4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
90 /* We only need stdlib for abort() */
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
102 * Fast emulation of guest FP instructions is challenging for two reasons.
103 * First, FP instruction semantics are similar but not identical, particularly
104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
105 * exception flags is not trivial: reading the host's flags register with a
106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107 * and trapping on every FP exception is not fast nor pleasant to work with.
109 * We address these challenges by leveraging the host FPU for a subset of the
110 * operations. To do this we expand on the idea presented in this paper:
112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
115 * The idea is thus to leverage the host FPU to (1) compute FP operations
116 * and (2) identify whether FP exceptions occurred while avoiding
117 * expensive exception flag register accesses.
119 * An important optimization shown in the paper is that given that exception
120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121 * This is particularly useful for the inexact flag, which is very frequently
122 * raised in floating-point workloads.
124 * We optimize the code further by deferring to soft-fp whenever FP exception
125 * detection might get hairy. Two examples: (1) when at least one operand is
126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127 * and the result is < the minimum normal.
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \
130 static inline void name(soft_t *a, float_status *s) \
132 if (unlikely(soft_t ## _is_denormal(*a))) { \
133 *a = soft_t ## _set_sign(soft_t ## _zero, \
134 soft_t ## _is_neg(*a)); \
135 s->float_exception_flags |= float_flag_input_denormal; \
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck
, float32
)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck
, float64
)
141 #undef GEN_INPUT_FLUSH__NOCHECK
143 #define GEN_INPUT_FLUSH1(name, soft_t) \
144 static inline void name(soft_t *a, float_status *s) \
146 if (likely(!s->flush_inputs_to_zero)) { \
149 soft_t ## _input_flush__nocheck(a, s); \
152 GEN_INPUT_FLUSH1(float32_input_flush1
, float32
)
153 GEN_INPUT_FLUSH1(float64_input_flush1
, float64
)
154 #undef GEN_INPUT_FLUSH1
156 #define GEN_INPUT_FLUSH2(name, soft_t) \
157 static inline void name(soft_t *a, soft_t *b, float_status *s) \
159 if (likely(!s->flush_inputs_to_zero)) { \
162 soft_t ## _input_flush__nocheck(a, s); \
163 soft_t ## _input_flush__nocheck(b, s); \
166 GEN_INPUT_FLUSH2(float32_input_flush2
, float32
)
167 GEN_INPUT_FLUSH2(float64_input_flush2
, float64
)
168 #undef GEN_INPUT_FLUSH2
170 #define GEN_INPUT_FLUSH3(name, soft_t) \
171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
173 if (likely(!s->flush_inputs_to_zero)) { \
176 soft_t ## _input_flush__nocheck(a, s); \
177 soft_t ## _input_flush__nocheck(b, s); \
178 soft_t ## _input_flush__nocheck(c, s); \
181 GEN_INPUT_FLUSH3(float32_input_flush3
, float32
)
182 GEN_INPUT_FLUSH3(float64_input_flush3
, float64
)
183 #undef GEN_INPUT_FLUSH3
186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
187 * hardfloat functions. Each combination of number of inputs and float size
188 * gets its own value.
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208 * float{32,64}_is_infinity when !USE_FP.
209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF 1
215 # define QEMU_HARDFLOAT_USE_ISINF 0
219 * Some targets clear the FP flags before most FP operations. This prevents
220 * the use of hardfloat, since hardfloat relies on the inexact flag being
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
235 static inline bool can_use_fpu(const float_status
*s
)
237 if (QEMU_NO_HARDFLOAT
) {
240 return likely(s
->float_exception_flags
& float_flag_inexact
&&
241 s
->float_rounding_mode
== float_round_nearest_even
);
245 * Hardfloat generation functions. Each operation can have two flavors:
246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247 * most condition checks, or native ones (e.g. fpclassify).
249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
250 * compiler to propagate constants and inline everything into the callers.
252 * We only generate functions for operations with two inputs, since only
253 * these are common enough to justify consolidating them into common code.
266 typedef bool (*f32_check_fn
)(union_float32 a
, union_float32 b
);
267 typedef bool (*f64_check_fn
)(union_float64 a
, union_float64 b
);
269 typedef float32 (*soft_f32_op2_fn
)(float32 a
, float32 b
, float_status
*s
);
270 typedef float64 (*soft_f64_op2_fn
)(float64 a
, float64 b
, float_status
*s
);
271 typedef float (*hard_f32_op2_fn
)(float a
, float b
);
272 typedef double (*hard_f64_op2_fn
)(double a
, double b
);
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a
, union_float32 b
)
277 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
279 * Not using a temp variable for consecutive fpclassify calls ends up
280 * generating faster code.
282 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
283 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
285 return float32_is_zero_or_normal(a
.s
) &&
286 float32_is_zero_or_normal(b
.s
);
289 static inline bool f64_is_zon2(union_float64 a
, union_float64 b
)
291 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
292 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
293 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
295 return float64_is_zero_or_normal(a
.s
) &&
296 float64_is_zero_or_normal(b
.s
);
299 /* 3-input is-zero-or-normal */
301 bool f32_is_zon3(union_float32 a
, union_float32 b
, union_float32 c
)
303 if (QEMU_HARDFLOAT_3F32_USE_FP
) {
304 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
305 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
306 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
308 return float32_is_zero_or_normal(a
.s
) &&
309 float32_is_zero_or_normal(b
.s
) &&
310 float32_is_zero_or_normal(c
.s
);
314 bool f64_is_zon3(union_float64 a
, union_float64 b
, union_float64 c
)
316 if (QEMU_HARDFLOAT_3F64_USE_FP
) {
317 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
318 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
319 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
321 return float64_is_zero_or_normal(a
.s
) &&
322 float64_is_zero_or_normal(b
.s
) &&
323 float64_is_zero_or_normal(c
.s
);
326 static inline bool f32_is_inf(union_float32 a
)
328 if (QEMU_HARDFLOAT_USE_ISINF
) {
331 return float32_is_infinity(a
.s
);
334 static inline bool f64_is_inf(union_float64 a
)
336 if (QEMU_HARDFLOAT_USE_ISINF
) {
339 return float64_is_infinity(a
.s
);
342 /* Note: @fast_test and @post can be NULL */
343 static inline float32
344 float32_gen2(float32 xa
, float32 xb
, float_status
*s
,
345 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
,
346 f32_check_fn pre
, f32_check_fn post
,
347 f32_check_fn fast_test
, soft_f32_op2_fn fast_op
)
349 union_float32 ua
, ub
, ur
;
354 if (unlikely(!can_use_fpu(s
))) {
358 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
359 if (unlikely(!pre(ua
, ub
))) {
362 if (fast_test
&& fast_test(ua
, ub
)) {
363 return fast_op(ua
.s
, ub
.s
, s
);
366 ur
.h
= hard(ua
.h
, ub
.h
);
367 if (unlikely(f32_is_inf(ur
))) {
368 s
->float_exception_flags
|= float_flag_overflow
;
369 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
)) {
370 if (post
== NULL
|| post(ua
, ub
)) {
377 return soft(ua
.s
, ub
.s
, s
);
380 static inline float64
381 float64_gen2(float64 xa
, float64 xb
, float_status
*s
,
382 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
,
383 f64_check_fn pre
, f64_check_fn post
,
384 f64_check_fn fast_test
, soft_f64_op2_fn fast_op
)
386 union_float64 ua
, ub
, ur
;
391 if (unlikely(!can_use_fpu(s
))) {
395 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
396 if (unlikely(!pre(ua
, ub
))) {
399 if (fast_test
&& fast_test(ua
, ub
)) {
400 return fast_op(ua
.s
, ub
.s
, s
);
403 ur
.h
= hard(ua
.h
, ub
.h
);
404 if (unlikely(f64_is_inf(ur
))) {
405 s
->float_exception_flags
|= float_flag_overflow
;
406 } else if (unlikely(fabs(ur
.h
) <= DBL_MIN
)) {
407 if (post
== NULL
|| post(ua
, ub
)) {
414 return soft(ua
.s
, ub
.s
, s
);
417 /*----------------------------------------------------------------------------
418 | Returns the fraction bits of the half-precision floating-point value `a'.
419 *----------------------------------------------------------------------------*/
421 static inline uint32_t extractFloat16Frac(float16 a
)
423 return float16_val(a
) & 0x3ff;
426 /*----------------------------------------------------------------------------
427 | Returns the exponent bits of the half-precision floating-point value `a'.
428 *----------------------------------------------------------------------------*/
430 static inline int extractFloat16Exp(float16 a
)
432 return (float16_val(a
) >> 10) & 0x1f;
435 /*----------------------------------------------------------------------------
436 | Returns the fraction bits of the single-precision floating-point value `a'.
437 *----------------------------------------------------------------------------*/
439 static inline uint32_t extractFloat32Frac(float32 a
)
441 return float32_val(a
) & 0x007FFFFF;
444 /*----------------------------------------------------------------------------
445 | Returns the exponent bits of the single-precision floating-point value `a'.
446 *----------------------------------------------------------------------------*/
448 static inline int extractFloat32Exp(float32 a
)
450 return (float32_val(a
) >> 23) & 0xFF;
453 /*----------------------------------------------------------------------------
454 | Returns the sign bit of the single-precision floating-point value `a'.
455 *----------------------------------------------------------------------------*/
457 static inline flag
extractFloat32Sign(float32 a
)
459 return float32_val(a
) >> 31;
462 /*----------------------------------------------------------------------------
463 | Returns the fraction bits of the double-precision floating-point value `a'.
464 *----------------------------------------------------------------------------*/
466 static inline uint64_t extractFloat64Frac(float64 a
)
468 return float64_val(a
) & LIT64(0x000FFFFFFFFFFFFF);
471 /*----------------------------------------------------------------------------
472 | Returns the exponent bits of the double-precision floating-point value `a'.
473 *----------------------------------------------------------------------------*/
475 static inline int extractFloat64Exp(float64 a
)
477 return (float64_val(a
) >> 52) & 0x7FF;
480 /*----------------------------------------------------------------------------
481 | Returns the sign bit of the double-precision floating-point value `a'.
482 *----------------------------------------------------------------------------*/
484 static inline flag
extractFloat64Sign(float64 a
)
486 return float64_val(a
) >> 63;
490 * Classify a floating point number. Everything above float_class_qnan
491 * is a NaN so cls >= float_class_qnan is any NaN.
494 typedef enum __attribute__ ((__packed__
)) {
495 float_class_unclassified
,
499 float_class_qnan
, /* all NaNs from here */
503 /* Simple helpers for checking if, or what kind of, NaN we have */
504 static inline __attribute__((unused
)) bool is_nan(FloatClass c
)
506 return unlikely(c
>= float_class_qnan
);
509 static inline __attribute__((unused
)) bool is_snan(FloatClass c
)
511 return c
== float_class_snan
;
514 static inline __attribute__((unused
)) bool is_qnan(FloatClass c
)
516 return c
== float_class_qnan
;
520 * Structure holding all of the decomposed parts of a float. The
521 * exponent is unbiased and the fraction is normalized. All
522 * calculations are done with a 64 bit fraction and then rounded as
523 * appropriate for the final format.
525 * Thanks to the packed FloatClass a decent compiler should be able to
526 * fit the whole structure into registers and avoid using the stack
527 * for parameter passing.
537 #define DECOMPOSED_BINARY_POINT (64 - 2)
538 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
539 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
541 /* Structure holding all of the relevant parameters for a format.
542 * exp_size: the size of the exponent field
543 * exp_bias: the offset applied to the exponent field
544 * exp_max: the maximum normalised exponent
545 * frac_size: the size of the fraction field
546 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
547 * The following are computed based the size of fraction
548 * frac_lsb: least significant bit of fraction
549 * frac_lsbm1: the bit below the least significant bit (for rounding)
550 * round_mask/roundeven_mask: masks used for rounding
551 * The following optional modifiers are available:
552 * arm_althp: handle ARM Alternative Half Precision
563 uint64_t roundeven_mask
;
567 /* Expand fields based on the size of exponent and fraction */
568 #define FLOAT_PARAMS(E, F) \
570 .exp_bias = ((1 << E) - 1) >> 1, \
571 .exp_max = (1 << E) - 1, \
573 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
574 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
575 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
576 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
577 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
579 static const FloatFmt float16_params
= {
583 static const FloatFmt float16_params_ahp
= {
588 static const FloatFmt float32_params
= {
592 static const FloatFmt float64_params
= {
596 /* Unpack a float to parts, but do not canonicalize. */
597 static inline FloatParts
unpack_raw(FloatFmt fmt
, uint64_t raw
)
599 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
601 return (FloatParts
) {
602 .cls
= float_class_unclassified
,
603 .sign
= extract64(raw
, sign_pos
, 1),
604 .exp
= extract64(raw
, fmt
.frac_size
, fmt
.exp_size
),
605 .frac
= extract64(raw
, 0, fmt
.frac_size
),
609 static inline FloatParts
float16_unpack_raw(float16 f
)
611 return unpack_raw(float16_params
, f
);
614 static inline FloatParts
float32_unpack_raw(float32 f
)
616 return unpack_raw(float32_params
, f
);
619 static inline FloatParts
float64_unpack_raw(float64 f
)
621 return unpack_raw(float64_params
, f
);
624 /* Pack a float from parts, but do not canonicalize. */
625 static inline uint64_t pack_raw(FloatFmt fmt
, FloatParts p
)
627 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
628 uint64_t ret
= deposit64(p
.frac
, fmt
.frac_size
, fmt
.exp_size
, p
.exp
);
629 return deposit64(ret
, sign_pos
, 1, p
.sign
);
632 static inline float16
float16_pack_raw(FloatParts p
)
634 return make_float16(pack_raw(float16_params
, p
));
637 static inline float32
float32_pack_raw(FloatParts p
)
639 return make_float32(pack_raw(float32_params
, p
));
642 static inline float64
float64_pack_raw(FloatParts p
)
644 return make_float64(pack_raw(float64_params
, p
));
647 /*----------------------------------------------------------------------------
648 | Functions and definitions to determine: (1) whether tininess for underflow
649 | is detected before or after rounding by default, (2) what (if anything)
650 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
651 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
652 | are propagated from function inputs to output. These details are target-
654 *----------------------------------------------------------------------------*/
655 #include "softfloat-specialize.h"
657 /* Canonicalize EXP and FRAC, setting CLS. */
658 static FloatParts
sf_canonicalize(FloatParts part
, const FloatFmt
*parm
,
659 float_status
*status
)
661 if (part
.exp
== parm
->exp_max
&& !parm
->arm_althp
) {
662 if (part
.frac
== 0) {
663 part
.cls
= float_class_inf
;
665 part
.frac
<<= parm
->frac_shift
;
666 part
.cls
= (parts_is_snan_frac(part
.frac
, status
)
667 ? float_class_snan
: float_class_qnan
);
669 } else if (part
.exp
== 0) {
670 if (likely(part
.frac
== 0)) {
671 part
.cls
= float_class_zero
;
672 } else if (status
->flush_inputs_to_zero
) {
673 float_raise(float_flag_input_denormal
, status
);
674 part
.cls
= float_class_zero
;
677 int shift
= clz64(part
.frac
) - 1;
678 part
.cls
= float_class_normal
;
679 part
.exp
= parm
->frac_shift
- parm
->exp_bias
- shift
+ 1;
683 part
.cls
= float_class_normal
;
684 part
.exp
-= parm
->exp_bias
;
685 part
.frac
= DECOMPOSED_IMPLICIT_BIT
+ (part
.frac
<< parm
->frac_shift
);
690 /* Round and uncanonicalize a floating-point number by parts. There
691 * are FRAC_SHIFT bits that may require rounding at the bottom of the
692 * fraction; these bits will be removed. The exponent will be biased
693 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
696 static FloatParts
round_canonical(FloatParts p
, float_status
*s
,
697 const FloatFmt
*parm
)
699 const uint64_t frac_lsb
= parm
->frac_lsb
;
700 const uint64_t frac_lsbm1
= parm
->frac_lsbm1
;
701 const uint64_t round_mask
= parm
->round_mask
;
702 const uint64_t roundeven_mask
= parm
->roundeven_mask
;
703 const int exp_max
= parm
->exp_max
;
704 const int frac_shift
= parm
->frac_shift
;
713 case float_class_normal
:
714 switch (s
->float_rounding_mode
) {
715 case float_round_nearest_even
:
716 overflow_norm
= false;
717 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
719 case float_round_ties_away
:
720 overflow_norm
= false;
723 case float_round_to_zero
:
724 overflow_norm
= true;
728 inc
= p
.sign
? 0 : round_mask
;
729 overflow_norm
= p
.sign
;
731 case float_round_down
:
732 inc
= p
.sign
? round_mask
: 0;
733 overflow_norm
= !p
.sign
;
735 case float_round_to_odd
:
736 overflow_norm
= true;
737 inc
= frac
& frac_lsb
? 0 : round_mask
;
740 g_assert_not_reached();
743 exp
+= parm
->exp_bias
;
744 if (likely(exp
> 0)) {
745 if (frac
& round_mask
) {
746 flags
|= float_flag_inexact
;
748 if (frac
& DECOMPOSED_OVERFLOW_BIT
) {
755 if (parm
->arm_althp
) {
756 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
757 if (unlikely(exp
> exp_max
)) {
758 /* Overflow. Return the maximum normal. */
759 flags
= float_flag_invalid
;
763 } else if (unlikely(exp
>= exp_max
)) {
764 flags
|= float_flag_overflow
| float_flag_inexact
;
769 p
.cls
= float_class_inf
;
773 } else if (s
->flush_to_zero
) {
774 flags
|= float_flag_output_denormal
;
775 p
.cls
= float_class_zero
;
778 bool is_tiny
= (s
->float_detect_tininess
779 == float_tininess_before_rounding
)
781 || !((frac
+ inc
) & DECOMPOSED_OVERFLOW_BIT
);
783 shift64RightJamming(frac
, 1 - exp
, &frac
);
784 if (frac
& round_mask
) {
785 /* Need to recompute round-to-even. */
786 switch (s
->float_rounding_mode
) {
787 case float_round_nearest_even
:
788 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
791 case float_round_to_odd
:
792 inc
= frac
& frac_lsb
? 0 : round_mask
;
795 flags
|= float_flag_inexact
;
799 exp
= (frac
& DECOMPOSED_IMPLICIT_BIT
? 1 : 0);
802 if (is_tiny
&& (flags
& float_flag_inexact
)) {
803 flags
|= float_flag_underflow
;
805 if (exp
== 0 && frac
== 0) {
806 p
.cls
= float_class_zero
;
811 case float_class_zero
:
817 case float_class_inf
:
819 assert(!parm
->arm_althp
);
824 case float_class_qnan
:
825 case float_class_snan
:
826 assert(!parm
->arm_althp
);
828 frac
>>= parm
->frac_shift
;
832 g_assert_not_reached();
835 float_raise(flags
, s
);
841 /* Explicit FloatFmt version */
842 static FloatParts
float16a_unpack_canonical(float16 f
, float_status
*s
,
843 const FloatFmt
*params
)
845 return sf_canonicalize(float16_unpack_raw(f
), params
, s
);
848 static FloatParts
float16_unpack_canonical(float16 f
, float_status
*s
)
850 return float16a_unpack_canonical(f
, s
, &float16_params
);
853 static float16
float16a_round_pack_canonical(FloatParts p
, float_status
*s
,
854 const FloatFmt
*params
)
856 return float16_pack_raw(round_canonical(p
, s
, params
));
859 static float16
float16_round_pack_canonical(FloatParts p
, float_status
*s
)
861 return float16a_round_pack_canonical(p
, s
, &float16_params
);
864 static FloatParts
float32_unpack_canonical(float32 f
, float_status
*s
)
866 return sf_canonicalize(float32_unpack_raw(f
), &float32_params
, s
);
869 static float32
float32_round_pack_canonical(FloatParts p
, float_status
*s
)
871 return float32_pack_raw(round_canonical(p
, s
, &float32_params
));
874 static FloatParts
float64_unpack_canonical(float64 f
, float_status
*s
)
876 return sf_canonicalize(float64_unpack_raw(f
), &float64_params
, s
);
879 static float64
float64_round_pack_canonical(FloatParts p
, float_status
*s
)
881 return float64_pack_raw(round_canonical(p
, s
, &float64_params
));
884 static FloatParts
return_nan(FloatParts a
, float_status
*s
)
887 case float_class_snan
:
888 s
->float_exception_flags
|= float_flag_invalid
;
889 a
= parts_silence_nan(a
, s
);
891 case float_class_qnan
:
892 if (s
->default_nan_mode
) {
893 return parts_default_nan(s
);
898 g_assert_not_reached();
903 static FloatParts
pick_nan(FloatParts a
, FloatParts b
, float_status
*s
)
905 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
906 s
->float_exception_flags
|= float_flag_invalid
;
909 if (s
->default_nan_mode
) {
910 return parts_default_nan(s
);
912 if (pickNaN(a
.cls
, b
.cls
,
914 (a
.frac
== b
.frac
&& a
.sign
< b
.sign
))) {
917 if (is_snan(a
.cls
)) {
918 return parts_silence_nan(a
, s
);
924 static FloatParts
pick_nan_muladd(FloatParts a
, FloatParts b
, FloatParts c
,
925 bool inf_zero
, float_status
*s
)
929 if (is_snan(a
.cls
) || is_snan(b
.cls
) || is_snan(c
.cls
)) {
930 s
->float_exception_flags
|= float_flag_invalid
;
933 which
= pickNaNMulAdd(a
.cls
, b
.cls
, c
.cls
, inf_zero
, s
);
935 if (s
->default_nan_mode
) {
936 /* Note that this check is after pickNaNMulAdd so that function
937 * has an opportunity to set the Invalid flag.
952 return parts_default_nan(s
);
954 g_assert_not_reached();
957 if (is_snan(a
.cls
)) {
958 return parts_silence_nan(a
, s
);
964 * Returns the result of adding or subtracting the values of the
965 * floating-point values `a' and `b'. The operation is performed
966 * according to the IEC/IEEE Standard for Binary Floating-Point
970 static FloatParts
addsub_floats(FloatParts a
, FloatParts b
, bool subtract
,
973 bool a_sign
= a
.sign
;
974 bool b_sign
= b
.sign
^ subtract
;
976 if (a_sign
!= b_sign
) {
979 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
980 if (a
.exp
> b
.exp
|| (a
.exp
== b
.exp
&& a
.frac
>= b
.frac
)) {
981 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
982 a
.frac
= a
.frac
- b
.frac
;
984 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
985 a
.frac
= b
.frac
- a
.frac
;
991 a
.cls
= float_class_zero
;
992 a
.sign
= s
->float_rounding_mode
== float_round_down
;
994 int shift
= clz64(a
.frac
) - 1;
995 a
.frac
= a
.frac
<< shift
;
996 a
.exp
= a
.exp
- shift
;
1001 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1002 return pick_nan(a
, b
, s
);
1004 if (a
.cls
== float_class_inf
) {
1005 if (b
.cls
== float_class_inf
) {
1006 float_raise(float_flag_invalid
, s
);
1007 return parts_default_nan(s
);
1011 if (a
.cls
== float_class_zero
&& b
.cls
== float_class_zero
) {
1012 a
.sign
= s
->float_rounding_mode
== float_round_down
;
1015 if (a
.cls
== float_class_zero
|| b
.cls
== float_class_inf
) {
1016 b
.sign
= a_sign
^ 1;
1019 if (b
.cls
== float_class_zero
) {
1024 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1025 if (a
.exp
> b
.exp
) {
1026 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
1027 } else if (a
.exp
< b
.exp
) {
1028 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
1032 if (a
.frac
& DECOMPOSED_OVERFLOW_BIT
) {
1033 shift64RightJamming(a
.frac
, 1, &a
.frac
);
1038 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1039 return pick_nan(a
, b
, s
);
1041 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
1044 if (b
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1049 g_assert_not_reached();
1053 * Returns the result of adding or subtracting the floating-point
1054 * values `a' and `b'. The operation is performed according to the
1055 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1058 float16 QEMU_FLATTEN
float16_add(float16 a
, float16 b
, float_status
*status
)
1060 FloatParts pa
= float16_unpack_canonical(a
, status
);
1061 FloatParts pb
= float16_unpack_canonical(b
, status
);
1062 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
1064 return float16_round_pack_canonical(pr
, status
);
1067 float16 QEMU_FLATTEN
float16_sub(float16 a
, float16 b
, float_status
*status
)
1069 FloatParts pa
= float16_unpack_canonical(a
, status
);
1070 FloatParts pb
= float16_unpack_canonical(b
, status
);
1071 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
1073 return float16_round_pack_canonical(pr
, status
);
1076 static float32 QEMU_SOFTFLOAT_ATTR
1077 soft_f32_addsub(float32 a
, float32 b
, bool subtract
, float_status
*status
)
1079 FloatParts pa
= float32_unpack_canonical(a
, status
);
1080 FloatParts pb
= float32_unpack_canonical(b
, status
);
1081 FloatParts pr
= addsub_floats(pa
, pb
, subtract
, status
);
1083 return float32_round_pack_canonical(pr
, status
);
1086 static inline float32
soft_f32_add(float32 a
, float32 b
, float_status
*status
)
1088 return soft_f32_addsub(a
, b
, false, status
);
1091 static inline float32
soft_f32_sub(float32 a
, float32 b
, float_status
*status
)
1093 return soft_f32_addsub(a
, b
, true, status
);
1096 static float64 QEMU_SOFTFLOAT_ATTR
1097 soft_f64_addsub(float64 a
, float64 b
, bool subtract
, float_status
*status
)
1099 FloatParts pa
= float64_unpack_canonical(a
, status
);
1100 FloatParts pb
= float64_unpack_canonical(b
, status
);
1101 FloatParts pr
= addsub_floats(pa
, pb
, subtract
, status
);
1103 return float64_round_pack_canonical(pr
, status
);
1106 static inline float64
soft_f64_add(float64 a
, float64 b
, float_status
*status
)
1108 return soft_f64_addsub(a
, b
, false, status
);
1111 static inline float64
soft_f64_sub(float64 a
, float64 b
, float_status
*status
)
1113 return soft_f64_addsub(a
, b
, true, status
);
1116 static float hard_f32_add(float a
, float b
)
1121 static float hard_f32_sub(float a
, float b
)
1126 static double hard_f64_add(double a
, double b
)
1131 static double hard_f64_sub(double a
, double b
)
1136 static bool f32_addsub_post(union_float32 a
, union_float32 b
)
1138 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1139 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1141 return !(float32_is_zero(a
.s
) && float32_is_zero(b
.s
));
1144 static bool f64_addsub_post(union_float64 a
, union_float64 b
)
1146 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1147 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1149 return !(float64_is_zero(a
.s
) && float64_is_zero(b
.s
));
1153 static float32
float32_addsub(float32 a
, float32 b
, float_status
*s
,
1154 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
)
1156 return float32_gen2(a
, b
, s
, hard
, soft
,
1157 f32_is_zon2
, f32_addsub_post
, NULL
, NULL
);
1160 static float64
float64_addsub(float64 a
, float64 b
, float_status
*s
,
1161 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
)
1163 return float64_gen2(a
, b
, s
, hard
, soft
,
1164 f64_is_zon2
, f64_addsub_post
, NULL
, NULL
);
1167 float32 QEMU_FLATTEN
1168 float32_add(float32 a
, float32 b
, float_status
*s
)
1170 return float32_addsub(a
, b
, s
, hard_f32_add
, soft_f32_add
);
1173 float32 QEMU_FLATTEN
1174 float32_sub(float32 a
, float32 b
, float_status
*s
)
1176 return float32_addsub(a
, b
, s
, hard_f32_sub
, soft_f32_sub
);
1179 float64 QEMU_FLATTEN
1180 float64_add(float64 a
, float64 b
, float_status
*s
)
1182 return float64_addsub(a
, b
, s
, hard_f64_add
, soft_f64_add
);
1185 float64 QEMU_FLATTEN
1186 float64_sub(float64 a
, float64 b
, float_status
*s
)
1188 return float64_addsub(a
, b
, s
, hard_f64_sub
, soft_f64_sub
);
1192 * Returns the result of multiplying the floating-point values `a' and
1193 * `b'. The operation is performed according to the IEC/IEEE Standard
1194 * for Binary Floating-Point Arithmetic.
1197 static FloatParts
mul_floats(FloatParts a
, FloatParts b
, float_status
*s
)
1199 bool sign
= a
.sign
^ b
.sign
;
1201 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1203 int exp
= a
.exp
+ b
.exp
;
1205 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
1206 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
1207 if (lo
& DECOMPOSED_OVERFLOW_BIT
) {
1208 shift64RightJamming(lo
, 1, &lo
);
1218 /* handle all the NaN cases */
1219 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1220 return pick_nan(a
, b
, s
);
1222 /* Inf * Zero == NaN */
1223 if ((a
.cls
== float_class_inf
&& b
.cls
== float_class_zero
) ||
1224 (a
.cls
== float_class_zero
&& b
.cls
== float_class_inf
)) {
1225 s
->float_exception_flags
|= float_flag_invalid
;
1226 return parts_default_nan(s
);
1228 /* Multiply by 0 or Inf */
1229 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1233 if (b
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
1237 g_assert_not_reached();
1240 float16 QEMU_FLATTEN
float16_mul(float16 a
, float16 b
, float_status
*status
)
1242 FloatParts pa
= float16_unpack_canonical(a
, status
);
1243 FloatParts pb
= float16_unpack_canonical(b
, status
);
1244 FloatParts pr
= mul_floats(pa
, pb
, status
);
1246 return float16_round_pack_canonical(pr
, status
);
1249 static float32 QEMU_SOFTFLOAT_ATTR
1250 soft_f32_mul(float32 a
, float32 b
, float_status
*status
)
1252 FloatParts pa
= float32_unpack_canonical(a
, status
);
1253 FloatParts pb
= float32_unpack_canonical(b
, status
);
1254 FloatParts pr
= mul_floats(pa
, pb
, status
);
1256 return float32_round_pack_canonical(pr
, status
);
1259 static float64 QEMU_SOFTFLOAT_ATTR
1260 soft_f64_mul(float64 a
, float64 b
, float_status
*status
)
1262 FloatParts pa
= float64_unpack_canonical(a
, status
);
1263 FloatParts pb
= float64_unpack_canonical(b
, status
);
1264 FloatParts pr
= mul_floats(pa
, pb
, status
);
1266 return float64_round_pack_canonical(pr
, status
);
1269 static float hard_f32_mul(float a
, float b
)
1274 static double hard_f64_mul(double a
, double b
)
1279 static bool f32_mul_fast_test(union_float32 a
, union_float32 b
)
1281 return float32_is_zero(a
.s
) || float32_is_zero(b
.s
);
1284 static bool f64_mul_fast_test(union_float64 a
, union_float64 b
)
1286 return float64_is_zero(a
.s
) || float64_is_zero(b
.s
);
1289 static float32
f32_mul_fast_op(float32 a
, float32 b
, float_status
*s
)
1291 bool signbit
= float32_is_neg(a
) ^ float32_is_neg(b
);
1293 return float32_set_sign(float32_zero
, signbit
);
1296 static float64
f64_mul_fast_op(float64 a
, float64 b
, float_status
*s
)
1298 bool signbit
= float64_is_neg(a
) ^ float64_is_neg(b
);
1300 return float64_set_sign(float64_zero
, signbit
);
1303 float32 QEMU_FLATTEN
1304 float32_mul(float32 a
, float32 b
, float_status
*s
)
1306 return float32_gen2(a
, b
, s
, hard_f32_mul
, soft_f32_mul
,
1307 f32_is_zon2
, NULL
, f32_mul_fast_test
, f32_mul_fast_op
);
1310 float64 QEMU_FLATTEN
1311 float64_mul(float64 a
, float64 b
, float_status
*s
)
1313 return float64_gen2(a
, b
, s
, hard_f64_mul
, soft_f64_mul
,
1314 f64_is_zon2
, NULL
, f64_mul_fast_test
, f64_mul_fast_op
);
1318 * Returns the result of multiplying the floating-point values `a' and
1319 * `b' then adding 'c', with no intermediate rounding step after the
1320 * multiplication. The operation is performed according to the
1321 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1322 * The flags argument allows the caller to select negation of the
1323 * addend, the intermediate product, or the final result. (The
1324 * difference between this and having the caller do a separate
1325 * negation is that negating externally will flip the sign bit on
1329 static FloatParts
muladd_floats(FloatParts a
, FloatParts b
, FloatParts c
,
1330 int flags
, float_status
*s
)
1332 bool inf_zero
= ((1 << a
.cls
) | (1 << b
.cls
)) ==
1333 ((1 << float_class_inf
) | (1 << float_class_zero
));
1335 bool sign_flip
= flags
& float_muladd_negate_result
;
1340 /* It is implementation-defined whether the cases of (0,inf,qnan)
1341 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1342 * they return if they do), so we have to hand this information
1343 * off to the target-specific pick-a-NaN routine.
1345 if (is_nan(a
.cls
) || is_nan(b
.cls
) || is_nan(c
.cls
)) {
1346 return pick_nan_muladd(a
, b
, c
, inf_zero
, s
);
1350 s
->float_exception_flags
|= float_flag_invalid
;
1351 return parts_default_nan(s
);
1354 if (flags
& float_muladd_negate_c
) {
1358 p_sign
= a
.sign
^ b
.sign
;
1360 if (flags
& float_muladd_negate_product
) {
1364 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_inf
) {
1365 p_class
= float_class_inf
;
1366 } else if (a
.cls
== float_class_zero
|| b
.cls
== float_class_zero
) {
1367 p_class
= float_class_zero
;
1369 p_class
= float_class_normal
;
1372 if (c
.cls
== float_class_inf
) {
1373 if (p_class
== float_class_inf
&& p_sign
!= c
.sign
) {
1374 s
->float_exception_flags
|= float_flag_invalid
;
1375 return parts_default_nan(s
);
1377 a
.cls
= float_class_inf
;
1378 a
.sign
= c
.sign
^ sign_flip
;
1383 if (p_class
== float_class_inf
) {
1384 a
.cls
= float_class_inf
;
1385 a
.sign
= p_sign
^ sign_flip
;
1389 if (p_class
== float_class_zero
) {
1390 if (c
.cls
== float_class_zero
) {
1391 if (p_sign
!= c
.sign
) {
1392 p_sign
= s
->float_rounding_mode
== float_round_down
;
1395 } else if (flags
& float_muladd_halve_result
) {
1398 c
.sign
^= sign_flip
;
1402 /* a & b should be normals now... */
1403 assert(a
.cls
== float_class_normal
&&
1404 b
.cls
== float_class_normal
);
1406 p_exp
= a
.exp
+ b
.exp
;
1408 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1411 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
1412 /* binary point now at bit 124 */
1414 /* check for overflow */
1415 if (hi
& (1ULL << (DECOMPOSED_BINARY_POINT
* 2 + 1 - 64))) {
1416 shift128RightJamming(hi
, lo
, 1, &hi
, &lo
);
1421 if (c
.cls
== float_class_zero
) {
1422 /* move binary point back to 62 */
1423 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
1425 int exp_diff
= p_exp
- c
.exp
;
1426 if (p_sign
== c
.sign
) {
1428 if (exp_diff
<= 0) {
1429 shift128RightJamming(hi
, lo
,
1430 DECOMPOSED_BINARY_POINT
- exp_diff
,
1435 uint64_t c_hi
, c_lo
;
1436 /* shift c to the same binary point as the product (124) */
1439 shift128RightJamming(c_hi
, c_lo
,
1442 add128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1443 /* move binary point back to 62 */
1444 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
1447 if (lo
& DECOMPOSED_OVERFLOW_BIT
) {
1448 shift64RightJamming(lo
, 1, &lo
);
1454 uint64_t c_hi
, c_lo
;
1455 /* make C binary point match product at bit 124 */
1459 if (exp_diff
<= 0) {
1460 shift128RightJamming(hi
, lo
, -exp_diff
, &hi
, &lo
);
1463 (hi
> c_hi
|| (hi
== c_hi
&& lo
>= c_lo
))) {
1464 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1466 sub128(c_hi
, c_lo
, hi
, lo
, &hi
, &lo
);
1471 shift128RightJamming(c_hi
, c_lo
,
1474 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1477 if (hi
== 0 && lo
== 0) {
1478 a
.cls
= float_class_zero
;
1479 a
.sign
= s
->float_rounding_mode
== float_round_down
;
1480 a
.sign
^= sign_flip
;
1487 shift
= clz64(lo
) + 64;
1489 /* Normalizing to a binary point of 124 is the
1490 correct adjust for the exponent. However since we're
1491 shifting, we might as well put the binary point back
1492 at 62 where we really want it. Therefore shift as
1493 if we're leaving 1 bit at the top of the word, but
1494 adjust the exponent as if we're leaving 3 bits. */
1497 lo
= lo
<< (shift
- 64);
1499 hi
= (hi
<< shift
) | (lo
>> (64 - shift
));
1500 lo
= hi
| ((lo
<< shift
) != 0);
1507 if (flags
& float_muladd_halve_result
) {
1511 /* finally prepare our result */
1512 a
.cls
= float_class_normal
;
1513 a
.sign
= p_sign
^ sign_flip
;
1520 float16 QEMU_FLATTEN
float16_muladd(float16 a
, float16 b
, float16 c
,
1521 int flags
, float_status
*status
)
1523 FloatParts pa
= float16_unpack_canonical(a
, status
);
1524 FloatParts pb
= float16_unpack_canonical(b
, status
);
1525 FloatParts pc
= float16_unpack_canonical(c
, status
);
1526 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1528 return float16_round_pack_canonical(pr
, status
);
1531 static float32 QEMU_SOFTFLOAT_ATTR
1532 soft_f32_muladd(float32 a
, float32 b
, float32 c
, int flags
,
1533 float_status
*status
)
1535 FloatParts pa
= float32_unpack_canonical(a
, status
);
1536 FloatParts pb
= float32_unpack_canonical(b
, status
);
1537 FloatParts pc
= float32_unpack_canonical(c
, status
);
1538 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1540 return float32_round_pack_canonical(pr
, status
);
1543 static float64 QEMU_SOFTFLOAT_ATTR
1544 soft_f64_muladd(float64 a
, float64 b
, float64 c
, int flags
,
1545 float_status
*status
)
1547 FloatParts pa
= float64_unpack_canonical(a
, status
);
1548 FloatParts pb
= float64_unpack_canonical(b
, status
);
1549 FloatParts pc
= float64_unpack_canonical(c
, status
);
1550 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1552 return float64_round_pack_canonical(pr
, status
);
1555 static bool force_soft_fma
;
1557 float32 QEMU_FLATTEN
1558 float32_muladd(float32 xa
, float32 xb
, float32 xc
, int flags
, float_status
*s
)
1560 union_float32 ua
, ub
, uc
, ur
;
1566 if (unlikely(!can_use_fpu(s
))) {
1569 if (unlikely(flags
& float_muladd_halve_result
)) {
1573 float32_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1574 if (unlikely(!f32_is_zon3(ua
, ub
, uc
))) {
1578 if (unlikely(force_soft_fma
)) {
1583 * When (a || b) == 0, there's no need to check for under/over flow,
1584 * since we know the addend is (normal || 0) and the product is 0.
1586 if (float32_is_zero(ua
.s
) || float32_is_zero(ub
.s
)) {
1590 prod_sign
= float32_is_neg(ua
.s
) ^ float32_is_neg(ub
.s
);
1591 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1592 up
.s
= float32_set_sign(float32_zero
, prod_sign
);
1594 if (flags
& float_muladd_negate_c
) {
1599 union_float32 ua_orig
= ua
;
1600 union_float32 uc_orig
= uc
;
1602 if (flags
& float_muladd_negate_product
) {
1605 if (flags
& float_muladd_negate_c
) {
1609 ur
.h
= fmaf(ua
.h
, ub
.h
, uc
.h
);
1611 if (unlikely(f32_is_inf(ur
))) {
1612 s
->float_exception_flags
|= float_flag_overflow
;
1613 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
)) {
1619 if (flags
& float_muladd_negate_result
) {
1620 return float32_chs(ur
.s
);
1625 return soft_f32_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1628 float64 QEMU_FLATTEN
1629 float64_muladd(float64 xa
, float64 xb
, float64 xc
, int flags
, float_status
*s
)
1631 union_float64 ua
, ub
, uc
, ur
;
1637 if (unlikely(!can_use_fpu(s
))) {
1640 if (unlikely(flags
& float_muladd_halve_result
)) {
1644 float64_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1645 if (unlikely(!f64_is_zon3(ua
, ub
, uc
))) {
1649 if (unlikely(force_soft_fma
)) {
1654 * When (a || b) == 0, there's no need to check for under/over flow,
1655 * since we know the addend is (normal || 0) and the product is 0.
1657 if (float64_is_zero(ua
.s
) || float64_is_zero(ub
.s
)) {
1661 prod_sign
= float64_is_neg(ua
.s
) ^ float64_is_neg(ub
.s
);
1662 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1663 up
.s
= float64_set_sign(float64_zero
, prod_sign
);
1665 if (flags
& float_muladd_negate_c
) {
1670 union_float64 ua_orig
= ua
;
1671 union_float64 uc_orig
= uc
;
1673 if (flags
& float_muladd_negate_product
) {
1676 if (flags
& float_muladd_negate_c
) {
1680 ur
.h
= fma(ua
.h
, ub
.h
, uc
.h
);
1682 if (unlikely(f64_is_inf(ur
))) {
1683 s
->float_exception_flags
|= float_flag_overflow
;
1684 } else if (unlikely(fabs(ur
.h
) <= FLT_MIN
)) {
1690 if (flags
& float_muladd_negate_result
) {
1691 return float64_chs(ur
.s
);
1696 return soft_f64_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1700 * Returns the result of dividing the floating-point value `a' by the
1701 * corresponding value `b'. The operation is performed according to
1702 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1705 static FloatParts
div_floats(FloatParts a
, FloatParts b
, float_status
*s
)
1707 bool sign
= a
.sign
^ b
.sign
;
1709 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1710 uint64_t n0
, n1
, q
, r
;
1711 int exp
= a
.exp
- b
.exp
;
1714 * We want a 2*N / N-bit division to produce exactly an N-bit
1715 * result, so that we do not lose any precision and so that we
1716 * do not have to renormalize afterward. If A.frac < B.frac,
1717 * then division would produce an (N-1)-bit result; shift A left
1718 * by one to produce the an N-bit result, and decrement the
1719 * exponent to match.
1721 * The udiv_qrnnd algorithm that we're using requires normalization,
1722 * i.e. the msb of the denominator must be set. Since we know that
1723 * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1724 * by one (more), and the remainder must be shifted right by one.
1726 if (a
.frac
< b
.frac
) {
1728 shift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
+ 2, &n1
, &n0
);
1730 shift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
+ 1, &n1
, &n0
);
1732 q
= udiv_qrnnd(&r
, n1
, n0
, b
.frac
<< 1);
1735 * Set lsb if there is a remainder, to set inexact.
1736 * As mentioned above, to find the actual value of the remainder we
1737 * would need to shift right, but (1) we are only concerned about
1738 * non-zero-ness, and (2) the remainder will always be even because
1739 * both inputs to the division primitive are even.
1741 a
.frac
= q
| (r
!= 0);
1746 /* handle all the NaN cases */
1747 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1748 return pick_nan(a
, b
, s
);
1750 /* 0/0 or Inf/Inf */
1753 (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
)) {
1754 s
->float_exception_flags
|= float_flag_invalid
;
1755 return parts_default_nan(s
);
1757 /* Inf / x or 0 / x */
1758 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1763 if (b
.cls
== float_class_zero
) {
1764 s
->float_exception_flags
|= float_flag_divbyzero
;
1765 a
.cls
= float_class_inf
;
1770 if (b
.cls
== float_class_inf
) {
1771 a
.cls
= float_class_zero
;
1775 g_assert_not_reached();
1778 float16
float16_div(float16 a
, float16 b
, float_status
*status
)
1780 FloatParts pa
= float16_unpack_canonical(a
, status
);
1781 FloatParts pb
= float16_unpack_canonical(b
, status
);
1782 FloatParts pr
= div_floats(pa
, pb
, status
);
1784 return float16_round_pack_canonical(pr
, status
);
1787 static float32 QEMU_SOFTFLOAT_ATTR
1788 soft_f32_div(float32 a
, float32 b
, float_status
*status
)
1790 FloatParts pa
= float32_unpack_canonical(a
, status
);
1791 FloatParts pb
= float32_unpack_canonical(b
, status
);
1792 FloatParts pr
= div_floats(pa
, pb
, status
);
1794 return float32_round_pack_canonical(pr
, status
);
1797 static float64 QEMU_SOFTFLOAT_ATTR
1798 soft_f64_div(float64 a
, float64 b
, float_status
*status
)
1800 FloatParts pa
= float64_unpack_canonical(a
, status
);
1801 FloatParts pb
= float64_unpack_canonical(b
, status
);
1802 FloatParts pr
= div_floats(pa
, pb
, status
);
1804 return float64_round_pack_canonical(pr
, status
);
1807 static float hard_f32_div(float a
, float b
)
1812 static double hard_f64_div(double a
, double b
)
1817 static bool f32_div_pre(union_float32 a
, union_float32 b
)
1819 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1820 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
1821 fpclassify(b
.h
) == FP_NORMAL
;
1823 return float32_is_zero_or_normal(a
.s
) && float32_is_normal(b
.s
);
1826 static bool f64_div_pre(union_float64 a
, union_float64 b
)
1828 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1829 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
1830 fpclassify(b
.h
) == FP_NORMAL
;
1832 return float64_is_zero_or_normal(a
.s
) && float64_is_normal(b
.s
);
1835 static bool f32_div_post(union_float32 a
, union_float32 b
)
1837 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1838 return fpclassify(a
.h
) != FP_ZERO
;
1840 return !float32_is_zero(a
.s
);
1843 static bool f64_div_post(union_float64 a
, union_float64 b
)
1845 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1846 return fpclassify(a
.h
) != FP_ZERO
;
1848 return !float64_is_zero(a
.s
);
1851 float32 QEMU_FLATTEN
1852 float32_div(float32 a
, float32 b
, float_status
*s
)
1854 return float32_gen2(a
, b
, s
, hard_f32_div
, soft_f32_div
,
1855 f32_div_pre
, f32_div_post
, NULL
, NULL
);
1858 float64 QEMU_FLATTEN
1859 float64_div(float64 a
, float64 b
, float_status
*s
)
1861 return float64_gen2(a
, b
, s
, hard_f64_div
, soft_f64_div
,
1862 f64_div_pre
, f64_div_post
, NULL
, NULL
);
1866 * Float to Float conversions
1868 * Returns the result of converting one float format to another. The
1869 * conversion is performed according to the IEC/IEEE Standard for
1870 * Binary Floating-Point Arithmetic.
1872 * The float_to_float helper only needs to take care of raising
1873 * invalid exceptions and handling the conversion on NaNs.
1876 static FloatParts
float_to_float(FloatParts a
, const FloatFmt
*dstf
,
1879 if (dstf
->arm_althp
) {
1881 case float_class_qnan
:
1882 case float_class_snan
:
1883 /* There is no NaN in the destination format. Raise Invalid
1884 * and return a zero with the sign of the input NaN.
1886 s
->float_exception_flags
|= float_flag_invalid
;
1887 a
.cls
= float_class_zero
;
1892 case float_class_inf
:
1893 /* There is no Inf in the destination format. Raise Invalid
1894 * and return the maximum normal with the correct sign.
1896 s
->float_exception_flags
|= float_flag_invalid
;
1897 a
.cls
= float_class_normal
;
1898 a
.exp
= dstf
->exp_max
;
1899 a
.frac
= ((1ull << dstf
->frac_size
) - 1) << dstf
->frac_shift
;
1905 } else if (is_nan(a
.cls
)) {
1906 if (is_snan(a
.cls
)) {
1907 s
->float_exception_flags
|= float_flag_invalid
;
1908 a
= parts_silence_nan(a
, s
);
1910 if (s
->default_nan_mode
) {
1911 return parts_default_nan(s
);
1917 float32
float16_to_float32(float16 a
, bool ieee
, float_status
*s
)
1919 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1920 FloatParts p
= float16a_unpack_canonical(a
, s
, fmt16
);
1921 FloatParts pr
= float_to_float(p
, &float32_params
, s
);
1922 return float32_round_pack_canonical(pr
, s
);
1925 float64
float16_to_float64(float16 a
, bool ieee
, float_status
*s
)
1927 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1928 FloatParts p
= float16a_unpack_canonical(a
, s
, fmt16
);
1929 FloatParts pr
= float_to_float(p
, &float64_params
, s
);
1930 return float64_round_pack_canonical(pr
, s
);
1933 float16
float32_to_float16(float32 a
, bool ieee
, float_status
*s
)
1935 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1936 FloatParts p
= float32_unpack_canonical(a
, s
);
1937 FloatParts pr
= float_to_float(p
, fmt16
, s
);
1938 return float16a_round_pack_canonical(pr
, s
, fmt16
);
1941 float64
float32_to_float64(float32 a
, float_status
*s
)
1943 FloatParts p
= float32_unpack_canonical(a
, s
);
1944 FloatParts pr
= float_to_float(p
, &float64_params
, s
);
1945 return float64_round_pack_canonical(pr
, s
);
1948 float16
float64_to_float16(float64 a
, bool ieee
, float_status
*s
)
1950 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1951 FloatParts p
= float64_unpack_canonical(a
, s
);
1952 FloatParts pr
= float_to_float(p
, fmt16
, s
);
1953 return float16a_round_pack_canonical(pr
, s
, fmt16
);
1956 float32
float64_to_float32(float64 a
, float_status
*s
)
1958 FloatParts p
= float64_unpack_canonical(a
, s
);
1959 FloatParts pr
= float_to_float(p
, &float32_params
, s
);
1960 return float32_round_pack_canonical(pr
, s
);
1964 * Rounds the floating-point value `a' to an integer, and returns the
1965 * result as a floating-point value. The operation is performed
1966 * according to the IEC/IEEE Standard for Binary Floating-Point
1970 static FloatParts
round_to_int(FloatParts a
, int rmode
,
1971 int scale
, float_status
*s
)
1974 case float_class_qnan
:
1975 case float_class_snan
:
1976 return return_nan(a
, s
);
1978 case float_class_zero
:
1979 case float_class_inf
:
1980 /* already "integral" */
1983 case float_class_normal
:
1984 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
1987 if (a
.exp
>= DECOMPOSED_BINARY_POINT
) {
1988 /* already integral */
1993 /* all fractional */
1994 s
->float_exception_flags
|= float_flag_inexact
;
1996 case float_round_nearest_even
:
1997 one
= a
.exp
== -1 && a
.frac
> DECOMPOSED_IMPLICIT_BIT
;
1999 case float_round_ties_away
:
2000 one
= a
.exp
== -1 && a
.frac
>= DECOMPOSED_IMPLICIT_BIT
;
2002 case float_round_to_zero
:
2005 case float_round_up
:
2008 case float_round_down
:
2011 case float_round_to_odd
:
2015 g_assert_not_reached();
2019 a
.frac
= DECOMPOSED_IMPLICIT_BIT
;
2022 a
.cls
= float_class_zero
;
2025 uint64_t frac_lsb
= DECOMPOSED_IMPLICIT_BIT
>> a
.exp
;
2026 uint64_t frac_lsbm1
= frac_lsb
>> 1;
2027 uint64_t rnd_even_mask
= (frac_lsb
- 1) | frac_lsb
;
2028 uint64_t rnd_mask
= rnd_even_mask
>> 1;
2032 case float_round_nearest_even
:
2033 inc
= ((a
.frac
& rnd_even_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
2035 case float_round_ties_away
:
2038 case float_round_to_zero
:
2041 case float_round_up
:
2042 inc
= a
.sign
? 0 : rnd_mask
;
2044 case float_round_down
:
2045 inc
= a
.sign
? rnd_mask
: 0;
2047 case float_round_to_odd
:
2048 inc
= a
.frac
& frac_lsb
? 0 : rnd_mask
;
2051 g_assert_not_reached();
2054 if (a
.frac
& rnd_mask
) {
2055 s
->float_exception_flags
|= float_flag_inexact
;
2057 a
.frac
&= ~rnd_mask
;
2058 if (a
.frac
& DECOMPOSED_OVERFLOW_BIT
) {
2066 g_assert_not_reached();
2071 float16
float16_round_to_int(float16 a
, float_status
*s
)
2073 FloatParts pa
= float16_unpack_canonical(a
, s
);
2074 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2075 return float16_round_pack_canonical(pr
, s
);
2078 float32
float32_round_to_int(float32 a
, float_status
*s
)
2080 FloatParts pa
= float32_unpack_canonical(a
, s
);
2081 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2082 return float32_round_pack_canonical(pr
, s
);
2085 float64
float64_round_to_int(float64 a
, float_status
*s
)
2087 FloatParts pa
= float64_unpack_canonical(a
, s
);
2088 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2089 return float64_round_pack_canonical(pr
, s
);
2093 * Returns the result of converting the floating-point value `a' to
2094 * the two's complement integer format. The conversion is performed
2095 * according to the IEC/IEEE Standard for Binary Floating-Point
2096 * Arithmetic---which means in particular that the conversion is
2097 * rounded according to the current rounding mode. If `a' is a NaN,
2098 * the largest positive integer is returned. Otherwise, if the
2099 * conversion overflows, the largest integer with the same sign as `a'
2103 static int64_t round_to_int_and_pack(FloatParts in
, int rmode
, int scale
,
2104 int64_t min
, int64_t max
,
2108 int orig_flags
= get_float_exception_flags(s
);
2109 FloatParts p
= round_to_int(in
, rmode
, scale
, s
);
2112 case float_class_snan
:
2113 case float_class_qnan
:
2114 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2116 case float_class_inf
:
2117 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2118 return p
.sign
? min
: max
;
2119 case float_class_zero
:
2121 case float_class_normal
:
2122 if (p
.exp
< DECOMPOSED_BINARY_POINT
) {
2123 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
2124 } else if (p
.exp
- DECOMPOSED_BINARY_POINT
< 2) {
2125 r
= p
.frac
<< (p
.exp
- DECOMPOSED_BINARY_POINT
);
2130 if (r
<= -(uint64_t) min
) {
2133 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2140 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2145 g_assert_not_reached();
2149 int16_t float16_to_int16_scalbn(float16 a
, int rmode
, int scale
,
2152 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2153 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2156 int32_t float16_to_int32_scalbn(float16 a
, int rmode
, int scale
,
2159 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2160 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2163 int64_t float16_to_int64_scalbn(float16 a
, int rmode
, int scale
,
2166 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2167 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2170 int16_t float32_to_int16_scalbn(float32 a
, int rmode
, int scale
,
2173 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2174 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2177 int32_t float32_to_int32_scalbn(float32 a
, int rmode
, int scale
,
2180 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2181 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2184 int64_t float32_to_int64_scalbn(float32 a
, int rmode
, int scale
,
2187 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2188 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2191 int16_t float64_to_int16_scalbn(float64 a
, int rmode
, int scale
,
2194 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2195 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2198 int32_t float64_to_int32_scalbn(float64 a
, int rmode
, int scale
,
2201 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2202 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2205 int64_t float64_to_int64_scalbn(float64 a
, int rmode
, int scale
,
2208 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2209 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2212 int16_t float16_to_int16(float16 a
, float_status
*s
)
2214 return float16_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2217 int32_t float16_to_int32(float16 a
, float_status
*s
)
2219 return float16_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2222 int64_t float16_to_int64(float16 a
, float_status
*s
)
2224 return float16_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2227 int16_t float32_to_int16(float32 a
, float_status
*s
)
2229 return float32_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2232 int32_t float32_to_int32(float32 a
, float_status
*s
)
2234 return float32_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2237 int64_t float32_to_int64(float32 a
, float_status
*s
)
2239 return float32_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2242 int16_t float64_to_int16(float64 a
, float_status
*s
)
2244 return float64_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2247 int32_t float64_to_int32(float64 a
, float_status
*s
)
2249 return float64_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2252 int64_t float64_to_int64(float64 a
, float_status
*s
)
2254 return float64_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2257 int16_t float16_to_int16_round_to_zero(float16 a
, float_status
*s
)
2259 return float16_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2262 int32_t float16_to_int32_round_to_zero(float16 a
, float_status
*s
)
2264 return float16_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2267 int64_t float16_to_int64_round_to_zero(float16 a
, float_status
*s
)
2269 return float16_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2272 int16_t float32_to_int16_round_to_zero(float32 a
, float_status
*s
)
2274 return float32_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2277 int32_t float32_to_int32_round_to_zero(float32 a
, float_status
*s
)
2279 return float32_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2282 int64_t float32_to_int64_round_to_zero(float32 a
, float_status
*s
)
2284 return float32_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2287 int16_t float64_to_int16_round_to_zero(float64 a
, float_status
*s
)
2289 return float64_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2292 int32_t float64_to_int32_round_to_zero(float64 a
, float_status
*s
)
2294 return float64_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2297 int64_t float64_to_int64_round_to_zero(float64 a
, float_status
*s
)
2299 return float64_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2303 * Returns the result of converting the floating-point value `a' to
2304 * the unsigned integer format. The conversion is performed according
2305 * to the IEC/IEEE Standard for Binary Floating-Point
2306 * Arithmetic---which means in particular that the conversion is
2307 * rounded according to the current rounding mode. If `a' is a NaN,
2308 * the largest unsigned integer is returned. Otherwise, if the
2309 * conversion overflows, the largest unsigned integer is returned. If
2310 * the 'a' is negative, the result is rounded and zero is returned;
2311 * values that do not round to zero will raise the inexact exception
2315 static uint64_t round_to_uint_and_pack(FloatParts in
, int rmode
, int scale
,
2316 uint64_t max
, float_status
*s
)
2318 int orig_flags
= get_float_exception_flags(s
);
2319 FloatParts p
= round_to_int(in
, rmode
, scale
, s
);
2323 case float_class_snan
:
2324 case float_class_qnan
:
2325 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2327 case float_class_inf
:
2328 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2329 return p
.sign
? 0 : max
;
2330 case float_class_zero
:
2332 case float_class_normal
:
2334 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2338 if (p
.exp
< DECOMPOSED_BINARY_POINT
) {
2339 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
2340 } else if (p
.exp
- DECOMPOSED_BINARY_POINT
< 2) {
2341 r
= p
.frac
<< (p
.exp
- DECOMPOSED_BINARY_POINT
);
2343 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2347 /* For uint64 this will never trip, but if p.exp is too large
2348 * to shift a decomposed fraction we shall have exited via the
2352 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2357 g_assert_not_reached();
2361 uint16_t float16_to_uint16_scalbn(float16 a
, int rmode
, int scale
,
2364 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2365 rmode
, scale
, UINT16_MAX
, s
);
2368 uint32_t float16_to_uint32_scalbn(float16 a
, int rmode
, int scale
,
2371 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2372 rmode
, scale
, UINT32_MAX
, s
);
2375 uint64_t float16_to_uint64_scalbn(float16 a
, int rmode
, int scale
,
2378 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2379 rmode
, scale
, UINT64_MAX
, s
);
2382 uint16_t float32_to_uint16_scalbn(float32 a
, int rmode
, int scale
,
2385 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2386 rmode
, scale
, UINT16_MAX
, s
);
2389 uint32_t float32_to_uint32_scalbn(float32 a
, int rmode
, int scale
,
2392 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2393 rmode
, scale
, UINT32_MAX
, s
);
2396 uint64_t float32_to_uint64_scalbn(float32 a
, int rmode
, int scale
,
2399 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2400 rmode
, scale
, UINT64_MAX
, s
);
2403 uint16_t float64_to_uint16_scalbn(float64 a
, int rmode
, int scale
,
2406 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2407 rmode
, scale
, UINT16_MAX
, s
);
2410 uint32_t float64_to_uint32_scalbn(float64 a
, int rmode
, int scale
,
2413 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2414 rmode
, scale
, UINT32_MAX
, s
);
2417 uint64_t float64_to_uint64_scalbn(float64 a
, int rmode
, int scale
,
2420 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2421 rmode
, scale
, UINT64_MAX
, s
);
2424 uint16_t float16_to_uint16(float16 a
, float_status
*s
)
2426 return float16_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2429 uint32_t float16_to_uint32(float16 a
, float_status
*s
)
2431 return float16_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2434 uint64_t float16_to_uint64(float16 a
, float_status
*s
)
2436 return float16_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2439 uint16_t float32_to_uint16(float32 a
, float_status
*s
)
2441 return float32_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2444 uint32_t float32_to_uint32(float32 a
, float_status
*s
)
2446 return float32_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2449 uint64_t float32_to_uint64(float32 a
, float_status
*s
)
2451 return float32_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2454 uint16_t float64_to_uint16(float64 a
, float_status
*s
)
2456 return float64_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2459 uint32_t float64_to_uint32(float64 a
, float_status
*s
)
2461 return float64_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2464 uint64_t float64_to_uint64(float64 a
, float_status
*s
)
2466 return float64_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2469 uint16_t float16_to_uint16_round_to_zero(float16 a
, float_status
*s
)
2471 return float16_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2474 uint32_t float16_to_uint32_round_to_zero(float16 a
, float_status
*s
)
2476 return float16_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2479 uint64_t float16_to_uint64_round_to_zero(float16 a
, float_status
*s
)
2481 return float16_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2484 uint16_t float32_to_uint16_round_to_zero(float32 a
, float_status
*s
)
2486 return float32_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2489 uint32_t float32_to_uint32_round_to_zero(float32 a
, float_status
*s
)
2491 return float32_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2494 uint64_t float32_to_uint64_round_to_zero(float32 a
, float_status
*s
)
2496 return float32_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2499 uint16_t float64_to_uint16_round_to_zero(float64 a
, float_status
*s
)
2501 return float64_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2504 uint32_t float64_to_uint32_round_to_zero(float64 a
, float_status
*s
)
2506 return float64_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2509 uint64_t float64_to_uint64_round_to_zero(float64 a
, float_status
*s
)
2511 return float64_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2515 * Integer to float conversions
2517 * Returns the result of converting the two's complement integer `a'
2518 * to the floating-point format. The conversion is performed according
2519 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2522 static FloatParts
int_to_float(int64_t a
, int scale
, float_status
*status
)
2524 FloatParts r
= { .sign
= false };
2527 r
.cls
= float_class_zero
;
2532 r
.cls
= float_class_normal
;
2537 shift
= clz64(f
) - 1;
2538 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2540 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
2541 r
.frac
= (shift
< 0 ? DECOMPOSED_IMPLICIT_BIT
: f
<< shift
);
2547 float16
int64_to_float16_scalbn(int64_t a
, int scale
, float_status
*status
)
2549 FloatParts pa
= int_to_float(a
, scale
, status
);
2550 return float16_round_pack_canonical(pa
, status
);
2553 float16
int32_to_float16_scalbn(int32_t a
, int scale
, float_status
*status
)
2555 return int64_to_float16_scalbn(a
, scale
, status
);
2558 float16
int16_to_float16_scalbn(int16_t a
, int scale
, float_status
*status
)
2560 return int64_to_float16_scalbn(a
, scale
, status
);
2563 float16
int64_to_float16(int64_t a
, float_status
*status
)
2565 return int64_to_float16_scalbn(a
, 0, status
);
2568 float16
int32_to_float16(int32_t a
, float_status
*status
)
2570 return int64_to_float16_scalbn(a
, 0, status
);
2573 float16
int16_to_float16(int16_t a
, float_status
*status
)
2575 return int64_to_float16_scalbn(a
, 0, status
);
2578 float32
int64_to_float32_scalbn(int64_t a
, int scale
, float_status
*status
)
2580 FloatParts pa
= int_to_float(a
, scale
, status
);
2581 return float32_round_pack_canonical(pa
, status
);
2584 float32
int32_to_float32_scalbn(int32_t a
, int scale
, float_status
*status
)
2586 return int64_to_float32_scalbn(a
, scale
, status
);
2589 float32
int16_to_float32_scalbn(int16_t a
, int scale
, float_status
*status
)
2591 return int64_to_float32_scalbn(a
, scale
, status
);
2594 float32
int64_to_float32(int64_t a
, float_status
*status
)
2596 return int64_to_float32_scalbn(a
, 0, status
);
2599 float32
int32_to_float32(int32_t a
, float_status
*status
)
2601 return int64_to_float32_scalbn(a
, 0, status
);
2604 float32
int16_to_float32(int16_t a
, float_status
*status
)
2606 return int64_to_float32_scalbn(a
, 0, status
);
2609 float64
int64_to_float64_scalbn(int64_t a
, int scale
, float_status
*status
)
2611 FloatParts pa
= int_to_float(a
, scale
, status
);
2612 return float64_round_pack_canonical(pa
, status
);
2615 float64
int32_to_float64_scalbn(int32_t a
, int scale
, float_status
*status
)
2617 return int64_to_float64_scalbn(a
, scale
, status
);
2620 float64
int16_to_float64_scalbn(int16_t a
, int scale
, float_status
*status
)
2622 return int64_to_float64_scalbn(a
, scale
, status
);
2625 float64
int64_to_float64(int64_t a
, float_status
*status
)
2627 return int64_to_float64_scalbn(a
, 0, status
);
2630 float64
int32_to_float64(int32_t a
, float_status
*status
)
2632 return int64_to_float64_scalbn(a
, 0, status
);
2635 float64
int16_to_float64(int16_t a
, float_status
*status
)
2637 return int64_to_float64_scalbn(a
, 0, status
);
2642 * Unsigned Integer to float conversions
2644 * Returns the result of converting the unsigned integer `a' to the
2645 * floating-point format. The conversion is performed according to the
2646 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2649 static FloatParts
uint_to_float(uint64_t a
, int scale
, float_status
*status
)
2651 FloatParts r
= { .sign
= false };
2654 r
.cls
= float_class_zero
;
2656 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2657 r
.cls
= float_class_normal
;
2658 if ((int64_t)a
< 0) {
2659 r
.exp
= DECOMPOSED_BINARY_POINT
+ 1 + scale
;
2660 shift64RightJamming(a
, 1, &a
);
2663 int shift
= clz64(a
) - 1;
2664 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
2665 r
.frac
= a
<< shift
;
2672 float16
uint64_to_float16_scalbn(uint64_t a
, int scale
, float_status
*status
)
2674 FloatParts pa
= uint_to_float(a
, scale
, status
);
2675 return float16_round_pack_canonical(pa
, status
);
2678 float16
uint32_to_float16_scalbn(uint32_t a
, int scale
, float_status
*status
)
2680 return uint64_to_float16_scalbn(a
, scale
, status
);
2683 float16
uint16_to_float16_scalbn(uint16_t a
, int scale
, float_status
*status
)
2685 return uint64_to_float16_scalbn(a
, scale
, status
);
2688 float16
uint64_to_float16(uint64_t a
, float_status
*status
)
2690 return uint64_to_float16_scalbn(a
, 0, status
);
2693 float16
uint32_to_float16(uint32_t a
, float_status
*status
)
2695 return uint64_to_float16_scalbn(a
, 0, status
);
2698 float16
uint16_to_float16(uint16_t a
, float_status
*status
)
2700 return uint64_to_float16_scalbn(a
, 0, status
);
2703 float32
uint64_to_float32_scalbn(uint64_t a
, int scale
, float_status
*status
)
2705 FloatParts pa
= uint_to_float(a
, scale
, status
);
2706 return float32_round_pack_canonical(pa
, status
);
2709 float32
uint32_to_float32_scalbn(uint32_t a
, int scale
, float_status
*status
)
2711 return uint64_to_float32_scalbn(a
, scale
, status
);
2714 float32
uint16_to_float32_scalbn(uint16_t a
, int scale
, float_status
*status
)
2716 return uint64_to_float32_scalbn(a
, scale
, status
);
2719 float32
uint64_to_float32(uint64_t a
, float_status
*status
)
2721 return uint64_to_float32_scalbn(a
, 0, status
);
2724 float32
uint32_to_float32(uint32_t a
, float_status
*status
)
2726 return uint64_to_float32_scalbn(a
, 0, status
);
2729 float32
uint16_to_float32(uint16_t a
, float_status
*status
)
2731 return uint64_to_float32_scalbn(a
, 0, status
);
2734 float64
uint64_to_float64_scalbn(uint64_t a
, int scale
, float_status
*status
)
2736 FloatParts pa
= uint_to_float(a
, scale
, status
);
2737 return float64_round_pack_canonical(pa
, status
);
2740 float64
uint32_to_float64_scalbn(uint32_t a
, int scale
, float_status
*status
)
2742 return uint64_to_float64_scalbn(a
, scale
, status
);
2745 float64
uint16_to_float64_scalbn(uint16_t a
, int scale
, float_status
*status
)
2747 return uint64_to_float64_scalbn(a
, scale
, status
);
2750 float64
uint64_to_float64(uint64_t a
, float_status
*status
)
2752 return uint64_to_float64_scalbn(a
, 0, status
);
2755 float64
uint32_to_float64(uint32_t a
, float_status
*status
)
2757 return uint64_to_float64_scalbn(a
, 0, status
);
2760 float64
uint16_to_float64(uint16_t a
, float_status
*status
)
2762 return uint64_to_float64_scalbn(a
, 0, status
);
2766 /* min() and max() functions. These can't be implemented as
2767 * 'compare and pick one input' because that would mishandle
2768 * NaNs and +0 vs -0.
2770 * minnum() and maxnum() functions. These are similar to the min()
2771 * and max() functions but if one of the arguments is a QNaN and
2772 * the other is numerical then the numerical argument is returned.
2773 * SNaNs will get quietened before being returned.
2774 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2775 * and maxNum() operations. min() and max() are the typical min/max
2776 * semantics provided by many CPUs which predate that specification.
2778 * minnummag() and maxnummag() functions correspond to minNumMag()
2779 * and minNumMag() from the IEEE-754 2008.
2781 static FloatParts
minmax_floats(FloatParts a
, FloatParts b
, bool ismin
,
2782 bool ieee
, bool ismag
, float_status
*s
)
2784 if (unlikely(is_nan(a
.cls
) || is_nan(b
.cls
))) {
2786 /* Takes two floating-point values `a' and `b', one of
2787 * which is a NaN, and returns the appropriate NaN
2788 * result. If either `a' or `b' is a signaling NaN,
2789 * the invalid exception is raised.
2791 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
2792 return pick_nan(a
, b
, s
);
2793 } else if (is_nan(a
.cls
) && !is_nan(b
.cls
)) {
2795 } else if (is_nan(b
.cls
) && !is_nan(a
.cls
)) {
2799 return pick_nan(a
, b
, s
);
2804 case float_class_normal
:
2807 case float_class_inf
:
2810 case float_class_zero
:
2814 g_assert_not_reached();
2818 case float_class_normal
:
2821 case float_class_inf
:
2824 case float_class_zero
:
2828 g_assert_not_reached();
2832 if (ismag
&& (a_exp
!= b_exp
|| a
.frac
!= b
.frac
)) {
2833 bool a_less
= a_exp
< b_exp
;
2834 if (a_exp
== b_exp
) {
2835 a_less
= a
.frac
< b
.frac
;
2837 return a_less
^ ismin
? b
: a
;
2840 if (a
.sign
== b
.sign
) {
2841 bool a_less
= a_exp
< b_exp
;
2842 if (a_exp
== b_exp
) {
2843 a_less
= a
.frac
< b
.frac
;
2845 return a
.sign
^ a_less
^ ismin
? b
: a
;
2847 return a
.sign
^ ismin
? b
: a
;
2852 #define MINMAX(sz, name, ismin, isiee, ismag) \
2853 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
2856 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2857 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2858 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
2860 return float ## sz ## _round_pack_canonical(pr, s); \
2863 MINMAX(16, min
, true, false, false)
2864 MINMAX(16, minnum
, true, true, false)
2865 MINMAX(16, minnummag
, true, true, true)
2866 MINMAX(16, max
, false, false, false)
2867 MINMAX(16, maxnum
, false, true, false)
2868 MINMAX(16, maxnummag
, false, true, true)
2870 MINMAX(32, min
, true, false, false)
2871 MINMAX(32, minnum
, true, true, false)
2872 MINMAX(32, minnummag
, true, true, true)
2873 MINMAX(32, max
, false, false, false)
2874 MINMAX(32, maxnum
, false, true, false)
2875 MINMAX(32, maxnummag
, false, true, true)
2877 MINMAX(64, min
, true, false, false)
2878 MINMAX(64, minnum
, true, true, false)
2879 MINMAX(64, minnummag
, true, true, true)
2880 MINMAX(64, max
, false, false, false)
2881 MINMAX(64, maxnum
, false, true, false)
2882 MINMAX(64, maxnummag
, false, true, true)
2886 /* Floating point compare */
2887 static int compare_floats(FloatParts a
, FloatParts b
, bool is_quiet
,
2890 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
2892 a
.cls
== float_class_snan
||
2893 b
.cls
== float_class_snan
) {
2894 s
->float_exception_flags
|= float_flag_invalid
;
2896 return float_relation_unordered
;
2899 if (a
.cls
== float_class_zero
) {
2900 if (b
.cls
== float_class_zero
) {
2901 return float_relation_equal
;
2903 return b
.sign
? float_relation_greater
: float_relation_less
;
2904 } else if (b
.cls
== float_class_zero
) {
2905 return a
.sign
? float_relation_less
: float_relation_greater
;
2908 /* The only really important thing about infinity is its sign. If
2909 * both are infinities the sign marks the smallest of the two.
2911 if (a
.cls
== float_class_inf
) {
2912 if ((b
.cls
== float_class_inf
) && (a
.sign
== b
.sign
)) {
2913 return float_relation_equal
;
2915 return a
.sign
? float_relation_less
: float_relation_greater
;
2916 } else if (b
.cls
== float_class_inf
) {
2917 return b
.sign
? float_relation_greater
: float_relation_less
;
2920 if (a
.sign
!= b
.sign
) {
2921 return a
.sign
? float_relation_less
: float_relation_greater
;
2924 if (a
.exp
== b
.exp
) {
2925 if (a
.frac
== b
.frac
) {
2926 return float_relation_equal
;
2929 return a
.frac
> b
.frac
?
2930 float_relation_less
: float_relation_greater
;
2932 return a
.frac
> b
.frac
?
2933 float_relation_greater
: float_relation_less
;
2937 return a
.exp
> b
.exp
? float_relation_less
: float_relation_greater
;
2939 return a
.exp
> b
.exp
? float_relation_greater
: float_relation_less
;
2944 #define COMPARE(name, attr, sz) \
2946 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \
2948 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2949 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2950 return compare_floats(pa, pb, is_quiet, s); \
2953 COMPARE(soft_f16_compare
, QEMU_FLATTEN
, 16)
2954 COMPARE(soft_f32_compare
, QEMU_SOFTFLOAT_ATTR
, 32)
2955 COMPARE(soft_f64_compare
, QEMU_SOFTFLOAT_ATTR
, 64)
2959 int float16_compare(float16 a
, float16 b
, float_status
*s
)
2961 return soft_f16_compare(a
, b
, false, s
);
2964 int float16_compare_quiet(float16 a
, float16 b
, float_status
*s
)
2966 return soft_f16_compare(a
, b
, true, s
);
2969 static int QEMU_FLATTEN
2970 f32_compare(float32 xa
, float32 xb
, bool is_quiet
, float_status
*s
)
2972 union_float32 ua
, ub
;
2977 if (QEMU_NO_HARDFLOAT
) {
2981 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
2982 if (isgreaterequal(ua
.h
, ub
.h
)) {
2983 if (isgreater(ua
.h
, ub
.h
)) {
2984 return float_relation_greater
;
2986 return float_relation_equal
;
2988 if (likely(isless(ua
.h
, ub
.h
))) {
2989 return float_relation_less
;
2991 /* The only condition remaining is unordered.
2992 * Fall through to set flags.
2995 return soft_f32_compare(ua
.s
, ub
.s
, is_quiet
, s
);
2998 int float32_compare(float32 a
, float32 b
, float_status
*s
)
3000 return f32_compare(a
, b
, false, s
);
3003 int float32_compare_quiet(float32 a
, float32 b
, float_status
*s
)
3005 return f32_compare(a
, b
, true, s
);
3008 static int QEMU_FLATTEN
3009 f64_compare(float64 xa
, float64 xb
, bool is_quiet
, float_status
*s
)
3011 union_float64 ua
, ub
;
3016 if (QEMU_NO_HARDFLOAT
) {
3020 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
3021 if (isgreaterequal(ua
.h
, ub
.h
)) {
3022 if (isgreater(ua
.h
, ub
.h
)) {
3023 return float_relation_greater
;
3025 return float_relation_equal
;
3027 if (likely(isless(ua
.h
, ub
.h
))) {
3028 return float_relation_less
;
3030 /* The only condition remaining is unordered.
3031 * Fall through to set flags.
3034 return soft_f64_compare(ua
.s
, ub
.s
, is_quiet
, s
);
3037 int float64_compare(float64 a
, float64 b
, float_status
*s
)
3039 return f64_compare(a
, b
, false, s
);
3042 int float64_compare_quiet(float64 a
, float64 b
, float_status
*s
)
3044 return f64_compare(a
, b
, true, s
);
3047 /* Multiply A by 2 raised to the power N. */
3048 static FloatParts
scalbn_decomposed(FloatParts a
, int n
, float_status
*s
)
3050 if (unlikely(is_nan(a
.cls
))) {
3051 return return_nan(a
, s
);
3053 if (a
.cls
== float_class_normal
) {
3054 /* The largest float type (even though not supported by FloatParts)
3055 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
3056 * still allows rounding to infinity, without allowing overflow
3057 * within the int32_t that backs FloatParts.exp.
3059 n
= MIN(MAX(n
, -0x10000), 0x10000);
3065 float16
float16_scalbn(float16 a
, int n
, float_status
*status
)
3067 FloatParts pa
= float16_unpack_canonical(a
, status
);
3068 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
3069 return float16_round_pack_canonical(pr
, status
);
3072 float32
float32_scalbn(float32 a
, int n
, float_status
*status
)
3074 FloatParts pa
= float32_unpack_canonical(a
, status
);
3075 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
3076 return float32_round_pack_canonical(pr
, status
);
3079 float64
float64_scalbn(float64 a
, int n
, float_status
*status
)
3081 FloatParts pa
= float64_unpack_canonical(a
, status
);
3082 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
3083 return float64_round_pack_canonical(pr
, status
);
3089 * The old softfloat code did an approximation step before zeroing in
3090 * on the final result. However for simpleness we just compute the
3091 * square root by iterating down from the implicit bit to enough extra
3092 * bits to ensure we get a correctly rounded result.
3094 * This does mean however the calculation is slower than before,
3095 * especially for 64 bit floats.
3098 static FloatParts
sqrt_float(FloatParts a
, float_status
*s
, const FloatFmt
*p
)
3100 uint64_t a_frac
, r_frac
, s_frac
;
3103 if (is_nan(a
.cls
)) {
3104 return return_nan(a
, s
);
3106 if (a
.cls
== float_class_zero
) {
3107 return a
; /* sqrt(+-0) = +-0 */
3110 s
->float_exception_flags
|= float_flag_invalid
;
3111 return parts_default_nan(s
);
3113 if (a
.cls
== float_class_inf
) {
3114 return a
; /* sqrt(+inf) = +inf */
3117 assert(a
.cls
== float_class_normal
);
3119 /* We need two overflow bits at the top. Adding room for that is a
3120 * right shift. If the exponent is odd, we can discard the low bit
3121 * by multiplying the fraction by 2; that's a left shift. Combine
3122 * those and we shift right if the exponent is even.
3130 /* Bit-by-bit computation of sqrt. */
3134 /* Iterate from implicit bit down to the 3 extra bits to compute a
3135 * properly rounded result. Remember we've inserted one more bit
3136 * at the top, so these positions are one less.
3138 bit
= DECOMPOSED_BINARY_POINT
- 1;
3139 last_bit
= MAX(p
->frac_shift
- 4, 0);
3141 uint64_t q
= 1ULL << bit
;
3142 uint64_t t_frac
= s_frac
+ q
;
3143 if (t_frac
<= a_frac
) {
3144 s_frac
= t_frac
+ q
;
3149 } while (--bit
>= last_bit
);
3151 /* Undo the right shift done above. If there is any remaining
3152 * fraction, the result is inexact. Set the sticky bit.
3154 a
.frac
= (r_frac
<< 1) + (a_frac
!= 0);
3159 float16 QEMU_FLATTEN
float16_sqrt(float16 a
, float_status
*status
)
3161 FloatParts pa
= float16_unpack_canonical(a
, status
);
3162 FloatParts pr
= sqrt_float(pa
, status
, &float16_params
);
3163 return float16_round_pack_canonical(pr
, status
);
3166 static float32 QEMU_SOFTFLOAT_ATTR
3167 soft_f32_sqrt(float32 a
, float_status
*status
)
3169 FloatParts pa
= float32_unpack_canonical(a
, status
);
3170 FloatParts pr
= sqrt_float(pa
, status
, &float32_params
);
3171 return float32_round_pack_canonical(pr
, status
);
3174 static float64 QEMU_SOFTFLOAT_ATTR
3175 soft_f64_sqrt(float64 a
, float_status
*status
)
3177 FloatParts pa
= float64_unpack_canonical(a
, status
);
3178 FloatParts pr
= sqrt_float(pa
, status
, &float64_params
);
3179 return float64_round_pack_canonical(pr
, status
);
3182 float32 QEMU_FLATTEN
float32_sqrt(float32 xa
, float_status
*s
)
3184 union_float32 ua
, ur
;
3187 if (unlikely(!can_use_fpu(s
))) {
3191 float32_input_flush1(&ua
.s
, s
);
3192 if (QEMU_HARDFLOAT_1F32_USE_FP
) {
3193 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3194 fpclassify(ua
.h
) == FP_ZERO
) ||
3198 } else if (unlikely(!float32_is_zero_or_normal(ua
.s
) ||
3199 float32_is_neg(ua
.s
))) {
3206 return soft_f32_sqrt(ua
.s
, s
);
3209 float64 QEMU_FLATTEN
float64_sqrt(float64 xa
, float_status
*s
)
3211 union_float64 ua
, ur
;
3214 if (unlikely(!can_use_fpu(s
))) {
3218 float64_input_flush1(&ua
.s
, s
);
3219 if (QEMU_HARDFLOAT_1F64_USE_FP
) {
3220 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3221 fpclassify(ua
.h
) == FP_ZERO
) ||
3225 } else if (unlikely(!float64_is_zero_or_normal(ua
.s
) ||
3226 float64_is_neg(ua
.s
))) {
3233 return soft_f64_sqrt(ua
.s
, s
);
3236 /*----------------------------------------------------------------------------
3237 | The pattern for a default generated NaN.
3238 *----------------------------------------------------------------------------*/
3240 float16
float16_default_nan(float_status
*status
)
3242 FloatParts p
= parts_default_nan(status
);
3243 p
.frac
>>= float16_params
.frac_shift
;
3244 return float16_pack_raw(p
);
3247 float32
float32_default_nan(float_status
*status
)
3249 FloatParts p
= parts_default_nan(status
);
3250 p
.frac
>>= float32_params
.frac_shift
;
3251 return float32_pack_raw(p
);
3254 float64
float64_default_nan(float_status
*status
)
3256 FloatParts p
= parts_default_nan(status
);
3257 p
.frac
>>= float64_params
.frac_shift
;
3258 return float64_pack_raw(p
);
3261 float128
float128_default_nan(float_status
*status
)
3263 FloatParts p
= parts_default_nan(status
);
3266 /* Extrapolate from the choices made by parts_default_nan to fill
3267 * in the quad-floating format. If the low bit is set, assume we
3268 * want to set all non-snan bits.
3270 r
.low
= -(p
.frac
& 1);
3271 r
.high
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- 48);
3272 r
.high
|= LIT64(0x7FFF000000000000);
3273 r
.high
|= (uint64_t)p
.sign
<< 63;
3278 /*----------------------------------------------------------------------------
3279 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3280 *----------------------------------------------------------------------------*/
3282 float16
float16_silence_nan(float16 a
, float_status
*status
)
3284 FloatParts p
= float16_unpack_raw(a
);
3285 p
.frac
<<= float16_params
.frac_shift
;
3286 p
= parts_silence_nan(p
, status
);
3287 p
.frac
>>= float16_params
.frac_shift
;
3288 return float16_pack_raw(p
);
3291 float32
float32_silence_nan(float32 a
, float_status
*status
)
3293 FloatParts p
= float32_unpack_raw(a
);
3294 p
.frac
<<= float32_params
.frac_shift
;
3295 p
= parts_silence_nan(p
, status
);
3296 p
.frac
>>= float32_params
.frac_shift
;
3297 return float32_pack_raw(p
);
3300 float64
float64_silence_nan(float64 a
, float_status
*status
)
3302 FloatParts p
= float64_unpack_raw(a
);
3303 p
.frac
<<= float64_params
.frac_shift
;
3304 p
= parts_silence_nan(p
, status
);
3305 p
.frac
>>= float64_params
.frac_shift
;
3306 return float64_pack_raw(p
);
3309 /*----------------------------------------------------------------------------
3310 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3311 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3312 | input. If `zSign' is 1, the input is negated before being converted to an
3313 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
3314 | is simply rounded to an integer, with the inexact exception raised if the
3315 | input cannot be represented exactly as an integer. However, if the fixed-
3316 | point input is too large, the invalid exception is raised and the largest
3317 | positive or negative integer is returned.
3318 *----------------------------------------------------------------------------*/
3320 static int32_t roundAndPackInt32(flag zSign
, uint64_t absZ
, float_status
*status
)
3322 int8_t roundingMode
;
3323 flag roundNearestEven
;
3324 int8_t roundIncrement
, roundBits
;
3327 roundingMode
= status
->float_rounding_mode
;
3328 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3329 switch (roundingMode
) {
3330 case float_round_nearest_even
:
3331 case float_round_ties_away
:
3332 roundIncrement
= 0x40;
3334 case float_round_to_zero
:
3337 case float_round_up
:
3338 roundIncrement
= zSign
? 0 : 0x7f;
3340 case float_round_down
:
3341 roundIncrement
= zSign
? 0x7f : 0;
3343 case float_round_to_odd
:
3344 roundIncrement
= absZ
& 0x80 ? 0 : 0x7f;
3349 roundBits
= absZ
& 0x7F;
3350 absZ
= ( absZ
+ roundIncrement
)>>7;
3351 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
3353 if ( zSign
) z
= - z
;
3354 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
3355 float_raise(float_flag_invalid
, status
);
3356 return zSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
3359 status
->float_exception_flags
|= float_flag_inexact
;
3365 /*----------------------------------------------------------------------------
3366 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3367 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3368 | and returns the properly rounded 64-bit integer corresponding to the input.
3369 | If `zSign' is 1, the input is negated before being converted to an integer.
3370 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3371 | the inexact exception raised if the input cannot be represented exactly as
3372 | an integer. However, if the fixed-point input is too large, the invalid
3373 | exception is raised and the largest positive or negative integer is
3375 *----------------------------------------------------------------------------*/
3377 static int64_t roundAndPackInt64(flag zSign
, uint64_t absZ0
, uint64_t absZ1
,
3378 float_status
*status
)
3380 int8_t roundingMode
;
3381 flag roundNearestEven
, increment
;
3384 roundingMode
= status
->float_rounding_mode
;
3385 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3386 switch (roundingMode
) {
3387 case float_round_nearest_even
:
3388 case float_round_ties_away
:
3389 increment
= ((int64_t) absZ1
< 0);
3391 case float_round_to_zero
:
3394 case float_round_up
:
3395 increment
= !zSign
&& absZ1
;
3397 case float_round_down
:
3398 increment
= zSign
&& absZ1
;
3400 case float_round_to_odd
:
3401 increment
= !(absZ0
& 1) && absZ1
;
3408 if ( absZ0
== 0 ) goto overflow
;
3409 absZ0
&= ~ ( ( (uint64_t) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
3412 if ( zSign
) z
= - z
;
3413 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
3415 float_raise(float_flag_invalid
, status
);
3417 zSign
? (int64_t) LIT64( 0x8000000000000000 )
3418 : LIT64( 0x7FFFFFFFFFFFFFFF );
3421 status
->float_exception_flags
|= float_flag_inexact
;
3427 /*----------------------------------------------------------------------------
3428 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3429 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3430 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3431 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
3432 | with the inexact exception raised if the input cannot be represented exactly
3433 | as an integer. However, if the fixed-point input is too large, the invalid
3434 | exception is raised and the largest unsigned integer is returned.
3435 *----------------------------------------------------------------------------*/
3437 static int64_t roundAndPackUint64(flag zSign
, uint64_t absZ0
,
3438 uint64_t absZ1
, float_status
*status
)
3440 int8_t roundingMode
;
3441 flag roundNearestEven
, increment
;
3443 roundingMode
= status
->float_rounding_mode
;
3444 roundNearestEven
= (roundingMode
== float_round_nearest_even
);
3445 switch (roundingMode
) {
3446 case float_round_nearest_even
:
3447 case float_round_ties_away
:
3448 increment
= ((int64_t)absZ1
< 0);
3450 case float_round_to_zero
:
3453 case float_round_up
:
3454 increment
= !zSign
&& absZ1
;
3456 case float_round_down
:
3457 increment
= zSign
&& absZ1
;
3459 case float_round_to_odd
:
3460 increment
= !(absZ0
& 1) && absZ1
;
3468 float_raise(float_flag_invalid
, status
);
3469 return LIT64(0xFFFFFFFFFFFFFFFF);
3471 absZ0
&= ~(((uint64_t)(absZ1
<<1) == 0) & roundNearestEven
);
3474 if (zSign
&& absZ0
) {
3475 float_raise(float_flag_invalid
, status
);
3480 status
->float_exception_flags
|= float_flag_inexact
;
3485 /*----------------------------------------------------------------------------
3486 | If `a' is denormal and we are in flush-to-zero mode then set the
3487 | input-denormal exception and return zero. Otherwise just return the value.
3488 *----------------------------------------------------------------------------*/
3489 float32
float32_squash_input_denormal(float32 a
, float_status
*status
)
3491 if (status
->flush_inputs_to_zero
) {
3492 if (extractFloat32Exp(a
) == 0 && extractFloat32Frac(a
) != 0) {
3493 float_raise(float_flag_input_denormal
, status
);
3494 return make_float32(float32_val(a
) & 0x80000000);
3500 /*----------------------------------------------------------------------------
3501 | Normalizes the subnormal single-precision floating-point value represented
3502 | by the denormalized significand `aSig'. The normalized exponent and
3503 | significand are stored at the locations pointed to by `zExpPtr' and
3504 | `zSigPtr', respectively.
3505 *----------------------------------------------------------------------------*/
3508 normalizeFloat32Subnormal(uint32_t aSig
, int *zExpPtr
, uint32_t *zSigPtr
)
3512 shiftCount
= clz32(aSig
) - 8;
3513 *zSigPtr
= aSig
<<shiftCount
;
3514 *zExpPtr
= 1 - shiftCount
;
3518 /*----------------------------------------------------------------------------
3519 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3520 | and significand `zSig', and returns the proper single-precision floating-
3521 | point value corresponding to the abstract input. Ordinarily, the abstract
3522 | value is simply rounded and packed into the single-precision format, with
3523 | the inexact exception raised if the abstract input cannot be represented
3524 | exactly. However, if the abstract value is too large, the overflow and
3525 | inexact exceptions are raised and an infinity or maximal finite value is
3526 | returned. If the abstract value is too small, the input value is rounded to
3527 | a subnormal number, and the underflow and inexact exceptions are raised if
3528 | the abstract input cannot be represented exactly as a subnormal single-
3529 | precision floating-point number.
3530 | The input significand `zSig' has its binary point between bits 30
3531 | and 29, which is 7 bits to the left of the usual location. This shifted
3532 | significand must be normalized or smaller. If `zSig' is not normalized,
3533 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3534 | and it must not require rounding. In the usual case that `zSig' is
3535 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3536 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3537 | Binary Floating-Point Arithmetic.
3538 *----------------------------------------------------------------------------*/
3540 static float32
roundAndPackFloat32(flag zSign
, int zExp
, uint32_t zSig
,
3541 float_status
*status
)
3543 int8_t roundingMode
;
3544 flag roundNearestEven
;
3545 int8_t roundIncrement
, roundBits
;
3548 roundingMode
= status
->float_rounding_mode
;
3549 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3550 switch (roundingMode
) {
3551 case float_round_nearest_even
:
3552 case float_round_ties_away
:
3553 roundIncrement
= 0x40;
3555 case float_round_to_zero
:
3558 case float_round_up
:
3559 roundIncrement
= zSign
? 0 : 0x7f;
3561 case float_round_down
:
3562 roundIncrement
= zSign
? 0x7f : 0;
3564 case float_round_to_odd
:
3565 roundIncrement
= zSig
& 0x80 ? 0 : 0x7f;
3571 roundBits
= zSig
& 0x7F;
3572 if ( 0xFD <= (uint16_t) zExp
) {
3573 if ( ( 0xFD < zExp
)
3574 || ( ( zExp
== 0xFD )
3575 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
3577 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
3578 roundIncrement
!= 0;
3579 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3580 return packFloat32(zSign
, 0xFF, -!overflow_to_inf
);
3583 if (status
->flush_to_zero
) {
3584 float_raise(float_flag_output_denormal
, status
);
3585 return packFloat32(zSign
, 0, 0);
3588 (status
->float_detect_tininess
3589 == float_tininess_before_rounding
)
3591 || ( zSig
+ roundIncrement
< 0x80000000 );
3592 shift32RightJamming( zSig
, - zExp
, &zSig
);
3594 roundBits
= zSig
& 0x7F;
3595 if (isTiny
&& roundBits
) {
3596 float_raise(float_flag_underflow
, status
);
3598 if (roundingMode
== float_round_to_odd
) {
3600 * For round-to-odd case, the roundIncrement depends on
3601 * zSig which just changed.
3603 roundIncrement
= zSig
& 0x80 ? 0 : 0x7f;
3608 status
->float_exception_flags
|= float_flag_inexact
;
3610 zSig
= ( zSig
+ roundIncrement
)>>7;
3611 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
3612 if ( zSig
== 0 ) zExp
= 0;
3613 return packFloat32( zSign
, zExp
, zSig
);
3617 /*----------------------------------------------------------------------------
3618 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3619 | and significand `zSig', and returns the proper single-precision floating-
3620 | point value corresponding to the abstract input. This routine is just like
3621 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
3622 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3623 | floating-point exponent.
3624 *----------------------------------------------------------------------------*/
3627 normalizeRoundAndPackFloat32(flag zSign
, int zExp
, uint32_t zSig
,
3628 float_status
*status
)
3632 shiftCount
= clz32(zSig
) - 1;
3633 return roundAndPackFloat32(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
3638 /*----------------------------------------------------------------------------
3639 | If `a' is denormal and we are in flush-to-zero mode then set the
3640 | input-denormal exception and return zero. Otherwise just return the value.
3641 *----------------------------------------------------------------------------*/
3642 float64
float64_squash_input_denormal(float64 a
, float_status
*status
)
3644 if (status
->flush_inputs_to_zero
) {
3645 if (extractFloat64Exp(a
) == 0 && extractFloat64Frac(a
) != 0) {
3646 float_raise(float_flag_input_denormal
, status
);
3647 return make_float64(float64_val(a
) & (1ULL << 63));
3653 /*----------------------------------------------------------------------------
3654 | Normalizes the subnormal double-precision floating-point value represented
3655 | by the denormalized significand `aSig'. The normalized exponent and
3656 | significand are stored at the locations pointed to by `zExpPtr' and
3657 | `zSigPtr', respectively.
3658 *----------------------------------------------------------------------------*/
3661 normalizeFloat64Subnormal(uint64_t aSig
, int *zExpPtr
, uint64_t *zSigPtr
)
3665 shiftCount
= clz64(aSig
) - 11;
3666 *zSigPtr
= aSig
<<shiftCount
;
3667 *zExpPtr
= 1 - shiftCount
;
3671 /*----------------------------------------------------------------------------
3672 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3673 | double-precision floating-point value, returning the result. After being
3674 | shifted into the proper positions, the three fields are simply added
3675 | together to form the result. This means that any integer portion of `zSig'
3676 | will be added into the exponent. Since a properly normalized significand
3677 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3678 | than the desired result exponent whenever `zSig' is a complete, normalized
3680 *----------------------------------------------------------------------------*/
3682 static inline float64
packFloat64(flag zSign
, int zExp
, uint64_t zSig
)
3685 return make_float64(
3686 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
3690 /*----------------------------------------------------------------------------
3691 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3692 | and significand `zSig', and returns the proper double-precision floating-
3693 | point value corresponding to the abstract input. Ordinarily, the abstract
3694 | value is simply rounded and packed into the double-precision format, with
3695 | the inexact exception raised if the abstract input cannot be represented
3696 | exactly. However, if the abstract value is too large, the overflow and
3697 | inexact exceptions are raised and an infinity or maximal finite value is
3698 | returned. If the abstract value is too small, the input value is rounded to
3699 | a subnormal number, and the underflow and inexact exceptions are raised if
3700 | the abstract input cannot be represented exactly as a subnormal double-
3701 | precision floating-point number.
3702 | The input significand `zSig' has its binary point between bits 62
3703 | and 61, which is 10 bits to the left of the usual location. This shifted
3704 | significand must be normalized or smaller. If `zSig' is not normalized,
3705 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3706 | and it must not require rounding. In the usual case that `zSig' is
3707 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3708 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3709 | Binary Floating-Point Arithmetic.
3710 *----------------------------------------------------------------------------*/
3712 static float64
roundAndPackFloat64(flag zSign
, int zExp
, uint64_t zSig
,
3713 float_status
*status
)
3715 int8_t roundingMode
;
3716 flag roundNearestEven
;
3717 int roundIncrement
, roundBits
;
3720 roundingMode
= status
->float_rounding_mode
;
3721 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3722 switch (roundingMode
) {
3723 case float_round_nearest_even
:
3724 case float_round_ties_away
:
3725 roundIncrement
= 0x200;
3727 case float_round_to_zero
:
3730 case float_round_up
:
3731 roundIncrement
= zSign
? 0 : 0x3ff;
3733 case float_round_down
:
3734 roundIncrement
= zSign
? 0x3ff : 0;
3736 case float_round_to_odd
:
3737 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
3742 roundBits
= zSig
& 0x3FF;
3743 if ( 0x7FD <= (uint16_t) zExp
) {
3744 if ( ( 0x7FD < zExp
)
3745 || ( ( zExp
== 0x7FD )
3746 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
3748 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
3749 roundIncrement
!= 0;
3750 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3751 return packFloat64(zSign
, 0x7FF, -(!overflow_to_inf
));
3754 if (status
->flush_to_zero
) {
3755 float_raise(float_flag_output_denormal
, status
);
3756 return packFloat64(zSign
, 0, 0);
3759 (status
->float_detect_tininess
3760 == float_tininess_before_rounding
)
3762 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
3763 shift64RightJamming( zSig
, - zExp
, &zSig
);
3765 roundBits
= zSig
& 0x3FF;
3766 if (isTiny
&& roundBits
) {
3767 float_raise(float_flag_underflow
, status
);
3769 if (roundingMode
== float_round_to_odd
) {
3771 * For round-to-odd case, the roundIncrement depends on
3772 * zSig which just changed.
3774 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
3779 status
->float_exception_flags
|= float_flag_inexact
;
3781 zSig
= ( zSig
+ roundIncrement
)>>10;
3782 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
3783 if ( zSig
== 0 ) zExp
= 0;
3784 return packFloat64( zSign
, zExp
, zSig
);
3788 /*----------------------------------------------------------------------------
3789 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3790 | and significand `zSig', and returns the proper double-precision floating-
3791 | point value corresponding to the abstract input. This routine is just like
3792 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
3793 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3794 | floating-point exponent.
3795 *----------------------------------------------------------------------------*/
3798 normalizeRoundAndPackFloat64(flag zSign
, int zExp
, uint64_t zSig
,
3799 float_status
*status
)
3803 shiftCount
= clz64(zSig
) - 1;
3804 return roundAndPackFloat64(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
3809 /*----------------------------------------------------------------------------
3810 | Normalizes the subnormal extended double-precision floating-point value
3811 | represented by the denormalized significand `aSig'. The normalized exponent
3812 | and significand are stored at the locations pointed to by `zExpPtr' and
3813 | `zSigPtr', respectively.
3814 *----------------------------------------------------------------------------*/
3816 void normalizeFloatx80Subnormal(uint64_t aSig
, int32_t *zExpPtr
,
3821 shiftCount
= clz64(aSig
);
3822 *zSigPtr
= aSig
<<shiftCount
;
3823 *zExpPtr
= 1 - shiftCount
;
3826 /*----------------------------------------------------------------------------
3827 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3828 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3829 | and returns the proper extended double-precision floating-point value
3830 | corresponding to the abstract input. Ordinarily, the abstract value is
3831 | rounded and packed into the extended double-precision format, with the
3832 | inexact exception raised if the abstract input cannot be represented
3833 | exactly. However, if the abstract value is too large, the overflow and
3834 | inexact exceptions are raised and an infinity or maximal finite value is
3835 | returned. If the abstract value is too small, the input value is rounded to
3836 | a subnormal number, and the underflow and inexact exceptions are raised if
3837 | the abstract input cannot be represented exactly as a subnormal extended
3838 | double-precision floating-point number.
3839 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
3840 | number of bits as single or double precision, respectively. Otherwise, the
3841 | result is rounded to the full precision of the extended double-precision
3843 | The input significand must be normalized or smaller. If the input
3844 | significand is not normalized, `zExp' must be 0; in that case, the result
3845 | returned is a subnormal number, and it must not require rounding. The
3846 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3847 | Floating-Point Arithmetic.
3848 *----------------------------------------------------------------------------*/
3850 floatx80
roundAndPackFloatx80(int8_t roundingPrecision
, flag zSign
,
3851 int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
,
3852 float_status
*status
)
3854 int8_t roundingMode
;
3855 flag roundNearestEven
, increment
, isTiny
;
3856 int64_t roundIncrement
, roundMask
, roundBits
;
3858 roundingMode
= status
->float_rounding_mode
;
3859 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3860 if ( roundingPrecision
== 80 ) goto precision80
;
3861 if ( roundingPrecision
== 64 ) {
3862 roundIncrement
= LIT64( 0x0000000000000400 );
3863 roundMask
= LIT64( 0x00000000000007FF );
3865 else if ( roundingPrecision
== 32 ) {
3866 roundIncrement
= LIT64( 0x0000008000000000 );
3867 roundMask
= LIT64( 0x000000FFFFFFFFFF );
3872 zSig0
|= ( zSig1
!= 0 );
3873 switch (roundingMode
) {
3874 case float_round_nearest_even
:
3875 case float_round_ties_away
:
3877 case float_round_to_zero
:
3880 case float_round_up
:
3881 roundIncrement
= zSign
? 0 : roundMask
;
3883 case float_round_down
:
3884 roundIncrement
= zSign
? roundMask
: 0;
3889 roundBits
= zSig0
& roundMask
;
3890 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
3891 if ( ( 0x7FFE < zExp
)
3892 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
3897 if (status
->flush_to_zero
) {
3898 float_raise(float_flag_output_denormal
, status
);
3899 return packFloatx80(zSign
, 0, 0);
3902 (status
->float_detect_tininess
3903 == float_tininess_before_rounding
)
3905 || ( zSig0
<= zSig0
+ roundIncrement
);
3906 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
3908 roundBits
= zSig0
& roundMask
;
3909 if (isTiny
&& roundBits
) {
3910 float_raise(float_flag_underflow
, status
);
3913 status
->float_exception_flags
|= float_flag_inexact
;
3915 zSig0
+= roundIncrement
;
3916 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
3917 roundIncrement
= roundMask
+ 1;
3918 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
3919 roundMask
|= roundIncrement
;
3921 zSig0
&= ~ roundMask
;
3922 return packFloatx80( zSign
, zExp
, zSig0
);
3926 status
->float_exception_flags
|= float_flag_inexact
;
3928 zSig0
+= roundIncrement
;
3929 if ( zSig0
< roundIncrement
) {
3931 zSig0
= LIT64( 0x8000000000000000 );
3933 roundIncrement
= roundMask
+ 1;
3934 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
3935 roundMask
|= roundIncrement
;
3937 zSig0
&= ~ roundMask
;
3938 if ( zSig0
== 0 ) zExp
= 0;
3939 return packFloatx80( zSign
, zExp
, zSig0
);
3941 switch (roundingMode
) {
3942 case float_round_nearest_even
:
3943 case float_round_ties_away
:
3944 increment
= ((int64_t)zSig1
< 0);
3946 case float_round_to_zero
:
3949 case float_round_up
:
3950 increment
= !zSign
&& zSig1
;
3952 case float_round_down
:
3953 increment
= zSign
&& zSig1
;
3958 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
3959 if ( ( 0x7FFE < zExp
)
3960 || ( ( zExp
== 0x7FFE )
3961 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
3967 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3968 if ( ( roundingMode
== float_round_to_zero
)
3969 || ( zSign
&& ( roundingMode
== float_round_up
) )
3970 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
3972 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
3974 return packFloatx80(zSign
,
3975 floatx80_infinity_high
,
3976 floatx80_infinity_low
);
3980 (status
->float_detect_tininess
3981 == float_tininess_before_rounding
)
3984 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
3985 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
3987 if (isTiny
&& zSig1
) {
3988 float_raise(float_flag_underflow
, status
);
3991 status
->float_exception_flags
|= float_flag_inexact
;
3993 switch (roundingMode
) {
3994 case float_round_nearest_even
:
3995 case float_round_ties_away
:
3996 increment
= ((int64_t)zSig1
< 0);
3998 case float_round_to_zero
:
4001 case float_round_up
:
4002 increment
= !zSign
&& zSig1
;
4004 case float_round_down
:
4005 increment
= zSign
&& zSig1
;
4013 ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
4014 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
4016 return packFloatx80( zSign
, zExp
, zSig0
);
4020 status
->float_exception_flags
|= float_flag_inexact
;
4026 zSig0
= LIT64( 0x8000000000000000 );
4029 zSig0
&= ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
4033 if ( zSig0
== 0 ) zExp
= 0;
4035 return packFloatx80( zSign
, zExp
, zSig0
);
4039 /*----------------------------------------------------------------------------
4040 | Takes an abstract floating-point value having sign `zSign', exponent
4041 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4042 | and returns the proper extended double-precision floating-point value
4043 | corresponding to the abstract input. This routine is just like
4044 | `roundAndPackFloatx80' except that the input significand does not have to be
4046 *----------------------------------------------------------------------------*/
4048 floatx80
normalizeRoundAndPackFloatx80(int8_t roundingPrecision
,
4049 flag zSign
, int32_t zExp
,
4050 uint64_t zSig0
, uint64_t zSig1
,
4051 float_status
*status
)
4060 shiftCount
= clz64(zSig0
);
4061 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4063 return roundAndPackFloatx80(roundingPrecision
, zSign
, zExp
,
4064 zSig0
, zSig1
, status
);
4068 /*----------------------------------------------------------------------------
4069 | Returns the least-significant 64 fraction bits of the quadruple-precision
4070 | floating-point value `a'.
4071 *----------------------------------------------------------------------------*/
4073 static inline uint64_t extractFloat128Frac1( float128 a
)
4080 /*----------------------------------------------------------------------------
4081 | Returns the most-significant 48 fraction bits of the quadruple-precision
4082 | floating-point value `a'.
4083 *----------------------------------------------------------------------------*/
4085 static inline uint64_t extractFloat128Frac0( float128 a
)
4088 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
4092 /*----------------------------------------------------------------------------
4093 | Returns the exponent bits of the quadruple-precision floating-point value
4095 *----------------------------------------------------------------------------*/
4097 static inline int32_t extractFloat128Exp( float128 a
)
4100 return ( a
.high
>>48 ) & 0x7FFF;
4104 /*----------------------------------------------------------------------------
4105 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4106 *----------------------------------------------------------------------------*/
4108 static inline flag
extractFloat128Sign( float128 a
)
4115 /*----------------------------------------------------------------------------
4116 | Normalizes the subnormal quadruple-precision floating-point value
4117 | represented by the denormalized significand formed by the concatenation of
4118 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
4119 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
4120 | significand are stored at the location pointed to by `zSig0Ptr', and the
4121 | least significant 64 bits of the normalized significand are stored at the
4122 | location pointed to by `zSig1Ptr'.
4123 *----------------------------------------------------------------------------*/
4126 normalizeFloat128Subnormal(
4137 shiftCount
= clz64(aSig1
) - 15;
4138 if ( shiftCount
< 0 ) {
4139 *zSig0Ptr
= aSig1
>>( - shiftCount
);
4140 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
4143 *zSig0Ptr
= aSig1
<<shiftCount
;
4146 *zExpPtr
= - shiftCount
- 63;
4149 shiftCount
= clz64(aSig0
) - 15;
4150 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
4151 *zExpPtr
= 1 - shiftCount
;
4156 /*----------------------------------------------------------------------------
4157 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4158 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4159 | floating-point value, returning the result. After being shifted into the
4160 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4161 | added together to form the most significant 32 bits of the result. This
4162 | means that any integer portion of `zSig0' will be added into the exponent.
4163 | Since a properly normalized significand will have an integer portion equal
4164 | to 1, the `zExp' input should be 1 less than the desired result exponent
4165 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4167 *----------------------------------------------------------------------------*/
4169 static inline float128
4170 packFloat128( flag zSign
, int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
)
4175 z
.high
= ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<48 ) + zSig0
;
4180 /*----------------------------------------------------------------------------
4181 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4182 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4183 | and `zSig2', and returns the proper quadruple-precision floating-point value
4184 | corresponding to the abstract input. Ordinarily, the abstract value is
4185 | simply rounded and packed into the quadruple-precision format, with the
4186 | inexact exception raised if the abstract input cannot be represented
4187 | exactly. However, if the abstract value is too large, the overflow and
4188 | inexact exceptions are raised and an infinity or maximal finite value is
4189 | returned. If the abstract value is too small, the input value is rounded to
4190 | a subnormal number, and the underflow and inexact exceptions are raised if
4191 | the abstract input cannot be represented exactly as a subnormal quadruple-
4192 | precision floating-point number.
4193 | The input significand must be normalized or smaller. If the input
4194 | significand is not normalized, `zExp' must be 0; in that case, the result
4195 | returned is a subnormal number, and it must not require rounding. In the
4196 | usual case that the input significand is normalized, `zExp' must be 1 less
4197 | than the ``true'' floating-point exponent. The handling of underflow and
4198 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4199 *----------------------------------------------------------------------------*/
4201 static float128
roundAndPackFloat128(flag zSign
, int32_t zExp
,
4202 uint64_t zSig0
, uint64_t zSig1
,
4203 uint64_t zSig2
, float_status
*status
)
4205 int8_t roundingMode
;
4206 flag roundNearestEven
, increment
, isTiny
;
4208 roundingMode
= status
->float_rounding_mode
;
4209 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4210 switch (roundingMode
) {
4211 case float_round_nearest_even
:
4212 case float_round_ties_away
:
4213 increment
= ((int64_t)zSig2
< 0);
4215 case float_round_to_zero
:
4218 case float_round_up
:
4219 increment
= !zSign
&& zSig2
;
4221 case float_round_down
:
4222 increment
= zSign
&& zSig2
;
4224 case float_round_to_odd
:
4225 increment
= !(zSig1
& 0x1) && zSig2
;
4230 if ( 0x7FFD <= (uint32_t) zExp
) {
4231 if ( ( 0x7FFD < zExp
)
4232 || ( ( zExp
== 0x7FFD )
4234 LIT64( 0x0001FFFFFFFFFFFF ),
4235 LIT64( 0xFFFFFFFFFFFFFFFF ),
4242 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4243 if ( ( roundingMode
== float_round_to_zero
)
4244 || ( zSign
&& ( roundingMode
== float_round_up
) )
4245 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
4246 || (roundingMode
== float_round_to_odd
)
4252 LIT64( 0x0000FFFFFFFFFFFF ),
4253 LIT64( 0xFFFFFFFFFFFFFFFF )
4256 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4259 if (status
->flush_to_zero
) {
4260 float_raise(float_flag_output_denormal
, status
);
4261 return packFloat128(zSign
, 0, 0, 0);
4264 (status
->float_detect_tininess
4265 == float_tininess_before_rounding
)
4271 LIT64( 0x0001FFFFFFFFFFFF ),
4272 LIT64( 0xFFFFFFFFFFFFFFFF )
4274 shift128ExtraRightJamming(
4275 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
4277 if (isTiny
&& zSig2
) {
4278 float_raise(float_flag_underflow
, status
);
4280 switch (roundingMode
) {
4281 case float_round_nearest_even
:
4282 case float_round_ties_away
:
4283 increment
= ((int64_t)zSig2
< 0);
4285 case float_round_to_zero
:
4288 case float_round_up
:
4289 increment
= !zSign
&& zSig2
;
4291 case float_round_down
:
4292 increment
= zSign
&& zSig2
;
4294 case float_round_to_odd
:
4295 increment
= !(zSig1
& 0x1) && zSig2
;
4303 status
->float_exception_flags
|= float_flag_inexact
;
4306 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
4307 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
4310 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
4312 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
4316 /*----------------------------------------------------------------------------
4317 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4318 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4319 | returns the proper quadruple-precision floating-point value corresponding
4320 | to the abstract input. This routine is just like `roundAndPackFloat128'
4321 | except that the input significand has fewer bits and does not have to be
4322 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
4324 *----------------------------------------------------------------------------*/
4326 static float128
normalizeRoundAndPackFloat128(flag zSign
, int32_t zExp
,
4327 uint64_t zSig0
, uint64_t zSig1
,
4328 float_status
*status
)
4338 shiftCount
= clz64(zSig0
) - 15;
4339 if ( 0 <= shiftCount
) {
4341 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4344 shift128ExtraRightJamming(
4345 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
4348 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
4353 /*----------------------------------------------------------------------------
4354 | Returns the result of converting the 32-bit two's complement integer `a'
4355 | to the extended double-precision floating-point format. The conversion
4356 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4358 *----------------------------------------------------------------------------*/
4360 floatx80
int32_to_floatx80(int32_t a
, float_status
*status
)
4367 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4369 absA
= zSign
? - a
: a
;
4370 shiftCount
= clz32(absA
) + 32;
4372 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
4376 /*----------------------------------------------------------------------------
4377 | Returns the result of converting the 32-bit two's complement integer `a' to
4378 | the quadruple-precision floating-point format. The conversion is performed
4379 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4380 *----------------------------------------------------------------------------*/
4382 float128
int32_to_float128(int32_t a
, float_status
*status
)
4389 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
4391 absA
= zSign
? - a
: a
;
4392 shiftCount
= clz32(absA
) + 17;
4394 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
4398 /*----------------------------------------------------------------------------
4399 | Returns the result of converting the 64-bit two's complement integer `a'
4400 | to the extended double-precision floating-point format. The conversion
4401 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4403 *----------------------------------------------------------------------------*/
4405 floatx80
int64_to_floatx80(int64_t a
, float_status
*status
)
4411 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4413 absA
= zSign
? - a
: a
;
4414 shiftCount
= clz64(absA
);
4415 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
4419 /*----------------------------------------------------------------------------
4420 | Returns the result of converting the 64-bit two's complement integer `a' to
4421 | the quadruple-precision floating-point format. The conversion is performed
4422 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4423 *----------------------------------------------------------------------------*/
4425 float128
int64_to_float128(int64_t a
, float_status
*status
)
4431 uint64_t zSig0
, zSig1
;
4433 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
4435 absA
= zSign
? - a
: a
;
4436 shiftCount
= clz64(absA
) + 49;
4437 zExp
= 0x406E - shiftCount
;
4438 if ( 64 <= shiftCount
) {
4447 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4448 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
4452 /*----------------------------------------------------------------------------
4453 | Returns the result of converting the 64-bit unsigned integer `a'
4454 | to the quadruple-precision floating-point format. The conversion is performed
4455 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4456 *----------------------------------------------------------------------------*/
4458 float128
uint64_to_float128(uint64_t a
, float_status
*status
)
4461 return float128_zero
;
4463 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a
, status
);
4466 /*----------------------------------------------------------------------------
4467 | Returns the result of converting the single-precision floating-point value
4468 | `a' to the extended double-precision floating-point format. The conversion
4469 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4471 *----------------------------------------------------------------------------*/
4473 floatx80
float32_to_floatx80(float32 a
, float_status
*status
)
4479 a
= float32_squash_input_denormal(a
, status
);
4480 aSig
= extractFloat32Frac( a
);
4481 aExp
= extractFloat32Exp( a
);
4482 aSign
= extractFloat32Sign( a
);
4483 if ( aExp
== 0xFF ) {
4485 return commonNaNToFloatx80(float32ToCommonNaN(a
, status
), status
);
4487 return packFloatx80(aSign
,
4488 floatx80_infinity_high
,
4489 floatx80_infinity_low
);
4492 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
4493 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4496 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
4500 /*----------------------------------------------------------------------------
4501 | Returns the result of converting the single-precision floating-point value
4502 | `a' to the double-precision floating-point format. The conversion is
4503 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4505 *----------------------------------------------------------------------------*/
4507 float128
float32_to_float128(float32 a
, float_status
*status
)
4513 a
= float32_squash_input_denormal(a
, status
);
4514 aSig
= extractFloat32Frac( a
);
4515 aExp
= extractFloat32Exp( a
);
4516 aSign
= extractFloat32Sign( a
);
4517 if ( aExp
== 0xFF ) {
4519 return commonNaNToFloat128(float32ToCommonNaN(a
, status
), status
);
4521 return packFloat128( aSign
, 0x7FFF, 0, 0 );
4524 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
4525 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4528 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
4532 /*----------------------------------------------------------------------------
4533 | Returns the remainder of the single-precision floating-point value `a'
4534 | with respect to the corresponding value `b'. The operation is performed
4535 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4536 *----------------------------------------------------------------------------*/
4538 float32
float32_rem(float32 a
, float32 b
, float_status
*status
)
4541 int aExp
, bExp
, expDiff
;
4542 uint32_t aSig
, bSig
;
4544 uint64_t aSig64
, bSig64
, q64
;
4545 uint32_t alternateASig
;
4547 a
= float32_squash_input_denormal(a
, status
);
4548 b
= float32_squash_input_denormal(b
, status
);
4550 aSig
= extractFloat32Frac( a
);
4551 aExp
= extractFloat32Exp( a
);
4552 aSign
= extractFloat32Sign( a
);
4553 bSig
= extractFloat32Frac( b
);
4554 bExp
= extractFloat32Exp( b
);
4555 if ( aExp
== 0xFF ) {
4556 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
4557 return propagateFloat32NaN(a
, b
, status
);
4559 float_raise(float_flag_invalid
, status
);
4560 return float32_default_nan(status
);
4562 if ( bExp
== 0xFF ) {
4564 return propagateFloat32NaN(a
, b
, status
);
4570 float_raise(float_flag_invalid
, status
);
4571 return float32_default_nan(status
);
4573 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
4576 if ( aSig
== 0 ) return a
;
4577 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4579 expDiff
= aExp
- bExp
;
4582 if ( expDiff
< 32 ) {
4585 if ( expDiff
< 0 ) {
4586 if ( expDiff
< -1 ) return a
;
4589 q
= ( bSig
<= aSig
);
4590 if ( q
) aSig
-= bSig
;
4591 if ( 0 < expDiff
) {
4592 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
4595 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
4603 if ( bSig
<= aSig
) aSig
-= bSig
;
4604 aSig64
= ( (uint64_t) aSig
)<<40;
4605 bSig64
= ( (uint64_t) bSig
)<<40;
4607 while ( 0 < expDiff
) {
4608 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
4609 q64
= ( 2 < q64
) ? q64
- 2 : 0;
4610 aSig64
= - ( ( bSig
* q64
)<<38 );
4614 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
4615 q64
= ( 2 < q64
) ? q64
- 2 : 0;
4616 q
= q64
>>( 64 - expDiff
);
4618 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
4621 alternateASig
= aSig
;
4624 } while ( 0 <= (int32_t) aSig
);
4625 sigMean
= aSig
+ alternateASig
;
4626 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
4627 aSig
= alternateASig
;
4629 zSign
= ( (int32_t) aSig
< 0 );
4630 if ( zSign
) aSig
= - aSig
;
4631 return normalizeRoundAndPackFloat32(aSign
^ zSign
, bExp
, aSig
, status
);
4636 /*----------------------------------------------------------------------------
4637 | Returns the binary exponential of the single-precision floating-point value
4638 | `a'. The operation is performed according to the IEC/IEEE Standard for
4639 | Binary Floating-Point Arithmetic.
4641 | Uses the following identities:
4643 | 1. -------------------------------------------------------------------------
4647 | 2. -------------------------------------------------------------------------
4650 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
4652 *----------------------------------------------------------------------------*/
4654 static const float64 float32_exp2_coefficients
[15] =
4656 const_float64( 0x3ff0000000000000ll
), /* 1 */
4657 const_float64( 0x3fe0000000000000ll
), /* 2 */
4658 const_float64( 0x3fc5555555555555ll
), /* 3 */
4659 const_float64( 0x3fa5555555555555ll
), /* 4 */
4660 const_float64( 0x3f81111111111111ll
), /* 5 */
4661 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
4662 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
4663 const_float64( 0x3efa01a01a01a01all
), /* 8 */
4664 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
4665 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
4666 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
4667 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
4668 const_float64( 0x3de6124613a86d09ll
), /* 13 */
4669 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
4670 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
4673 float32
float32_exp2(float32 a
, float_status
*status
)
4680 a
= float32_squash_input_denormal(a
, status
);
4682 aSig
= extractFloat32Frac( a
);
4683 aExp
= extractFloat32Exp( a
);
4684 aSign
= extractFloat32Sign( a
);
4686 if ( aExp
== 0xFF) {
4688 return propagateFloat32NaN(a
, float32_zero
, status
);
4690 return (aSign
) ? float32_zero
: a
;
4693 if (aSig
== 0) return float32_one
;
4696 float_raise(float_flag_inexact
, status
);
4698 /* ******************************* */
4699 /* using float64 for approximation */
4700 /* ******************************* */
4701 x
= float32_to_float64(a
, status
);
4702 x
= float64_mul(x
, float64_ln2
, status
);
4706 for (i
= 0 ; i
< 15 ; i
++) {
4709 f
= float64_mul(xn
, float32_exp2_coefficients
[i
], status
);
4710 r
= float64_add(r
, f
, status
);
4712 xn
= float64_mul(xn
, x
, status
);
4715 return float64_to_float32(r
, status
);
4718 /*----------------------------------------------------------------------------
4719 | Returns the binary log of the single-precision floating-point value `a'.
4720 | The operation is performed according to the IEC/IEEE Standard for Binary
4721 | Floating-Point Arithmetic.
4722 *----------------------------------------------------------------------------*/
4723 float32
float32_log2(float32 a
, float_status
*status
)
4727 uint32_t aSig
, zSig
, i
;
4729 a
= float32_squash_input_denormal(a
, status
);
4730 aSig
= extractFloat32Frac( a
);
4731 aExp
= extractFloat32Exp( a
);
4732 aSign
= extractFloat32Sign( a
);
4735 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
4736 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4739 float_raise(float_flag_invalid
, status
);
4740 return float32_default_nan(status
);
4742 if ( aExp
== 0xFF ) {
4744 return propagateFloat32NaN(a
, float32_zero
, status
);
4754 for (i
= 1 << 22; i
> 0; i
>>= 1) {
4755 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
4756 if ( aSig
& 0x01000000 ) {
4765 return normalizeRoundAndPackFloat32(zSign
, 0x85, zSig
, status
);
4768 /*----------------------------------------------------------------------------
4769 | Returns 1 if the single-precision floating-point value `a' is equal to
4770 | the corresponding value `b', and 0 otherwise. The invalid exception is
4771 | raised if either operand is a NaN. Otherwise, the comparison is performed
4772 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4773 *----------------------------------------------------------------------------*/
4775 int float32_eq(float32 a
, float32 b
, float_status
*status
)
4778 a
= float32_squash_input_denormal(a
, status
);
4779 b
= float32_squash_input_denormal(b
, status
);
4781 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4782 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4784 float_raise(float_flag_invalid
, status
);
4787 av
= float32_val(a
);
4788 bv
= float32_val(b
);
4789 return ( av
== bv
) || ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
4792 /*----------------------------------------------------------------------------
4793 | Returns 1 if the single-precision floating-point value `a' is less than
4794 | or equal to the corresponding value `b', and 0 otherwise. The invalid
4795 | exception is raised if either operand is a NaN. The comparison is performed
4796 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4797 *----------------------------------------------------------------------------*/
4799 int float32_le(float32 a
, float32 b
, float_status
*status
)
4803 a
= float32_squash_input_denormal(a
, status
);
4804 b
= float32_squash_input_denormal(b
, status
);
4806 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4807 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4809 float_raise(float_flag_invalid
, status
);
4812 aSign
= extractFloat32Sign( a
);
4813 bSign
= extractFloat32Sign( b
);
4814 av
= float32_val(a
);
4815 bv
= float32_val(b
);
4816 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
4817 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4821 /*----------------------------------------------------------------------------
4822 | Returns 1 if the single-precision floating-point value `a' is less than
4823 | the corresponding value `b', and 0 otherwise. The invalid exception is
4824 | raised if either operand is a NaN. The comparison is performed according
4825 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4826 *----------------------------------------------------------------------------*/
4828 int float32_lt(float32 a
, float32 b
, float_status
*status
)
4832 a
= float32_squash_input_denormal(a
, status
);
4833 b
= float32_squash_input_denormal(b
, status
);
4835 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4836 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4838 float_raise(float_flag_invalid
, status
);
4841 aSign
= extractFloat32Sign( a
);
4842 bSign
= extractFloat32Sign( b
);
4843 av
= float32_val(a
);
4844 bv
= float32_val(b
);
4845 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
4846 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4850 /*----------------------------------------------------------------------------
4851 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4852 | be compared, and 0 otherwise. The invalid exception is raised if either
4853 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4854 | Standard for Binary Floating-Point Arithmetic.
4855 *----------------------------------------------------------------------------*/
4857 int float32_unordered(float32 a
, float32 b
, float_status
*status
)
4859 a
= float32_squash_input_denormal(a
, status
);
4860 b
= float32_squash_input_denormal(b
, status
);
4862 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4863 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4865 float_raise(float_flag_invalid
, status
);
4871 /*----------------------------------------------------------------------------
4872 | Returns 1 if the single-precision floating-point value `a' is equal to
4873 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4874 | exception. The comparison is performed according to the IEC/IEEE Standard
4875 | for Binary Floating-Point Arithmetic.
4876 *----------------------------------------------------------------------------*/
4878 int float32_eq_quiet(float32 a
, float32 b
, float_status
*status
)
4880 a
= float32_squash_input_denormal(a
, status
);
4881 b
= float32_squash_input_denormal(b
, status
);
4883 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4884 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4886 if (float32_is_signaling_nan(a
, status
)
4887 || float32_is_signaling_nan(b
, status
)) {
4888 float_raise(float_flag_invalid
, status
);
4892 return ( float32_val(a
) == float32_val(b
) ) ||
4893 ( (uint32_t) ( ( float32_val(a
) | float32_val(b
) )<<1 ) == 0 );
4896 /*----------------------------------------------------------------------------
4897 | Returns 1 if the single-precision floating-point value `a' is less than or
4898 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4899 | cause an exception. Otherwise, the comparison is performed according to the
4900 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4901 *----------------------------------------------------------------------------*/
4903 int float32_le_quiet(float32 a
, float32 b
, float_status
*status
)
4907 a
= float32_squash_input_denormal(a
, status
);
4908 b
= float32_squash_input_denormal(b
, status
);
4910 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4911 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4913 if (float32_is_signaling_nan(a
, status
)
4914 || float32_is_signaling_nan(b
, status
)) {
4915 float_raise(float_flag_invalid
, status
);
4919 aSign
= extractFloat32Sign( a
);
4920 bSign
= extractFloat32Sign( b
);
4921 av
= float32_val(a
);
4922 bv
= float32_val(b
);
4923 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
4924 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4928 /*----------------------------------------------------------------------------
4929 | Returns 1 if the single-precision floating-point value `a' is less than
4930 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4931 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4932 | Standard for Binary Floating-Point Arithmetic.
4933 *----------------------------------------------------------------------------*/
4935 int float32_lt_quiet(float32 a
, float32 b
, float_status
*status
)
4939 a
= float32_squash_input_denormal(a
, status
);
4940 b
= float32_squash_input_denormal(b
, status
);
4942 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4943 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4945 if (float32_is_signaling_nan(a
, status
)
4946 || float32_is_signaling_nan(b
, status
)) {
4947 float_raise(float_flag_invalid
, status
);
4951 aSign
= extractFloat32Sign( a
);
4952 bSign
= extractFloat32Sign( b
);
4953 av
= float32_val(a
);
4954 bv
= float32_val(b
);
4955 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
4956 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4960 /*----------------------------------------------------------------------------
4961 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4962 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4963 | comparison is performed according to the IEC/IEEE Standard for Binary
4964 | Floating-Point Arithmetic.
4965 *----------------------------------------------------------------------------*/
4967 int float32_unordered_quiet(float32 a
, float32 b
, float_status
*status
)
4969 a
= float32_squash_input_denormal(a
, status
);
4970 b
= float32_squash_input_denormal(b
, status
);
4972 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4973 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4975 if (float32_is_signaling_nan(a
, status
)
4976 || float32_is_signaling_nan(b
, status
)) {
4977 float_raise(float_flag_invalid
, status
);
4984 /*----------------------------------------------------------------------------
4985 | If `a' is denormal and we are in flush-to-zero mode then set the
4986 | input-denormal exception and return zero. Otherwise just return the value.
4987 *----------------------------------------------------------------------------*/
4988 float16
float16_squash_input_denormal(float16 a
, float_status
*status
)
4990 if (status
->flush_inputs_to_zero
) {
4991 if (extractFloat16Exp(a
) == 0 && extractFloat16Frac(a
) != 0) {
4992 float_raise(float_flag_input_denormal
, status
);
4993 return make_float16(float16_val(a
) & 0x8000);
4999 /*----------------------------------------------------------------------------
5000 | Returns the result of converting the double-precision floating-point value
5001 | `a' to the extended double-precision floating-point format. The conversion
5002 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5004 *----------------------------------------------------------------------------*/
5006 floatx80
float64_to_floatx80(float64 a
, float_status
*status
)
5012 a
= float64_squash_input_denormal(a
, status
);
5013 aSig
= extractFloat64Frac( a
);
5014 aExp
= extractFloat64Exp( a
);
5015 aSign
= extractFloat64Sign( a
);
5016 if ( aExp
== 0x7FF ) {
5018 return commonNaNToFloatx80(float64ToCommonNaN(a
, status
), status
);
5020 return packFloatx80(aSign
,
5021 floatx80_infinity_high
,
5022 floatx80_infinity_low
);
5025 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
5026 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5030 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
5034 /*----------------------------------------------------------------------------
5035 | Returns the result of converting the double-precision floating-point value
5036 | `a' to the quadruple-precision floating-point format. The conversion is
5037 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5039 *----------------------------------------------------------------------------*/
5041 float128
float64_to_float128(float64 a
, float_status
*status
)
5045 uint64_t aSig
, zSig0
, zSig1
;
5047 a
= float64_squash_input_denormal(a
, status
);
5048 aSig
= extractFloat64Frac( a
);
5049 aExp
= extractFloat64Exp( a
);
5050 aSign
= extractFloat64Sign( a
);
5051 if ( aExp
== 0x7FF ) {
5053 return commonNaNToFloat128(float64ToCommonNaN(a
, status
), status
);
5055 return packFloat128( aSign
, 0x7FFF, 0, 0 );
5058 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
5059 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5062 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
5063 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
5068 /*----------------------------------------------------------------------------
5069 | Returns the remainder of the double-precision floating-point value `a'
5070 | with respect to the corresponding value `b'. The operation is performed
5071 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5072 *----------------------------------------------------------------------------*/
5074 float64
float64_rem(float64 a
, float64 b
, float_status
*status
)
5077 int aExp
, bExp
, expDiff
;
5078 uint64_t aSig
, bSig
;
5079 uint64_t q
, alternateASig
;
5082 a
= float64_squash_input_denormal(a
, status
);
5083 b
= float64_squash_input_denormal(b
, status
);
5084 aSig
= extractFloat64Frac( a
);
5085 aExp
= extractFloat64Exp( a
);
5086 aSign
= extractFloat64Sign( a
);
5087 bSig
= extractFloat64Frac( b
);
5088 bExp
= extractFloat64Exp( b
);
5089 if ( aExp
== 0x7FF ) {
5090 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
5091 return propagateFloat64NaN(a
, b
, status
);
5093 float_raise(float_flag_invalid
, status
);
5094 return float64_default_nan(status
);
5096 if ( bExp
== 0x7FF ) {
5098 return propagateFloat64NaN(a
, b
, status
);
5104 float_raise(float_flag_invalid
, status
);
5105 return float64_default_nan(status
);
5107 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
5110 if ( aSig
== 0 ) return a
;
5111 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5113 expDiff
= aExp
- bExp
;
5114 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
5115 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
5116 if ( expDiff
< 0 ) {
5117 if ( expDiff
< -1 ) return a
;
5120 q
= ( bSig
<= aSig
);
5121 if ( q
) aSig
-= bSig
;
5123 while ( 0 < expDiff
) {
5124 q
= estimateDiv128To64( aSig
, 0, bSig
);
5125 q
= ( 2 < q
) ? q
- 2 : 0;
5126 aSig
= - ( ( bSig
>>2 ) * q
);
5130 if ( 0 < expDiff
) {
5131 q
= estimateDiv128To64( aSig
, 0, bSig
);
5132 q
= ( 2 < q
) ? q
- 2 : 0;
5135 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
5142 alternateASig
= aSig
;
5145 } while ( 0 <= (int64_t) aSig
);
5146 sigMean
= aSig
+ alternateASig
;
5147 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
5148 aSig
= alternateASig
;
5150 zSign
= ( (int64_t) aSig
< 0 );
5151 if ( zSign
) aSig
= - aSig
;
5152 return normalizeRoundAndPackFloat64(aSign
^ zSign
, bExp
, aSig
, status
);
5156 /*----------------------------------------------------------------------------
5157 | Returns the binary log of the double-precision floating-point value `a'.
5158 | The operation is performed according to the IEC/IEEE Standard for Binary
5159 | Floating-Point Arithmetic.
5160 *----------------------------------------------------------------------------*/
5161 float64
float64_log2(float64 a
, float_status
*status
)
5165 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
5166 a
= float64_squash_input_denormal(a
, status
);
5168 aSig
= extractFloat64Frac( a
);
5169 aExp
= extractFloat64Exp( a
);
5170 aSign
= extractFloat64Sign( a
);
5173 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
5174 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5177 float_raise(float_flag_invalid
, status
);
5178 return float64_default_nan(status
);
5180 if ( aExp
== 0x7FF ) {
5182 return propagateFloat64NaN(a
, float64_zero
, status
);
5188 aSig
|= LIT64( 0x0010000000000000 );
5190 zSig
= (uint64_t)aExp
<< 52;
5191 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
5192 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
5193 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
5194 if ( aSig
& LIT64( 0x0020000000000000 ) ) {
5202 return normalizeRoundAndPackFloat64(zSign
, 0x408, zSig
, status
);
5205 /*----------------------------------------------------------------------------
5206 | Returns 1 if the double-precision floating-point value `a' is equal to the
5207 | corresponding value `b', and 0 otherwise. The invalid exception is raised
5208 | if either operand is a NaN. Otherwise, the comparison is performed
5209 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5210 *----------------------------------------------------------------------------*/
5212 int float64_eq(float64 a
, float64 b
, float_status
*status
)
5215 a
= float64_squash_input_denormal(a
, status
);
5216 b
= float64_squash_input_denormal(b
, status
);
5218 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5219 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5221 float_raise(float_flag_invalid
, status
);
5224 av
= float64_val(a
);
5225 bv
= float64_val(b
);
5226 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
5230 /*----------------------------------------------------------------------------
5231 | Returns 1 if the double-precision floating-point value `a' is less than or
5232 | equal to the corresponding value `b', and 0 otherwise. The invalid
5233 | exception is raised if either operand is a NaN. The comparison is performed
5234 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5235 *----------------------------------------------------------------------------*/
5237 int float64_le(float64 a
, float64 b
, float_status
*status
)
5241 a
= float64_squash_input_denormal(a
, status
);
5242 b
= float64_squash_input_denormal(b
, status
);
5244 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5245 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5247 float_raise(float_flag_invalid
, status
);
5250 aSign
= extractFloat64Sign( a
);
5251 bSign
= extractFloat64Sign( b
);
5252 av
= float64_val(a
);
5253 bv
= float64_val(b
);
5254 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
5255 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
5259 /*----------------------------------------------------------------------------
5260 | Returns 1 if the double-precision floating-point value `a' is less than
5261 | the corresponding value `b', and 0 otherwise. The invalid exception is
5262 | raised if either operand is a NaN. The comparison is performed according
5263 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5264 *----------------------------------------------------------------------------*/
5266 int float64_lt(float64 a
, float64 b
, float_status
*status
)
5271 a
= float64_squash_input_denormal(a
, status
);
5272 b
= float64_squash_input_denormal(b
, status
);
5273 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5274 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5276 float_raise(float_flag_invalid
, status
);
5279 aSign
= extractFloat64Sign( a
);
5280 bSign
= extractFloat64Sign( b
);
5281 av
= float64_val(a
);
5282 bv
= float64_val(b
);
5283 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
5284 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
5288 /*----------------------------------------------------------------------------
5289 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5290 | be compared, and 0 otherwise. The invalid exception is raised if either
5291 | operand is a NaN. The comparison is performed according to the IEC/IEEE
5292 | Standard for Binary Floating-Point Arithmetic.
5293 *----------------------------------------------------------------------------*/
5295 int float64_unordered(float64 a
, float64 b
, float_status
*status
)
5297 a
= float64_squash_input_denormal(a
, status
);
5298 b
= float64_squash_input_denormal(b
, status
);
5300 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5301 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5303 float_raise(float_flag_invalid
, status
);
5309 /*----------------------------------------------------------------------------
5310 | Returns 1 if the double-precision floating-point value `a' is equal to the
5311 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5312 | exception.The comparison is performed according to the IEC/IEEE Standard
5313 | for Binary Floating-Point Arithmetic.
5314 *----------------------------------------------------------------------------*/
5316 int float64_eq_quiet(float64 a
, float64 b
, float_status
*status
)
5319 a
= float64_squash_input_denormal(a
, status
);
5320 b
= float64_squash_input_denormal(b
, status
);
5322 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5323 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5325 if (float64_is_signaling_nan(a
, status
)
5326 || float64_is_signaling_nan(b
, status
)) {
5327 float_raise(float_flag_invalid
, status
);
5331 av
= float64_val(a
);
5332 bv
= float64_val(b
);
5333 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
5337 /*----------------------------------------------------------------------------
5338 | Returns 1 if the double-precision floating-point value `a' is less than or
5339 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5340 | cause an exception. Otherwise, the comparison is performed according to the
5341 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5342 *----------------------------------------------------------------------------*/
5344 int float64_le_quiet(float64 a
, float64 b
, float_status
*status
)
5348 a
= float64_squash_input_denormal(a
, status
);
5349 b
= float64_squash_input_denormal(b
, status
);
5351 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5352 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5354 if (float64_is_signaling_nan(a
, status
)
5355 || float64_is_signaling_nan(b
, status
)) {
5356 float_raise(float_flag_invalid
, status
);
5360 aSign
= extractFloat64Sign( a
);
5361 bSign
= extractFloat64Sign( b
);
5362 av
= float64_val(a
);
5363 bv
= float64_val(b
);
5364 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
5365 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
5369 /*----------------------------------------------------------------------------
5370 | Returns 1 if the double-precision floating-point value `a' is less than
5371 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5372 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5373 | Standard for Binary Floating-Point Arithmetic.
5374 *----------------------------------------------------------------------------*/
5376 int float64_lt_quiet(float64 a
, float64 b
, float_status
*status
)
5380 a
= float64_squash_input_denormal(a
, status
);
5381 b
= float64_squash_input_denormal(b
, status
);
5383 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5384 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5386 if (float64_is_signaling_nan(a
, status
)
5387 || float64_is_signaling_nan(b
, status
)) {
5388 float_raise(float_flag_invalid
, status
);
5392 aSign
= extractFloat64Sign( a
);
5393 bSign
= extractFloat64Sign( b
);
5394 av
= float64_val(a
);
5395 bv
= float64_val(b
);
5396 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
5397 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
5401 /*----------------------------------------------------------------------------
5402 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5403 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
5404 | comparison is performed according to the IEC/IEEE Standard for Binary
5405 | Floating-Point Arithmetic.
5406 *----------------------------------------------------------------------------*/
5408 int float64_unordered_quiet(float64 a
, float64 b
, float_status
*status
)
5410 a
= float64_squash_input_denormal(a
, status
);
5411 b
= float64_squash_input_denormal(b
, status
);
5413 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5414 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5416 if (float64_is_signaling_nan(a
, status
)
5417 || float64_is_signaling_nan(b
, status
)) {
5418 float_raise(float_flag_invalid
, status
);
5425 /*----------------------------------------------------------------------------
5426 | Returns the result of converting the extended double-precision floating-
5427 | point value `a' to the 32-bit two's complement integer format. The
5428 | conversion is performed according to the IEC/IEEE Standard for Binary
5429 | Floating-Point Arithmetic---which means in particular that the conversion
5430 | is rounded according to the current rounding mode. If `a' is a NaN, the
5431 | largest positive integer is returned. Otherwise, if the conversion
5432 | overflows, the largest integer with the same sign as `a' is returned.
5433 *----------------------------------------------------------------------------*/
5435 int32_t floatx80_to_int32(floatx80 a
, float_status
*status
)
5438 int32_t aExp
, shiftCount
;
5441 if (floatx80_invalid_encoding(a
)) {
5442 float_raise(float_flag_invalid
, status
);
5445 aSig
= extractFloatx80Frac( a
);
5446 aExp
= extractFloatx80Exp( a
);
5447 aSign
= extractFloatx80Sign( a
);
5448 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
5449 shiftCount
= 0x4037 - aExp
;
5450 if ( shiftCount
<= 0 ) shiftCount
= 1;
5451 shift64RightJamming( aSig
, shiftCount
, &aSig
);
5452 return roundAndPackInt32(aSign
, aSig
, status
);
5456 /*----------------------------------------------------------------------------
5457 | Returns the result of converting the extended double-precision floating-
5458 | point value `a' to the 32-bit two's complement integer format. The
5459 | conversion is performed according to the IEC/IEEE Standard for Binary
5460 | Floating-Point Arithmetic, except that the conversion is always rounded
5461 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5462 | Otherwise, if the conversion overflows, the largest integer with the same
5463 | sign as `a' is returned.
5464 *----------------------------------------------------------------------------*/
5466 int32_t floatx80_to_int32_round_to_zero(floatx80 a
, float_status
*status
)
5469 int32_t aExp
, shiftCount
;
5470 uint64_t aSig
, savedASig
;
5473 if (floatx80_invalid_encoding(a
)) {
5474 float_raise(float_flag_invalid
, status
);
5477 aSig
= extractFloatx80Frac( a
);
5478 aExp
= extractFloatx80Exp( a
);
5479 aSign
= extractFloatx80Sign( a
);
5480 if ( 0x401E < aExp
) {
5481 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
5484 else if ( aExp
< 0x3FFF ) {
5486 status
->float_exception_flags
|= float_flag_inexact
;
5490 shiftCount
= 0x403E - aExp
;
5492 aSig
>>= shiftCount
;
5494 if ( aSign
) z
= - z
;
5495 if ( ( z
< 0 ) ^ aSign
) {
5497 float_raise(float_flag_invalid
, status
);
5498 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
5500 if ( ( aSig
<<shiftCount
) != savedASig
) {
5501 status
->float_exception_flags
|= float_flag_inexact
;
5507 /*----------------------------------------------------------------------------
5508 | Returns the result of converting the extended double-precision floating-
5509 | point value `a' to the 64-bit two's complement integer format. The
5510 | conversion is performed according to the IEC/IEEE Standard for Binary
5511 | Floating-Point Arithmetic---which means in particular that the conversion
5512 | is rounded according to the current rounding mode. If `a' is a NaN,
5513 | the largest positive integer is returned. Otherwise, if the conversion
5514 | overflows, the largest integer with the same sign as `a' is returned.
5515 *----------------------------------------------------------------------------*/
5517 int64_t floatx80_to_int64(floatx80 a
, float_status
*status
)
5520 int32_t aExp
, shiftCount
;
5521 uint64_t aSig
, aSigExtra
;
5523 if (floatx80_invalid_encoding(a
)) {
5524 float_raise(float_flag_invalid
, status
);
5527 aSig
= extractFloatx80Frac( a
);
5528 aExp
= extractFloatx80Exp( a
);
5529 aSign
= extractFloatx80Sign( a
);
5530 shiftCount
= 0x403E - aExp
;
5531 if ( shiftCount
<= 0 ) {
5533 float_raise(float_flag_invalid
, status
);
5534 if (!aSign
|| floatx80_is_any_nan(a
)) {
5535 return LIT64( 0x7FFFFFFFFFFFFFFF );
5537 return (int64_t) LIT64( 0x8000000000000000 );
5542 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
5544 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
5548 /*----------------------------------------------------------------------------
5549 | Returns the result of converting the extended double-precision floating-
5550 | point value `a' to the 64-bit two's complement integer format. The
5551 | conversion is performed according to the IEC/IEEE Standard for Binary
5552 | Floating-Point Arithmetic, except that the conversion is always rounded
5553 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5554 | Otherwise, if the conversion overflows, the largest integer with the same
5555 | sign as `a' is returned.
5556 *----------------------------------------------------------------------------*/
5558 int64_t floatx80_to_int64_round_to_zero(floatx80 a
, float_status
*status
)
5561 int32_t aExp
, shiftCount
;
5565 if (floatx80_invalid_encoding(a
)) {
5566 float_raise(float_flag_invalid
, status
);
5569 aSig
= extractFloatx80Frac( a
);
5570 aExp
= extractFloatx80Exp( a
);
5571 aSign
= extractFloatx80Sign( a
);
5572 shiftCount
= aExp
- 0x403E;
5573 if ( 0 <= shiftCount
) {
5574 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
5575 if ( ( a
.high
!= 0xC03E ) || aSig
) {
5576 float_raise(float_flag_invalid
, status
);
5577 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
5578 return LIT64( 0x7FFFFFFFFFFFFFFF );
5581 return (int64_t) LIT64( 0x8000000000000000 );
5583 else if ( aExp
< 0x3FFF ) {
5585 status
->float_exception_flags
|= float_flag_inexact
;
5589 z
= aSig
>>( - shiftCount
);
5590 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
5591 status
->float_exception_flags
|= float_flag_inexact
;
5593 if ( aSign
) z
= - z
;
5598 /*----------------------------------------------------------------------------
5599 | Returns the result of converting the extended double-precision floating-
5600 | point value `a' to the single-precision floating-point format. The
5601 | conversion is performed according to the IEC/IEEE Standard for Binary
5602 | Floating-Point Arithmetic.
5603 *----------------------------------------------------------------------------*/
5605 float32
floatx80_to_float32(floatx80 a
, float_status
*status
)
5611 if (floatx80_invalid_encoding(a
)) {
5612 float_raise(float_flag_invalid
, status
);
5613 return float32_default_nan(status
);
5615 aSig
= extractFloatx80Frac( a
);
5616 aExp
= extractFloatx80Exp( a
);
5617 aSign
= extractFloatx80Sign( a
);
5618 if ( aExp
== 0x7FFF ) {
5619 if ( (uint64_t) ( aSig
<<1 ) ) {
5620 return commonNaNToFloat32(floatx80ToCommonNaN(a
, status
), status
);
5622 return packFloat32( aSign
, 0xFF, 0 );
5624 shift64RightJamming( aSig
, 33, &aSig
);
5625 if ( aExp
|| aSig
) aExp
-= 0x3F81;
5626 return roundAndPackFloat32(aSign
, aExp
, aSig
, status
);
5630 /*----------------------------------------------------------------------------
5631 | Returns the result of converting the extended double-precision floating-
5632 | point value `a' to the double-precision floating-point format. The
5633 | conversion is performed according to the IEC/IEEE Standard for Binary
5634 | Floating-Point Arithmetic.
5635 *----------------------------------------------------------------------------*/
5637 float64
floatx80_to_float64(floatx80 a
, float_status
*status
)
5641 uint64_t aSig
, zSig
;
5643 if (floatx80_invalid_encoding(a
)) {
5644 float_raise(float_flag_invalid
, status
);
5645 return float64_default_nan(status
);
5647 aSig
= extractFloatx80Frac( a
);
5648 aExp
= extractFloatx80Exp( a
);
5649 aSign
= extractFloatx80Sign( a
);
5650 if ( aExp
== 0x7FFF ) {
5651 if ( (uint64_t) ( aSig
<<1 ) ) {
5652 return commonNaNToFloat64(floatx80ToCommonNaN(a
, status
), status
);
5654 return packFloat64( aSign
, 0x7FF, 0 );
5656 shift64RightJamming( aSig
, 1, &zSig
);
5657 if ( aExp
|| aSig
) aExp
-= 0x3C01;
5658 return roundAndPackFloat64(aSign
, aExp
, zSig
, status
);
5662 /*----------------------------------------------------------------------------
5663 | Returns the result of converting the extended double-precision floating-
5664 | point value `a' to the quadruple-precision floating-point format. The
5665 | conversion is performed according to the IEC/IEEE Standard for Binary
5666 | Floating-Point Arithmetic.
5667 *----------------------------------------------------------------------------*/
5669 float128
floatx80_to_float128(floatx80 a
, float_status
*status
)
5673 uint64_t aSig
, zSig0
, zSig1
;
5675 if (floatx80_invalid_encoding(a
)) {
5676 float_raise(float_flag_invalid
, status
);
5677 return float128_default_nan(status
);
5679 aSig
= extractFloatx80Frac( a
);
5680 aExp
= extractFloatx80Exp( a
);
5681 aSign
= extractFloatx80Sign( a
);
5682 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
5683 return commonNaNToFloat128(floatx80ToCommonNaN(a
, status
), status
);
5685 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
5686 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
5690 /*----------------------------------------------------------------------------
5691 | Rounds the extended double-precision floating-point value `a'
5692 | to the precision provided by floatx80_rounding_precision and returns the
5693 | result as an extended double-precision floating-point value.
5694 | The operation is performed according to the IEC/IEEE Standard for Binary
5695 | Floating-Point Arithmetic.
5696 *----------------------------------------------------------------------------*/
5698 floatx80
floatx80_round(floatx80 a
, float_status
*status
)
5700 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5701 extractFloatx80Sign(a
),
5702 extractFloatx80Exp(a
),
5703 extractFloatx80Frac(a
), 0, status
);
5706 /*----------------------------------------------------------------------------
5707 | Rounds the extended double-precision floating-point value `a' to an integer,
5708 | and returns the result as an extended quadruple-precision floating-point
5709 | value. The operation is performed according to the IEC/IEEE Standard for
5710 | Binary Floating-Point Arithmetic.
5711 *----------------------------------------------------------------------------*/
5713 floatx80
floatx80_round_to_int(floatx80 a
, float_status
*status
)
5717 uint64_t lastBitMask
, roundBitsMask
;
5720 if (floatx80_invalid_encoding(a
)) {
5721 float_raise(float_flag_invalid
, status
);
5722 return floatx80_default_nan(status
);
5724 aExp
= extractFloatx80Exp( a
);
5725 if ( 0x403E <= aExp
) {
5726 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
5727 return propagateFloatx80NaN(a
, a
, status
);
5731 if ( aExp
< 0x3FFF ) {
5733 && ( (uint64_t) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
5736 status
->float_exception_flags
|= float_flag_inexact
;
5737 aSign
= extractFloatx80Sign( a
);
5738 switch (status
->float_rounding_mode
) {
5739 case float_round_nearest_even
:
5740 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
5743 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
5746 case float_round_ties_away
:
5747 if (aExp
== 0x3FFE) {
5748 return packFloatx80(aSign
, 0x3FFF, LIT64(0x8000000000000000));
5751 case float_round_down
:
5754 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5755 : packFloatx80( 0, 0, 0 );
5756 case float_round_up
:
5758 aSign
? packFloatx80( 1, 0, 0 )
5759 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5761 return packFloatx80( aSign
, 0, 0 );
5764 lastBitMask
<<= 0x403E - aExp
;
5765 roundBitsMask
= lastBitMask
- 1;
5767 switch (status
->float_rounding_mode
) {
5768 case float_round_nearest_even
:
5769 z
.low
+= lastBitMask
>>1;
5770 if ((z
.low
& roundBitsMask
) == 0) {
5771 z
.low
&= ~lastBitMask
;
5774 case float_round_ties_away
:
5775 z
.low
+= lastBitMask
>> 1;
5777 case float_round_to_zero
:
5779 case float_round_up
:
5780 if (!extractFloatx80Sign(z
)) {
5781 z
.low
+= roundBitsMask
;
5784 case float_round_down
:
5785 if (extractFloatx80Sign(z
)) {
5786 z
.low
+= roundBitsMask
;
5792 z
.low
&= ~ roundBitsMask
;
5795 z
.low
= LIT64( 0x8000000000000000 );
5797 if (z
.low
!= a
.low
) {
5798 status
->float_exception_flags
|= float_flag_inexact
;
5804 /*----------------------------------------------------------------------------
5805 | Returns the result of adding the absolute values of the extended double-
5806 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5807 | negated before being returned. `zSign' is ignored if the result is a NaN.
5808 | The addition is performed according to the IEC/IEEE Standard for Binary
5809 | Floating-Point Arithmetic.
5810 *----------------------------------------------------------------------------*/
5812 static floatx80
addFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
5813 float_status
*status
)
5815 int32_t aExp
, bExp
, zExp
;
5816 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5819 aSig
= extractFloatx80Frac( a
);
5820 aExp
= extractFloatx80Exp( a
);
5821 bSig
= extractFloatx80Frac( b
);
5822 bExp
= extractFloatx80Exp( b
);
5823 expDiff
= aExp
- bExp
;
5824 if ( 0 < expDiff
) {
5825 if ( aExp
== 0x7FFF ) {
5826 if ((uint64_t)(aSig
<< 1)) {
5827 return propagateFloatx80NaN(a
, b
, status
);
5831 if ( bExp
== 0 ) --expDiff
;
5832 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5835 else if ( expDiff
< 0 ) {
5836 if ( bExp
== 0x7FFF ) {
5837 if ((uint64_t)(bSig
<< 1)) {
5838 return propagateFloatx80NaN(a
, b
, status
);
5840 return packFloatx80(zSign
,
5841 floatx80_infinity_high
,
5842 floatx80_infinity_low
);
5844 if ( aExp
== 0 ) ++expDiff
;
5845 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5849 if ( aExp
== 0x7FFF ) {
5850 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5851 return propagateFloatx80NaN(a
, b
, status
);
5856 zSig0
= aSig
+ bSig
;
5858 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
5864 zSig0
= aSig
+ bSig
;
5865 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
5867 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5868 zSig0
|= LIT64( 0x8000000000000000 );
5871 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5872 zSign
, zExp
, zSig0
, zSig1
, status
);
5875 /*----------------------------------------------------------------------------
5876 | Returns the result of subtracting the absolute values of the extended
5877 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5878 | difference is negated before being returned. `zSign' is ignored if the
5879 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5880 | Standard for Binary Floating-Point Arithmetic.
5881 *----------------------------------------------------------------------------*/
5883 static floatx80
subFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
5884 float_status
*status
)
5886 int32_t aExp
, bExp
, zExp
;
5887 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5890 aSig
= extractFloatx80Frac( a
);
5891 aExp
= extractFloatx80Exp( a
);
5892 bSig
= extractFloatx80Frac( b
);
5893 bExp
= extractFloatx80Exp( b
);
5894 expDiff
= aExp
- bExp
;
5895 if ( 0 < expDiff
) goto aExpBigger
;
5896 if ( expDiff
< 0 ) goto bExpBigger
;
5897 if ( aExp
== 0x7FFF ) {
5898 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5899 return propagateFloatx80NaN(a
, b
, status
);
5901 float_raise(float_flag_invalid
, status
);
5902 return floatx80_default_nan(status
);
5909 if ( bSig
< aSig
) goto aBigger
;
5910 if ( aSig
< bSig
) goto bBigger
;
5911 return packFloatx80(status
->float_rounding_mode
== float_round_down
, 0, 0);
5913 if ( bExp
== 0x7FFF ) {
5914 if ((uint64_t)(bSig
<< 1)) {
5915 return propagateFloatx80NaN(a
, b
, status
);
5917 return packFloatx80(zSign
^ 1, floatx80_infinity_high
,
5918 floatx80_infinity_low
);
5920 if ( aExp
== 0 ) ++expDiff
;
5921 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5923 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
5926 goto normalizeRoundAndPack
;
5928 if ( aExp
== 0x7FFF ) {
5929 if ((uint64_t)(aSig
<< 1)) {
5930 return propagateFloatx80NaN(a
, b
, status
);
5934 if ( bExp
== 0 ) --expDiff
;
5935 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5937 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
5939 normalizeRoundAndPack
:
5940 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
5941 zSign
, zExp
, zSig0
, zSig1
, status
);
5944 /*----------------------------------------------------------------------------
5945 | Returns the result of adding the extended double-precision floating-point
5946 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5947 | Standard for Binary Floating-Point Arithmetic.
5948 *----------------------------------------------------------------------------*/
5950 floatx80
floatx80_add(floatx80 a
, floatx80 b
, float_status
*status
)
5954 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5955 float_raise(float_flag_invalid
, status
);
5956 return floatx80_default_nan(status
);
5958 aSign
= extractFloatx80Sign( a
);
5959 bSign
= extractFloatx80Sign( b
);
5960 if ( aSign
== bSign
) {
5961 return addFloatx80Sigs(a
, b
, aSign
, status
);
5964 return subFloatx80Sigs(a
, b
, aSign
, status
);
5969 /*----------------------------------------------------------------------------
5970 | Returns the result of subtracting the extended double-precision floating-
5971 | point values `a' and `b'. The operation is performed according to the
5972 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5973 *----------------------------------------------------------------------------*/
5975 floatx80
floatx80_sub(floatx80 a
, floatx80 b
, float_status
*status
)
5979 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5980 float_raise(float_flag_invalid
, status
);
5981 return floatx80_default_nan(status
);
5983 aSign
= extractFloatx80Sign( a
);
5984 bSign
= extractFloatx80Sign( b
);
5985 if ( aSign
== bSign
) {
5986 return subFloatx80Sigs(a
, b
, aSign
, status
);
5989 return addFloatx80Sigs(a
, b
, aSign
, status
);
5994 /*----------------------------------------------------------------------------
5995 | Returns the result of multiplying the extended double-precision floating-
5996 | point values `a' and `b'. The operation is performed according to the
5997 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5998 *----------------------------------------------------------------------------*/
6000 floatx80
floatx80_mul(floatx80 a
, floatx80 b
, float_status
*status
)
6002 flag aSign
, bSign
, zSign
;
6003 int32_t aExp
, bExp
, zExp
;
6004 uint64_t aSig
, bSig
, zSig0
, zSig1
;
6006 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6007 float_raise(float_flag_invalid
, status
);
6008 return floatx80_default_nan(status
);
6010 aSig
= extractFloatx80Frac( a
);
6011 aExp
= extractFloatx80Exp( a
);
6012 aSign
= extractFloatx80Sign( a
);
6013 bSig
= extractFloatx80Frac( b
);
6014 bExp
= extractFloatx80Exp( b
);
6015 bSign
= extractFloatx80Sign( b
);
6016 zSign
= aSign
^ bSign
;
6017 if ( aExp
== 0x7FFF ) {
6018 if ( (uint64_t) ( aSig
<<1 )
6019 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
6020 return propagateFloatx80NaN(a
, b
, status
);
6022 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
6023 return packFloatx80(zSign
, floatx80_infinity_high
,
6024 floatx80_infinity_low
);
6026 if ( bExp
== 0x7FFF ) {
6027 if ((uint64_t)(bSig
<< 1)) {
6028 return propagateFloatx80NaN(a
, b
, status
);
6030 if ( ( aExp
| aSig
) == 0 ) {
6032 float_raise(float_flag_invalid
, status
);
6033 return floatx80_default_nan(status
);
6035 return packFloatx80(zSign
, floatx80_infinity_high
,
6036 floatx80_infinity_low
);
6039 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6040 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
6043 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6044 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6046 zExp
= aExp
+ bExp
- 0x3FFE;
6047 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
6048 if ( 0 < (int64_t) zSig0
) {
6049 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
6052 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6053 zSign
, zExp
, zSig0
, zSig1
, status
);
6056 /*----------------------------------------------------------------------------
6057 | Returns the result of dividing the extended double-precision floating-point
6058 | value `a' by the corresponding value `b'. The operation is performed
6059 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6060 *----------------------------------------------------------------------------*/
6062 floatx80
floatx80_div(floatx80 a
, floatx80 b
, float_status
*status
)
6064 flag aSign
, bSign
, zSign
;
6065 int32_t aExp
, bExp
, zExp
;
6066 uint64_t aSig
, bSig
, zSig0
, zSig1
;
6067 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
6069 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6070 float_raise(float_flag_invalid
, status
);
6071 return floatx80_default_nan(status
);
6073 aSig
= extractFloatx80Frac( a
);
6074 aExp
= extractFloatx80Exp( a
);
6075 aSign
= extractFloatx80Sign( a
);
6076 bSig
= extractFloatx80Frac( b
);
6077 bExp
= extractFloatx80Exp( b
);
6078 bSign
= extractFloatx80Sign( b
);
6079 zSign
= aSign
^ bSign
;
6080 if ( aExp
== 0x7FFF ) {
6081 if ((uint64_t)(aSig
<< 1)) {
6082 return propagateFloatx80NaN(a
, b
, status
);
6084 if ( bExp
== 0x7FFF ) {
6085 if ((uint64_t)(bSig
<< 1)) {
6086 return propagateFloatx80NaN(a
, b
, status
);
6090 return packFloatx80(zSign
, floatx80_infinity_high
,
6091 floatx80_infinity_low
);
6093 if ( bExp
== 0x7FFF ) {
6094 if ((uint64_t)(bSig
<< 1)) {
6095 return propagateFloatx80NaN(a
, b
, status
);
6097 return packFloatx80( zSign
, 0, 0 );
6101 if ( ( aExp
| aSig
) == 0 ) {
6103 float_raise(float_flag_invalid
, status
);
6104 return floatx80_default_nan(status
);
6106 float_raise(float_flag_divbyzero
, status
);
6107 return packFloatx80(zSign
, floatx80_infinity_high
,
6108 floatx80_infinity_low
);
6110 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6113 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6114 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
6116 zExp
= aExp
- bExp
+ 0x3FFE;
6118 if ( bSig
<= aSig
) {
6119 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
6122 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
6123 mul64To128( bSig
, zSig0
, &term0
, &term1
);
6124 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
6125 while ( (int64_t) rem0
< 0 ) {
6127 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
6129 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
6130 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
6131 mul64To128( bSig
, zSig1
, &term1
, &term2
);
6132 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6133 while ( (int64_t) rem1
< 0 ) {
6135 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
6137 zSig1
|= ( ( rem1
| rem2
) != 0 );
6139 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6140 zSign
, zExp
, zSig0
, zSig1
, status
);
6143 /*----------------------------------------------------------------------------
6144 | Returns the remainder of the extended double-precision floating-point value
6145 | `a' with respect to the corresponding value `b'. The operation is performed
6146 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6147 *----------------------------------------------------------------------------*/
6149 floatx80
floatx80_rem(floatx80 a
, floatx80 b
, float_status
*status
)
6152 int32_t aExp
, bExp
, expDiff
;
6153 uint64_t aSig0
, aSig1
, bSig
;
6154 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
6156 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6157 float_raise(float_flag_invalid
, status
);
6158 return floatx80_default_nan(status
);
6160 aSig0
= extractFloatx80Frac( a
);
6161 aExp
= extractFloatx80Exp( a
);
6162 aSign
= extractFloatx80Sign( a
);
6163 bSig
= extractFloatx80Frac( b
);
6164 bExp
= extractFloatx80Exp( b
);
6165 if ( aExp
== 0x7FFF ) {
6166 if ( (uint64_t) ( aSig0
<<1 )
6167 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
6168 return propagateFloatx80NaN(a
, b
, status
);
6172 if ( bExp
== 0x7FFF ) {
6173 if ((uint64_t)(bSig
<< 1)) {
6174 return propagateFloatx80NaN(a
, b
, status
);
6181 float_raise(float_flag_invalid
, status
);
6182 return floatx80_default_nan(status
);
6184 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6187 if ( (uint64_t) ( aSig0
<<1 ) == 0 ) return a
;
6188 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
6190 bSig
|= LIT64( 0x8000000000000000 );
6192 expDiff
= aExp
- bExp
;
6194 if ( expDiff
< 0 ) {
6195 if ( expDiff
< -1 ) return a
;
6196 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
6199 q
= ( bSig
<= aSig0
);
6200 if ( q
) aSig0
-= bSig
;
6202 while ( 0 < expDiff
) {
6203 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
6204 q
= ( 2 < q
) ? q
- 2 : 0;
6205 mul64To128( bSig
, q
, &term0
, &term1
);
6206 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6207 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
6211 if ( 0 < expDiff
) {
6212 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
6213 q
= ( 2 < q
) ? q
- 2 : 0;
6215 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
6216 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6217 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
6218 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
6220 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6227 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
6228 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
6229 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
6232 aSig0
= alternateASig0
;
6233 aSig1
= alternateASig1
;
6237 normalizeRoundAndPackFloatx80(
6238 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
, status
);
6242 /*----------------------------------------------------------------------------
6243 | Returns the square root of the extended double-precision floating-point
6244 | value `a'. The operation is performed according to the IEC/IEEE Standard
6245 | for Binary Floating-Point Arithmetic.
6246 *----------------------------------------------------------------------------*/
6248 floatx80
floatx80_sqrt(floatx80 a
, float_status
*status
)
6252 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
6253 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6255 if (floatx80_invalid_encoding(a
)) {
6256 float_raise(float_flag_invalid
, status
);
6257 return floatx80_default_nan(status
);
6259 aSig0
= extractFloatx80Frac( a
);
6260 aExp
= extractFloatx80Exp( a
);
6261 aSign
= extractFloatx80Sign( a
);
6262 if ( aExp
== 0x7FFF ) {
6263 if ((uint64_t)(aSig0
<< 1)) {
6264 return propagateFloatx80NaN(a
, a
, status
);
6266 if ( ! aSign
) return a
;
6270 if ( ( aExp
| aSig0
) == 0 ) return a
;
6272 float_raise(float_flag_invalid
, status
);
6273 return floatx80_default_nan(status
);
6276 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
6277 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
6279 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
6280 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
6281 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
6282 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6283 doubleZSig0
= zSig0
<<1;
6284 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6285 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6286 while ( (int64_t) rem0
< 0 ) {
6289 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6291 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
6292 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
6293 if ( zSig1
== 0 ) zSig1
= 1;
6294 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
6295 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6296 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
6297 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6298 while ( (int64_t) rem1
< 0 ) {
6300 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
6302 term2
|= doubleZSig0
;
6303 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6305 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6307 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
6308 zSig0
|= doubleZSig0
;
6309 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6310 0, zExp
, zSig0
, zSig1
, status
);
6313 /*----------------------------------------------------------------------------
6314 | Returns 1 if the extended double-precision floating-point value `a' is equal
6315 | to the corresponding value `b', and 0 otherwise. The invalid exception is
6316 | raised if either operand is a NaN. Otherwise, the comparison is performed
6317 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6318 *----------------------------------------------------------------------------*/
6320 int floatx80_eq(floatx80 a
, floatx80 b
, float_status
*status
)
6323 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
6324 || (extractFloatx80Exp(a
) == 0x7FFF
6325 && (uint64_t) (extractFloatx80Frac(a
) << 1))
6326 || (extractFloatx80Exp(b
) == 0x7FFF
6327 && (uint64_t) (extractFloatx80Frac(b
) << 1))
6329 float_raise(float_flag_invalid
, status
);
6334 && ( ( a
.high
== b
.high
)
6336 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6341 /*----------------------------------------------------------------------------
6342 | Returns 1 if the extended double-precision floating-point value `a' is
6343 | less than or equal to the corresponding value `b', and 0 otherwise. The
6344 | invalid exception is raised if either operand is a NaN. The comparison is
6345 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6347 *----------------------------------------------------------------------------*/
6349 int floatx80_le(floatx80 a
, floatx80 b
, float_status
*status
)
6353 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
6354 || (extractFloatx80Exp(a
) == 0x7FFF
6355 && (uint64_t) (extractFloatx80Frac(a
) << 1))
6356 || (extractFloatx80Exp(b
) == 0x7FFF
6357 && (uint64_t) (extractFloatx80Frac(b
) << 1))
6359 float_raise(float_flag_invalid
, status
);
6362 aSign
= extractFloatx80Sign( a
);
6363 bSign
= extractFloatx80Sign( b
);
6364 if ( aSign
!= bSign
) {
6367 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6371 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6372 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6376 /*----------------------------------------------------------------------------
6377 | Returns 1 if the extended double-precision floating-point value `a' is
6378 | less than the corresponding value `b', and 0 otherwise. The invalid
6379 | exception is raised if either operand is a NaN. The comparison is performed
6380 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6381 *----------------------------------------------------------------------------*/
6383 int floatx80_lt(floatx80 a
, floatx80 b
, float_status
*status
)
6387 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
6388 || (extractFloatx80Exp(a
) == 0x7FFF
6389 && (uint64_t) (extractFloatx80Frac(a
) << 1))
6390 || (extractFloatx80Exp(b
) == 0x7FFF
6391 && (uint64_t) (extractFloatx80Frac(b
) << 1))
6393 float_raise(float_flag_invalid
, status
);
6396 aSign
= extractFloatx80Sign( a
);
6397 bSign
= extractFloatx80Sign( b
);
6398 if ( aSign
!= bSign
) {
6401 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6405 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
6406 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6410 /*----------------------------------------------------------------------------
6411 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6412 | cannot be compared, and 0 otherwise. The invalid exception is raised if
6413 | either operand is a NaN. The comparison is performed according to the
6414 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6415 *----------------------------------------------------------------------------*/
6416 int floatx80_unordered(floatx80 a
, floatx80 b
, float_status
*status
)
6418 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
6419 || (extractFloatx80Exp(a
) == 0x7FFF
6420 && (uint64_t) (extractFloatx80Frac(a
) << 1))
6421 || (extractFloatx80Exp(b
) == 0x7FFF
6422 && (uint64_t) (extractFloatx80Frac(b
) << 1))
6424 float_raise(float_flag_invalid
, status
);
6430 /*----------------------------------------------------------------------------
6431 | Returns 1 if the extended double-precision floating-point value `a' is
6432 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6433 | cause an exception. The comparison is performed according to the IEC/IEEE
6434 | Standard for Binary Floating-Point Arithmetic.
6435 *----------------------------------------------------------------------------*/
6437 int floatx80_eq_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
6440 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6441 float_raise(float_flag_invalid
, status
);
6444 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
6445 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
6446 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
6447 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
6449 if (floatx80_is_signaling_nan(a
, status
)
6450 || floatx80_is_signaling_nan(b
, status
)) {
6451 float_raise(float_flag_invalid
, status
);
6457 && ( ( a
.high
== b
.high
)
6459 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6464 /*----------------------------------------------------------------------------
6465 | Returns 1 if the extended double-precision floating-point value `a' is less
6466 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
6467 | do not cause an exception. Otherwise, the comparison is performed according
6468 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6469 *----------------------------------------------------------------------------*/
6471 int floatx80_le_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
6475 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6476 float_raise(float_flag_invalid
, status
);
6479 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
6480 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
6481 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
6482 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
6484 if (floatx80_is_signaling_nan(a
, status
)
6485 || floatx80_is_signaling_nan(b
, status
)) {
6486 float_raise(float_flag_invalid
, status
);
6490 aSign
= extractFloatx80Sign( a
);
6491 bSign
= extractFloatx80Sign( b
);
6492 if ( aSign
!= bSign
) {
6495 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6499 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6500 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6504 /*----------------------------------------------------------------------------
6505 | Returns 1 if the extended double-precision floating-point value `a' is less
6506 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
6507 | an exception. Otherwise, the comparison is performed according to the
6508 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6509 *----------------------------------------------------------------------------*/
6511 int floatx80_lt_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
6515 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6516 float_raise(float_flag_invalid
, status
);
6519 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
6520 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
6521 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
6522 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
6524 if (floatx80_is_signaling_nan(a
, status
)
6525 || floatx80_is_signaling_nan(b
, status
)) {
6526 float_raise(float_flag_invalid
, status
);
6530 aSign
= extractFloatx80Sign( a
);
6531 bSign
= extractFloatx80Sign( b
);
6532 if ( aSign
!= bSign
) {
6535 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6539 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
6540 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6544 /*----------------------------------------------------------------------------
6545 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6546 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
6547 | The comparison is performed according to the IEC/IEEE Standard for Binary
6548 | Floating-Point Arithmetic.
6549 *----------------------------------------------------------------------------*/
6550 int floatx80_unordered_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
6552 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6553 float_raise(float_flag_invalid
, status
);
6556 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
6557 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
6558 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
6559 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
6561 if (floatx80_is_signaling_nan(a
, status
)
6562 || floatx80_is_signaling_nan(b
, status
)) {
6563 float_raise(float_flag_invalid
, status
);
6570 /*----------------------------------------------------------------------------
6571 | Returns the result of converting the quadruple-precision floating-point
6572 | value `a' to the 32-bit two's complement integer format. The conversion
6573 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6574 | Arithmetic---which means in particular that the conversion is rounded
6575 | according to the current rounding mode. If `a' is a NaN, the largest
6576 | positive integer is returned. Otherwise, if the conversion overflows, the
6577 | largest integer with the same sign as `a' is returned.
6578 *----------------------------------------------------------------------------*/
6580 int32_t float128_to_int32(float128 a
, float_status
*status
)
6583 int32_t aExp
, shiftCount
;
6584 uint64_t aSig0
, aSig1
;
6586 aSig1
= extractFloat128Frac1( a
);
6587 aSig0
= extractFloat128Frac0( a
);
6588 aExp
= extractFloat128Exp( a
);
6589 aSign
= extractFloat128Sign( a
);
6590 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
6591 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
6592 aSig0
|= ( aSig1
!= 0 );
6593 shiftCount
= 0x4028 - aExp
;
6594 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
6595 return roundAndPackInt32(aSign
, aSig0
, status
);
6599 /*----------------------------------------------------------------------------
6600 | Returns the result of converting the quadruple-precision floating-point
6601 | value `a' to the 32-bit two's complement integer format. The conversion
6602 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6603 | Arithmetic, except that the conversion is always rounded toward zero. If
6604 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
6605 | conversion overflows, the largest integer with the same sign as `a' is
6607 *----------------------------------------------------------------------------*/
6609 int32_t float128_to_int32_round_to_zero(float128 a
, float_status
*status
)
6612 int32_t aExp
, shiftCount
;
6613 uint64_t aSig0
, aSig1
, savedASig
;
6616 aSig1
= extractFloat128Frac1( a
);
6617 aSig0
= extractFloat128Frac0( a
);
6618 aExp
= extractFloat128Exp( a
);
6619 aSign
= extractFloat128Sign( a
);
6620 aSig0
|= ( aSig1
!= 0 );
6621 if ( 0x401E < aExp
) {
6622 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
6625 else if ( aExp
< 0x3FFF ) {
6626 if (aExp
|| aSig0
) {
6627 status
->float_exception_flags
|= float_flag_inexact
;
6631 aSig0
|= LIT64( 0x0001000000000000 );
6632 shiftCount
= 0x402F - aExp
;
6634 aSig0
>>= shiftCount
;
6636 if ( aSign
) z
= - z
;
6637 if ( ( z
< 0 ) ^ aSign
) {
6639 float_raise(float_flag_invalid
, status
);
6640 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
6642 if ( ( aSig0
<<shiftCount
) != savedASig
) {
6643 status
->float_exception_flags
|= float_flag_inexact
;
6649 /*----------------------------------------------------------------------------
6650 | Returns the result of converting the quadruple-precision floating-point
6651 | value `a' to the 64-bit two's complement integer format. The conversion
6652 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6653 | Arithmetic---which means in particular that the conversion is rounded
6654 | according to the current rounding mode. If `a' is a NaN, the largest
6655 | positive integer is returned. Otherwise, if the conversion overflows, the
6656 | largest integer with the same sign as `a' is returned.
6657 *----------------------------------------------------------------------------*/
6659 int64_t float128_to_int64(float128 a
, float_status
*status
)
6662 int32_t aExp
, shiftCount
;
6663 uint64_t aSig0
, aSig1
;
6665 aSig1
= extractFloat128Frac1( a
);
6666 aSig0
= extractFloat128Frac0( a
);
6667 aExp
= extractFloat128Exp( a
);
6668 aSign
= extractFloat128Sign( a
);
6669 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
6670 shiftCount
= 0x402F - aExp
;
6671 if ( shiftCount
<= 0 ) {
6672 if ( 0x403E < aExp
) {
6673 float_raise(float_flag_invalid
, status
);
6675 || ( ( aExp
== 0x7FFF )
6676 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
6679 return LIT64( 0x7FFFFFFFFFFFFFFF );
6681 return (int64_t) LIT64( 0x8000000000000000 );
6683 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
6686 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
6688 return roundAndPackInt64(aSign
, aSig0
, aSig1
, status
);
6692 /*----------------------------------------------------------------------------
6693 | Returns the result of converting the quadruple-precision floating-point
6694 | value `a' to the 64-bit two's complement integer format. The conversion
6695 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6696 | Arithmetic, except that the conversion is always rounded toward zero.
6697 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6698 | the conversion overflows, the largest integer with the same sign as `a' is
6700 *----------------------------------------------------------------------------*/
6702 int64_t float128_to_int64_round_to_zero(float128 a
, float_status
*status
)
6705 int32_t aExp
, shiftCount
;
6706 uint64_t aSig0
, aSig1
;
6709 aSig1
= extractFloat128Frac1( a
);
6710 aSig0
= extractFloat128Frac0( a
);
6711 aExp
= extractFloat128Exp( a
);
6712 aSign
= extractFloat128Sign( a
);
6713 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
6714 shiftCount
= aExp
- 0x402F;
6715 if ( 0 < shiftCount
) {
6716 if ( 0x403E <= aExp
) {
6717 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
6718 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
6719 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
6721 status
->float_exception_flags
|= float_flag_inexact
;
6725 float_raise(float_flag_invalid
, status
);
6726 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
6727 return LIT64( 0x7FFFFFFFFFFFFFFF );
6730 return (int64_t) LIT64( 0x8000000000000000 );
6732 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
6733 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
6734 status
->float_exception_flags
|= float_flag_inexact
;
6738 if ( aExp
< 0x3FFF ) {
6739 if ( aExp
| aSig0
| aSig1
) {
6740 status
->float_exception_flags
|= float_flag_inexact
;
6744 z
= aSig0
>>( - shiftCount
);
6746 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
6747 status
->float_exception_flags
|= float_flag_inexact
;
6750 if ( aSign
) z
= - z
;
6755 /*----------------------------------------------------------------------------
6756 | Returns the result of converting the quadruple-precision floating-point value
6757 | `a' to the 64-bit unsigned integer format. The conversion is
6758 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6759 | Arithmetic---which means in particular that the conversion is rounded
6760 | according to the current rounding mode. If `a' is a NaN, the largest
6761 | positive integer is returned. If the conversion overflows, the
6762 | largest unsigned integer is returned. If 'a' is negative, the value is
6763 | rounded and zero is returned; negative values that do not round to zero
6764 | will raise the inexact exception.
6765 *----------------------------------------------------------------------------*/
6767 uint64_t float128_to_uint64(float128 a
, float_status
*status
)
6772 uint64_t aSig0
, aSig1
;
6774 aSig0
= extractFloat128Frac0(a
);
6775 aSig1
= extractFloat128Frac1(a
);
6776 aExp
= extractFloat128Exp(a
);
6777 aSign
= extractFloat128Sign(a
);
6778 if (aSign
&& (aExp
> 0x3FFE)) {
6779 float_raise(float_flag_invalid
, status
);
6780 if (float128_is_any_nan(a
)) {
6781 return LIT64(0xFFFFFFFFFFFFFFFF);
6787 aSig0
|= LIT64(0x0001000000000000);
6789 shiftCount
= 0x402F - aExp
;
6790 if (shiftCount
<= 0) {
6791 if (0x403E < aExp
) {
6792 float_raise(float_flag_invalid
, status
);
6793 return LIT64(0xFFFFFFFFFFFFFFFF);
6795 shortShift128Left(aSig0
, aSig1
, -shiftCount
, &aSig0
, &aSig1
);
6797 shift64ExtraRightJamming(aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
6799 return roundAndPackUint64(aSign
, aSig0
, aSig1
, status
);
6802 uint64_t float128_to_uint64_round_to_zero(float128 a
, float_status
*status
)
6805 signed char current_rounding_mode
= status
->float_rounding_mode
;
6807 set_float_rounding_mode(float_round_to_zero
, status
);
6808 v
= float128_to_uint64(a
, status
);
6809 set_float_rounding_mode(current_rounding_mode
, status
);
6814 /*----------------------------------------------------------------------------
6815 | Returns the result of converting the quadruple-precision floating-point
6816 | value `a' to the 32-bit unsigned integer format. The conversion
6817 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6818 | Arithmetic except that the conversion is always rounded toward zero.
6819 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
6820 | if the conversion overflows, the largest unsigned integer is returned.
6821 | If 'a' is negative, the value is rounded and zero is returned; negative
6822 | values that do not round to zero will raise the inexact exception.
6823 *----------------------------------------------------------------------------*/
6825 uint32_t float128_to_uint32_round_to_zero(float128 a
, float_status
*status
)
6829 int old_exc_flags
= get_float_exception_flags(status
);
6831 v
= float128_to_uint64_round_to_zero(a
, status
);
6832 if (v
> 0xffffffff) {
6837 set_float_exception_flags(old_exc_flags
, status
);
6838 float_raise(float_flag_invalid
, status
);
6842 /*----------------------------------------------------------------------------
6843 | Returns the result of converting the quadruple-precision floating-point value
6844 | `a' to the 32-bit unsigned integer format. The conversion is
6845 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6846 | Arithmetic---which means in particular that the conversion is rounded
6847 | according to the current rounding mode. If `a' is a NaN, the largest
6848 | positive integer is returned. If the conversion overflows, the
6849 | largest unsigned integer is returned. If 'a' is negative, the value is
6850 | rounded and zero is returned; negative values that do not round to zero
6851 | will raise the inexact exception.
6852 *----------------------------------------------------------------------------*/
6854 uint32_t float128_to_uint32(float128 a
, float_status
*status
)
6858 int old_exc_flags
= get_float_exception_flags(status
);
6860 v
= float128_to_uint64(a
, status
);
6861 if (v
> 0xffffffff) {
6866 set_float_exception_flags(old_exc_flags
, status
);
6867 float_raise(float_flag_invalid
, status
);
6871 /*----------------------------------------------------------------------------
6872 | Returns the result of converting the quadruple-precision floating-point
6873 | value `a' to the single-precision floating-point format. The conversion
6874 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6876 *----------------------------------------------------------------------------*/
6878 float32
float128_to_float32(float128 a
, float_status
*status
)
6882 uint64_t aSig0
, aSig1
;
6885 aSig1
= extractFloat128Frac1( a
);
6886 aSig0
= extractFloat128Frac0( a
);
6887 aExp
= extractFloat128Exp( a
);
6888 aSign
= extractFloat128Sign( a
);
6889 if ( aExp
== 0x7FFF ) {
6890 if ( aSig0
| aSig1
) {
6891 return commonNaNToFloat32(float128ToCommonNaN(a
, status
), status
);
6893 return packFloat32( aSign
, 0xFF, 0 );
6895 aSig0
|= ( aSig1
!= 0 );
6896 shift64RightJamming( aSig0
, 18, &aSig0
);
6898 if ( aExp
|| zSig
) {
6902 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
6906 /*----------------------------------------------------------------------------
6907 | Returns the result of converting the quadruple-precision floating-point
6908 | value `a' to the double-precision floating-point format. The conversion
6909 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6911 *----------------------------------------------------------------------------*/
6913 float64
float128_to_float64(float128 a
, float_status
*status
)
6917 uint64_t aSig0
, aSig1
;
6919 aSig1
= extractFloat128Frac1( a
);
6920 aSig0
= extractFloat128Frac0( a
);
6921 aExp
= extractFloat128Exp( a
);
6922 aSign
= extractFloat128Sign( a
);
6923 if ( aExp
== 0x7FFF ) {
6924 if ( aSig0
| aSig1
) {
6925 return commonNaNToFloat64(float128ToCommonNaN(a
, status
), status
);
6927 return packFloat64( aSign
, 0x7FF, 0 );
6929 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6930 aSig0
|= ( aSig1
!= 0 );
6931 if ( aExp
|| aSig0
) {
6932 aSig0
|= LIT64( 0x4000000000000000 );
6935 return roundAndPackFloat64(aSign
, aExp
, aSig0
, status
);
6939 /*----------------------------------------------------------------------------
6940 | Returns the result of converting the quadruple-precision floating-point
6941 | value `a' to the extended double-precision floating-point format. The
6942 | conversion is performed according to the IEC/IEEE Standard for Binary
6943 | Floating-Point Arithmetic.
6944 *----------------------------------------------------------------------------*/
6946 floatx80
float128_to_floatx80(float128 a
, float_status
*status
)
6950 uint64_t aSig0
, aSig1
;
6952 aSig1
= extractFloat128Frac1( a
);
6953 aSig0
= extractFloat128Frac0( a
);
6954 aExp
= extractFloat128Exp( a
);
6955 aSign
= extractFloat128Sign( a
);
6956 if ( aExp
== 0x7FFF ) {
6957 if ( aSig0
| aSig1
) {
6958 return commonNaNToFloatx80(float128ToCommonNaN(a
, status
), status
);
6960 return packFloatx80(aSign
, floatx80_infinity_high
,
6961 floatx80_infinity_low
);
6964 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
6965 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6968 aSig0
|= LIT64( 0x0001000000000000 );
6970 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
6971 return roundAndPackFloatx80(80, aSign
, aExp
, aSig0
, aSig1
, status
);
6975 /*----------------------------------------------------------------------------
6976 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6977 | returns the result as a quadruple-precision floating-point value. The
6978 | operation is performed according to the IEC/IEEE Standard for Binary
6979 | Floating-Point Arithmetic.
6980 *----------------------------------------------------------------------------*/
6982 float128
float128_round_to_int(float128 a
, float_status
*status
)
6986 uint64_t lastBitMask
, roundBitsMask
;
6989 aExp
= extractFloat128Exp( a
);
6990 if ( 0x402F <= aExp
) {
6991 if ( 0x406F <= aExp
) {
6992 if ( ( aExp
== 0x7FFF )
6993 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
6995 return propagateFloat128NaN(a
, a
, status
);
7000 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
7001 roundBitsMask
= lastBitMask
- 1;
7003 switch (status
->float_rounding_mode
) {
7004 case float_round_nearest_even
:
7005 if ( lastBitMask
) {
7006 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
7007 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
7010 if ( (int64_t) z
.low
< 0 ) {
7012 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
7016 case float_round_ties_away
:
7018 add128(z
.high
, z
.low
, 0, lastBitMask
>> 1, &z
.high
, &z
.low
);
7020 if ((int64_t) z
.low
< 0) {
7025 case float_round_to_zero
:
7027 case float_round_up
:
7028 if (!extractFloat128Sign(z
)) {
7029 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
7032 case float_round_down
:
7033 if (extractFloat128Sign(z
)) {
7034 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
7037 case float_round_to_odd
:
7039 * Note that if lastBitMask == 0, the last bit is the lsb
7040 * of high, and roundBitsMask == -1.
7042 if ((lastBitMask
? z
.low
& lastBitMask
: z
.high
& 1) == 0) {
7043 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
7049 z
.low
&= ~ roundBitsMask
;
7052 if ( aExp
< 0x3FFF ) {
7053 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
7054 status
->float_exception_flags
|= float_flag_inexact
;
7055 aSign
= extractFloat128Sign( a
);
7056 switch (status
->float_rounding_mode
) {
7057 case float_round_nearest_even
:
7058 if ( ( aExp
== 0x3FFE )
7059 && ( extractFloat128Frac0( a
)
7060 | extractFloat128Frac1( a
) )
7062 return packFloat128( aSign
, 0x3FFF, 0, 0 );
7065 case float_round_ties_away
:
7066 if (aExp
== 0x3FFE) {
7067 return packFloat128(aSign
, 0x3FFF, 0, 0);
7070 case float_round_down
:
7072 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
7073 : packFloat128( 0, 0, 0, 0 );
7074 case float_round_up
:
7076 aSign
? packFloat128( 1, 0, 0, 0 )
7077 : packFloat128( 0, 0x3FFF, 0, 0 );
7079 case float_round_to_odd
:
7080 return packFloat128(aSign
, 0x3FFF, 0, 0);
7082 return packFloat128( aSign
, 0, 0, 0 );
7085 lastBitMask
<<= 0x402F - aExp
;
7086 roundBitsMask
= lastBitMask
- 1;
7089 switch (status
->float_rounding_mode
) {
7090 case float_round_nearest_even
:
7091 z
.high
+= lastBitMask
>>1;
7092 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
7093 z
.high
&= ~ lastBitMask
;
7096 case float_round_ties_away
:
7097 z
.high
+= lastBitMask
>>1;
7099 case float_round_to_zero
:
7101 case float_round_up
:
7102 if (!extractFloat128Sign(z
)) {
7103 z
.high
|= ( a
.low
!= 0 );
7104 z
.high
+= roundBitsMask
;
7107 case float_round_down
:
7108 if (extractFloat128Sign(z
)) {
7109 z
.high
|= (a
.low
!= 0);
7110 z
.high
+= roundBitsMask
;
7113 case float_round_to_odd
:
7114 if ((z
.high
& lastBitMask
) == 0) {
7115 z
.high
|= (a
.low
!= 0);
7116 z
.high
+= roundBitsMask
;
7122 z
.high
&= ~ roundBitsMask
;
7124 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
7125 status
->float_exception_flags
|= float_flag_inexact
;
7131 /*----------------------------------------------------------------------------
7132 | Returns the result of adding the absolute values of the quadruple-precision
7133 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
7134 | before being returned. `zSign' is ignored if the result is a NaN.
7135 | The addition is performed according to the IEC/IEEE Standard for Binary
7136 | Floating-Point Arithmetic.
7137 *----------------------------------------------------------------------------*/
7139 static float128
addFloat128Sigs(float128 a
, float128 b
, flag zSign
,
7140 float_status
*status
)
7142 int32_t aExp
, bExp
, zExp
;
7143 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
7146 aSig1
= extractFloat128Frac1( a
);
7147 aSig0
= extractFloat128Frac0( a
);
7148 aExp
= extractFloat128Exp( a
);
7149 bSig1
= extractFloat128Frac1( b
);
7150 bSig0
= extractFloat128Frac0( b
);
7151 bExp
= extractFloat128Exp( b
);
7152 expDiff
= aExp
- bExp
;
7153 if ( 0 < expDiff
) {
7154 if ( aExp
== 0x7FFF ) {
7155 if (aSig0
| aSig1
) {
7156 return propagateFloat128NaN(a
, b
, status
);
7164 bSig0
|= LIT64( 0x0001000000000000 );
7166 shift128ExtraRightJamming(
7167 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
7170 else if ( expDiff
< 0 ) {
7171 if ( bExp
== 0x7FFF ) {
7172 if (bSig0
| bSig1
) {
7173 return propagateFloat128NaN(a
, b
, status
);
7175 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7181 aSig0
|= LIT64( 0x0001000000000000 );
7183 shift128ExtraRightJamming(
7184 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
7188 if ( aExp
== 0x7FFF ) {
7189 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
7190 return propagateFloat128NaN(a
, b
, status
);
7194 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
7196 if (status
->flush_to_zero
) {
7197 if (zSig0
| zSig1
) {
7198 float_raise(float_flag_output_denormal
, status
);
7200 return packFloat128(zSign
, 0, 0, 0);
7202 return packFloat128( zSign
, 0, zSig0
, zSig1
);
7205 zSig0
|= LIT64( 0x0002000000000000 );
7209 aSig0
|= LIT64( 0x0001000000000000 );
7210 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
7212 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
7215 shift128ExtraRightJamming(
7216 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
7218 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7222 /*----------------------------------------------------------------------------
7223 | Returns the result of subtracting the absolute values of the quadruple-
7224 | precision floating-point values `a' and `b'. If `zSign' is 1, the
7225 | difference is negated before being returned. `zSign' is ignored if the
7226 | result is a NaN. The subtraction is performed according to the IEC/IEEE
7227 | Standard for Binary Floating-Point Arithmetic.
7228 *----------------------------------------------------------------------------*/
7230 static float128
subFloat128Sigs(float128 a
, float128 b
, flag zSign
,
7231 float_status
*status
)
7233 int32_t aExp
, bExp
, zExp
;
7234 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
7237 aSig1
= extractFloat128Frac1( a
);
7238 aSig0
= extractFloat128Frac0( a
);
7239 aExp
= extractFloat128Exp( a
);
7240 bSig1
= extractFloat128Frac1( b
);
7241 bSig0
= extractFloat128Frac0( b
);
7242 bExp
= extractFloat128Exp( b
);
7243 expDiff
= aExp
- bExp
;
7244 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
7245 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
7246 if ( 0 < expDiff
) goto aExpBigger
;
7247 if ( expDiff
< 0 ) goto bExpBigger
;
7248 if ( aExp
== 0x7FFF ) {
7249 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
7250 return propagateFloat128NaN(a
, b
, status
);
7252 float_raise(float_flag_invalid
, status
);
7253 return float128_default_nan(status
);
7259 if ( bSig0
< aSig0
) goto aBigger
;
7260 if ( aSig0
< bSig0
) goto bBigger
;
7261 if ( bSig1
< aSig1
) goto aBigger
;
7262 if ( aSig1
< bSig1
) goto bBigger
;
7263 return packFloat128(status
->float_rounding_mode
== float_round_down
,
7266 if ( bExp
== 0x7FFF ) {
7267 if (bSig0
| bSig1
) {
7268 return propagateFloat128NaN(a
, b
, status
);
7270 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
7276 aSig0
|= LIT64( 0x4000000000000000 );
7278 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
7279 bSig0
|= LIT64( 0x4000000000000000 );
7281 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
7284 goto normalizeRoundAndPack
;
7286 if ( aExp
== 0x7FFF ) {
7287 if (aSig0
| aSig1
) {
7288 return propagateFloat128NaN(a
, b
, status
);
7296 bSig0
|= LIT64( 0x4000000000000000 );
7298 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
7299 aSig0
|= LIT64( 0x4000000000000000 );
7301 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
7303 normalizeRoundAndPack
:
7305 return normalizeRoundAndPackFloat128(zSign
, zExp
- 14, zSig0
, zSig1
,
7310 /*----------------------------------------------------------------------------
7311 | Returns the result of adding the quadruple-precision floating-point values
7312 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
7313 | for Binary Floating-Point Arithmetic.
7314 *----------------------------------------------------------------------------*/
7316 float128
float128_add(float128 a
, float128 b
, float_status
*status
)
7320 aSign
= extractFloat128Sign( a
);
7321 bSign
= extractFloat128Sign( b
);
7322 if ( aSign
== bSign
) {
7323 return addFloat128Sigs(a
, b
, aSign
, status
);
7326 return subFloat128Sigs(a
, b
, aSign
, status
);
7331 /*----------------------------------------------------------------------------
7332 | Returns the result of subtracting the quadruple-precision floating-point
7333 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7334 | Standard for Binary Floating-Point Arithmetic.
7335 *----------------------------------------------------------------------------*/
7337 float128
float128_sub(float128 a
, float128 b
, float_status
*status
)
7341 aSign
= extractFloat128Sign( a
);
7342 bSign
= extractFloat128Sign( b
);
7343 if ( aSign
== bSign
) {
7344 return subFloat128Sigs(a
, b
, aSign
, status
);
7347 return addFloat128Sigs(a
, b
, aSign
, status
);
7352 /*----------------------------------------------------------------------------
7353 | Returns the result of multiplying the quadruple-precision floating-point
7354 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7355 | Standard for Binary Floating-Point Arithmetic.
7356 *----------------------------------------------------------------------------*/
7358 float128
float128_mul(float128 a
, float128 b
, float_status
*status
)
7360 flag aSign
, bSign
, zSign
;
7361 int32_t aExp
, bExp
, zExp
;
7362 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
7364 aSig1
= extractFloat128Frac1( a
);
7365 aSig0
= extractFloat128Frac0( a
);
7366 aExp
= extractFloat128Exp( a
);
7367 aSign
= extractFloat128Sign( a
);
7368 bSig1
= extractFloat128Frac1( b
);
7369 bSig0
= extractFloat128Frac0( b
);
7370 bExp
= extractFloat128Exp( b
);
7371 bSign
= extractFloat128Sign( b
);
7372 zSign
= aSign
^ bSign
;
7373 if ( aExp
== 0x7FFF ) {
7374 if ( ( aSig0
| aSig1
)
7375 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
7376 return propagateFloat128NaN(a
, b
, status
);
7378 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
7379 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7381 if ( bExp
== 0x7FFF ) {
7382 if (bSig0
| bSig1
) {
7383 return propagateFloat128NaN(a
, b
, status
);
7385 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
7387 float_raise(float_flag_invalid
, status
);
7388 return float128_default_nan(status
);
7390 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7393 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7394 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7397 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7398 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7400 zExp
= aExp
+ bExp
- 0x4000;
7401 aSig0
|= LIT64( 0x0001000000000000 );
7402 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
7403 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
7404 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
7405 zSig2
|= ( zSig3
!= 0 );
7406 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
7407 shift128ExtraRightJamming(
7408 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
7411 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7415 /*----------------------------------------------------------------------------
7416 | Returns the result of dividing the quadruple-precision floating-point value
7417 | `a' by the corresponding value `b'. The operation is performed according to
7418 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7419 *----------------------------------------------------------------------------*/
7421 float128
float128_div(float128 a
, float128 b
, float_status
*status
)
7423 flag aSign
, bSign
, zSign
;
7424 int32_t aExp
, bExp
, zExp
;
7425 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
7426 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
7428 aSig1
= extractFloat128Frac1( a
);
7429 aSig0
= extractFloat128Frac0( a
);
7430 aExp
= extractFloat128Exp( a
);
7431 aSign
= extractFloat128Sign( a
);
7432 bSig1
= extractFloat128Frac1( b
);
7433 bSig0
= extractFloat128Frac0( b
);
7434 bExp
= extractFloat128Exp( b
);
7435 bSign
= extractFloat128Sign( b
);
7436 zSign
= aSign
^ bSign
;
7437 if ( aExp
== 0x7FFF ) {
7438 if (aSig0
| aSig1
) {
7439 return propagateFloat128NaN(a
, b
, status
);
7441 if ( bExp
== 0x7FFF ) {
7442 if (bSig0
| bSig1
) {
7443 return propagateFloat128NaN(a
, b
, status
);
7447 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7449 if ( bExp
== 0x7FFF ) {
7450 if (bSig0
| bSig1
) {
7451 return propagateFloat128NaN(a
, b
, status
);
7453 return packFloat128( zSign
, 0, 0, 0 );
7456 if ( ( bSig0
| bSig1
) == 0 ) {
7457 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
7459 float_raise(float_flag_invalid
, status
);
7460 return float128_default_nan(status
);
7462 float_raise(float_flag_divbyzero
, status
);
7463 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7465 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7468 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7469 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7471 zExp
= aExp
- bExp
+ 0x3FFD;
7473 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
7475 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
7476 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
7477 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
7480 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7481 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
7482 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
7483 while ( (int64_t) rem0
< 0 ) {
7485 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
7487 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
7488 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
7489 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
7490 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
7491 while ( (int64_t) rem1
< 0 ) {
7493 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
7495 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
7497 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
7498 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7502 /*----------------------------------------------------------------------------
7503 | Returns the remainder of the quadruple-precision floating-point value `a'
7504 | with respect to the corresponding value `b'. The operation is performed
7505 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7506 *----------------------------------------------------------------------------*/
7508 float128
float128_rem(float128 a
, float128 b
, float_status
*status
)
7511 int32_t aExp
, bExp
, expDiff
;
7512 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
7513 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
7516 aSig1
= extractFloat128Frac1( a
);
7517 aSig0
= extractFloat128Frac0( a
);
7518 aExp
= extractFloat128Exp( a
);
7519 aSign
= extractFloat128Sign( a
);
7520 bSig1
= extractFloat128Frac1( b
);
7521 bSig0
= extractFloat128Frac0( b
);
7522 bExp
= extractFloat128Exp( b
);
7523 if ( aExp
== 0x7FFF ) {
7524 if ( ( aSig0
| aSig1
)
7525 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
7526 return propagateFloat128NaN(a
, b
, status
);
7530 if ( bExp
== 0x7FFF ) {
7531 if (bSig0
| bSig1
) {
7532 return propagateFloat128NaN(a
, b
, status
);
7537 if ( ( bSig0
| bSig1
) == 0 ) {
7539 float_raise(float_flag_invalid
, status
);
7540 return float128_default_nan(status
);
7542 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7545 if ( ( aSig0
| aSig1
) == 0 ) return a
;
7546 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7548 expDiff
= aExp
- bExp
;
7549 if ( expDiff
< -1 ) return a
;
7551 aSig0
| LIT64( 0x0001000000000000 ),
7553 15 - ( expDiff
< 0 ),
7558 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
7559 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
7560 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
7562 while ( 0 < expDiff
) {
7563 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7564 q
= ( 4 < q
) ? q
- 4 : 0;
7565 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
7566 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
7567 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
7568 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
7571 if ( -64 < expDiff
) {
7572 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7573 q
= ( 4 < q
) ? q
- 4 : 0;
7575 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
7577 if ( expDiff
< 0 ) {
7578 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
7581 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
7583 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
7584 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
7587 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
7588 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
7591 alternateASig0
= aSig0
;
7592 alternateASig1
= aSig1
;
7594 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
7595 } while ( 0 <= (int64_t) aSig0
);
7597 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
7598 if ( ( sigMean0
< 0 )
7599 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
7600 aSig0
= alternateASig0
;
7601 aSig1
= alternateASig1
;
7603 zSign
= ( (int64_t) aSig0
< 0 );
7604 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
7605 return normalizeRoundAndPackFloat128(aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
,
7609 /*----------------------------------------------------------------------------
7610 | Returns the square root of the quadruple-precision floating-point value `a'.
7611 | The operation is performed according to the IEC/IEEE Standard for Binary
7612 | Floating-Point Arithmetic.
7613 *----------------------------------------------------------------------------*/
7615 float128
float128_sqrt(float128 a
, float_status
*status
)
7619 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
7620 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
7622 aSig1
= extractFloat128Frac1( a
);
7623 aSig0
= extractFloat128Frac0( a
);
7624 aExp
= extractFloat128Exp( a
);
7625 aSign
= extractFloat128Sign( a
);
7626 if ( aExp
== 0x7FFF ) {
7627 if (aSig0
| aSig1
) {
7628 return propagateFloat128NaN(a
, a
, status
);
7630 if ( ! aSign
) return a
;
7634 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
7636 float_raise(float_flag_invalid
, status
);
7637 return float128_default_nan(status
);
7640 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
7641 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7643 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
7644 aSig0
|= LIT64( 0x0001000000000000 );
7645 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
7646 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
7647 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
7648 doubleZSig0
= zSig0
<<1;
7649 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
7650 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
7651 while ( (int64_t) rem0
< 0 ) {
7654 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
7656 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
7657 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
7658 if ( zSig1
== 0 ) zSig1
= 1;
7659 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
7660 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
7661 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
7662 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
7663 while ( (int64_t) rem1
< 0 ) {
7665 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
7667 term2
|= doubleZSig0
;
7668 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
7670 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
7672 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
7673 return roundAndPackFloat128(0, zExp
, zSig0
, zSig1
, zSig2
, status
);
7677 /*----------------------------------------------------------------------------
7678 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7679 | the corresponding value `b', and 0 otherwise. The invalid exception is
7680 | raised if either operand is a NaN. Otherwise, the comparison is performed
7681 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7682 *----------------------------------------------------------------------------*/
7684 int float128_eq(float128 a
, float128 b
, float_status
*status
)
7687 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7688 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7689 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7690 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7692 float_raise(float_flag_invalid
, status
);
7697 && ( ( a
.high
== b
.high
)
7699 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
7704 /*----------------------------------------------------------------------------
7705 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7706 | or equal to the corresponding value `b', and 0 otherwise. The invalid
7707 | exception is raised if either operand is a NaN. The comparison is performed
7708 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7709 *----------------------------------------------------------------------------*/
7711 int float128_le(float128 a
, float128 b
, float_status
*status
)
7715 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7716 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7717 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7718 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7720 float_raise(float_flag_invalid
, status
);
7723 aSign
= extractFloat128Sign( a
);
7724 bSign
= extractFloat128Sign( b
);
7725 if ( aSign
!= bSign
) {
7728 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
7732 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
7733 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
7737 /*----------------------------------------------------------------------------
7738 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7739 | the corresponding value `b', and 0 otherwise. The invalid exception is
7740 | raised if either operand is a NaN. The comparison is performed according
7741 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7742 *----------------------------------------------------------------------------*/
7744 int float128_lt(float128 a
, float128 b
, float_status
*status
)
7748 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7749 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7750 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7751 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7753 float_raise(float_flag_invalid
, status
);
7756 aSign
= extractFloat128Sign( a
);
7757 bSign
= extractFloat128Sign( b
);
7758 if ( aSign
!= bSign
) {
7761 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
7765 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
7766 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
7770 /*----------------------------------------------------------------------------
7771 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7772 | be compared, and 0 otherwise. The invalid exception is raised if either
7773 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7774 | Standard for Binary Floating-Point Arithmetic.
7775 *----------------------------------------------------------------------------*/
7777 int float128_unordered(float128 a
, float128 b
, float_status
*status
)
7779 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7780 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7781 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7782 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7784 float_raise(float_flag_invalid
, status
);
7790 /*----------------------------------------------------------------------------
7791 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7792 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7793 | exception. The comparison is performed according to the IEC/IEEE Standard
7794 | for Binary Floating-Point Arithmetic.
7795 *----------------------------------------------------------------------------*/
7797 int float128_eq_quiet(float128 a
, float128 b
, float_status
*status
)
7800 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7801 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7802 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7803 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7805 if (float128_is_signaling_nan(a
, status
)
7806 || float128_is_signaling_nan(b
, status
)) {
7807 float_raise(float_flag_invalid
, status
);
7813 && ( ( a
.high
== b
.high
)
7815 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
7820 /*----------------------------------------------------------------------------
7821 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7822 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
7823 | cause an exception. Otherwise, the comparison is performed according to the
7824 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7825 *----------------------------------------------------------------------------*/
7827 int float128_le_quiet(float128 a
, float128 b
, float_status
*status
)
7831 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7832 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7833 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7834 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7836 if (float128_is_signaling_nan(a
, status
)
7837 || float128_is_signaling_nan(b
, status
)) {
7838 float_raise(float_flag_invalid
, status
);
7842 aSign
= extractFloat128Sign( a
);
7843 bSign
= extractFloat128Sign( b
);
7844 if ( aSign
!= bSign
) {
7847 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
7851 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
7852 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
7856 /*----------------------------------------------------------------------------
7857 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7858 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7859 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
7860 | Standard for Binary Floating-Point Arithmetic.
7861 *----------------------------------------------------------------------------*/
7863 int float128_lt_quiet(float128 a
, float128 b
, float_status
*status
)
7867 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7868 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7869 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7870 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7872 if (float128_is_signaling_nan(a
, status
)
7873 || float128_is_signaling_nan(b
, status
)) {
7874 float_raise(float_flag_invalid
, status
);
7878 aSign
= extractFloat128Sign( a
);
7879 bSign
= extractFloat128Sign( b
);
7880 if ( aSign
!= bSign
) {
7883 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
7887 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
7888 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
7892 /*----------------------------------------------------------------------------
7893 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7894 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
7895 | comparison is performed according to the IEC/IEEE Standard for Binary
7896 | Floating-Point Arithmetic.
7897 *----------------------------------------------------------------------------*/
7899 int float128_unordered_quiet(float128 a
, float128 b
, float_status
*status
)
7901 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7902 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7903 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7904 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7906 if (float128_is_signaling_nan(a
, status
)
7907 || float128_is_signaling_nan(b
, status
)) {
7908 float_raise(float_flag_invalid
, status
);
7915 static inline int floatx80_compare_internal(floatx80 a
, floatx80 b
,
7916 int is_quiet
, float_status
*status
)
7920 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
7921 float_raise(float_flag_invalid
, status
);
7922 return float_relation_unordered
;
7924 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
7925 ( extractFloatx80Frac( a
)<<1 ) ) ||
7926 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
7927 ( extractFloatx80Frac( b
)<<1 ) )) {
7929 floatx80_is_signaling_nan(a
, status
) ||
7930 floatx80_is_signaling_nan(b
, status
)) {
7931 float_raise(float_flag_invalid
, status
);
7933 return float_relation_unordered
;
7935 aSign
= extractFloatx80Sign( a
);
7936 bSign
= extractFloatx80Sign( b
);
7937 if ( aSign
!= bSign
) {
7939 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
7940 ( ( a
.low
| b
.low
) == 0 ) ) {
7942 return float_relation_equal
;
7944 return 1 - (2 * aSign
);
7947 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7948 return float_relation_equal
;
7950 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7955 int floatx80_compare(floatx80 a
, floatx80 b
, float_status
*status
)
7957 return floatx80_compare_internal(a
, b
, 0, status
);
7960 int floatx80_compare_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
7962 return floatx80_compare_internal(a
, b
, 1, status
);
7965 static inline int float128_compare_internal(float128 a
, float128 b
,
7966 int is_quiet
, float_status
*status
)
7970 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
7971 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
7972 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
7973 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
7975 float128_is_signaling_nan(a
, status
) ||
7976 float128_is_signaling_nan(b
, status
)) {
7977 float_raise(float_flag_invalid
, status
);
7979 return float_relation_unordered
;
7981 aSign
= extractFloat128Sign( a
);
7982 bSign
= extractFloat128Sign( b
);
7983 if ( aSign
!= bSign
) {
7984 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
7986 return float_relation_equal
;
7988 return 1 - (2 * aSign
);
7991 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7992 return float_relation_equal
;
7994 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7999 int float128_compare(float128 a
, float128 b
, float_status
*status
)
8001 return float128_compare_internal(a
, b
, 0, status
);
8004 int float128_compare_quiet(float128 a
, float128 b
, float_status
*status
)
8006 return float128_compare_internal(a
, b
, 1, status
);
8009 floatx80
floatx80_scalbn(floatx80 a
, int n
, float_status
*status
)
8015 if (floatx80_invalid_encoding(a
)) {
8016 float_raise(float_flag_invalid
, status
);
8017 return floatx80_default_nan(status
);
8019 aSig
= extractFloatx80Frac( a
);
8020 aExp
= extractFloatx80Exp( a
);
8021 aSign
= extractFloatx80Sign( a
);
8023 if ( aExp
== 0x7FFF ) {
8025 return propagateFloatx80NaN(a
, a
, status
);
8039 } else if (n
< -0x10000) {
8044 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
8045 aSign
, aExp
, aSig
, 0, status
);
8048 float128
float128_scalbn(float128 a
, int n
, float_status
*status
)
8052 uint64_t aSig0
, aSig1
;
8054 aSig1
= extractFloat128Frac1( a
);
8055 aSig0
= extractFloat128Frac0( a
);
8056 aExp
= extractFloat128Exp( a
);
8057 aSign
= extractFloat128Sign( a
);
8058 if ( aExp
== 0x7FFF ) {
8059 if ( aSig0
| aSig1
) {
8060 return propagateFloat128NaN(a
, a
, status
);
8065 aSig0
|= LIT64( 0x0001000000000000 );
8066 } else if (aSig0
== 0 && aSig1
== 0) {
8074 } else if (n
< -0x10000) {
8079 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1
8084 static void __attribute__((constructor
)) softfloat_init(void)
8086 union_float64 ua
, ub
, uc
, ur
;
8088 if (QEMU_NO_HARDFLOAT
) {
8092 * Test that the host's FMA is not obviously broken. For example,
8093 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
8094 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304
8096 ua
.s
= 0x0020000000000001ULL
;
8097 ub
.s
= 0x3ca0000000000000ULL
;
8098 uc
.s
= 0x0020000000000000ULL
;
8099 ur
.h
= fma(ua
.h
, ub
.h
, uc
.h
);
8100 if (ur
.s
!= 0x0020000000000001ULL
) {
8101 force_soft_fma
= true;