4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
90 /* We only need stdlib for abort() */
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
102 * Fast emulation of guest FP instructions is challenging for two reasons.
103 * First, FP instruction semantics are similar but not identical, particularly
104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
105 * exception flags is not trivial: reading the host's flags register with a
106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107 * and trapping on every FP exception is not fast nor pleasant to work with.
109 * We address these challenges by leveraging the host FPU for a subset of the
110 * operations. To do this we expand on the idea presented in this paper:
112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
115 * The idea is thus to leverage the host FPU to (1) compute FP operations
116 * and (2) identify whether FP exceptions occurred while avoiding
117 * expensive exception flag register accesses.
119 * An important optimization shown in the paper is that given that exception
120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121 * This is particularly useful for the inexact flag, which is very frequently
122 * raised in floating-point workloads.
124 * We optimize the code further by deferring to soft-fp whenever FP exception
125 * detection might get hairy. Two examples: (1) when at least one operand is
126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127 * and the result is < the minimum normal.
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \
130 static inline void name(soft_t *a, float_status *s) \
132 if (unlikely(soft_t ## _is_denormal(*a))) { \
133 *a = soft_t ## _set_sign(soft_t ## _zero, \
134 soft_t ## _is_neg(*a)); \
135 s->float_exception_flags |= float_flag_input_denormal; \
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck
, float32
)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck
, float64
)
141 #undef GEN_INPUT_FLUSH__NOCHECK
143 #define GEN_INPUT_FLUSH1(name, soft_t) \
144 static inline void name(soft_t *a, float_status *s) \
146 if (likely(!s->flush_inputs_to_zero)) { \
149 soft_t ## _input_flush__nocheck(a, s); \
152 GEN_INPUT_FLUSH1(float32_input_flush1
, float32
)
153 GEN_INPUT_FLUSH1(float64_input_flush1
, float64
)
154 #undef GEN_INPUT_FLUSH1
156 #define GEN_INPUT_FLUSH2(name, soft_t) \
157 static inline void name(soft_t *a, soft_t *b, float_status *s) \
159 if (likely(!s->flush_inputs_to_zero)) { \
162 soft_t ## _input_flush__nocheck(a, s); \
163 soft_t ## _input_flush__nocheck(b, s); \
166 GEN_INPUT_FLUSH2(float32_input_flush2
, float32
)
167 GEN_INPUT_FLUSH2(float64_input_flush2
, float64
)
168 #undef GEN_INPUT_FLUSH2
170 #define GEN_INPUT_FLUSH3(name, soft_t) \
171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
173 if (likely(!s->flush_inputs_to_zero)) { \
176 soft_t ## _input_flush__nocheck(a, s); \
177 soft_t ## _input_flush__nocheck(b, s); \
178 soft_t ## _input_flush__nocheck(c, s); \
181 GEN_INPUT_FLUSH3(float32_input_flush3
, float32
)
182 GEN_INPUT_FLUSH3(float64_input_flush3
, float64
)
183 #undef GEN_INPUT_FLUSH3
186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
187 * hardfloat functions. Each combination of number of inputs and float size
188 * gets its own value.
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208 * float{32,64}_is_infinity when !USE_FP.
209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF 1
215 # define QEMU_HARDFLOAT_USE_ISINF 0
219 * Some targets clear the FP flags before most FP operations. This prevents
220 * the use of hardfloat, since hardfloat relies on the inexact flag being
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
235 static inline bool can_use_fpu(const float_status
*s
)
237 if (QEMU_NO_HARDFLOAT
) {
240 return likely(s
->float_exception_flags
& float_flag_inexact
&&
241 s
->float_rounding_mode
== float_round_nearest_even
);
245 * Hardfloat generation functions. Each operation can have two flavors:
246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247 * most condition checks, or native ones (e.g. fpclassify).
249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
250 * compiler to propagate constants and inline everything into the callers.
252 * We only generate functions for operations with two inputs, since only
253 * these are common enough to justify consolidating them into common code.
266 typedef bool (*f32_check_fn
)(union_float32 a
, union_float32 b
);
267 typedef bool (*f64_check_fn
)(union_float64 a
, union_float64 b
);
269 typedef float32 (*soft_f32_op2_fn
)(float32 a
, float32 b
, float_status
*s
);
270 typedef float64 (*soft_f64_op2_fn
)(float64 a
, float64 b
, float_status
*s
);
271 typedef float (*hard_f32_op2_fn
)(float a
, float b
);
272 typedef double (*hard_f64_op2_fn
)(double a
, double b
);
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a
, union_float32 b
)
277 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
279 * Not using a temp variable for consecutive fpclassify calls ends up
280 * generating faster code.
282 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
283 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
285 return float32_is_zero_or_normal(a
.s
) &&
286 float32_is_zero_or_normal(b
.s
);
289 static inline bool f64_is_zon2(union_float64 a
, union_float64 b
)
291 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
292 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
293 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
295 return float64_is_zero_or_normal(a
.s
) &&
296 float64_is_zero_or_normal(b
.s
);
299 /* 3-input is-zero-or-normal */
301 bool f32_is_zon3(union_float32 a
, union_float32 b
, union_float32 c
)
303 if (QEMU_HARDFLOAT_3F32_USE_FP
) {
304 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
305 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
306 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
308 return float32_is_zero_or_normal(a
.s
) &&
309 float32_is_zero_or_normal(b
.s
) &&
310 float32_is_zero_or_normal(c
.s
);
314 bool f64_is_zon3(union_float64 a
, union_float64 b
, union_float64 c
)
316 if (QEMU_HARDFLOAT_3F64_USE_FP
) {
317 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
318 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
319 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
321 return float64_is_zero_or_normal(a
.s
) &&
322 float64_is_zero_or_normal(b
.s
) &&
323 float64_is_zero_or_normal(c
.s
);
326 static inline bool f32_is_inf(union_float32 a
)
328 if (QEMU_HARDFLOAT_USE_ISINF
) {
331 return float32_is_infinity(a
.s
);
334 static inline bool f64_is_inf(union_float64 a
)
336 if (QEMU_HARDFLOAT_USE_ISINF
) {
339 return float64_is_infinity(a
.s
);
342 /* Note: @fast_test and @post can be NULL */
343 static inline float32
344 float32_gen2(float32 xa
, float32 xb
, float_status
*s
,
345 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
,
346 f32_check_fn pre
, f32_check_fn post
,
347 f32_check_fn fast_test
, soft_f32_op2_fn fast_op
)
349 union_float32 ua
, ub
, ur
;
354 if (unlikely(!can_use_fpu(s
))) {
358 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
359 if (unlikely(!pre(ua
, ub
))) {
362 if (fast_test
&& fast_test(ua
, ub
)) {
363 return fast_op(ua
.s
, ub
.s
, s
);
366 ur
.h
= hard(ua
.h
, ub
.h
);
367 if (unlikely(f32_is_inf(ur
))) {
368 s
->float_exception_flags
|= float_flag_overflow
;
369 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
)) {
370 if (post
== NULL
|| post(ua
, ub
)) {
377 return soft(ua
.s
, ub
.s
, s
);
380 static inline float64
381 float64_gen2(float64 xa
, float64 xb
, float_status
*s
,
382 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
,
383 f64_check_fn pre
, f64_check_fn post
,
384 f64_check_fn fast_test
, soft_f64_op2_fn fast_op
)
386 union_float64 ua
, ub
, ur
;
391 if (unlikely(!can_use_fpu(s
))) {
395 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
396 if (unlikely(!pre(ua
, ub
))) {
399 if (fast_test
&& fast_test(ua
, ub
)) {
400 return fast_op(ua
.s
, ub
.s
, s
);
403 ur
.h
= hard(ua
.h
, ub
.h
);
404 if (unlikely(f64_is_inf(ur
))) {
405 s
->float_exception_flags
|= float_flag_overflow
;
406 } else if (unlikely(fabs(ur
.h
) <= DBL_MIN
)) {
407 if (post
== NULL
|| post(ua
, ub
)) {
414 return soft(ua
.s
, ub
.s
, s
);
417 /*----------------------------------------------------------------------------
418 | Returns the fraction bits of the half-precision floating-point value `a'.
419 *----------------------------------------------------------------------------*/
421 static inline uint32_t extractFloat16Frac(float16 a
)
423 return float16_val(a
) & 0x3ff;
426 /*----------------------------------------------------------------------------
427 | Returns the exponent bits of the half-precision floating-point value `a'.
428 *----------------------------------------------------------------------------*/
430 static inline int extractFloat16Exp(float16 a
)
432 return (float16_val(a
) >> 10) & 0x1f;
435 /*----------------------------------------------------------------------------
436 | Returns the fraction bits of the single-precision floating-point value `a'.
437 *----------------------------------------------------------------------------*/
439 static inline uint32_t extractFloat32Frac(float32 a
)
441 return float32_val(a
) & 0x007FFFFF;
444 /*----------------------------------------------------------------------------
445 | Returns the exponent bits of the single-precision floating-point value `a'.
446 *----------------------------------------------------------------------------*/
448 static inline int extractFloat32Exp(float32 a
)
450 return (float32_val(a
) >> 23) & 0xFF;
453 /*----------------------------------------------------------------------------
454 | Returns the sign bit of the single-precision floating-point value `a'.
455 *----------------------------------------------------------------------------*/
457 static inline flag
extractFloat32Sign(float32 a
)
459 return float32_val(a
) >> 31;
462 /*----------------------------------------------------------------------------
463 | Returns the fraction bits of the double-precision floating-point value `a'.
464 *----------------------------------------------------------------------------*/
466 static inline uint64_t extractFloat64Frac(float64 a
)
468 return float64_val(a
) & LIT64(0x000FFFFFFFFFFFFF);
471 /*----------------------------------------------------------------------------
472 | Returns the exponent bits of the double-precision floating-point value `a'.
473 *----------------------------------------------------------------------------*/
475 static inline int extractFloat64Exp(float64 a
)
477 return (float64_val(a
) >> 52) & 0x7FF;
480 /*----------------------------------------------------------------------------
481 | Returns the sign bit of the double-precision floating-point value `a'.
482 *----------------------------------------------------------------------------*/
484 static inline flag
extractFloat64Sign(float64 a
)
486 return float64_val(a
) >> 63;
490 * Classify a floating point number. Everything above float_class_qnan
491 * is a NaN so cls >= float_class_qnan is any NaN.
494 typedef enum __attribute__ ((__packed__
)) {
495 float_class_unclassified
,
499 float_class_qnan
, /* all NaNs from here */
503 /* Simple helpers for checking if, or what kind of, NaN we have */
504 static inline __attribute__((unused
)) bool is_nan(FloatClass c
)
506 return unlikely(c
>= float_class_qnan
);
509 static inline __attribute__((unused
)) bool is_snan(FloatClass c
)
511 return c
== float_class_snan
;
514 static inline __attribute__((unused
)) bool is_qnan(FloatClass c
)
516 return c
== float_class_qnan
;
520 * Structure holding all of the decomposed parts of a float. The
521 * exponent is unbiased and the fraction is normalized. All
522 * calculations are done with a 64 bit fraction and then rounded as
523 * appropriate for the final format.
525 * Thanks to the packed FloatClass a decent compiler should be able to
526 * fit the whole structure into registers and avoid using the stack
527 * for parameter passing.
537 #define DECOMPOSED_BINARY_POINT (64 - 2)
538 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
539 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
541 /* Structure holding all of the relevant parameters for a format.
542 * exp_size: the size of the exponent field
543 * exp_bias: the offset applied to the exponent field
544 * exp_max: the maximum normalised exponent
545 * frac_size: the size of the fraction field
546 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
547 * The following are computed based the size of fraction
548 * frac_lsb: least significant bit of fraction
549 * frac_lsbm1: the bit below the least significant bit (for rounding)
550 * round_mask/roundeven_mask: masks used for rounding
551 * The following optional modifiers are available:
552 * arm_althp: handle ARM Alternative Half Precision
563 uint64_t roundeven_mask
;
567 /* Expand fields based on the size of exponent and fraction */
568 #define FLOAT_PARAMS(E, F) \
570 .exp_bias = ((1 << E) - 1) >> 1, \
571 .exp_max = (1 << E) - 1, \
573 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
574 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
575 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
576 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
577 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
579 static const FloatFmt float16_params
= {
583 static const FloatFmt float16_params_ahp
= {
588 static const FloatFmt float32_params
= {
592 static const FloatFmt float64_params
= {
596 /* Unpack a float to parts, but do not canonicalize. */
597 static inline FloatParts
unpack_raw(FloatFmt fmt
, uint64_t raw
)
599 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
601 return (FloatParts
) {
602 .cls
= float_class_unclassified
,
603 .sign
= extract64(raw
, sign_pos
, 1),
604 .exp
= extract64(raw
, fmt
.frac_size
, fmt
.exp_size
),
605 .frac
= extract64(raw
, 0, fmt
.frac_size
),
609 static inline FloatParts
float16_unpack_raw(float16 f
)
611 return unpack_raw(float16_params
, f
);
614 static inline FloatParts
float32_unpack_raw(float32 f
)
616 return unpack_raw(float32_params
, f
);
619 static inline FloatParts
float64_unpack_raw(float64 f
)
621 return unpack_raw(float64_params
, f
);
624 /* Pack a float from parts, but do not canonicalize. */
625 static inline uint64_t pack_raw(FloatFmt fmt
, FloatParts p
)
627 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
628 uint64_t ret
= deposit64(p
.frac
, fmt
.frac_size
, fmt
.exp_size
, p
.exp
);
629 return deposit64(ret
, sign_pos
, 1, p
.sign
);
632 static inline float16
float16_pack_raw(FloatParts p
)
634 return make_float16(pack_raw(float16_params
, p
));
637 static inline float32
float32_pack_raw(FloatParts p
)
639 return make_float32(pack_raw(float32_params
, p
));
642 static inline float64
float64_pack_raw(FloatParts p
)
644 return make_float64(pack_raw(float64_params
, p
));
647 /*----------------------------------------------------------------------------
648 | Functions and definitions to determine: (1) whether tininess for underflow
649 | is detected before or after rounding by default, (2) what (if anything)
650 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
651 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
652 | are propagated from function inputs to output. These details are target-
654 *----------------------------------------------------------------------------*/
655 #include "softfloat-specialize.h"
657 /* Canonicalize EXP and FRAC, setting CLS. */
658 static FloatParts
sf_canonicalize(FloatParts part
, const FloatFmt
*parm
,
659 float_status
*status
)
661 if (part
.exp
== parm
->exp_max
&& !parm
->arm_althp
) {
662 if (part
.frac
== 0) {
663 part
.cls
= float_class_inf
;
665 part
.frac
<<= parm
->frac_shift
;
666 part
.cls
= (parts_is_snan_frac(part
.frac
, status
)
667 ? float_class_snan
: float_class_qnan
);
669 } else if (part
.exp
== 0) {
670 if (likely(part
.frac
== 0)) {
671 part
.cls
= float_class_zero
;
672 } else if (status
->flush_inputs_to_zero
) {
673 float_raise(float_flag_input_denormal
, status
);
674 part
.cls
= float_class_zero
;
677 int shift
= clz64(part
.frac
) - 1;
678 part
.cls
= float_class_normal
;
679 part
.exp
= parm
->frac_shift
- parm
->exp_bias
- shift
+ 1;
683 part
.cls
= float_class_normal
;
684 part
.exp
-= parm
->exp_bias
;
685 part
.frac
= DECOMPOSED_IMPLICIT_BIT
+ (part
.frac
<< parm
->frac_shift
);
690 /* Round and uncanonicalize a floating-point number by parts. There
691 * are FRAC_SHIFT bits that may require rounding at the bottom of the
692 * fraction; these bits will be removed. The exponent will be biased
693 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
696 static FloatParts
round_canonical(FloatParts p
, float_status
*s
,
697 const FloatFmt
*parm
)
699 const uint64_t frac_lsbm1
= parm
->frac_lsbm1
;
700 const uint64_t round_mask
= parm
->round_mask
;
701 const uint64_t roundeven_mask
= parm
->roundeven_mask
;
702 const int exp_max
= parm
->exp_max
;
703 const int frac_shift
= parm
->frac_shift
;
712 case float_class_normal
:
713 switch (s
->float_rounding_mode
) {
714 case float_round_nearest_even
:
715 overflow_norm
= false;
716 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
718 case float_round_ties_away
:
719 overflow_norm
= false;
722 case float_round_to_zero
:
723 overflow_norm
= true;
727 inc
= p
.sign
? 0 : round_mask
;
728 overflow_norm
= p
.sign
;
730 case float_round_down
:
731 inc
= p
.sign
? round_mask
: 0;
732 overflow_norm
= !p
.sign
;
735 g_assert_not_reached();
738 exp
+= parm
->exp_bias
;
739 if (likely(exp
> 0)) {
740 if (frac
& round_mask
) {
741 flags
|= float_flag_inexact
;
743 if (frac
& DECOMPOSED_OVERFLOW_BIT
) {
750 if (parm
->arm_althp
) {
751 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
752 if (unlikely(exp
> exp_max
)) {
753 /* Overflow. Return the maximum normal. */
754 flags
= float_flag_invalid
;
758 } else if (unlikely(exp
>= exp_max
)) {
759 flags
|= float_flag_overflow
| float_flag_inexact
;
764 p
.cls
= float_class_inf
;
768 } else if (s
->flush_to_zero
) {
769 flags
|= float_flag_output_denormal
;
770 p
.cls
= float_class_zero
;
773 bool is_tiny
= (s
->float_detect_tininess
774 == float_tininess_before_rounding
)
776 || !((frac
+ inc
) & DECOMPOSED_OVERFLOW_BIT
);
778 shift64RightJamming(frac
, 1 - exp
, &frac
);
779 if (frac
& round_mask
) {
780 /* Need to recompute round-to-even. */
781 if (s
->float_rounding_mode
== float_round_nearest_even
) {
782 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
785 flags
|= float_flag_inexact
;
789 exp
= (frac
& DECOMPOSED_IMPLICIT_BIT
? 1 : 0);
792 if (is_tiny
&& (flags
& float_flag_inexact
)) {
793 flags
|= float_flag_underflow
;
795 if (exp
== 0 && frac
== 0) {
796 p
.cls
= float_class_zero
;
801 case float_class_zero
:
807 case float_class_inf
:
809 assert(!parm
->arm_althp
);
814 case float_class_qnan
:
815 case float_class_snan
:
816 assert(!parm
->arm_althp
);
818 frac
>>= parm
->frac_shift
;
822 g_assert_not_reached();
825 float_raise(flags
, s
);
831 /* Explicit FloatFmt version */
832 static FloatParts
float16a_unpack_canonical(float16 f
, float_status
*s
,
833 const FloatFmt
*params
)
835 return sf_canonicalize(float16_unpack_raw(f
), params
, s
);
838 static FloatParts
float16_unpack_canonical(float16 f
, float_status
*s
)
840 return float16a_unpack_canonical(f
, s
, &float16_params
);
843 static float16
float16a_round_pack_canonical(FloatParts p
, float_status
*s
,
844 const FloatFmt
*params
)
846 return float16_pack_raw(round_canonical(p
, s
, params
));
849 static float16
float16_round_pack_canonical(FloatParts p
, float_status
*s
)
851 return float16a_round_pack_canonical(p
, s
, &float16_params
);
854 static FloatParts
float32_unpack_canonical(float32 f
, float_status
*s
)
856 return sf_canonicalize(float32_unpack_raw(f
), &float32_params
, s
);
859 static float32
float32_round_pack_canonical(FloatParts p
, float_status
*s
)
861 return float32_pack_raw(round_canonical(p
, s
, &float32_params
));
864 static FloatParts
float64_unpack_canonical(float64 f
, float_status
*s
)
866 return sf_canonicalize(float64_unpack_raw(f
), &float64_params
, s
);
869 static float64
float64_round_pack_canonical(FloatParts p
, float_status
*s
)
871 return float64_pack_raw(round_canonical(p
, s
, &float64_params
));
874 static FloatParts
return_nan(FloatParts a
, float_status
*s
)
877 case float_class_snan
:
878 s
->float_exception_flags
|= float_flag_invalid
;
879 a
= parts_silence_nan(a
, s
);
881 case float_class_qnan
:
882 if (s
->default_nan_mode
) {
883 return parts_default_nan(s
);
888 g_assert_not_reached();
893 static FloatParts
pick_nan(FloatParts a
, FloatParts b
, float_status
*s
)
895 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
896 s
->float_exception_flags
|= float_flag_invalid
;
899 if (s
->default_nan_mode
) {
900 return parts_default_nan(s
);
902 if (pickNaN(a
.cls
, b
.cls
,
904 (a
.frac
== b
.frac
&& a
.sign
< b
.sign
))) {
907 if (is_snan(a
.cls
)) {
908 return parts_silence_nan(a
, s
);
914 static FloatParts
pick_nan_muladd(FloatParts a
, FloatParts b
, FloatParts c
,
915 bool inf_zero
, float_status
*s
)
919 if (is_snan(a
.cls
) || is_snan(b
.cls
) || is_snan(c
.cls
)) {
920 s
->float_exception_flags
|= float_flag_invalid
;
923 which
= pickNaNMulAdd(a
.cls
, b
.cls
, c
.cls
, inf_zero
, s
);
925 if (s
->default_nan_mode
) {
926 /* Note that this check is after pickNaNMulAdd so that function
927 * has an opportunity to set the Invalid flag.
942 return parts_default_nan(s
);
944 g_assert_not_reached();
947 if (is_snan(a
.cls
)) {
948 return parts_silence_nan(a
, s
);
954 * Returns the result of adding or subtracting the values of the
955 * floating-point values `a' and `b'. The operation is performed
956 * according to the IEC/IEEE Standard for Binary Floating-Point
960 static FloatParts
addsub_floats(FloatParts a
, FloatParts b
, bool subtract
,
963 bool a_sign
= a
.sign
;
964 bool b_sign
= b
.sign
^ subtract
;
966 if (a_sign
!= b_sign
) {
969 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
970 if (a
.exp
> b
.exp
|| (a
.exp
== b
.exp
&& a
.frac
>= b
.frac
)) {
971 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
972 a
.frac
= a
.frac
- b
.frac
;
974 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
975 a
.frac
= b
.frac
- a
.frac
;
981 a
.cls
= float_class_zero
;
982 a
.sign
= s
->float_rounding_mode
== float_round_down
;
984 int shift
= clz64(a
.frac
) - 1;
985 a
.frac
= a
.frac
<< shift
;
986 a
.exp
= a
.exp
- shift
;
991 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
992 return pick_nan(a
, b
, s
);
994 if (a
.cls
== float_class_inf
) {
995 if (b
.cls
== float_class_inf
) {
996 float_raise(float_flag_invalid
, s
);
997 return parts_default_nan(s
);
1001 if (a
.cls
== float_class_zero
&& b
.cls
== float_class_zero
) {
1002 a
.sign
= s
->float_rounding_mode
== float_round_down
;
1005 if (a
.cls
== float_class_zero
|| b
.cls
== float_class_inf
) {
1006 b
.sign
= a_sign
^ 1;
1009 if (b
.cls
== float_class_zero
) {
1014 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1015 if (a
.exp
> b
.exp
) {
1016 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
1017 } else if (a
.exp
< b
.exp
) {
1018 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
1022 if (a
.frac
& DECOMPOSED_OVERFLOW_BIT
) {
1023 shift64RightJamming(a
.frac
, 1, &a
.frac
);
1028 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1029 return pick_nan(a
, b
, s
);
1031 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
1034 if (b
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1039 g_assert_not_reached();
1043 * Returns the result of adding or subtracting the floating-point
1044 * values `a' and `b'. The operation is performed according to the
1045 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1048 float16 QEMU_FLATTEN
float16_add(float16 a
, float16 b
, float_status
*status
)
1050 FloatParts pa
= float16_unpack_canonical(a
, status
);
1051 FloatParts pb
= float16_unpack_canonical(b
, status
);
1052 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
1054 return float16_round_pack_canonical(pr
, status
);
1057 float16 QEMU_FLATTEN
float16_sub(float16 a
, float16 b
, float_status
*status
)
1059 FloatParts pa
= float16_unpack_canonical(a
, status
);
1060 FloatParts pb
= float16_unpack_canonical(b
, status
);
1061 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
1063 return float16_round_pack_canonical(pr
, status
);
1066 static float32 QEMU_SOFTFLOAT_ATTR
1067 soft_f32_addsub(float32 a
, float32 b
, bool subtract
, float_status
*status
)
1069 FloatParts pa
= float32_unpack_canonical(a
, status
);
1070 FloatParts pb
= float32_unpack_canonical(b
, status
);
1071 FloatParts pr
= addsub_floats(pa
, pb
, subtract
, status
);
1073 return float32_round_pack_canonical(pr
, status
);
1076 static inline float32
soft_f32_add(float32 a
, float32 b
, float_status
*status
)
1078 return soft_f32_addsub(a
, b
, false, status
);
1081 static inline float32
soft_f32_sub(float32 a
, float32 b
, float_status
*status
)
1083 return soft_f32_addsub(a
, b
, true, status
);
1086 static float64 QEMU_SOFTFLOAT_ATTR
1087 soft_f64_addsub(float64 a
, float64 b
, bool subtract
, float_status
*status
)
1089 FloatParts pa
= float64_unpack_canonical(a
, status
);
1090 FloatParts pb
= float64_unpack_canonical(b
, status
);
1091 FloatParts pr
= addsub_floats(pa
, pb
, subtract
, status
);
1093 return float64_round_pack_canonical(pr
, status
);
1096 static inline float64
soft_f64_add(float64 a
, float64 b
, float_status
*status
)
1098 return soft_f64_addsub(a
, b
, false, status
);
1101 static inline float64
soft_f64_sub(float64 a
, float64 b
, float_status
*status
)
1103 return soft_f64_addsub(a
, b
, true, status
);
1106 static float hard_f32_add(float a
, float b
)
1111 static float hard_f32_sub(float a
, float b
)
1116 static double hard_f64_add(double a
, double b
)
1121 static double hard_f64_sub(double a
, double b
)
1126 static bool f32_addsub_post(union_float32 a
, union_float32 b
)
1128 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1129 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1131 return !(float32_is_zero(a
.s
) && float32_is_zero(b
.s
));
1134 static bool f64_addsub_post(union_float64 a
, union_float64 b
)
1136 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1137 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1139 return !(float64_is_zero(a
.s
) && float64_is_zero(b
.s
));
1143 static float32
float32_addsub(float32 a
, float32 b
, float_status
*s
,
1144 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
)
1146 return float32_gen2(a
, b
, s
, hard
, soft
,
1147 f32_is_zon2
, f32_addsub_post
, NULL
, NULL
);
1150 static float64
float64_addsub(float64 a
, float64 b
, float_status
*s
,
1151 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
)
1153 return float64_gen2(a
, b
, s
, hard
, soft
,
1154 f64_is_zon2
, f64_addsub_post
, NULL
, NULL
);
1157 float32 QEMU_FLATTEN
1158 float32_add(float32 a
, float32 b
, float_status
*s
)
1160 return float32_addsub(a
, b
, s
, hard_f32_add
, soft_f32_add
);
1163 float32 QEMU_FLATTEN
1164 float32_sub(float32 a
, float32 b
, float_status
*s
)
1166 return float32_addsub(a
, b
, s
, hard_f32_sub
, soft_f32_sub
);
1169 float64 QEMU_FLATTEN
1170 float64_add(float64 a
, float64 b
, float_status
*s
)
1172 return float64_addsub(a
, b
, s
, hard_f64_add
, soft_f64_add
);
1175 float64 QEMU_FLATTEN
1176 float64_sub(float64 a
, float64 b
, float_status
*s
)
1178 return float64_addsub(a
, b
, s
, hard_f64_sub
, soft_f64_sub
);
1182 * Returns the result of multiplying the floating-point values `a' and
1183 * `b'. The operation is performed according to the IEC/IEEE Standard
1184 * for Binary Floating-Point Arithmetic.
1187 static FloatParts
mul_floats(FloatParts a
, FloatParts b
, float_status
*s
)
1189 bool sign
= a
.sign
^ b
.sign
;
1191 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1193 int exp
= a
.exp
+ b
.exp
;
1195 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
1196 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
1197 if (lo
& DECOMPOSED_OVERFLOW_BIT
) {
1198 shift64RightJamming(lo
, 1, &lo
);
1208 /* handle all the NaN cases */
1209 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1210 return pick_nan(a
, b
, s
);
1212 /* Inf * Zero == NaN */
1213 if ((a
.cls
== float_class_inf
&& b
.cls
== float_class_zero
) ||
1214 (a
.cls
== float_class_zero
&& b
.cls
== float_class_inf
)) {
1215 s
->float_exception_flags
|= float_flag_invalid
;
1216 return parts_default_nan(s
);
1218 /* Multiply by 0 or Inf */
1219 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1223 if (b
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
1227 g_assert_not_reached();
1230 float16 QEMU_FLATTEN
float16_mul(float16 a
, float16 b
, float_status
*status
)
1232 FloatParts pa
= float16_unpack_canonical(a
, status
);
1233 FloatParts pb
= float16_unpack_canonical(b
, status
);
1234 FloatParts pr
= mul_floats(pa
, pb
, status
);
1236 return float16_round_pack_canonical(pr
, status
);
1239 static float32 QEMU_SOFTFLOAT_ATTR
1240 soft_f32_mul(float32 a
, float32 b
, float_status
*status
)
1242 FloatParts pa
= float32_unpack_canonical(a
, status
);
1243 FloatParts pb
= float32_unpack_canonical(b
, status
);
1244 FloatParts pr
= mul_floats(pa
, pb
, status
);
1246 return float32_round_pack_canonical(pr
, status
);
1249 static float64 QEMU_SOFTFLOAT_ATTR
1250 soft_f64_mul(float64 a
, float64 b
, float_status
*status
)
1252 FloatParts pa
= float64_unpack_canonical(a
, status
);
1253 FloatParts pb
= float64_unpack_canonical(b
, status
);
1254 FloatParts pr
= mul_floats(pa
, pb
, status
);
1256 return float64_round_pack_canonical(pr
, status
);
1259 static float hard_f32_mul(float a
, float b
)
1264 static double hard_f64_mul(double a
, double b
)
1269 static bool f32_mul_fast_test(union_float32 a
, union_float32 b
)
1271 return float32_is_zero(a
.s
) || float32_is_zero(b
.s
);
1274 static bool f64_mul_fast_test(union_float64 a
, union_float64 b
)
1276 return float64_is_zero(a
.s
) || float64_is_zero(b
.s
);
1279 static float32
f32_mul_fast_op(float32 a
, float32 b
, float_status
*s
)
1281 bool signbit
= float32_is_neg(a
) ^ float32_is_neg(b
);
1283 return float32_set_sign(float32_zero
, signbit
);
1286 static float64
f64_mul_fast_op(float64 a
, float64 b
, float_status
*s
)
1288 bool signbit
= float64_is_neg(a
) ^ float64_is_neg(b
);
1290 return float64_set_sign(float64_zero
, signbit
);
1293 float32 QEMU_FLATTEN
1294 float32_mul(float32 a
, float32 b
, float_status
*s
)
1296 return float32_gen2(a
, b
, s
, hard_f32_mul
, soft_f32_mul
,
1297 f32_is_zon2
, NULL
, f32_mul_fast_test
, f32_mul_fast_op
);
1300 float64 QEMU_FLATTEN
1301 float64_mul(float64 a
, float64 b
, float_status
*s
)
1303 return float64_gen2(a
, b
, s
, hard_f64_mul
, soft_f64_mul
,
1304 f64_is_zon2
, NULL
, f64_mul_fast_test
, f64_mul_fast_op
);
1308 * Returns the result of multiplying the floating-point values `a' and
1309 * `b' then adding 'c', with no intermediate rounding step after the
1310 * multiplication. The operation is performed according to the
1311 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1312 * The flags argument allows the caller to select negation of the
1313 * addend, the intermediate product, or the final result. (The
1314 * difference between this and having the caller do a separate
1315 * negation is that negating externally will flip the sign bit on
1319 static FloatParts
muladd_floats(FloatParts a
, FloatParts b
, FloatParts c
,
1320 int flags
, float_status
*s
)
1322 bool inf_zero
= ((1 << a
.cls
) | (1 << b
.cls
)) ==
1323 ((1 << float_class_inf
) | (1 << float_class_zero
));
1325 bool sign_flip
= flags
& float_muladd_negate_result
;
1330 /* It is implementation-defined whether the cases of (0,inf,qnan)
1331 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1332 * they return if they do), so we have to hand this information
1333 * off to the target-specific pick-a-NaN routine.
1335 if (is_nan(a
.cls
) || is_nan(b
.cls
) || is_nan(c
.cls
)) {
1336 return pick_nan_muladd(a
, b
, c
, inf_zero
, s
);
1340 s
->float_exception_flags
|= float_flag_invalid
;
1341 return parts_default_nan(s
);
1344 if (flags
& float_muladd_negate_c
) {
1348 p_sign
= a
.sign
^ b
.sign
;
1350 if (flags
& float_muladd_negate_product
) {
1354 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_inf
) {
1355 p_class
= float_class_inf
;
1356 } else if (a
.cls
== float_class_zero
|| b
.cls
== float_class_zero
) {
1357 p_class
= float_class_zero
;
1359 p_class
= float_class_normal
;
1362 if (c
.cls
== float_class_inf
) {
1363 if (p_class
== float_class_inf
&& p_sign
!= c
.sign
) {
1364 s
->float_exception_flags
|= float_flag_invalid
;
1365 return parts_default_nan(s
);
1367 a
.cls
= float_class_inf
;
1368 a
.sign
= c
.sign
^ sign_flip
;
1373 if (p_class
== float_class_inf
) {
1374 a
.cls
= float_class_inf
;
1375 a
.sign
= p_sign
^ sign_flip
;
1379 if (p_class
== float_class_zero
) {
1380 if (c
.cls
== float_class_zero
) {
1381 if (p_sign
!= c
.sign
) {
1382 p_sign
= s
->float_rounding_mode
== float_round_down
;
1385 } else if (flags
& float_muladd_halve_result
) {
1388 c
.sign
^= sign_flip
;
1392 /* a & b should be normals now... */
1393 assert(a
.cls
== float_class_normal
&&
1394 b
.cls
== float_class_normal
);
1396 p_exp
= a
.exp
+ b
.exp
;
1398 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1401 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
1402 /* binary point now at bit 124 */
1404 /* check for overflow */
1405 if (hi
& (1ULL << (DECOMPOSED_BINARY_POINT
* 2 + 1 - 64))) {
1406 shift128RightJamming(hi
, lo
, 1, &hi
, &lo
);
1411 if (c
.cls
== float_class_zero
) {
1412 /* move binary point back to 62 */
1413 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
1415 int exp_diff
= p_exp
- c
.exp
;
1416 if (p_sign
== c
.sign
) {
1418 if (exp_diff
<= 0) {
1419 shift128RightJamming(hi
, lo
,
1420 DECOMPOSED_BINARY_POINT
- exp_diff
,
1425 uint64_t c_hi
, c_lo
;
1426 /* shift c to the same binary point as the product (124) */
1429 shift128RightJamming(c_hi
, c_lo
,
1432 add128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1433 /* move binary point back to 62 */
1434 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
1437 if (lo
& DECOMPOSED_OVERFLOW_BIT
) {
1438 shift64RightJamming(lo
, 1, &lo
);
1444 uint64_t c_hi
, c_lo
;
1445 /* make C binary point match product at bit 124 */
1449 if (exp_diff
<= 0) {
1450 shift128RightJamming(hi
, lo
, -exp_diff
, &hi
, &lo
);
1453 (hi
> c_hi
|| (hi
== c_hi
&& lo
>= c_lo
))) {
1454 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1456 sub128(c_hi
, c_lo
, hi
, lo
, &hi
, &lo
);
1461 shift128RightJamming(c_hi
, c_lo
,
1464 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1467 if (hi
== 0 && lo
== 0) {
1468 a
.cls
= float_class_zero
;
1469 a
.sign
= s
->float_rounding_mode
== float_round_down
;
1470 a
.sign
^= sign_flip
;
1477 shift
= clz64(lo
) + 64;
1479 /* Normalizing to a binary point of 124 is the
1480 correct adjust for the exponent. However since we're
1481 shifting, we might as well put the binary point back
1482 at 62 where we really want it. Therefore shift as
1483 if we're leaving 1 bit at the top of the word, but
1484 adjust the exponent as if we're leaving 3 bits. */
1487 lo
= lo
<< (shift
- 64);
1489 hi
= (hi
<< shift
) | (lo
>> (64 - shift
));
1490 lo
= hi
| ((lo
<< shift
) != 0);
1497 if (flags
& float_muladd_halve_result
) {
1501 /* finally prepare our result */
1502 a
.cls
= float_class_normal
;
1503 a
.sign
= p_sign
^ sign_flip
;
1510 float16 QEMU_FLATTEN
float16_muladd(float16 a
, float16 b
, float16 c
,
1511 int flags
, float_status
*status
)
1513 FloatParts pa
= float16_unpack_canonical(a
, status
);
1514 FloatParts pb
= float16_unpack_canonical(b
, status
);
1515 FloatParts pc
= float16_unpack_canonical(c
, status
);
1516 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1518 return float16_round_pack_canonical(pr
, status
);
1521 static float32 QEMU_SOFTFLOAT_ATTR
1522 soft_f32_muladd(float32 a
, float32 b
, float32 c
, int flags
,
1523 float_status
*status
)
1525 FloatParts pa
= float32_unpack_canonical(a
, status
);
1526 FloatParts pb
= float32_unpack_canonical(b
, status
);
1527 FloatParts pc
= float32_unpack_canonical(c
, status
);
1528 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1530 return float32_round_pack_canonical(pr
, status
);
1533 static float64 QEMU_SOFTFLOAT_ATTR
1534 soft_f64_muladd(float64 a
, float64 b
, float64 c
, int flags
,
1535 float_status
*status
)
1537 FloatParts pa
= float64_unpack_canonical(a
, status
);
1538 FloatParts pb
= float64_unpack_canonical(b
, status
);
1539 FloatParts pc
= float64_unpack_canonical(c
, status
);
1540 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1542 return float64_round_pack_canonical(pr
, status
);
1545 float32 QEMU_FLATTEN
1546 float32_muladd(float32 xa
, float32 xb
, float32 xc
, int flags
, float_status
*s
)
1548 union_float32 ua
, ub
, uc
, ur
;
1554 if (unlikely(!can_use_fpu(s
))) {
1557 if (unlikely(flags
& float_muladd_halve_result
)) {
1561 float32_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1562 if (unlikely(!f32_is_zon3(ua
, ub
, uc
))) {
1566 * When (a || b) == 0, there's no need to check for under/over flow,
1567 * since we know the addend is (normal || 0) and the product is 0.
1569 if (float32_is_zero(ua
.s
) || float32_is_zero(ub
.s
)) {
1573 prod_sign
= float32_is_neg(ua
.s
) ^ float32_is_neg(ub
.s
);
1574 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1575 up
.s
= float32_set_sign(float32_zero
, prod_sign
);
1577 if (flags
& float_muladd_negate_c
) {
1582 if (flags
& float_muladd_negate_product
) {
1585 if (flags
& float_muladd_negate_c
) {
1589 ur
.h
= fmaf(ua
.h
, ub
.h
, uc
.h
);
1591 if (unlikely(f32_is_inf(ur
))) {
1592 s
->float_exception_flags
|= float_flag_overflow
;
1593 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
)) {
1597 if (flags
& float_muladd_negate_result
) {
1598 return float32_chs(ur
.s
);
1603 return soft_f32_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1606 float64 QEMU_FLATTEN
1607 float64_muladd(float64 xa
, float64 xb
, float64 xc
, int flags
, float_status
*s
)
1609 union_float64 ua
, ub
, uc
, ur
;
1615 if (unlikely(!can_use_fpu(s
))) {
1618 if (unlikely(flags
& float_muladd_halve_result
)) {
1622 float64_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1623 if (unlikely(!f64_is_zon3(ua
, ub
, uc
))) {
1627 * When (a || b) == 0, there's no need to check for under/over flow,
1628 * since we know the addend is (normal || 0) and the product is 0.
1630 if (float64_is_zero(ua
.s
) || float64_is_zero(ub
.s
)) {
1634 prod_sign
= float64_is_neg(ua
.s
) ^ float64_is_neg(ub
.s
);
1635 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1636 up
.s
= float64_set_sign(float64_zero
, prod_sign
);
1638 if (flags
& float_muladd_negate_c
) {
1643 if (flags
& float_muladd_negate_product
) {
1646 if (flags
& float_muladd_negate_c
) {
1650 ur
.h
= fma(ua
.h
, ub
.h
, uc
.h
);
1652 if (unlikely(f64_is_inf(ur
))) {
1653 s
->float_exception_flags
|= float_flag_overflow
;
1654 } else if (unlikely(fabs(ur
.h
) <= FLT_MIN
)) {
1658 if (flags
& float_muladd_negate_result
) {
1659 return float64_chs(ur
.s
);
1664 return soft_f64_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1668 * Returns the result of dividing the floating-point value `a' by the
1669 * corresponding value `b'. The operation is performed according to
1670 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1673 static FloatParts
div_floats(FloatParts a
, FloatParts b
, float_status
*s
)
1675 bool sign
= a
.sign
^ b
.sign
;
1677 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1678 uint64_t n0
, n1
, q
, r
;
1679 int exp
= a
.exp
- b
.exp
;
1682 * We want a 2*N / N-bit division to produce exactly an N-bit
1683 * result, so that we do not lose any precision and so that we
1684 * do not have to renormalize afterward. If A.frac < B.frac,
1685 * then division would produce an (N-1)-bit result; shift A left
1686 * by one to produce the an N-bit result, and decrement the
1687 * exponent to match.
1689 * The udiv_qrnnd algorithm that we're using requires normalization,
1690 * i.e. the msb of the denominator must be set. Since we know that
1691 * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1692 * by one (more), and the remainder must be shifted right by one.
1694 if (a
.frac
< b
.frac
) {
1696 shift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
+ 2, &n1
, &n0
);
1698 shift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
+ 1, &n1
, &n0
);
1700 q
= udiv_qrnnd(&r
, n1
, n0
, b
.frac
<< 1);
1703 * Set lsb if there is a remainder, to set inexact.
1704 * As mentioned above, to find the actual value of the remainder we
1705 * would need to shift right, but (1) we are only concerned about
1706 * non-zero-ness, and (2) the remainder will always be even because
1707 * both inputs to the division primitive are even.
1709 a
.frac
= q
| (r
!= 0);
1714 /* handle all the NaN cases */
1715 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1716 return pick_nan(a
, b
, s
);
1718 /* 0/0 or Inf/Inf */
1721 (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
)) {
1722 s
->float_exception_flags
|= float_flag_invalid
;
1723 return parts_default_nan(s
);
1725 /* Inf / x or 0 / x */
1726 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1731 if (b
.cls
== float_class_zero
) {
1732 s
->float_exception_flags
|= float_flag_divbyzero
;
1733 a
.cls
= float_class_inf
;
1738 if (b
.cls
== float_class_inf
) {
1739 a
.cls
= float_class_zero
;
1743 g_assert_not_reached();
1746 float16
float16_div(float16 a
, float16 b
, float_status
*status
)
1748 FloatParts pa
= float16_unpack_canonical(a
, status
);
1749 FloatParts pb
= float16_unpack_canonical(b
, status
);
1750 FloatParts pr
= div_floats(pa
, pb
, status
);
1752 return float16_round_pack_canonical(pr
, status
);
1755 static float32 QEMU_SOFTFLOAT_ATTR
1756 soft_f32_div(float32 a
, float32 b
, float_status
*status
)
1758 FloatParts pa
= float32_unpack_canonical(a
, status
);
1759 FloatParts pb
= float32_unpack_canonical(b
, status
);
1760 FloatParts pr
= div_floats(pa
, pb
, status
);
1762 return float32_round_pack_canonical(pr
, status
);
1765 static float64 QEMU_SOFTFLOAT_ATTR
1766 soft_f64_div(float64 a
, float64 b
, float_status
*status
)
1768 FloatParts pa
= float64_unpack_canonical(a
, status
);
1769 FloatParts pb
= float64_unpack_canonical(b
, status
);
1770 FloatParts pr
= div_floats(pa
, pb
, status
);
1772 return float64_round_pack_canonical(pr
, status
);
1775 static float hard_f32_div(float a
, float b
)
1780 static double hard_f64_div(double a
, double b
)
1785 static bool f32_div_pre(union_float32 a
, union_float32 b
)
1787 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1788 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
1789 fpclassify(b
.h
) == FP_NORMAL
;
1791 return float32_is_zero_or_normal(a
.s
) && float32_is_normal(b
.s
);
1794 static bool f64_div_pre(union_float64 a
, union_float64 b
)
1796 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1797 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
1798 fpclassify(b
.h
) == FP_NORMAL
;
1800 return float64_is_zero_or_normal(a
.s
) && float64_is_normal(b
.s
);
1803 static bool f32_div_post(union_float32 a
, union_float32 b
)
1805 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1806 return fpclassify(a
.h
) != FP_ZERO
;
1808 return !float32_is_zero(a
.s
);
1811 static bool f64_div_post(union_float64 a
, union_float64 b
)
1813 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1814 return fpclassify(a
.h
) != FP_ZERO
;
1816 return !float64_is_zero(a
.s
);
1819 float32 QEMU_FLATTEN
1820 float32_div(float32 a
, float32 b
, float_status
*s
)
1822 return float32_gen2(a
, b
, s
, hard_f32_div
, soft_f32_div
,
1823 f32_div_pre
, f32_div_post
, NULL
, NULL
);
1826 float64 QEMU_FLATTEN
1827 float64_div(float64 a
, float64 b
, float_status
*s
)
1829 return float64_gen2(a
, b
, s
, hard_f64_div
, soft_f64_div
,
1830 f64_div_pre
, f64_div_post
, NULL
, NULL
);
1834 * Float to Float conversions
1836 * Returns the result of converting one float format to another. The
1837 * conversion is performed according to the IEC/IEEE Standard for
1838 * Binary Floating-Point Arithmetic.
1840 * The float_to_float helper only needs to take care of raising
1841 * invalid exceptions and handling the conversion on NaNs.
1844 static FloatParts
float_to_float(FloatParts a
, const FloatFmt
*dstf
,
1847 if (dstf
->arm_althp
) {
1849 case float_class_qnan
:
1850 case float_class_snan
:
1851 /* There is no NaN in the destination format. Raise Invalid
1852 * and return a zero with the sign of the input NaN.
1854 s
->float_exception_flags
|= float_flag_invalid
;
1855 a
.cls
= float_class_zero
;
1860 case float_class_inf
:
1861 /* There is no Inf in the destination format. Raise Invalid
1862 * and return the maximum normal with the correct sign.
1864 s
->float_exception_flags
|= float_flag_invalid
;
1865 a
.cls
= float_class_normal
;
1866 a
.exp
= dstf
->exp_max
;
1867 a
.frac
= ((1ull << dstf
->frac_size
) - 1) << dstf
->frac_shift
;
1873 } else if (is_nan(a
.cls
)) {
1874 if (is_snan(a
.cls
)) {
1875 s
->float_exception_flags
|= float_flag_invalid
;
1876 a
= parts_silence_nan(a
, s
);
1878 if (s
->default_nan_mode
) {
1879 return parts_default_nan(s
);
1885 float32
float16_to_float32(float16 a
, bool ieee
, float_status
*s
)
1887 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1888 FloatParts p
= float16a_unpack_canonical(a
, s
, fmt16
);
1889 FloatParts pr
= float_to_float(p
, &float32_params
, s
);
1890 return float32_round_pack_canonical(pr
, s
);
1893 float64
float16_to_float64(float16 a
, bool ieee
, float_status
*s
)
1895 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1896 FloatParts p
= float16a_unpack_canonical(a
, s
, fmt16
);
1897 FloatParts pr
= float_to_float(p
, &float64_params
, s
);
1898 return float64_round_pack_canonical(pr
, s
);
1901 float16
float32_to_float16(float32 a
, bool ieee
, float_status
*s
)
1903 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1904 FloatParts p
= float32_unpack_canonical(a
, s
);
1905 FloatParts pr
= float_to_float(p
, fmt16
, s
);
1906 return float16a_round_pack_canonical(pr
, s
, fmt16
);
1909 float64
float32_to_float64(float32 a
, float_status
*s
)
1911 FloatParts p
= float32_unpack_canonical(a
, s
);
1912 FloatParts pr
= float_to_float(p
, &float64_params
, s
);
1913 return float64_round_pack_canonical(pr
, s
);
1916 float16
float64_to_float16(float64 a
, bool ieee
, float_status
*s
)
1918 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1919 FloatParts p
= float64_unpack_canonical(a
, s
);
1920 FloatParts pr
= float_to_float(p
, fmt16
, s
);
1921 return float16a_round_pack_canonical(pr
, s
, fmt16
);
1924 float32
float64_to_float32(float64 a
, float_status
*s
)
1926 FloatParts p
= float64_unpack_canonical(a
, s
);
1927 FloatParts pr
= float_to_float(p
, &float32_params
, s
);
1928 return float32_round_pack_canonical(pr
, s
);
1932 * Rounds the floating-point value `a' to an integer, and returns the
1933 * result as a floating-point value. The operation is performed
1934 * according to the IEC/IEEE Standard for Binary Floating-Point
1938 static FloatParts
round_to_int(FloatParts a
, int rmode
,
1939 int scale
, float_status
*s
)
1942 case float_class_qnan
:
1943 case float_class_snan
:
1944 return return_nan(a
, s
);
1946 case float_class_zero
:
1947 case float_class_inf
:
1948 /* already "integral" */
1951 case float_class_normal
:
1952 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
1955 if (a
.exp
>= DECOMPOSED_BINARY_POINT
) {
1956 /* already integral */
1961 /* all fractional */
1962 s
->float_exception_flags
|= float_flag_inexact
;
1964 case float_round_nearest_even
:
1965 one
= a
.exp
== -1 && a
.frac
> DECOMPOSED_IMPLICIT_BIT
;
1967 case float_round_ties_away
:
1968 one
= a
.exp
== -1 && a
.frac
>= DECOMPOSED_IMPLICIT_BIT
;
1970 case float_round_to_zero
:
1973 case float_round_up
:
1976 case float_round_down
:
1980 g_assert_not_reached();
1984 a
.frac
= DECOMPOSED_IMPLICIT_BIT
;
1987 a
.cls
= float_class_zero
;
1990 uint64_t frac_lsb
= DECOMPOSED_IMPLICIT_BIT
>> a
.exp
;
1991 uint64_t frac_lsbm1
= frac_lsb
>> 1;
1992 uint64_t rnd_even_mask
= (frac_lsb
- 1) | frac_lsb
;
1993 uint64_t rnd_mask
= rnd_even_mask
>> 1;
1997 case float_round_nearest_even
:
1998 inc
= ((a
.frac
& rnd_even_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
2000 case float_round_ties_away
:
2003 case float_round_to_zero
:
2006 case float_round_up
:
2007 inc
= a
.sign
? 0 : rnd_mask
;
2009 case float_round_down
:
2010 inc
= a
.sign
? rnd_mask
: 0;
2013 g_assert_not_reached();
2016 if (a
.frac
& rnd_mask
) {
2017 s
->float_exception_flags
|= float_flag_inexact
;
2019 a
.frac
&= ~rnd_mask
;
2020 if (a
.frac
& DECOMPOSED_OVERFLOW_BIT
) {
2028 g_assert_not_reached();
2033 float16
float16_round_to_int(float16 a
, float_status
*s
)
2035 FloatParts pa
= float16_unpack_canonical(a
, s
);
2036 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2037 return float16_round_pack_canonical(pr
, s
);
2040 float32
float32_round_to_int(float32 a
, float_status
*s
)
2042 FloatParts pa
= float32_unpack_canonical(a
, s
);
2043 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2044 return float32_round_pack_canonical(pr
, s
);
2047 float64
float64_round_to_int(float64 a
, float_status
*s
)
2049 FloatParts pa
= float64_unpack_canonical(a
, s
);
2050 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2051 return float64_round_pack_canonical(pr
, s
);
2055 * Returns the result of converting the floating-point value `a' to
2056 * the two's complement integer format. The conversion is performed
2057 * according to the IEC/IEEE Standard for Binary Floating-Point
2058 * Arithmetic---which means in particular that the conversion is
2059 * rounded according to the current rounding mode. If `a' is a NaN,
2060 * the largest positive integer is returned. Otherwise, if the
2061 * conversion overflows, the largest integer with the same sign as `a'
2065 static int64_t round_to_int_and_pack(FloatParts in
, int rmode
, int scale
,
2066 int64_t min
, int64_t max
,
2070 int orig_flags
= get_float_exception_flags(s
);
2071 FloatParts p
= round_to_int(in
, rmode
, scale
, s
);
2074 case float_class_snan
:
2075 case float_class_qnan
:
2076 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2078 case float_class_inf
:
2079 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2080 return p
.sign
? min
: max
;
2081 case float_class_zero
:
2083 case float_class_normal
:
2084 if (p
.exp
< DECOMPOSED_BINARY_POINT
) {
2085 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
2086 } else if (p
.exp
- DECOMPOSED_BINARY_POINT
< 2) {
2087 r
= p
.frac
<< (p
.exp
- DECOMPOSED_BINARY_POINT
);
2092 if (r
<= -(uint64_t) min
) {
2095 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2102 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2107 g_assert_not_reached();
2111 int16_t float16_to_int16_scalbn(float16 a
, int rmode
, int scale
,
2114 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2115 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2118 int32_t float16_to_int32_scalbn(float16 a
, int rmode
, int scale
,
2121 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2122 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2125 int64_t float16_to_int64_scalbn(float16 a
, int rmode
, int scale
,
2128 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2129 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2132 int16_t float32_to_int16_scalbn(float32 a
, int rmode
, int scale
,
2135 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2136 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2139 int32_t float32_to_int32_scalbn(float32 a
, int rmode
, int scale
,
2142 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2143 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2146 int64_t float32_to_int64_scalbn(float32 a
, int rmode
, int scale
,
2149 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2150 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2153 int16_t float64_to_int16_scalbn(float64 a
, int rmode
, int scale
,
2156 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2157 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2160 int32_t float64_to_int32_scalbn(float64 a
, int rmode
, int scale
,
2163 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2164 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2167 int64_t float64_to_int64_scalbn(float64 a
, int rmode
, int scale
,
2170 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2171 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2174 int16_t float16_to_int16(float16 a
, float_status
*s
)
2176 return float16_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2179 int32_t float16_to_int32(float16 a
, float_status
*s
)
2181 return float16_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2184 int64_t float16_to_int64(float16 a
, float_status
*s
)
2186 return float16_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2189 int16_t float32_to_int16(float32 a
, float_status
*s
)
2191 return float32_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2194 int32_t float32_to_int32(float32 a
, float_status
*s
)
2196 return float32_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2199 int64_t float32_to_int64(float32 a
, float_status
*s
)
2201 return float32_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2204 int16_t float64_to_int16(float64 a
, float_status
*s
)
2206 return float64_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2209 int32_t float64_to_int32(float64 a
, float_status
*s
)
2211 return float64_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2214 int64_t float64_to_int64(float64 a
, float_status
*s
)
2216 return float64_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2219 int16_t float16_to_int16_round_to_zero(float16 a
, float_status
*s
)
2221 return float16_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2224 int32_t float16_to_int32_round_to_zero(float16 a
, float_status
*s
)
2226 return float16_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2229 int64_t float16_to_int64_round_to_zero(float16 a
, float_status
*s
)
2231 return float16_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2234 int16_t float32_to_int16_round_to_zero(float32 a
, float_status
*s
)
2236 return float32_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2239 int32_t float32_to_int32_round_to_zero(float32 a
, float_status
*s
)
2241 return float32_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2244 int64_t float32_to_int64_round_to_zero(float32 a
, float_status
*s
)
2246 return float32_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2249 int16_t float64_to_int16_round_to_zero(float64 a
, float_status
*s
)
2251 return float64_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2254 int32_t float64_to_int32_round_to_zero(float64 a
, float_status
*s
)
2256 return float64_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2259 int64_t float64_to_int64_round_to_zero(float64 a
, float_status
*s
)
2261 return float64_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2265 * Returns the result of converting the floating-point value `a' to
2266 * the unsigned integer format. The conversion is performed according
2267 * to the IEC/IEEE Standard for Binary Floating-Point
2268 * Arithmetic---which means in particular that the conversion is
2269 * rounded according to the current rounding mode. If `a' is a NaN,
2270 * the largest unsigned integer is returned. Otherwise, if the
2271 * conversion overflows, the largest unsigned integer is returned. If
2272 * the 'a' is negative, the result is rounded and zero is returned;
2273 * values that do not round to zero will raise the inexact exception
2277 static uint64_t round_to_uint_and_pack(FloatParts in
, int rmode
, int scale
,
2278 uint64_t max
, float_status
*s
)
2280 int orig_flags
= get_float_exception_flags(s
);
2281 FloatParts p
= round_to_int(in
, rmode
, scale
, s
);
2285 case float_class_snan
:
2286 case float_class_qnan
:
2287 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2289 case float_class_inf
:
2290 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2291 return p
.sign
? 0 : max
;
2292 case float_class_zero
:
2294 case float_class_normal
:
2296 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2300 if (p
.exp
< DECOMPOSED_BINARY_POINT
) {
2301 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
2302 } else if (p
.exp
- DECOMPOSED_BINARY_POINT
< 2) {
2303 r
= p
.frac
<< (p
.exp
- DECOMPOSED_BINARY_POINT
);
2305 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2309 /* For uint64 this will never trip, but if p.exp is too large
2310 * to shift a decomposed fraction we shall have exited via the
2314 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2319 g_assert_not_reached();
2323 uint16_t float16_to_uint16_scalbn(float16 a
, int rmode
, int scale
,
2326 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2327 rmode
, scale
, UINT16_MAX
, s
);
2330 uint32_t float16_to_uint32_scalbn(float16 a
, int rmode
, int scale
,
2333 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2334 rmode
, scale
, UINT32_MAX
, s
);
2337 uint64_t float16_to_uint64_scalbn(float16 a
, int rmode
, int scale
,
2340 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2341 rmode
, scale
, UINT64_MAX
, s
);
2344 uint16_t float32_to_uint16_scalbn(float32 a
, int rmode
, int scale
,
2347 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2348 rmode
, scale
, UINT16_MAX
, s
);
2351 uint32_t float32_to_uint32_scalbn(float32 a
, int rmode
, int scale
,
2354 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2355 rmode
, scale
, UINT32_MAX
, s
);
2358 uint64_t float32_to_uint64_scalbn(float32 a
, int rmode
, int scale
,
2361 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2362 rmode
, scale
, UINT64_MAX
, s
);
2365 uint16_t float64_to_uint16_scalbn(float64 a
, int rmode
, int scale
,
2368 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2369 rmode
, scale
, UINT16_MAX
, s
);
2372 uint32_t float64_to_uint32_scalbn(float64 a
, int rmode
, int scale
,
2375 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2376 rmode
, scale
, UINT32_MAX
, s
);
2379 uint64_t float64_to_uint64_scalbn(float64 a
, int rmode
, int scale
,
2382 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2383 rmode
, scale
, UINT64_MAX
, s
);
2386 uint16_t float16_to_uint16(float16 a
, float_status
*s
)
2388 return float16_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2391 uint32_t float16_to_uint32(float16 a
, float_status
*s
)
2393 return float16_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2396 uint64_t float16_to_uint64(float16 a
, float_status
*s
)
2398 return float16_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2401 uint16_t float32_to_uint16(float32 a
, float_status
*s
)
2403 return float32_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2406 uint32_t float32_to_uint32(float32 a
, float_status
*s
)
2408 return float32_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2411 uint64_t float32_to_uint64(float32 a
, float_status
*s
)
2413 return float32_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2416 uint16_t float64_to_uint16(float64 a
, float_status
*s
)
2418 return float64_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2421 uint32_t float64_to_uint32(float64 a
, float_status
*s
)
2423 return float64_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2426 uint64_t float64_to_uint64(float64 a
, float_status
*s
)
2428 return float64_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2431 uint16_t float16_to_uint16_round_to_zero(float16 a
, float_status
*s
)
2433 return float16_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2436 uint32_t float16_to_uint32_round_to_zero(float16 a
, float_status
*s
)
2438 return float16_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2441 uint64_t float16_to_uint64_round_to_zero(float16 a
, float_status
*s
)
2443 return float16_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2446 uint16_t float32_to_uint16_round_to_zero(float32 a
, float_status
*s
)
2448 return float32_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2451 uint32_t float32_to_uint32_round_to_zero(float32 a
, float_status
*s
)
2453 return float32_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2456 uint64_t float32_to_uint64_round_to_zero(float32 a
, float_status
*s
)
2458 return float32_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2461 uint16_t float64_to_uint16_round_to_zero(float64 a
, float_status
*s
)
2463 return float64_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2466 uint32_t float64_to_uint32_round_to_zero(float64 a
, float_status
*s
)
2468 return float64_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2471 uint64_t float64_to_uint64_round_to_zero(float64 a
, float_status
*s
)
2473 return float64_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2477 * Integer to float conversions
2479 * Returns the result of converting the two's complement integer `a'
2480 * to the floating-point format. The conversion is performed according
2481 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2484 static FloatParts
int_to_float(int64_t a
, int scale
, float_status
*status
)
2486 FloatParts r
= { .sign
= false };
2489 r
.cls
= float_class_zero
;
2494 r
.cls
= float_class_normal
;
2499 shift
= clz64(f
) - 1;
2500 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2502 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
2503 r
.frac
= (shift
< 0 ? DECOMPOSED_IMPLICIT_BIT
: f
<< shift
);
2509 float16
int64_to_float16_scalbn(int64_t a
, int scale
, float_status
*status
)
2511 FloatParts pa
= int_to_float(a
, scale
, status
);
2512 return float16_round_pack_canonical(pa
, status
);
2515 float16
int32_to_float16_scalbn(int32_t a
, int scale
, float_status
*status
)
2517 return int64_to_float16_scalbn(a
, scale
, status
);
2520 float16
int16_to_float16_scalbn(int16_t a
, int scale
, float_status
*status
)
2522 return int64_to_float16_scalbn(a
, scale
, status
);
2525 float16
int64_to_float16(int64_t a
, float_status
*status
)
2527 return int64_to_float16_scalbn(a
, 0, status
);
2530 float16
int32_to_float16(int32_t a
, float_status
*status
)
2532 return int64_to_float16_scalbn(a
, 0, status
);
2535 float16
int16_to_float16(int16_t a
, float_status
*status
)
2537 return int64_to_float16_scalbn(a
, 0, status
);
2540 float32
int64_to_float32_scalbn(int64_t a
, int scale
, float_status
*status
)
2542 FloatParts pa
= int_to_float(a
, scale
, status
);
2543 return float32_round_pack_canonical(pa
, status
);
2546 float32
int32_to_float32_scalbn(int32_t a
, int scale
, float_status
*status
)
2548 return int64_to_float32_scalbn(a
, scale
, status
);
2551 float32
int16_to_float32_scalbn(int16_t a
, int scale
, float_status
*status
)
2553 return int64_to_float32_scalbn(a
, scale
, status
);
2556 float32
int64_to_float32(int64_t a
, float_status
*status
)
2558 return int64_to_float32_scalbn(a
, 0, status
);
2561 float32
int32_to_float32(int32_t a
, float_status
*status
)
2563 return int64_to_float32_scalbn(a
, 0, status
);
2566 float32
int16_to_float32(int16_t a
, float_status
*status
)
2568 return int64_to_float32_scalbn(a
, 0, status
);
2571 float64
int64_to_float64_scalbn(int64_t a
, int scale
, float_status
*status
)
2573 FloatParts pa
= int_to_float(a
, scale
, status
);
2574 return float64_round_pack_canonical(pa
, status
);
2577 float64
int32_to_float64_scalbn(int32_t a
, int scale
, float_status
*status
)
2579 return int64_to_float64_scalbn(a
, scale
, status
);
2582 float64
int16_to_float64_scalbn(int16_t a
, int scale
, float_status
*status
)
2584 return int64_to_float64_scalbn(a
, scale
, status
);
2587 float64
int64_to_float64(int64_t a
, float_status
*status
)
2589 return int64_to_float64_scalbn(a
, 0, status
);
2592 float64
int32_to_float64(int32_t a
, float_status
*status
)
2594 return int64_to_float64_scalbn(a
, 0, status
);
2597 float64
int16_to_float64(int16_t a
, float_status
*status
)
2599 return int64_to_float64_scalbn(a
, 0, status
);
2604 * Unsigned Integer to float conversions
2606 * Returns the result of converting the unsigned integer `a' to the
2607 * floating-point format. The conversion is performed according to the
2608 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2611 static FloatParts
uint_to_float(uint64_t a
, int scale
, float_status
*status
)
2613 FloatParts r
= { .sign
= false };
2616 r
.cls
= float_class_zero
;
2618 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2619 r
.cls
= float_class_normal
;
2620 if ((int64_t)a
< 0) {
2621 r
.exp
= DECOMPOSED_BINARY_POINT
+ 1 + scale
;
2622 shift64RightJamming(a
, 1, &a
);
2625 int shift
= clz64(a
) - 1;
2626 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
2627 r
.frac
= a
<< shift
;
2634 float16
uint64_to_float16_scalbn(uint64_t a
, int scale
, float_status
*status
)
2636 FloatParts pa
= uint_to_float(a
, scale
, status
);
2637 return float16_round_pack_canonical(pa
, status
);
2640 float16
uint32_to_float16_scalbn(uint32_t a
, int scale
, float_status
*status
)
2642 return uint64_to_float16_scalbn(a
, scale
, status
);
2645 float16
uint16_to_float16_scalbn(uint16_t a
, int scale
, float_status
*status
)
2647 return uint64_to_float16_scalbn(a
, scale
, status
);
2650 float16
uint64_to_float16(uint64_t a
, float_status
*status
)
2652 return uint64_to_float16_scalbn(a
, 0, status
);
2655 float16
uint32_to_float16(uint32_t a
, float_status
*status
)
2657 return uint64_to_float16_scalbn(a
, 0, status
);
2660 float16
uint16_to_float16(uint16_t a
, float_status
*status
)
2662 return uint64_to_float16_scalbn(a
, 0, status
);
2665 float32
uint64_to_float32_scalbn(uint64_t a
, int scale
, float_status
*status
)
2667 FloatParts pa
= uint_to_float(a
, scale
, status
);
2668 return float32_round_pack_canonical(pa
, status
);
2671 float32
uint32_to_float32_scalbn(uint32_t a
, int scale
, float_status
*status
)
2673 return uint64_to_float32_scalbn(a
, scale
, status
);
2676 float32
uint16_to_float32_scalbn(uint16_t a
, int scale
, float_status
*status
)
2678 return uint64_to_float32_scalbn(a
, scale
, status
);
2681 float32
uint64_to_float32(uint64_t a
, float_status
*status
)
2683 return uint64_to_float32_scalbn(a
, 0, status
);
2686 float32
uint32_to_float32(uint32_t a
, float_status
*status
)
2688 return uint64_to_float32_scalbn(a
, 0, status
);
2691 float32
uint16_to_float32(uint16_t a
, float_status
*status
)
2693 return uint64_to_float32_scalbn(a
, 0, status
);
2696 float64
uint64_to_float64_scalbn(uint64_t a
, int scale
, float_status
*status
)
2698 FloatParts pa
= uint_to_float(a
, scale
, status
);
2699 return float64_round_pack_canonical(pa
, status
);
2702 float64
uint32_to_float64_scalbn(uint32_t a
, int scale
, float_status
*status
)
2704 return uint64_to_float64_scalbn(a
, scale
, status
);
2707 float64
uint16_to_float64_scalbn(uint16_t a
, int scale
, float_status
*status
)
2709 return uint64_to_float64_scalbn(a
, scale
, status
);
2712 float64
uint64_to_float64(uint64_t a
, float_status
*status
)
2714 return uint64_to_float64_scalbn(a
, 0, status
);
2717 float64
uint32_to_float64(uint32_t a
, float_status
*status
)
2719 return uint64_to_float64_scalbn(a
, 0, status
);
2722 float64
uint16_to_float64(uint16_t a
, float_status
*status
)
2724 return uint64_to_float64_scalbn(a
, 0, status
);
2728 /* min() and max() functions. These can't be implemented as
2729 * 'compare and pick one input' because that would mishandle
2730 * NaNs and +0 vs -0.
2732 * minnum() and maxnum() functions. These are similar to the min()
2733 * and max() functions but if one of the arguments is a QNaN and
2734 * the other is numerical then the numerical argument is returned.
2735 * SNaNs will get quietened before being returned.
2736 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2737 * and maxNum() operations. min() and max() are the typical min/max
2738 * semantics provided by many CPUs which predate that specification.
2740 * minnummag() and maxnummag() functions correspond to minNumMag()
2741 * and minNumMag() from the IEEE-754 2008.
2743 static FloatParts
minmax_floats(FloatParts a
, FloatParts b
, bool ismin
,
2744 bool ieee
, bool ismag
, float_status
*s
)
2746 if (unlikely(is_nan(a
.cls
) || is_nan(b
.cls
))) {
2748 /* Takes two floating-point values `a' and `b', one of
2749 * which is a NaN, and returns the appropriate NaN
2750 * result. If either `a' or `b' is a signaling NaN,
2751 * the invalid exception is raised.
2753 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
2754 return pick_nan(a
, b
, s
);
2755 } else if (is_nan(a
.cls
) && !is_nan(b
.cls
)) {
2757 } else if (is_nan(b
.cls
) && !is_nan(a
.cls
)) {
2761 return pick_nan(a
, b
, s
);
2766 case float_class_normal
:
2769 case float_class_inf
:
2772 case float_class_zero
:
2776 g_assert_not_reached();
2780 case float_class_normal
:
2783 case float_class_inf
:
2786 case float_class_zero
:
2790 g_assert_not_reached();
2794 if (ismag
&& (a_exp
!= b_exp
|| a
.frac
!= b
.frac
)) {
2795 bool a_less
= a_exp
< b_exp
;
2796 if (a_exp
== b_exp
) {
2797 a_less
= a
.frac
< b
.frac
;
2799 return a_less
^ ismin
? b
: a
;
2802 if (a
.sign
== b
.sign
) {
2803 bool a_less
= a_exp
< b_exp
;
2804 if (a_exp
== b_exp
) {
2805 a_less
= a
.frac
< b
.frac
;
2807 return a
.sign
^ a_less
^ ismin
? b
: a
;
2809 return a
.sign
^ ismin
? b
: a
;
2814 #define MINMAX(sz, name, ismin, isiee, ismag) \
2815 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
2818 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2819 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2820 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
2822 return float ## sz ## _round_pack_canonical(pr, s); \
2825 MINMAX(16, min
, true, false, false)
2826 MINMAX(16, minnum
, true, true, false)
2827 MINMAX(16, minnummag
, true, true, true)
2828 MINMAX(16, max
, false, false, false)
2829 MINMAX(16, maxnum
, false, true, false)
2830 MINMAX(16, maxnummag
, false, true, true)
2832 MINMAX(32, min
, true, false, false)
2833 MINMAX(32, minnum
, true, true, false)
2834 MINMAX(32, minnummag
, true, true, true)
2835 MINMAX(32, max
, false, false, false)
2836 MINMAX(32, maxnum
, false, true, false)
2837 MINMAX(32, maxnummag
, false, true, true)
2839 MINMAX(64, min
, true, false, false)
2840 MINMAX(64, minnum
, true, true, false)
2841 MINMAX(64, minnummag
, true, true, true)
2842 MINMAX(64, max
, false, false, false)
2843 MINMAX(64, maxnum
, false, true, false)
2844 MINMAX(64, maxnummag
, false, true, true)
2848 /* Floating point compare */
2849 static int compare_floats(FloatParts a
, FloatParts b
, bool is_quiet
,
2852 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
2854 a
.cls
== float_class_snan
||
2855 b
.cls
== float_class_snan
) {
2856 s
->float_exception_flags
|= float_flag_invalid
;
2858 return float_relation_unordered
;
2861 if (a
.cls
== float_class_zero
) {
2862 if (b
.cls
== float_class_zero
) {
2863 return float_relation_equal
;
2865 return b
.sign
? float_relation_greater
: float_relation_less
;
2866 } else if (b
.cls
== float_class_zero
) {
2867 return a
.sign
? float_relation_less
: float_relation_greater
;
2870 /* The only really important thing about infinity is its sign. If
2871 * both are infinities the sign marks the smallest of the two.
2873 if (a
.cls
== float_class_inf
) {
2874 if ((b
.cls
== float_class_inf
) && (a
.sign
== b
.sign
)) {
2875 return float_relation_equal
;
2877 return a
.sign
? float_relation_less
: float_relation_greater
;
2878 } else if (b
.cls
== float_class_inf
) {
2879 return b
.sign
? float_relation_greater
: float_relation_less
;
2882 if (a
.sign
!= b
.sign
) {
2883 return a
.sign
? float_relation_less
: float_relation_greater
;
2886 if (a
.exp
== b
.exp
) {
2887 if (a
.frac
== b
.frac
) {
2888 return float_relation_equal
;
2891 return a
.frac
> b
.frac
?
2892 float_relation_less
: float_relation_greater
;
2894 return a
.frac
> b
.frac
?
2895 float_relation_greater
: float_relation_less
;
2899 return a
.exp
> b
.exp
? float_relation_less
: float_relation_greater
;
2901 return a
.exp
> b
.exp
? float_relation_greater
: float_relation_less
;
2906 #define COMPARE(name, attr, sz) \
2908 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \
2910 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2911 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2912 return compare_floats(pa, pb, is_quiet, s); \
2915 COMPARE(soft_f16_compare
, QEMU_FLATTEN
, 16)
2916 COMPARE(soft_f32_compare
, QEMU_SOFTFLOAT_ATTR
, 32)
2917 COMPARE(soft_f64_compare
, QEMU_SOFTFLOAT_ATTR
, 64)
2921 int float16_compare(float16 a
, float16 b
, float_status
*s
)
2923 return soft_f16_compare(a
, b
, false, s
);
2926 int float16_compare_quiet(float16 a
, float16 b
, float_status
*s
)
2928 return soft_f16_compare(a
, b
, true, s
);
2931 static int QEMU_FLATTEN
2932 f32_compare(float32 xa
, float32 xb
, bool is_quiet
, float_status
*s
)
2934 union_float32 ua
, ub
;
2939 if (QEMU_NO_HARDFLOAT
) {
2943 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
2944 if (isgreaterequal(ua
.h
, ub
.h
)) {
2945 if (isgreater(ua
.h
, ub
.h
)) {
2946 return float_relation_greater
;
2948 return float_relation_equal
;
2950 if (likely(isless(ua
.h
, ub
.h
))) {
2951 return float_relation_less
;
2953 /* The only condition remaining is unordered.
2954 * Fall through to set flags.
2957 return soft_f32_compare(ua
.s
, ub
.s
, is_quiet
, s
);
2960 int float32_compare(float32 a
, float32 b
, float_status
*s
)
2962 return f32_compare(a
, b
, false, s
);
2965 int float32_compare_quiet(float32 a
, float32 b
, float_status
*s
)
2967 return f32_compare(a
, b
, true, s
);
2970 static int QEMU_FLATTEN
2971 f64_compare(float64 xa
, float64 xb
, bool is_quiet
, float_status
*s
)
2973 union_float64 ua
, ub
;
2978 if (QEMU_NO_HARDFLOAT
) {
2982 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
2983 if (isgreaterequal(ua
.h
, ub
.h
)) {
2984 if (isgreater(ua
.h
, ub
.h
)) {
2985 return float_relation_greater
;
2987 return float_relation_equal
;
2989 if (likely(isless(ua
.h
, ub
.h
))) {
2990 return float_relation_less
;
2992 /* The only condition remaining is unordered.
2993 * Fall through to set flags.
2996 return soft_f64_compare(ua
.s
, ub
.s
, is_quiet
, s
);
2999 int float64_compare(float64 a
, float64 b
, float_status
*s
)
3001 return f64_compare(a
, b
, false, s
);
3004 int float64_compare_quiet(float64 a
, float64 b
, float_status
*s
)
3006 return f64_compare(a
, b
, true, s
);
3009 /* Multiply A by 2 raised to the power N. */
3010 static FloatParts
scalbn_decomposed(FloatParts a
, int n
, float_status
*s
)
3012 if (unlikely(is_nan(a
.cls
))) {
3013 return return_nan(a
, s
);
3015 if (a
.cls
== float_class_normal
) {
3016 /* The largest float type (even though not supported by FloatParts)
3017 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
3018 * still allows rounding to infinity, without allowing overflow
3019 * within the int32_t that backs FloatParts.exp.
3021 n
= MIN(MAX(n
, -0x10000), 0x10000);
3027 float16
float16_scalbn(float16 a
, int n
, float_status
*status
)
3029 FloatParts pa
= float16_unpack_canonical(a
, status
);
3030 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
3031 return float16_round_pack_canonical(pr
, status
);
3034 float32
float32_scalbn(float32 a
, int n
, float_status
*status
)
3036 FloatParts pa
= float32_unpack_canonical(a
, status
);
3037 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
3038 return float32_round_pack_canonical(pr
, status
);
3041 float64
float64_scalbn(float64 a
, int n
, float_status
*status
)
3043 FloatParts pa
= float64_unpack_canonical(a
, status
);
3044 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
3045 return float64_round_pack_canonical(pr
, status
);
3051 * The old softfloat code did an approximation step before zeroing in
3052 * on the final result. However for simpleness we just compute the
3053 * square root by iterating down from the implicit bit to enough extra
3054 * bits to ensure we get a correctly rounded result.
3056 * This does mean however the calculation is slower than before,
3057 * especially for 64 bit floats.
3060 static FloatParts
sqrt_float(FloatParts a
, float_status
*s
, const FloatFmt
*p
)
3062 uint64_t a_frac
, r_frac
, s_frac
;
3065 if (is_nan(a
.cls
)) {
3066 return return_nan(a
, s
);
3068 if (a
.cls
== float_class_zero
) {
3069 return a
; /* sqrt(+-0) = +-0 */
3072 s
->float_exception_flags
|= float_flag_invalid
;
3073 return parts_default_nan(s
);
3075 if (a
.cls
== float_class_inf
) {
3076 return a
; /* sqrt(+inf) = +inf */
3079 assert(a
.cls
== float_class_normal
);
3081 /* We need two overflow bits at the top. Adding room for that is a
3082 * right shift. If the exponent is odd, we can discard the low bit
3083 * by multiplying the fraction by 2; that's a left shift. Combine
3084 * those and we shift right if the exponent is even.
3092 /* Bit-by-bit computation of sqrt. */
3096 /* Iterate from implicit bit down to the 3 extra bits to compute a
3097 * properly rounded result. Remember we've inserted one more bit
3098 * at the top, so these positions are one less.
3100 bit
= DECOMPOSED_BINARY_POINT
- 1;
3101 last_bit
= MAX(p
->frac_shift
- 4, 0);
3103 uint64_t q
= 1ULL << bit
;
3104 uint64_t t_frac
= s_frac
+ q
;
3105 if (t_frac
<= a_frac
) {
3106 s_frac
= t_frac
+ q
;
3111 } while (--bit
>= last_bit
);
3113 /* Undo the right shift done above. If there is any remaining
3114 * fraction, the result is inexact. Set the sticky bit.
3116 a
.frac
= (r_frac
<< 1) + (a_frac
!= 0);
3121 float16 QEMU_FLATTEN
float16_sqrt(float16 a
, float_status
*status
)
3123 FloatParts pa
= float16_unpack_canonical(a
, status
);
3124 FloatParts pr
= sqrt_float(pa
, status
, &float16_params
);
3125 return float16_round_pack_canonical(pr
, status
);
3128 static float32 QEMU_SOFTFLOAT_ATTR
3129 soft_f32_sqrt(float32 a
, float_status
*status
)
3131 FloatParts pa
= float32_unpack_canonical(a
, status
);
3132 FloatParts pr
= sqrt_float(pa
, status
, &float32_params
);
3133 return float32_round_pack_canonical(pr
, status
);
3136 static float64 QEMU_SOFTFLOAT_ATTR
3137 soft_f64_sqrt(float64 a
, float_status
*status
)
3139 FloatParts pa
= float64_unpack_canonical(a
, status
);
3140 FloatParts pr
= sqrt_float(pa
, status
, &float64_params
);
3141 return float64_round_pack_canonical(pr
, status
);
3144 float32 QEMU_FLATTEN
float32_sqrt(float32 xa
, float_status
*s
)
3146 union_float32 ua
, ur
;
3149 if (unlikely(!can_use_fpu(s
))) {
3153 float32_input_flush1(&ua
.s
, s
);
3154 if (QEMU_HARDFLOAT_1F32_USE_FP
) {
3155 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3156 fpclassify(ua
.h
) == FP_ZERO
) ||
3160 } else if (unlikely(!float32_is_zero_or_normal(ua
.s
) ||
3161 float32_is_neg(ua
.s
))) {
3168 return soft_f32_sqrt(ua
.s
, s
);
3171 float64 QEMU_FLATTEN
float64_sqrt(float64 xa
, float_status
*s
)
3173 union_float64 ua
, ur
;
3176 if (unlikely(!can_use_fpu(s
))) {
3180 float64_input_flush1(&ua
.s
, s
);
3181 if (QEMU_HARDFLOAT_1F64_USE_FP
) {
3182 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3183 fpclassify(ua
.h
) == FP_ZERO
) ||
3187 } else if (unlikely(!float64_is_zero_or_normal(ua
.s
) ||
3188 float64_is_neg(ua
.s
))) {
3195 return soft_f64_sqrt(ua
.s
, s
);
3198 /*----------------------------------------------------------------------------
3199 | The pattern for a default generated NaN.
3200 *----------------------------------------------------------------------------*/
3202 float16
float16_default_nan(float_status
*status
)
3204 FloatParts p
= parts_default_nan(status
);
3205 p
.frac
>>= float16_params
.frac_shift
;
3206 return float16_pack_raw(p
);
3209 float32
float32_default_nan(float_status
*status
)
3211 FloatParts p
= parts_default_nan(status
);
3212 p
.frac
>>= float32_params
.frac_shift
;
3213 return float32_pack_raw(p
);
3216 float64
float64_default_nan(float_status
*status
)
3218 FloatParts p
= parts_default_nan(status
);
3219 p
.frac
>>= float64_params
.frac_shift
;
3220 return float64_pack_raw(p
);
3223 float128
float128_default_nan(float_status
*status
)
3225 FloatParts p
= parts_default_nan(status
);
3228 /* Extrapolate from the choices made by parts_default_nan to fill
3229 * in the quad-floating format. If the low bit is set, assume we
3230 * want to set all non-snan bits.
3232 r
.low
= -(p
.frac
& 1);
3233 r
.high
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- 48);
3234 r
.high
|= LIT64(0x7FFF000000000000);
3235 r
.high
|= (uint64_t)p
.sign
<< 63;
3240 /*----------------------------------------------------------------------------
3241 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3242 *----------------------------------------------------------------------------*/
3244 float16
float16_silence_nan(float16 a
, float_status
*status
)
3246 FloatParts p
= float16_unpack_raw(a
);
3247 p
.frac
<<= float16_params
.frac_shift
;
3248 p
= parts_silence_nan(p
, status
);
3249 p
.frac
>>= float16_params
.frac_shift
;
3250 return float16_pack_raw(p
);
3253 float32
float32_silence_nan(float32 a
, float_status
*status
)
3255 FloatParts p
= float32_unpack_raw(a
);
3256 p
.frac
<<= float32_params
.frac_shift
;
3257 p
= parts_silence_nan(p
, status
);
3258 p
.frac
>>= float32_params
.frac_shift
;
3259 return float32_pack_raw(p
);
3262 float64
float64_silence_nan(float64 a
, float_status
*status
)
3264 FloatParts p
= float64_unpack_raw(a
);
3265 p
.frac
<<= float64_params
.frac_shift
;
3266 p
= parts_silence_nan(p
, status
);
3267 p
.frac
>>= float64_params
.frac_shift
;
3268 return float64_pack_raw(p
);
3271 /*----------------------------------------------------------------------------
3272 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3273 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3274 | input. If `zSign' is 1, the input is negated before being converted to an
3275 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
3276 | is simply rounded to an integer, with the inexact exception raised if the
3277 | input cannot be represented exactly as an integer. However, if the fixed-
3278 | point input is too large, the invalid exception is raised and the largest
3279 | positive or negative integer is returned.
3280 *----------------------------------------------------------------------------*/
3282 static int32_t roundAndPackInt32(flag zSign
, uint64_t absZ
, float_status
*status
)
3284 int8_t roundingMode
;
3285 flag roundNearestEven
;
3286 int8_t roundIncrement
, roundBits
;
3289 roundingMode
= status
->float_rounding_mode
;
3290 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3291 switch (roundingMode
) {
3292 case float_round_nearest_even
:
3293 case float_round_ties_away
:
3294 roundIncrement
= 0x40;
3296 case float_round_to_zero
:
3299 case float_round_up
:
3300 roundIncrement
= zSign
? 0 : 0x7f;
3302 case float_round_down
:
3303 roundIncrement
= zSign
? 0x7f : 0;
3308 roundBits
= absZ
& 0x7F;
3309 absZ
= ( absZ
+ roundIncrement
)>>7;
3310 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
3312 if ( zSign
) z
= - z
;
3313 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
3314 float_raise(float_flag_invalid
, status
);
3315 return zSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
3318 status
->float_exception_flags
|= float_flag_inexact
;
3324 /*----------------------------------------------------------------------------
3325 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3326 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3327 | and returns the properly rounded 64-bit integer corresponding to the input.
3328 | If `zSign' is 1, the input is negated before being converted to an integer.
3329 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3330 | the inexact exception raised if the input cannot be represented exactly as
3331 | an integer. However, if the fixed-point input is too large, the invalid
3332 | exception is raised and the largest positive or negative integer is
3334 *----------------------------------------------------------------------------*/
3336 static int64_t roundAndPackInt64(flag zSign
, uint64_t absZ0
, uint64_t absZ1
,
3337 float_status
*status
)
3339 int8_t roundingMode
;
3340 flag roundNearestEven
, increment
;
3343 roundingMode
= status
->float_rounding_mode
;
3344 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3345 switch (roundingMode
) {
3346 case float_round_nearest_even
:
3347 case float_round_ties_away
:
3348 increment
= ((int64_t) absZ1
< 0);
3350 case float_round_to_zero
:
3353 case float_round_up
:
3354 increment
= !zSign
&& absZ1
;
3356 case float_round_down
:
3357 increment
= zSign
&& absZ1
;
3364 if ( absZ0
== 0 ) goto overflow
;
3365 absZ0
&= ~ ( ( (uint64_t) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
3368 if ( zSign
) z
= - z
;
3369 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
3371 float_raise(float_flag_invalid
, status
);
3373 zSign
? (int64_t) LIT64( 0x8000000000000000 )
3374 : LIT64( 0x7FFFFFFFFFFFFFFF );
3377 status
->float_exception_flags
|= float_flag_inexact
;
3383 /*----------------------------------------------------------------------------
3384 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3385 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3386 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3387 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
3388 | with the inexact exception raised if the input cannot be represented exactly
3389 | as an integer. However, if the fixed-point input is too large, the invalid
3390 | exception is raised and the largest unsigned integer is returned.
3391 *----------------------------------------------------------------------------*/
3393 static int64_t roundAndPackUint64(flag zSign
, uint64_t absZ0
,
3394 uint64_t absZ1
, float_status
*status
)
3396 int8_t roundingMode
;
3397 flag roundNearestEven
, increment
;
3399 roundingMode
= status
->float_rounding_mode
;
3400 roundNearestEven
= (roundingMode
== float_round_nearest_even
);
3401 switch (roundingMode
) {
3402 case float_round_nearest_even
:
3403 case float_round_ties_away
:
3404 increment
= ((int64_t)absZ1
< 0);
3406 case float_round_to_zero
:
3409 case float_round_up
:
3410 increment
= !zSign
&& absZ1
;
3412 case float_round_down
:
3413 increment
= zSign
&& absZ1
;
3421 float_raise(float_flag_invalid
, status
);
3422 return LIT64(0xFFFFFFFFFFFFFFFF);
3424 absZ0
&= ~(((uint64_t)(absZ1
<<1) == 0) & roundNearestEven
);
3427 if (zSign
&& absZ0
) {
3428 float_raise(float_flag_invalid
, status
);
3433 status
->float_exception_flags
|= float_flag_inexact
;
3438 /*----------------------------------------------------------------------------
3439 | If `a' is denormal and we are in flush-to-zero mode then set the
3440 | input-denormal exception and return zero. Otherwise just return the value.
3441 *----------------------------------------------------------------------------*/
3442 float32
float32_squash_input_denormal(float32 a
, float_status
*status
)
3444 if (status
->flush_inputs_to_zero
) {
3445 if (extractFloat32Exp(a
) == 0 && extractFloat32Frac(a
) != 0) {
3446 float_raise(float_flag_input_denormal
, status
);
3447 return make_float32(float32_val(a
) & 0x80000000);
3453 /*----------------------------------------------------------------------------
3454 | Normalizes the subnormal single-precision floating-point value represented
3455 | by the denormalized significand `aSig'. The normalized exponent and
3456 | significand are stored at the locations pointed to by `zExpPtr' and
3457 | `zSigPtr', respectively.
3458 *----------------------------------------------------------------------------*/
3461 normalizeFloat32Subnormal(uint32_t aSig
, int *zExpPtr
, uint32_t *zSigPtr
)
3465 shiftCount
= clz32(aSig
) - 8;
3466 *zSigPtr
= aSig
<<shiftCount
;
3467 *zExpPtr
= 1 - shiftCount
;
3471 /*----------------------------------------------------------------------------
3472 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3473 | and significand `zSig', and returns the proper single-precision floating-
3474 | point value corresponding to the abstract input. Ordinarily, the abstract
3475 | value is simply rounded and packed into the single-precision format, with
3476 | the inexact exception raised if the abstract input cannot be represented
3477 | exactly. However, if the abstract value is too large, the overflow and
3478 | inexact exceptions are raised and an infinity or maximal finite value is
3479 | returned. If the abstract value is too small, the input value is rounded to
3480 | a subnormal number, and the underflow and inexact exceptions are raised if
3481 | the abstract input cannot be represented exactly as a subnormal single-
3482 | precision floating-point number.
3483 | The input significand `zSig' has its binary point between bits 30
3484 | and 29, which is 7 bits to the left of the usual location. This shifted
3485 | significand must be normalized or smaller. If `zSig' is not normalized,
3486 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3487 | and it must not require rounding. In the usual case that `zSig' is
3488 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3489 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3490 | Binary Floating-Point Arithmetic.
3491 *----------------------------------------------------------------------------*/
3493 static float32
roundAndPackFloat32(flag zSign
, int zExp
, uint32_t zSig
,
3494 float_status
*status
)
3496 int8_t roundingMode
;
3497 flag roundNearestEven
;
3498 int8_t roundIncrement
, roundBits
;
3501 roundingMode
= status
->float_rounding_mode
;
3502 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3503 switch (roundingMode
) {
3504 case float_round_nearest_even
:
3505 case float_round_ties_away
:
3506 roundIncrement
= 0x40;
3508 case float_round_to_zero
:
3511 case float_round_up
:
3512 roundIncrement
= zSign
? 0 : 0x7f;
3514 case float_round_down
:
3515 roundIncrement
= zSign
? 0x7f : 0;
3521 roundBits
= zSig
& 0x7F;
3522 if ( 0xFD <= (uint16_t) zExp
) {
3523 if ( ( 0xFD < zExp
)
3524 || ( ( zExp
== 0xFD )
3525 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
3527 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3528 return packFloat32( zSign
, 0xFF, - ( roundIncrement
== 0 ));
3531 if (status
->flush_to_zero
) {
3532 float_raise(float_flag_output_denormal
, status
);
3533 return packFloat32(zSign
, 0, 0);
3536 (status
->float_detect_tininess
3537 == float_tininess_before_rounding
)
3539 || ( zSig
+ roundIncrement
< 0x80000000 );
3540 shift32RightJamming( zSig
, - zExp
, &zSig
);
3542 roundBits
= zSig
& 0x7F;
3543 if (isTiny
&& roundBits
) {
3544 float_raise(float_flag_underflow
, status
);
3549 status
->float_exception_flags
|= float_flag_inexact
;
3551 zSig
= ( zSig
+ roundIncrement
)>>7;
3552 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
3553 if ( zSig
== 0 ) zExp
= 0;
3554 return packFloat32( zSign
, zExp
, zSig
);
3558 /*----------------------------------------------------------------------------
3559 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3560 | and significand `zSig', and returns the proper single-precision floating-
3561 | point value corresponding to the abstract input. This routine is just like
3562 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
3563 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3564 | floating-point exponent.
3565 *----------------------------------------------------------------------------*/
3568 normalizeRoundAndPackFloat32(flag zSign
, int zExp
, uint32_t zSig
,
3569 float_status
*status
)
3573 shiftCount
= clz32(zSig
) - 1;
3574 return roundAndPackFloat32(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
3579 /*----------------------------------------------------------------------------
3580 | If `a' is denormal and we are in flush-to-zero mode then set the
3581 | input-denormal exception and return zero. Otherwise just return the value.
3582 *----------------------------------------------------------------------------*/
3583 float64
float64_squash_input_denormal(float64 a
, float_status
*status
)
3585 if (status
->flush_inputs_to_zero
) {
3586 if (extractFloat64Exp(a
) == 0 && extractFloat64Frac(a
) != 0) {
3587 float_raise(float_flag_input_denormal
, status
);
3588 return make_float64(float64_val(a
) & (1ULL << 63));
3594 /*----------------------------------------------------------------------------
3595 | Normalizes the subnormal double-precision floating-point value represented
3596 | by the denormalized significand `aSig'. The normalized exponent and
3597 | significand are stored at the locations pointed to by `zExpPtr' and
3598 | `zSigPtr', respectively.
3599 *----------------------------------------------------------------------------*/
3602 normalizeFloat64Subnormal(uint64_t aSig
, int *zExpPtr
, uint64_t *zSigPtr
)
3606 shiftCount
= clz64(aSig
) - 11;
3607 *zSigPtr
= aSig
<<shiftCount
;
3608 *zExpPtr
= 1 - shiftCount
;
3612 /*----------------------------------------------------------------------------
3613 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3614 | double-precision floating-point value, returning the result. After being
3615 | shifted into the proper positions, the three fields are simply added
3616 | together to form the result. This means that any integer portion of `zSig'
3617 | will be added into the exponent. Since a properly normalized significand
3618 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3619 | than the desired result exponent whenever `zSig' is a complete, normalized
3621 *----------------------------------------------------------------------------*/
3623 static inline float64
packFloat64(flag zSign
, int zExp
, uint64_t zSig
)
3626 return make_float64(
3627 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
3631 /*----------------------------------------------------------------------------
3632 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3633 | and significand `zSig', and returns the proper double-precision floating-
3634 | point value corresponding to the abstract input. Ordinarily, the abstract
3635 | value is simply rounded and packed into the double-precision format, with
3636 | the inexact exception raised if the abstract input cannot be represented
3637 | exactly. However, if the abstract value is too large, the overflow and
3638 | inexact exceptions are raised and an infinity or maximal finite value is
3639 | returned. If the abstract value is too small, the input value is rounded to
3640 | a subnormal number, and the underflow and inexact exceptions are raised if
3641 | the abstract input cannot be represented exactly as a subnormal double-
3642 | precision floating-point number.
3643 | The input significand `zSig' has its binary point between bits 62
3644 | and 61, which is 10 bits to the left of the usual location. This shifted
3645 | significand must be normalized or smaller. If `zSig' is not normalized,
3646 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3647 | and it must not require rounding. In the usual case that `zSig' is
3648 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3649 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3650 | Binary Floating-Point Arithmetic.
3651 *----------------------------------------------------------------------------*/
3653 static float64
roundAndPackFloat64(flag zSign
, int zExp
, uint64_t zSig
,
3654 float_status
*status
)
3656 int8_t roundingMode
;
3657 flag roundNearestEven
;
3658 int roundIncrement
, roundBits
;
3661 roundingMode
= status
->float_rounding_mode
;
3662 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3663 switch (roundingMode
) {
3664 case float_round_nearest_even
:
3665 case float_round_ties_away
:
3666 roundIncrement
= 0x200;
3668 case float_round_to_zero
:
3671 case float_round_up
:
3672 roundIncrement
= zSign
? 0 : 0x3ff;
3674 case float_round_down
:
3675 roundIncrement
= zSign
? 0x3ff : 0;
3677 case float_round_to_odd
:
3678 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
3683 roundBits
= zSig
& 0x3FF;
3684 if ( 0x7FD <= (uint16_t) zExp
) {
3685 if ( ( 0x7FD < zExp
)
3686 || ( ( zExp
== 0x7FD )
3687 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
3689 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
3690 roundIncrement
!= 0;
3691 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3692 return packFloat64(zSign
, 0x7FF, -(!overflow_to_inf
));
3695 if (status
->flush_to_zero
) {
3696 float_raise(float_flag_output_denormal
, status
);
3697 return packFloat64(zSign
, 0, 0);
3700 (status
->float_detect_tininess
3701 == float_tininess_before_rounding
)
3703 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
3704 shift64RightJamming( zSig
, - zExp
, &zSig
);
3706 roundBits
= zSig
& 0x3FF;
3707 if (isTiny
&& roundBits
) {
3708 float_raise(float_flag_underflow
, status
);
3710 if (roundingMode
== float_round_to_odd
) {
3712 * For round-to-odd case, the roundIncrement depends on
3713 * zSig which just changed.
3715 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
3720 status
->float_exception_flags
|= float_flag_inexact
;
3722 zSig
= ( zSig
+ roundIncrement
)>>10;
3723 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
3724 if ( zSig
== 0 ) zExp
= 0;
3725 return packFloat64( zSign
, zExp
, zSig
);
3729 /*----------------------------------------------------------------------------
3730 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3731 | and significand `zSig', and returns the proper double-precision floating-
3732 | point value corresponding to the abstract input. This routine is just like
3733 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
3734 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3735 | floating-point exponent.
3736 *----------------------------------------------------------------------------*/
3739 normalizeRoundAndPackFloat64(flag zSign
, int zExp
, uint64_t zSig
,
3740 float_status
*status
)
3744 shiftCount
= clz64(zSig
) - 1;
3745 return roundAndPackFloat64(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
3750 /*----------------------------------------------------------------------------
3751 | Normalizes the subnormal extended double-precision floating-point value
3752 | represented by the denormalized significand `aSig'. The normalized exponent
3753 | and significand are stored at the locations pointed to by `zExpPtr' and
3754 | `zSigPtr', respectively.
3755 *----------------------------------------------------------------------------*/
3757 void normalizeFloatx80Subnormal(uint64_t aSig
, int32_t *zExpPtr
,
3762 shiftCount
= clz64(aSig
);
3763 *zSigPtr
= aSig
<<shiftCount
;
3764 *zExpPtr
= 1 - shiftCount
;
3767 /*----------------------------------------------------------------------------
3768 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3769 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3770 | and returns the proper extended double-precision floating-point value
3771 | corresponding to the abstract input. Ordinarily, the abstract value is
3772 | rounded and packed into the extended double-precision format, with the
3773 | inexact exception raised if the abstract input cannot be represented
3774 | exactly. However, if the abstract value is too large, the overflow and
3775 | inexact exceptions are raised and an infinity or maximal finite value is
3776 | returned. If the abstract value is too small, the input value is rounded to
3777 | a subnormal number, and the underflow and inexact exceptions are raised if
3778 | the abstract input cannot be represented exactly as a subnormal extended
3779 | double-precision floating-point number.
3780 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
3781 | number of bits as single or double precision, respectively. Otherwise, the
3782 | result is rounded to the full precision of the extended double-precision
3784 | The input significand must be normalized or smaller. If the input
3785 | significand is not normalized, `zExp' must be 0; in that case, the result
3786 | returned is a subnormal number, and it must not require rounding. The
3787 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3788 | Floating-Point Arithmetic.
3789 *----------------------------------------------------------------------------*/
3791 floatx80
roundAndPackFloatx80(int8_t roundingPrecision
, flag zSign
,
3792 int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
,
3793 float_status
*status
)
3795 int8_t roundingMode
;
3796 flag roundNearestEven
, increment
, isTiny
;
3797 int64_t roundIncrement
, roundMask
, roundBits
;
3799 roundingMode
= status
->float_rounding_mode
;
3800 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3801 if ( roundingPrecision
== 80 ) goto precision80
;
3802 if ( roundingPrecision
== 64 ) {
3803 roundIncrement
= LIT64( 0x0000000000000400 );
3804 roundMask
= LIT64( 0x00000000000007FF );
3806 else if ( roundingPrecision
== 32 ) {
3807 roundIncrement
= LIT64( 0x0000008000000000 );
3808 roundMask
= LIT64( 0x000000FFFFFFFFFF );
3813 zSig0
|= ( zSig1
!= 0 );
3814 switch (roundingMode
) {
3815 case float_round_nearest_even
:
3816 case float_round_ties_away
:
3818 case float_round_to_zero
:
3821 case float_round_up
:
3822 roundIncrement
= zSign
? 0 : roundMask
;
3824 case float_round_down
:
3825 roundIncrement
= zSign
? roundMask
: 0;
3830 roundBits
= zSig0
& roundMask
;
3831 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
3832 if ( ( 0x7FFE < zExp
)
3833 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
3838 if (status
->flush_to_zero
) {
3839 float_raise(float_flag_output_denormal
, status
);
3840 return packFloatx80(zSign
, 0, 0);
3843 (status
->float_detect_tininess
3844 == float_tininess_before_rounding
)
3846 || ( zSig0
<= zSig0
+ roundIncrement
);
3847 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
3849 roundBits
= zSig0
& roundMask
;
3850 if (isTiny
&& roundBits
) {
3851 float_raise(float_flag_underflow
, status
);
3854 status
->float_exception_flags
|= float_flag_inexact
;
3856 zSig0
+= roundIncrement
;
3857 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
3858 roundIncrement
= roundMask
+ 1;
3859 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
3860 roundMask
|= roundIncrement
;
3862 zSig0
&= ~ roundMask
;
3863 return packFloatx80( zSign
, zExp
, zSig0
);
3867 status
->float_exception_flags
|= float_flag_inexact
;
3869 zSig0
+= roundIncrement
;
3870 if ( zSig0
< roundIncrement
) {
3872 zSig0
= LIT64( 0x8000000000000000 );
3874 roundIncrement
= roundMask
+ 1;
3875 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
3876 roundMask
|= roundIncrement
;
3878 zSig0
&= ~ roundMask
;
3879 if ( zSig0
== 0 ) zExp
= 0;
3880 return packFloatx80( zSign
, zExp
, zSig0
);
3882 switch (roundingMode
) {
3883 case float_round_nearest_even
:
3884 case float_round_ties_away
:
3885 increment
= ((int64_t)zSig1
< 0);
3887 case float_round_to_zero
:
3890 case float_round_up
:
3891 increment
= !zSign
&& zSig1
;
3893 case float_round_down
:
3894 increment
= zSign
&& zSig1
;
3899 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
3900 if ( ( 0x7FFE < zExp
)
3901 || ( ( zExp
== 0x7FFE )
3902 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
3908 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3909 if ( ( roundingMode
== float_round_to_zero
)
3910 || ( zSign
&& ( roundingMode
== float_round_up
) )
3911 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
3913 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
3915 return packFloatx80(zSign
,
3916 floatx80_infinity_high
,
3917 floatx80_infinity_low
);
3921 (status
->float_detect_tininess
3922 == float_tininess_before_rounding
)
3925 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
3926 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
3928 if (isTiny
&& zSig1
) {
3929 float_raise(float_flag_underflow
, status
);
3932 status
->float_exception_flags
|= float_flag_inexact
;
3934 switch (roundingMode
) {
3935 case float_round_nearest_even
:
3936 case float_round_ties_away
:
3937 increment
= ((int64_t)zSig1
< 0);
3939 case float_round_to_zero
:
3942 case float_round_up
:
3943 increment
= !zSign
&& zSig1
;
3945 case float_round_down
:
3946 increment
= zSign
&& zSig1
;
3954 ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
3955 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
3957 return packFloatx80( zSign
, zExp
, zSig0
);
3961 status
->float_exception_flags
|= float_flag_inexact
;
3967 zSig0
= LIT64( 0x8000000000000000 );
3970 zSig0
&= ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
3974 if ( zSig0
== 0 ) zExp
= 0;
3976 return packFloatx80( zSign
, zExp
, zSig0
);
3980 /*----------------------------------------------------------------------------
3981 | Takes an abstract floating-point value having sign `zSign', exponent
3982 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
3983 | and returns the proper extended double-precision floating-point value
3984 | corresponding to the abstract input. This routine is just like
3985 | `roundAndPackFloatx80' except that the input significand does not have to be
3987 *----------------------------------------------------------------------------*/
3989 floatx80
normalizeRoundAndPackFloatx80(int8_t roundingPrecision
,
3990 flag zSign
, int32_t zExp
,
3991 uint64_t zSig0
, uint64_t zSig1
,
3992 float_status
*status
)
4001 shiftCount
= clz64(zSig0
);
4002 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4004 return roundAndPackFloatx80(roundingPrecision
, zSign
, zExp
,
4005 zSig0
, zSig1
, status
);
4009 /*----------------------------------------------------------------------------
4010 | Returns the least-significant 64 fraction bits of the quadruple-precision
4011 | floating-point value `a'.
4012 *----------------------------------------------------------------------------*/
4014 static inline uint64_t extractFloat128Frac1( float128 a
)
4021 /*----------------------------------------------------------------------------
4022 | Returns the most-significant 48 fraction bits of the quadruple-precision
4023 | floating-point value `a'.
4024 *----------------------------------------------------------------------------*/
4026 static inline uint64_t extractFloat128Frac0( float128 a
)
4029 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
4033 /*----------------------------------------------------------------------------
4034 | Returns the exponent bits of the quadruple-precision floating-point value
4036 *----------------------------------------------------------------------------*/
4038 static inline int32_t extractFloat128Exp( float128 a
)
4041 return ( a
.high
>>48 ) & 0x7FFF;
4045 /*----------------------------------------------------------------------------
4046 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4047 *----------------------------------------------------------------------------*/
4049 static inline flag
extractFloat128Sign( float128 a
)
4056 /*----------------------------------------------------------------------------
4057 | Normalizes the subnormal quadruple-precision floating-point value
4058 | represented by the denormalized significand formed by the concatenation of
4059 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
4060 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
4061 | significand are stored at the location pointed to by `zSig0Ptr', and the
4062 | least significant 64 bits of the normalized significand are stored at the
4063 | location pointed to by `zSig1Ptr'.
4064 *----------------------------------------------------------------------------*/
4067 normalizeFloat128Subnormal(
4078 shiftCount
= clz64(aSig1
) - 15;
4079 if ( shiftCount
< 0 ) {
4080 *zSig0Ptr
= aSig1
>>( - shiftCount
);
4081 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
4084 *zSig0Ptr
= aSig1
<<shiftCount
;
4087 *zExpPtr
= - shiftCount
- 63;
4090 shiftCount
= clz64(aSig0
) - 15;
4091 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
4092 *zExpPtr
= 1 - shiftCount
;
4097 /*----------------------------------------------------------------------------
4098 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4099 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4100 | floating-point value, returning the result. After being shifted into the
4101 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4102 | added together to form the most significant 32 bits of the result. This
4103 | means that any integer portion of `zSig0' will be added into the exponent.
4104 | Since a properly normalized significand will have an integer portion equal
4105 | to 1, the `zExp' input should be 1 less than the desired result exponent
4106 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4108 *----------------------------------------------------------------------------*/
4110 static inline float128
4111 packFloat128( flag zSign
, int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
)
4116 z
.high
= ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<48 ) + zSig0
;
4121 /*----------------------------------------------------------------------------
4122 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4123 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4124 | and `zSig2', and returns the proper quadruple-precision floating-point value
4125 | corresponding to the abstract input. Ordinarily, the abstract value is
4126 | simply rounded and packed into the quadruple-precision format, with the
4127 | inexact exception raised if the abstract input cannot be represented
4128 | exactly. However, if the abstract value is too large, the overflow and
4129 | inexact exceptions are raised and an infinity or maximal finite value is
4130 | returned. If the abstract value is too small, the input value is rounded to
4131 | a subnormal number, and the underflow and inexact exceptions are raised if
4132 | the abstract input cannot be represented exactly as a subnormal quadruple-
4133 | precision floating-point number.
4134 | The input significand must be normalized or smaller. If the input
4135 | significand is not normalized, `zExp' must be 0; in that case, the result
4136 | returned is a subnormal number, and it must not require rounding. In the
4137 | usual case that the input significand is normalized, `zExp' must be 1 less
4138 | than the ``true'' floating-point exponent. The handling of underflow and
4139 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4140 *----------------------------------------------------------------------------*/
4142 static float128
roundAndPackFloat128(flag zSign
, int32_t zExp
,
4143 uint64_t zSig0
, uint64_t zSig1
,
4144 uint64_t zSig2
, float_status
*status
)
4146 int8_t roundingMode
;
4147 flag roundNearestEven
, increment
, isTiny
;
4149 roundingMode
= status
->float_rounding_mode
;
4150 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4151 switch (roundingMode
) {
4152 case float_round_nearest_even
:
4153 case float_round_ties_away
:
4154 increment
= ((int64_t)zSig2
< 0);
4156 case float_round_to_zero
:
4159 case float_round_up
:
4160 increment
= !zSign
&& zSig2
;
4162 case float_round_down
:
4163 increment
= zSign
&& zSig2
;
4165 case float_round_to_odd
:
4166 increment
= !(zSig1
& 0x1) && zSig2
;
4171 if ( 0x7FFD <= (uint32_t) zExp
) {
4172 if ( ( 0x7FFD < zExp
)
4173 || ( ( zExp
== 0x7FFD )
4175 LIT64( 0x0001FFFFFFFFFFFF ),
4176 LIT64( 0xFFFFFFFFFFFFFFFF ),
4183 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4184 if ( ( roundingMode
== float_round_to_zero
)
4185 || ( zSign
&& ( roundingMode
== float_round_up
) )
4186 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
4187 || (roundingMode
== float_round_to_odd
)
4193 LIT64( 0x0000FFFFFFFFFFFF ),
4194 LIT64( 0xFFFFFFFFFFFFFFFF )
4197 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4200 if (status
->flush_to_zero
) {
4201 float_raise(float_flag_output_denormal
, status
);
4202 return packFloat128(zSign
, 0, 0, 0);
4205 (status
->float_detect_tininess
4206 == float_tininess_before_rounding
)
4212 LIT64( 0x0001FFFFFFFFFFFF ),
4213 LIT64( 0xFFFFFFFFFFFFFFFF )
4215 shift128ExtraRightJamming(
4216 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
4218 if (isTiny
&& zSig2
) {
4219 float_raise(float_flag_underflow
, status
);
4221 switch (roundingMode
) {
4222 case float_round_nearest_even
:
4223 case float_round_ties_away
:
4224 increment
= ((int64_t)zSig2
< 0);
4226 case float_round_to_zero
:
4229 case float_round_up
:
4230 increment
= !zSign
&& zSig2
;
4232 case float_round_down
:
4233 increment
= zSign
&& zSig2
;
4235 case float_round_to_odd
:
4236 increment
= !(zSig1
& 0x1) && zSig2
;
4244 status
->float_exception_flags
|= float_flag_inexact
;
4247 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
4248 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
4251 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
4253 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
4257 /*----------------------------------------------------------------------------
4258 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4259 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4260 | returns the proper quadruple-precision floating-point value corresponding
4261 | to the abstract input. This routine is just like `roundAndPackFloat128'
4262 | except that the input significand has fewer bits and does not have to be
4263 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
4265 *----------------------------------------------------------------------------*/
4267 static float128
normalizeRoundAndPackFloat128(flag zSign
, int32_t zExp
,
4268 uint64_t zSig0
, uint64_t zSig1
,
4269 float_status
*status
)
4279 shiftCount
= clz64(zSig0
) - 15;
4280 if ( 0 <= shiftCount
) {
4282 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4285 shift128ExtraRightJamming(
4286 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
4289 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
4294 /*----------------------------------------------------------------------------
4295 | Returns the result of converting the 32-bit two's complement integer `a'
4296 | to the extended double-precision floating-point format. The conversion
4297 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4299 *----------------------------------------------------------------------------*/
4301 floatx80
int32_to_floatx80(int32_t a
, float_status
*status
)
4308 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4310 absA
= zSign
? - a
: a
;
4311 shiftCount
= clz32(absA
) + 32;
4313 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
4317 /*----------------------------------------------------------------------------
4318 | Returns the result of converting the 32-bit two's complement integer `a' to
4319 | the quadruple-precision floating-point format. The conversion is performed
4320 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4321 *----------------------------------------------------------------------------*/
4323 float128
int32_to_float128(int32_t a
, float_status
*status
)
4330 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
4332 absA
= zSign
? - a
: a
;
4333 shiftCount
= clz32(absA
) + 17;
4335 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
4339 /*----------------------------------------------------------------------------
4340 | Returns the result of converting the 64-bit two's complement integer `a'
4341 | to the extended double-precision floating-point format. The conversion
4342 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4344 *----------------------------------------------------------------------------*/
4346 floatx80
int64_to_floatx80(int64_t a
, float_status
*status
)
4352 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4354 absA
= zSign
? - a
: a
;
4355 shiftCount
= clz64(absA
);
4356 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
4360 /*----------------------------------------------------------------------------
4361 | Returns the result of converting the 64-bit two's complement integer `a' to
4362 | the quadruple-precision floating-point format. The conversion is performed
4363 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4364 *----------------------------------------------------------------------------*/
4366 float128
int64_to_float128(int64_t a
, float_status
*status
)
4372 uint64_t zSig0
, zSig1
;
4374 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
4376 absA
= zSign
? - a
: a
;
4377 shiftCount
= clz64(absA
) + 49;
4378 zExp
= 0x406E - shiftCount
;
4379 if ( 64 <= shiftCount
) {
4388 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4389 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
4393 /*----------------------------------------------------------------------------
4394 | Returns the result of converting the 64-bit unsigned integer `a'
4395 | to the quadruple-precision floating-point format. The conversion is performed
4396 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4397 *----------------------------------------------------------------------------*/
4399 float128
uint64_to_float128(uint64_t a
, float_status
*status
)
4402 return float128_zero
;
4404 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a
, status
);
4407 /*----------------------------------------------------------------------------
4408 | Returns the result of converting the single-precision floating-point value
4409 | `a' to the extended double-precision floating-point format. The conversion
4410 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4412 *----------------------------------------------------------------------------*/
4414 floatx80
float32_to_floatx80(float32 a
, float_status
*status
)
4420 a
= float32_squash_input_denormal(a
, status
);
4421 aSig
= extractFloat32Frac( a
);
4422 aExp
= extractFloat32Exp( a
);
4423 aSign
= extractFloat32Sign( a
);
4424 if ( aExp
== 0xFF ) {
4426 return commonNaNToFloatx80(float32ToCommonNaN(a
, status
), status
);
4428 return packFloatx80(aSign
,
4429 floatx80_infinity_high
,
4430 floatx80_infinity_low
);
4433 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
4434 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4437 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
4441 /*----------------------------------------------------------------------------
4442 | Returns the result of converting the single-precision floating-point value
4443 | `a' to the double-precision floating-point format. The conversion is
4444 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4446 *----------------------------------------------------------------------------*/
4448 float128
float32_to_float128(float32 a
, float_status
*status
)
4454 a
= float32_squash_input_denormal(a
, status
);
4455 aSig
= extractFloat32Frac( a
);
4456 aExp
= extractFloat32Exp( a
);
4457 aSign
= extractFloat32Sign( a
);
4458 if ( aExp
== 0xFF ) {
4460 return commonNaNToFloat128(float32ToCommonNaN(a
, status
), status
);
4462 return packFloat128( aSign
, 0x7FFF, 0, 0 );
4465 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
4466 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4469 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
4473 /*----------------------------------------------------------------------------
4474 | Returns the remainder of the single-precision floating-point value `a'
4475 | with respect to the corresponding value `b'. The operation is performed
4476 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4477 *----------------------------------------------------------------------------*/
4479 float32
float32_rem(float32 a
, float32 b
, float_status
*status
)
4482 int aExp
, bExp
, expDiff
;
4483 uint32_t aSig
, bSig
;
4485 uint64_t aSig64
, bSig64
, q64
;
4486 uint32_t alternateASig
;
4488 a
= float32_squash_input_denormal(a
, status
);
4489 b
= float32_squash_input_denormal(b
, status
);
4491 aSig
= extractFloat32Frac( a
);
4492 aExp
= extractFloat32Exp( a
);
4493 aSign
= extractFloat32Sign( a
);
4494 bSig
= extractFloat32Frac( b
);
4495 bExp
= extractFloat32Exp( b
);
4496 if ( aExp
== 0xFF ) {
4497 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
4498 return propagateFloat32NaN(a
, b
, status
);
4500 float_raise(float_flag_invalid
, status
);
4501 return float32_default_nan(status
);
4503 if ( bExp
== 0xFF ) {
4505 return propagateFloat32NaN(a
, b
, status
);
4511 float_raise(float_flag_invalid
, status
);
4512 return float32_default_nan(status
);
4514 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
4517 if ( aSig
== 0 ) return a
;
4518 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4520 expDiff
= aExp
- bExp
;
4523 if ( expDiff
< 32 ) {
4526 if ( expDiff
< 0 ) {
4527 if ( expDiff
< -1 ) return a
;
4530 q
= ( bSig
<= aSig
);
4531 if ( q
) aSig
-= bSig
;
4532 if ( 0 < expDiff
) {
4533 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
4536 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
4544 if ( bSig
<= aSig
) aSig
-= bSig
;
4545 aSig64
= ( (uint64_t) aSig
)<<40;
4546 bSig64
= ( (uint64_t) bSig
)<<40;
4548 while ( 0 < expDiff
) {
4549 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
4550 q64
= ( 2 < q64
) ? q64
- 2 : 0;
4551 aSig64
= - ( ( bSig
* q64
)<<38 );
4555 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
4556 q64
= ( 2 < q64
) ? q64
- 2 : 0;
4557 q
= q64
>>( 64 - expDiff
);
4559 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
4562 alternateASig
= aSig
;
4565 } while ( 0 <= (int32_t) aSig
);
4566 sigMean
= aSig
+ alternateASig
;
4567 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
4568 aSig
= alternateASig
;
4570 zSign
= ( (int32_t) aSig
< 0 );
4571 if ( zSign
) aSig
= - aSig
;
4572 return normalizeRoundAndPackFloat32(aSign
^ zSign
, bExp
, aSig
, status
);
4577 /*----------------------------------------------------------------------------
4578 | Returns the binary exponential of the single-precision floating-point value
4579 | `a'. The operation is performed according to the IEC/IEEE Standard for
4580 | Binary Floating-Point Arithmetic.
4582 | Uses the following identities:
4584 | 1. -------------------------------------------------------------------------
4588 | 2. -------------------------------------------------------------------------
4591 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
4593 *----------------------------------------------------------------------------*/
4595 static const float64 float32_exp2_coefficients
[15] =
4597 const_float64( 0x3ff0000000000000ll
), /* 1 */
4598 const_float64( 0x3fe0000000000000ll
), /* 2 */
4599 const_float64( 0x3fc5555555555555ll
), /* 3 */
4600 const_float64( 0x3fa5555555555555ll
), /* 4 */
4601 const_float64( 0x3f81111111111111ll
), /* 5 */
4602 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
4603 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
4604 const_float64( 0x3efa01a01a01a01all
), /* 8 */
4605 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
4606 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
4607 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
4608 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
4609 const_float64( 0x3de6124613a86d09ll
), /* 13 */
4610 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
4611 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
4614 float32
float32_exp2(float32 a
, float_status
*status
)
4621 a
= float32_squash_input_denormal(a
, status
);
4623 aSig
= extractFloat32Frac( a
);
4624 aExp
= extractFloat32Exp( a
);
4625 aSign
= extractFloat32Sign( a
);
4627 if ( aExp
== 0xFF) {
4629 return propagateFloat32NaN(a
, float32_zero
, status
);
4631 return (aSign
) ? float32_zero
: a
;
4634 if (aSig
== 0) return float32_one
;
4637 float_raise(float_flag_inexact
, status
);
4639 /* ******************************* */
4640 /* using float64 for approximation */
4641 /* ******************************* */
4642 x
= float32_to_float64(a
, status
);
4643 x
= float64_mul(x
, float64_ln2
, status
);
4647 for (i
= 0 ; i
< 15 ; i
++) {
4650 f
= float64_mul(xn
, float32_exp2_coefficients
[i
], status
);
4651 r
= float64_add(r
, f
, status
);
4653 xn
= float64_mul(xn
, x
, status
);
4656 return float64_to_float32(r
, status
);
4659 /*----------------------------------------------------------------------------
4660 | Returns the binary log of the single-precision floating-point value `a'.
4661 | The operation is performed according to the IEC/IEEE Standard for Binary
4662 | Floating-Point Arithmetic.
4663 *----------------------------------------------------------------------------*/
4664 float32
float32_log2(float32 a
, float_status
*status
)
4668 uint32_t aSig
, zSig
, i
;
4670 a
= float32_squash_input_denormal(a
, status
);
4671 aSig
= extractFloat32Frac( a
);
4672 aExp
= extractFloat32Exp( a
);
4673 aSign
= extractFloat32Sign( a
);
4676 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
4677 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4680 float_raise(float_flag_invalid
, status
);
4681 return float32_default_nan(status
);
4683 if ( aExp
== 0xFF ) {
4685 return propagateFloat32NaN(a
, float32_zero
, status
);
4695 for (i
= 1 << 22; i
> 0; i
>>= 1) {
4696 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
4697 if ( aSig
& 0x01000000 ) {
4706 return normalizeRoundAndPackFloat32(zSign
, 0x85, zSig
, status
);
4709 /*----------------------------------------------------------------------------
4710 | Returns 1 if the single-precision floating-point value `a' is equal to
4711 | the corresponding value `b', and 0 otherwise. The invalid exception is
4712 | raised if either operand is a NaN. Otherwise, the comparison is performed
4713 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4714 *----------------------------------------------------------------------------*/
4716 int float32_eq(float32 a
, float32 b
, float_status
*status
)
4719 a
= float32_squash_input_denormal(a
, status
);
4720 b
= float32_squash_input_denormal(b
, status
);
4722 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4723 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4725 float_raise(float_flag_invalid
, status
);
4728 av
= float32_val(a
);
4729 bv
= float32_val(b
);
4730 return ( av
== bv
) || ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
4733 /*----------------------------------------------------------------------------
4734 | Returns 1 if the single-precision floating-point value `a' is less than
4735 | or equal to the corresponding value `b', and 0 otherwise. The invalid
4736 | exception is raised if either operand is a NaN. The comparison is performed
4737 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4738 *----------------------------------------------------------------------------*/
4740 int float32_le(float32 a
, float32 b
, float_status
*status
)
4744 a
= float32_squash_input_denormal(a
, status
);
4745 b
= float32_squash_input_denormal(b
, status
);
4747 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4748 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4750 float_raise(float_flag_invalid
, status
);
4753 aSign
= extractFloat32Sign( a
);
4754 bSign
= extractFloat32Sign( b
);
4755 av
= float32_val(a
);
4756 bv
= float32_val(b
);
4757 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
4758 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4762 /*----------------------------------------------------------------------------
4763 | Returns 1 if the single-precision floating-point value `a' is less than
4764 | the corresponding value `b', and 0 otherwise. The invalid exception is
4765 | raised if either operand is a NaN. The comparison is performed according
4766 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4767 *----------------------------------------------------------------------------*/
4769 int float32_lt(float32 a
, float32 b
, float_status
*status
)
4773 a
= float32_squash_input_denormal(a
, status
);
4774 b
= float32_squash_input_denormal(b
, status
);
4776 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4777 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4779 float_raise(float_flag_invalid
, status
);
4782 aSign
= extractFloat32Sign( a
);
4783 bSign
= extractFloat32Sign( b
);
4784 av
= float32_val(a
);
4785 bv
= float32_val(b
);
4786 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
4787 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4791 /*----------------------------------------------------------------------------
4792 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4793 | be compared, and 0 otherwise. The invalid exception is raised if either
4794 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4795 | Standard for Binary Floating-Point Arithmetic.
4796 *----------------------------------------------------------------------------*/
4798 int float32_unordered(float32 a
, float32 b
, float_status
*status
)
4800 a
= float32_squash_input_denormal(a
, status
);
4801 b
= float32_squash_input_denormal(b
, status
);
4803 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4804 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4806 float_raise(float_flag_invalid
, status
);
4812 /*----------------------------------------------------------------------------
4813 | Returns 1 if the single-precision floating-point value `a' is equal to
4814 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4815 | exception. The comparison is performed according to the IEC/IEEE Standard
4816 | for Binary Floating-Point Arithmetic.
4817 *----------------------------------------------------------------------------*/
4819 int float32_eq_quiet(float32 a
, float32 b
, float_status
*status
)
4821 a
= float32_squash_input_denormal(a
, status
);
4822 b
= float32_squash_input_denormal(b
, status
);
4824 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4825 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4827 if (float32_is_signaling_nan(a
, status
)
4828 || float32_is_signaling_nan(b
, status
)) {
4829 float_raise(float_flag_invalid
, status
);
4833 return ( float32_val(a
) == float32_val(b
) ) ||
4834 ( (uint32_t) ( ( float32_val(a
) | float32_val(b
) )<<1 ) == 0 );
4837 /*----------------------------------------------------------------------------
4838 | Returns 1 if the single-precision floating-point value `a' is less than or
4839 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4840 | cause an exception. Otherwise, the comparison is performed according to the
4841 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4842 *----------------------------------------------------------------------------*/
4844 int float32_le_quiet(float32 a
, float32 b
, float_status
*status
)
4848 a
= float32_squash_input_denormal(a
, status
);
4849 b
= float32_squash_input_denormal(b
, status
);
4851 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4852 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4854 if (float32_is_signaling_nan(a
, status
)
4855 || float32_is_signaling_nan(b
, status
)) {
4856 float_raise(float_flag_invalid
, status
);
4860 aSign
= extractFloat32Sign( a
);
4861 bSign
= extractFloat32Sign( b
);
4862 av
= float32_val(a
);
4863 bv
= float32_val(b
);
4864 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
4865 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4869 /*----------------------------------------------------------------------------
4870 | Returns 1 if the single-precision floating-point value `a' is less than
4871 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4872 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4873 | Standard for Binary Floating-Point Arithmetic.
4874 *----------------------------------------------------------------------------*/
4876 int float32_lt_quiet(float32 a
, float32 b
, float_status
*status
)
4880 a
= float32_squash_input_denormal(a
, status
);
4881 b
= float32_squash_input_denormal(b
, status
);
4883 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4884 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4886 if (float32_is_signaling_nan(a
, status
)
4887 || float32_is_signaling_nan(b
, status
)) {
4888 float_raise(float_flag_invalid
, status
);
4892 aSign
= extractFloat32Sign( a
);
4893 bSign
= extractFloat32Sign( b
);
4894 av
= float32_val(a
);
4895 bv
= float32_val(b
);
4896 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
4897 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4901 /*----------------------------------------------------------------------------
4902 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4903 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4904 | comparison is performed according to the IEC/IEEE Standard for Binary
4905 | Floating-Point Arithmetic.
4906 *----------------------------------------------------------------------------*/
4908 int float32_unordered_quiet(float32 a
, float32 b
, float_status
*status
)
4910 a
= float32_squash_input_denormal(a
, status
);
4911 b
= float32_squash_input_denormal(b
, status
);
4913 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4914 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4916 if (float32_is_signaling_nan(a
, status
)
4917 || float32_is_signaling_nan(b
, status
)) {
4918 float_raise(float_flag_invalid
, status
);
4925 /*----------------------------------------------------------------------------
4926 | If `a' is denormal and we are in flush-to-zero mode then set the
4927 | input-denormal exception and return zero. Otherwise just return the value.
4928 *----------------------------------------------------------------------------*/
4929 float16
float16_squash_input_denormal(float16 a
, float_status
*status
)
4931 if (status
->flush_inputs_to_zero
) {
4932 if (extractFloat16Exp(a
) == 0 && extractFloat16Frac(a
) != 0) {
4933 float_raise(float_flag_input_denormal
, status
);
4934 return make_float16(float16_val(a
) & 0x8000);
4940 /*----------------------------------------------------------------------------
4941 | Returns the result of converting the double-precision floating-point value
4942 | `a' to the extended double-precision floating-point format. The conversion
4943 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4945 *----------------------------------------------------------------------------*/
4947 floatx80
float64_to_floatx80(float64 a
, float_status
*status
)
4953 a
= float64_squash_input_denormal(a
, status
);
4954 aSig
= extractFloat64Frac( a
);
4955 aExp
= extractFloat64Exp( a
);
4956 aSign
= extractFloat64Sign( a
);
4957 if ( aExp
== 0x7FF ) {
4959 return commonNaNToFloatx80(float64ToCommonNaN(a
, status
), status
);
4961 return packFloatx80(aSign
,
4962 floatx80_infinity_high
,
4963 floatx80_infinity_low
);
4966 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
4967 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4971 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
4975 /*----------------------------------------------------------------------------
4976 | Returns the result of converting the double-precision floating-point value
4977 | `a' to the quadruple-precision floating-point format. The conversion is
4978 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4980 *----------------------------------------------------------------------------*/
4982 float128
float64_to_float128(float64 a
, float_status
*status
)
4986 uint64_t aSig
, zSig0
, zSig1
;
4988 a
= float64_squash_input_denormal(a
, status
);
4989 aSig
= extractFloat64Frac( a
);
4990 aExp
= extractFloat64Exp( a
);
4991 aSign
= extractFloat64Sign( a
);
4992 if ( aExp
== 0x7FF ) {
4994 return commonNaNToFloat128(float64ToCommonNaN(a
, status
), status
);
4996 return packFloat128( aSign
, 0x7FFF, 0, 0 );
4999 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
5000 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5003 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
5004 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
5009 /*----------------------------------------------------------------------------
5010 | Returns the remainder of the double-precision floating-point value `a'
5011 | with respect to the corresponding value `b'. The operation is performed
5012 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5013 *----------------------------------------------------------------------------*/
5015 float64
float64_rem(float64 a
, float64 b
, float_status
*status
)
5018 int aExp
, bExp
, expDiff
;
5019 uint64_t aSig
, bSig
;
5020 uint64_t q
, alternateASig
;
5023 a
= float64_squash_input_denormal(a
, status
);
5024 b
= float64_squash_input_denormal(b
, status
);
5025 aSig
= extractFloat64Frac( a
);
5026 aExp
= extractFloat64Exp( a
);
5027 aSign
= extractFloat64Sign( a
);
5028 bSig
= extractFloat64Frac( b
);
5029 bExp
= extractFloat64Exp( b
);
5030 if ( aExp
== 0x7FF ) {
5031 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
5032 return propagateFloat64NaN(a
, b
, status
);
5034 float_raise(float_flag_invalid
, status
);
5035 return float64_default_nan(status
);
5037 if ( bExp
== 0x7FF ) {
5039 return propagateFloat64NaN(a
, b
, status
);
5045 float_raise(float_flag_invalid
, status
);
5046 return float64_default_nan(status
);
5048 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
5051 if ( aSig
== 0 ) return a
;
5052 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5054 expDiff
= aExp
- bExp
;
5055 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
5056 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
5057 if ( expDiff
< 0 ) {
5058 if ( expDiff
< -1 ) return a
;
5061 q
= ( bSig
<= aSig
);
5062 if ( q
) aSig
-= bSig
;
5064 while ( 0 < expDiff
) {
5065 q
= estimateDiv128To64( aSig
, 0, bSig
);
5066 q
= ( 2 < q
) ? q
- 2 : 0;
5067 aSig
= - ( ( bSig
>>2 ) * q
);
5071 if ( 0 < expDiff
) {
5072 q
= estimateDiv128To64( aSig
, 0, bSig
);
5073 q
= ( 2 < q
) ? q
- 2 : 0;
5076 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
5083 alternateASig
= aSig
;
5086 } while ( 0 <= (int64_t) aSig
);
5087 sigMean
= aSig
+ alternateASig
;
5088 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
5089 aSig
= alternateASig
;
5091 zSign
= ( (int64_t) aSig
< 0 );
5092 if ( zSign
) aSig
= - aSig
;
5093 return normalizeRoundAndPackFloat64(aSign
^ zSign
, bExp
, aSig
, status
);
5097 /*----------------------------------------------------------------------------
5098 | Returns the binary log of the double-precision floating-point value `a'.
5099 | The operation is performed according to the IEC/IEEE Standard for Binary
5100 | Floating-Point Arithmetic.
5101 *----------------------------------------------------------------------------*/
5102 float64
float64_log2(float64 a
, float_status
*status
)
5106 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
5107 a
= float64_squash_input_denormal(a
, status
);
5109 aSig
= extractFloat64Frac( a
);
5110 aExp
= extractFloat64Exp( a
);
5111 aSign
= extractFloat64Sign( a
);
5114 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
5115 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5118 float_raise(float_flag_invalid
, status
);
5119 return float64_default_nan(status
);
5121 if ( aExp
== 0x7FF ) {
5123 return propagateFloat64NaN(a
, float64_zero
, status
);
5129 aSig
|= LIT64( 0x0010000000000000 );
5131 zSig
= (uint64_t)aExp
<< 52;
5132 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
5133 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
5134 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
5135 if ( aSig
& LIT64( 0x0020000000000000 ) ) {
5143 return normalizeRoundAndPackFloat64(zSign
, 0x408, zSig
, status
);
5146 /*----------------------------------------------------------------------------
5147 | Returns 1 if the double-precision floating-point value `a' is equal to the
5148 | corresponding value `b', and 0 otherwise. The invalid exception is raised
5149 | if either operand is a NaN. Otherwise, the comparison is performed
5150 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5151 *----------------------------------------------------------------------------*/
5153 int float64_eq(float64 a
, float64 b
, float_status
*status
)
5156 a
= float64_squash_input_denormal(a
, status
);
5157 b
= float64_squash_input_denormal(b
, status
);
5159 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5160 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5162 float_raise(float_flag_invalid
, status
);
5165 av
= float64_val(a
);
5166 bv
= float64_val(b
);
5167 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
5171 /*----------------------------------------------------------------------------
5172 | Returns 1 if the double-precision floating-point value `a' is less than or
5173 | equal to the corresponding value `b', and 0 otherwise. The invalid
5174 | exception is raised if either operand is a NaN. The comparison is performed
5175 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5176 *----------------------------------------------------------------------------*/
5178 int float64_le(float64 a
, float64 b
, float_status
*status
)
5182 a
= float64_squash_input_denormal(a
, status
);
5183 b
= float64_squash_input_denormal(b
, status
);
5185 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5186 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5188 float_raise(float_flag_invalid
, status
);
5191 aSign
= extractFloat64Sign( a
);
5192 bSign
= extractFloat64Sign( b
);
5193 av
= float64_val(a
);
5194 bv
= float64_val(b
);
5195 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
5196 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
5200 /*----------------------------------------------------------------------------
5201 | Returns 1 if the double-precision floating-point value `a' is less than
5202 | the corresponding value `b', and 0 otherwise. The invalid exception is
5203 | raised if either operand is a NaN. The comparison is performed according
5204 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5205 *----------------------------------------------------------------------------*/
5207 int float64_lt(float64 a
, float64 b
, float_status
*status
)
5212 a
= float64_squash_input_denormal(a
, status
);
5213 b
= float64_squash_input_denormal(b
, status
);
5214 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5215 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5217 float_raise(float_flag_invalid
, status
);
5220 aSign
= extractFloat64Sign( a
);
5221 bSign
= extractFloat64Sign( b
);
5222 av
= float64_val(a
);
5223 bv
= float64_val(b
);
5224 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
5225 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
5229 /*----------------------------------------------------------------------------
5230 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5231 | be compared, and 0 otherwise. The invalid exception is raised if either
5232 | operand is a NaN. The comparison is performed according to the IEC/IEEE
5233 | Standard for Binary Floating-Point Arithmetic.
5234 *----------------------------------------------------------------------------*/
5236 int float64_unordered(float64 a
, float64 b
, float_status
*status
)
5238 a
= float64_squash_input_denormal(a
, status
);
5239 b
= float64_squash_input_denormal(b
, status
);
5241 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5242 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5244 float_raise(float_flag_invalid
, status
);
5250 /*----------------------------------------------------------------------------
5251 | Returns 1 if the double-precision floating-point value `a' is equal to the
5252 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5253 | exception.The comparison is performed according to the IEC/IEEE Standard
5254 | for Binary Floating-Point Arithmetic.
5255 *----------------------------------------------------------------------------*/
5257 int float64_eq_quiet(float64 a
, float64 b
, float_status
*status
)
5260 a
= float64_squash_input_denormal(a
, status
);
5261 b
= float64_squash_input_denormal(b
, status
);
5263 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5264 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5266 if (float64_is_signaling_nan(a
, status
)
5267 || float64_is_signaling_nan(b
, status
)) {
5268 float_raise(float_flag_invalid
, status
);
5272 av
= float64_val(a
);
5273 bv
= float64_val(b
);
5274 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
5278 /*----------------------------------------------------------------------------
5279 | Returns 1 if the double-precision floating-point value `a' is less than or
5280 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5281 | cause an exception. Otherwise, the comparison is performed according to the
5282 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5283 *----------------------------------------------------------------------------*/
5285 int float64_le_quiet(float64 a
, float64 b
, float_status
*status
)
5289 a
= float64_squash_input_denormal(a
, status
);
5290 b
= float64_squash_input_denormal(b
, status
);
5292 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5293 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5295 if (float64_is_signaling_nan(a
, status
)
5296 || float64_is_signaling_nan(b
, status
)) {
5297 float_raise(float_flag_invalid
, status
);
5301 aSign
= extractFloat64Sign( a
);
5302 bSign
= extractFloat64Sign( b
);
5303 av
= float64_val(a
);
5304 bv
= float64_val(b
);
5305 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
5306 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
5310 /*----------------------------------------------------------------------------
5311 | Returns 1 if the double-precision floating-point value `a' is less than
5312 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5313 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5314 | Standard for Binary Floating-Point Arithmetic.
5315 *----------------------------------------------------------------------------*/
5317 int float64_lt_quiet(float64 a
, float64 b
, float_status
*status
)
5321 a
= float64_squash_input_denormal(a
, status
);
5322 b
= float64_squash_input_denormal(b
, status
);
5324 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5325 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5327 if (float64_is_signaling_nan(a
, status
)
5328 || float64_is_signaling_nan(b
, status
)) {
5329 float_raise(float_flag_invalid
, status
);
5333 aSign
= extractFloat64Sign( a
);
5334 bSign
= extractFloat64Sign( b
);
5335 av
= float64_val(a
);
5336 bv
= float64_val(b
);
5337 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
5338 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
5342 /*----------------------------------------------------------------------------
5343 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5344 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
5345 | comparison is performed according to the IEC/IEEE Standard for Binary
5346 | Floating-Point Arithmetic.
5347 *----------------------------------------------------------------------------*/
5349 int float64_unordered_quiet(float64 a
, float64 b
, float_status
*status
)
5351 a
= float64_squash_input_denormal(a
, status
);
5352 b
= float64_squash_input_denormal(b
, status
);
5354 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5355 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5357 if (float64_is_signaling_nan(a
, status
)
5358 || float64_is_signaling_nan(b
, status
)) {
5359 float_raise(float_flag_invalid
, status
);
5366 /*----------------------------------------------------------------------------
5367 | Returns the result of converting the extended double-precision floating-
5368 | point value `a' to the 32-bit two's complement integer format. The
5369 | conversion is performed according to the IEC/IEEE Standard for Binary
5370 | Floating-Point Arithmetic---which means in particular that the conversion
5371 | is rounded according to the current rounding mode. If `a' is a NaN, the
5372 | largest positive integer is returned. Otherwise, if the conversion
5373 | overflows, the largest integer with the same sign as `a' is returned.
5374 *----------------------------------------------------------------------------*/
5376 int32_t floatx80_to_int32(floatx80 a
, float_status
*status
)
5379 int32_t aExp
, shiftCount
;
5382 if (floatx80_invalid_encoding(a
)) {
5383 float_raise(float_flag_invalid
, status
);
5386 aSig
= extractFloatx80Frac( a
);
5387 aExp
= extractFloatx80Exp( a
);
5388 aSign
= extractFloatx80Sign( a
);
5389 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
5390 shiftCount
= 0x4037 - aExp
;
5391 if ( shiftCount
<= 0 ) shiftCount
= 1;
5392 shift64RightJamming( aSig
, shiftCount
, &aSig
);
5393 return roundAndPackInt32(aSign
, aSig
, status
);
5397 /*----------------------------------------------------------------------------
5398 | Returns the result of converting the extended double-precision floating-
5399 | point value `a' to the 32-bit two's complement integer format. The
5400 | conversion is performed according to the IEC/IEEE Standard for Binary
5401 | Floating-Point Arithmetic, except that the conversion is always rounded
5402 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5403 | Otherwise, if the conversion overflows, the largest integer with the same
5404 | sign as `a' is returned.
5405 *----------------------------------------------------------------------------*/
5407 int32_t floatx80_to_int32_round_to_zero(floatx80 a
, float_status
*status
)
5410 int32_t aExp
, shiftCount
;
5411 uint64_t aSig
, savedASig
;
5414 if (floatx80_invalid_encoding(a
)) {
5415 float_raise(float_flag_invalid
, status
);
5418 aSig
= extractFloatx80Frac( a
);
5419 aExp
= extractFloatx80Exp( a
);
5420 aSign
= extractFloatx80Sign( a
);
5421 if ( 0x401E < aExp
) {
5422 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
5425 else if ( aExp
< 0x3FFF ) {
5427 status
->float_exception_flags
|= float_flag_inexact
;
5431 shiftCount
= 0x403E - aExp
;
5433 aSig
>>= shiftCount
;
5435 if ( aSign
) z
= - z
;
5436 if ( ( z
< 0 ) ^ aSign
) {
5438 float_raise(float_flag_invalid
, status
);
5439 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
5441 if ( ( aSig
<<shiftCount
) != savedASig
) {
5442 status
->float_exception_flags
|= float_flag_inexact
;
5448 /*----------------------------------------------------------------------------
5449 | Returns the result of converting the extended double-precision floating-
5450 | point value `a' to the 64-bit two's complement integer format. The
5451 | conversion is performed according to the IEC/IEEE Standard for Binary
5452 | Floating-Point Arithmetic---which means in particular that the conversion
5453 | is rounded according to the current rounding mode. If `a' is a NaN,
5454 | the largest positive integer is returned. Otherwise, if the conversion
5455 | overflows, the largest integer with the same sign as `a' is returned.
5456 *----------------------------------------------------------------------------*/
5458 int64_t floatx80_to_int64(floatx80 a
, float_status
*status
)
5461 int32_t aExp
, shiftCount
;
5462 uint64_t aSig
, aSigExtra
;
5464 if (floatx80_invalid_encoding(a
)) {
5465 float_raise(float_flag_invalid
, status
);
5468 aSig
= extractFloatx80Frac( a
);
5469 aExp
= extractFloatx80Exp( a
);
5470 aSign
= extractFloatx80Sign( a
);
5471 shiftCount
= 0x403E - aExp
;
5472 if ( shiftCount
<= 0 ) {
5474 float_raise(float_flag_invalid
, status
);
5475 if (!aSign
|| floatx80_is_any_nan(a
)) {
5476 return LIT64( 0x7FFFFFFFFFFFFFFF );
5478 return (int64_t) LIT64( 0x8000000000000000 );
5483 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
5485 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
5489 /*----------------------------------------------------------------------------
5490 | Returns the result of converting the extended double-precision floating-
5491 | point value `a' to the 64-bit two's complement integer format. The
5492 | conversion is performed according to the IEC/IEEE Standard for Binary
5493 | Floating-Point Arithmetic, except that the conversion is always rounded
5494 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5495 | Otherwise, if the conversion overflows, the largest integer with the same
5496 | sign as `a' is returned.
5497 *----------------------------------------------------------------------------*/
5499 int64_t floatx80_to_int64_round_to_zero(floatx80 a
, float_status
*status
)
5502 int32_t aExp
, shiftCount
;
5506 if (floatx80_invalid_encoding(a
)) {
5507 float_raise(float_flag_invalid
, status
);
5510 aSig
= extractFloatx80Frac( a
);
5511 aExp
= extractFloatx80Exp( a
);
5512 aSign
= extractFloatx80Sign( a
);
5513 shiftCount
= aExp
- 0x403E;
5514 if ( 0 <= shiftCount
) {
5515 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
5516 if ( ( a
.high
!= 0xC03E ) || aSig
) {
5517 float_raise(float_flag_invalid
, status
);
5518 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
5519 return LIT64( 0x7FFFFFFFFFFFFFFF );
5522 return (int64_t) LIT64( 0x8000000000000000 );
5524 else if ( aExp
< 0x3FFF ) {
5526 status
->float_exception_flags
|= float_flag_inexact
;
5530 z
= aSig
>>( - shiftCount
);
5531 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
5532 status
->float_exception_flags
|= float_flag_inexact
;
5534 if ( aSign
) z
= - z
;
5539 /*----------------------------------------------------------------------------
5540 | Returns the result of converting the extended double-precision floating-
5541 | point value `a' to the single-precision floating-point format. The
5542 | conversion is performed according to the IEC/IEEE Standard for Binary
5543 | Floating-Point Arithmetic.
5544 *----------------------------------------------------------------------------*/
5546 float32
floatx80_to_float32(floatx80 a
, float_status
*status
)
5552 if (floatx80_invalid_encoding(a
)) {
5553 float_raise(float_flag_invalid
, status
);
5554 return float32_default_nan(status
);
5556 aSig
= extractFloatx80Frac( a
);
5557 aExp
= extractFloatx80Exp( a
);
5558 aSign
= extractFloatx80Sign( a
);
5559 if ( aExp
== 0x7FFF ) {
5560 if ( (uint64_t) ( aSig
<<1 ) ) {
5561 return commonNaNToFloat32(floatx80ToCommonNaN(a
, status
), status
);
5563 return packFloat32( aSign
, 0xFF, 0 );
5565 shift64RightJamming( aSig
, 33, &aSig
);
5566 if ( aExp
|| aSig
) aExp
-= 0x3F81;
5567 return roundAndPackFloat32(aSign
, aExp
, aSig
, status
);
5571 /*----------------------------------------------------------------------------
5572 | Returns the result of converting the extended double-precision floating-
5573 | point value `a' to the double-precision floating-point format. The
5574 | conversion is performed according to the IEC/IEEE Standard for Binary
5575 | Floating-Point Arithmetic.
5576 *----------------------------------------------------------------------------*/
5578 float64
floatx80_to_float64(floatx80 a
, float_status
*status
)
5582 uint64_t aSig
, zSig
;
5584 if (floatx80_invalid_encoding(a
)) {
5585 float_raise(float_flag_invalid
, status
);
5586 return float64_default_nan(status
);
5588 aSig
= extractFloatx80Frac( a
);
5589 aExp
= extractFloatx80Exp( a
);
5590 aSign
= extractFloatx80Sign( a
);
5591 if ( aExp
== 0x7FFF ) {
5592 if ( (uint64_t) ( aSig
<<1 ) ) {
5593 return commonNaNToFloat64(floatx80ToCommonNaN(a
, status
), status
);
5595 return packFloat64( aSign
, 0x7FF, 0 );
5597 shift64RightJamming( aSig
, 1, &zSig
);
5598 if ( aExp
|| aSig
) aExp
-= 0x3C01;
5599 return roundAndPackFloat64(aSign
, aExp
, zSig
, status
);
5603 /*----------------------------------------------------------------------------
5604 | Returns the result of converting the extended double-precision floating-
5605 | point value `a' to the quadruple-precision floating-point format. The
5606 | conversion is performed according to the IEC/IEEE Standard for Binary
5607 | Floating-Point Arithmetic.
5608 *----------------------------------------------------------------------------*/
5610 float128
floatx80_to_float128(floatx80 a
, float_status
*status
)
5614 uint64_t aSig
, zSig0
, zSig1
;
5616 if (floatx80_invalid_encoding(a
)) {
5617 float_raise(float_flag_invalid
, status
);
5618 return float128_default_nan(status
);
5620 aSig
= extractFloatx80Frac( a
);
5621 aExp
= extractFloatx80Exp( a
);
5622 aSign
= extractFloatx80Sign( a
);
5623 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
5624 return commonNaNToFloat128(floatx80ToCommonNaN(a
, status
), status
);
5626 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
5627 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
5631 /*----------------------------------------------------------------------------
5632 | Rounds the extended double-precision floating-point value `a'
5633 | to the precision provided by floatx80_rounding_precision and returns the
5634 | result as an extended double-precision floating-point value.
5635 | The operation is performed according to the IEC/IEEE Standard for Binary
5636 | Floating-Point Arithmetic.
5637 *----------------------------------------------------------------------------*/
5639 floatx80
floatx80_round(floatx80 a
, float_status
*status
)
5641 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5642 extractFloatx80Sign(a
),
5643 extractFloatx80Exp(a
),
5644 extractFloatx80Frac(a
), 0, status
);
5647 /*----------------------------------------------------------------------------
5648 | Rounds the extended double-precision floating-point value `a' to an integer,
5649 | and returns the result as an extended quadruple-precision floating-point
5650 | value. The operation is performed according to the IEC/IEEE Standard for
5651 | Binary Floating-Point Arithmetic.
5652 *----------------------------------------------------------------------------*/
5654 floatx80
floatx80_round_to_int(floatx80 a
, float_status
*status
)
5658 uint64_t lastBitMask
, roundBitsMask
;
5661 if (floatx80_invalid_encoding(a
)) {
5662 float_raise(float_flag_invalid
, status
);
5663 return floatx80_default_nan(status
);
5665 aExp
= extractFloatx80Exp( a
);
5666 if ( 0x403E <= aExp
) {
5667 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
5668 return propagateFloatx80NaN(a
, a
, status
);
5672 if ( aExp
< 0x3FFF ) {
5674 && ( (uint64_t) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
5677 status
->float_exception_flags
|= float_flag_inexact
;
5678 aSign
= extractFloatx80Sign( a
);
5679 switch (status
->float_rounding_mode
) {
5680 case float_round_nearest_even
:
5681 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
5684 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
5687 case float_round_ties_away
:
5688 if (aExp
== 0x3FFE) {
5689 return packFloatx80(aSign
, 0x3FFF, LIT64(0x8000000000000000));
5692 case float_round_down
:
5695 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5696 : packFloatx80( 0, 0, 0 );
5697 case float_round_up
:
5699 aSign
? packFloatx80( 1, 0, 0 )
5700 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5702 return packFloatx80( aSign
, 0, 0 );
5705 lastBitMask
<<= 0x403E - aExp
;
5706 roundBitsMask
= lastBitMask
- 1;
5708 switch (status
->float_rounding_mode
) {
5709 case float_round_nearest_even
:
5710 z
.low
+= lastBitMask
>>1;
5711 if ((z
.low
& roundBitsMask
) == 0) {
5712 z
.low
&= ~lastBitMask
;
5715 case float_round_ties_away
:
5716 z
.low
+= lastBitMask
>> 1;
5718 case float_round_to_zero
:
5720 case float_round_up
:
5721 if (!extractFloatx80Sign(z
)) {
5722 z
.low
+= roundBitsMask
;
5725 case float_round_down
:
5726 if (extractFloatx80Sign(z
)) {
5727 z
.low
+= roundBitsMask
;
5733 z
.low
&= ~ roundBitsMask
;
5736 z
.low
= LIT64( 0x8000000000000000 );
5738 if (z
.low
!= a
.low
) {
5739 status
->float_exception_flags
|= float_flag_inexact
;
5745 /*----------------------------------------------------------------------------
5746 | Returns the result of adding the absolute values of the extended double-
5747 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5748 | negated before being returned. `zSign' is ignored if the result is a NaN.
5749 | The addition is performed according to the IEC/IEEE Standard for Binary
5750 | Floating-Point Arithmetic.
5751 *----------------------------------------------------------------------------*/
5753 static floatx80
addFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
5754 float_status
*status
)
5756 int32_t aExp
, bExp
, zExp
;
5757 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5760 aSig
= extractFloatx80Frac( a
);
5761 aExp
= extractFloatx80Exp( a
);
5762 bSig
= extractFloatx80Frac( b
);
5763 bExp
= extractFloatx80Exp( b
);
5764 expDiff
= aExp
- bExp
;
5765 if ( 0 < expDiff
) {
5766 if ( aExp
== 0x7FFF ) {
5767 if ((uint64_t)(aSig
<< 1)) {
5768 return propagateFloatx80NaN(a
, b
, status
);
5772 if ( bExp
== 0 ) --expDiff
;
5773 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5776 else if ( expDiff
< 0 ) {
5777 if ( bExp
== 0x7FFF ) {
5778 if ((uint64_t)(bSig
<< 1)) {
5779 return propagateFloatx80NaN(a
, b
, status
);
5781 return packFloatx80(zSign
,
5782 floatx80_infinity_high
,
5783 floatx80_infinity_low
);
5785 if ( aExp
== 0 ) ++expDiff
;
5786 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5790 if ( aExp
== 0x7FFF ) {
5791 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5792 return propagateFloatx80NaN(a
, b
, status
);
5797 zSig0
= aSig
+ bSig
;
5799 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
5805 zSig0
= aSig
+ bSig
;
5806 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
5808 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5809 zSig0
|= LIT64( 0x8000000000000000 );
5812 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5813 zSign
, zExp
, zSig0
, zSig1
, status
);
5816 /*----------------------------------------------------------------------------
5817 | Returns the result of subtracting the absolute values of the extended
5818 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5819 | difference is negated before being returned. `zSign' is ignored if the
5820 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5821 | Standard for Binary Floating-Point Arithmetic.
5822 *----------------------------------------------------------------------------*/
5824 static floatx80
subFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
5825 float_status
*status
)
5827 int32_t aExp
, bExp
, zExp
;
5828 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5831 aSig
= extractFloatx80Frac( a
);
5832 aExp
= extractFloatx80Exp( a
);
5833 bSig
= extractFloatx80Frac( b
);
5834 bExp
= extractFloatx80Exp( b
);
5835 expDiff
= aExp
- bExp
;
5836 if ( 0 < expDiff
) goto aExpBigger
;
5837 if ( expDiff
< 0 ) goto bExpBigger
;
5838 if ( aExp
== 0x7FFF ) {
5839 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5840 return propagateFloatx80NaN(a
, b
, status
);
5842 float_raise(float_flag_invalid
, status
);
5843 return floatx80_default_nan(status
);
5850 if ( bSig
< aSig
) goto aBigger
;
5851 if ( aSig
< bSig
) goto bBigger
;
5852 return packFloatx80(status
->float_rounding_mode
== float_round_down
, 0, 0);
5854 if ( bExp
== 0x7FFF ) {
5855 if ((uint64_t)(bSig
<< 1)) {
5856 return propagateFloatx80NaN(a
, b
, status
);
5858 return packFloatx80(zSign
^ 1, floatx80_infinity_high
,
5859 floatx80_infinity_low
);
5861 if ( aExp
== 0 ) ++expDiff
;
5862 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5864 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
5867 goto normalizeRoundAndPack
;
5869 if ( aExp
== 0x7FFF ) {
5870 if ((uint64_t)(aSig
<< 1)) {
5871 return propagateFloatx80NaN(a
, b
, status
);
5875 if ( bExp
== 0 ) --expDiff
;
5876 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5878 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
5880 normalizeRoundAndPack
:
5881 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
5882 zSign
, zExp
, zSig0
, zSig1
, status
);
5885 /*----------------------------------------------------------------------------
5886 | Returns the result of adding the extended double-precision floating-point
5887 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5888 | Standard for Binary Floating-Point Arithmetic.
5889 *----------------------------------------------------------------------------*/
5891 floatx80
floatx80_add(floatx80 a
, floatx80 b
, float_status
*status
)
5895 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5896 float_raise(float_flag_invalid
, status
);
5897 return floatx80_default_nan(status
);
5899 aSign
= extractFloatx80Sign( a
);
5900 bSign
= extractFloatx80Sign( b
);
5901 if ( aSign
== bSign
) {
5902 return addFloatx80Sigs(a
, b
, aSign
, status
);
5905 return subFloatx80Sigs(a
, b
, aSign
, status
);
5910 /*----------------------------------------------------------------------------
5911 | Returns the result of subtracting the extended double-precision floating-
5912 | point values `a' and `b'. The operation is performed according to the
5913 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5914 *----------------------------------------------------------------------------*/
5916 floatx80
floatx80_sub(floatx80 a
, floatx80 b
, float_status
*status
)
5920 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5921 float_raise(float_flag_invalid
, status
);
5922 return floatx80_default_nan(status
);
5924 aSign
= extractFloatx80Sign( a
);
5925 bSign
= extractFloatx80Sign( b
);
5926 if ( aSign
== bSign
) {
5927 return subFloatx80Sigs(a
, b
, aSign
, status
);
5930 return addFloatx80Sigs(a
, b
, aSign
, status
);
5935 /*----------------------------------------------------------------------------
5936 | Returns the result of multiplying the extended double-precision floating-
5937 | point values `a' and `b'. The operation is performed according to the
5938 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5939 *----------------------------------------------------------------------------*/
5941 floatx80
floatx80_mul(floatx80 a
, floatx80 b
, float_status
*status
)
5943 flag aSign
, bSign
, zSign
;
5944 int32_t aExp
, bExp
, zExp
;
5945 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5947 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5948 float_raise(float_flag_invalid
, status
);
5949 return floatx80_default_nan(status
);
5951 aSig
= extractFloatx80Frac( a
);
5952 aExp
= extractFloatx80Exp( a
);
5953 aSign
= extractFloatx80Sign( a
);
5954 bSig
= extractFloatx80Frac( b
);
5955 bExp
= extractFloatx80Exp( b
);
5956 bSign
= extractFloatx80Sign( b
);
5957 zSign
= aSign
^ bSign
;
5958 if ( aExp
== 0x7FFF ) {
5959 if ( (uint64_t) ( aSig
<<1 )
5960 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5961 return propagateFloatx80NaN(a
, b
, status
);
5963 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
5964 return packFloatx80(zSign
, floatx80_infinity_high
,
5965 floatx80_infinity_low
);
5967 if ( bExp
== 0x7FFF ) {
5968 if ((uint64_t)(bSig
<< 1)) {
5969 return propagateFloatx80NaN(a
, b
, status
);
5971 if ( ( aExp
| aSig
) == 0 ) {
5973 float_raise(float_flag_invalid
, status
);
5974 return floatx80_default_nan(status
);
5976 return packFloatx80(zSign
, floatx80_infinity_high
,
5977 floatx80_infinity_low
);
5980 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5981 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5984 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5985 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5987 zExp
= aExp
+ bExp
- 0x3FFE;
5988 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
5989 if ( 0 < (int64_t) zSig0
) {
5990 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5993 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5994 zSign
, zExp
, zSig0
, zSig1
, status
);
5997 /*----------------------------------------------------------------------------
5998 | Returns the result of dividing the extended double-precision floating-point
5999 | value `a' by the corresponding value `b'. The operation is performed
6000 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6001 *----------------------------------------------------------------------------*/
6003 floatx80
floatx80_div(floatx80 a
, floatx80 b
, float_status
*status
)
6005 flag aSign
, bSign
, zSign
;
6006 int32_t aExp
, bExp
, zExp
;
6007 uint64_t aSig
, bSig
, zSig0
, zSig1
;
6008 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
6010 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6011 float_raise(float_flag_invalid
, status
);
6012 return floatx80_default_nan(status
);
6014 aSig
= extractFloatx80Frac( a
);
6015 aExp
= extractFloatx80Exp( a
);
6016 aSign
= extractFloatx80Sign( a
);
6017 bSig
= extractFloatx80Frac( b
);
6018 bExp
= extractFloatx80Exp( b
);
6019 bSign
= extractFloatx80Sign( b
);
6020 zSign
= aSign
^ bSign
;
6021 if ( aExp
== 0x7FFF ) {
6022 if ((uint64_t)(aSig
<< 1)) {
6023 return propagateFloatx80NaN(a
, b
, status
);
6025 if ( bExp
== 0x7FFF ) {
6026 if ((uint64_t)(bSig
<< 1)) {
6027 return propagateFloatx80NaN(a
, b
, status
);
6031 return packFloatx80(zSign
, floatx80_infinity_high
,
6032 floatx80_infinity_low
);
6034 if ( bExp
== 0x7FFF ) {
6035 if ((uint64_t)(bSig
<< 1)) {
6036 return propagateFloatx80NaN(a
, b
, status
);
6038 return packFloatx80( zSign
, 0, 0 );
6042 if ( ( aExp
| aSig
) == 0 ) {
6044 float_raise(float_flag_invalid
, status
);
6045 return floatx80_default_nan(status
);
6047 float_raise(float_flag_divbyzero
, status
);
6048 return packFloatx80(zSign
, floatx80_infinity_high
,
6049 floatx80_infinity_low
);
6051 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6054 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6055 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
6057 zExp
= aExp
- bExp
+ 0x3FFE;
6059 if ( bSig
<= aSig
) {
6060 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
6063 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
6064 mul64To128( bSig
, zSig0
, &term0
, &term1
);
6065 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
6066 while ( (int64_t) rem0
< 0 ) {
6068 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
6070 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
6071 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
6072 mul64To128( bSig
, zSig1
, &term1
, &term2
);
6073 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6074 while ( (int64_t) rem1
< 0 ) {
6076 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
6078 zSig1
|= ( ( rem1
| rem2
) != 0 );
6080 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6081 zSign
, zExp
, zSig0
, zSig1
, status
);
6084 /*----------------------------------------------------------------------------
6085 | Returns the remainder of the extended double-precision floating-point value
6086 | `a' with respect to the corresponding value `b'. The operation is performed
6087 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6088 *----------------------------------------------------------------------------*/
6090 floatx80
floatx80_rem(floatx80 a
, floatx80 b
, float_status
*status
)
6093 int32_t aExp
, bExp
, expDiff
;
6094 uint64_t aSig0
, aSig1
, bSig
;
6095 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
6097 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6098 float_raise(float_flag_invalid
, status
);
6099 return floatx80_default_nan(status
);
6101 aSig0
= extractFloatx80Frac( a
);
6102 aExp
= extractFloatx80Exp( a
);
6103 aSign
= extractFloatx80Sign( a
);
6104 bSig
= extractFloatx80Frac( b
);
6105 bExp
= extractFloatx80Exp( b
);
6106 if ( aExp
== 0x7FFF ) {
6107 if ( (uint64_t) ( aSig0
<<1 )
6108 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
6109 return propagateFloatx80NaN(a
, b
, status
);
6113 if ( bExp
== 0x7FFF ) {
6114 if ((uint64_t)(bSig
<< 1)) {
6115 return propagateFloatx80NaN(a
, b
, status
);
6122 float_raise(float_flag_invalid
, status
);
6123 return floatx80_default_nan(status
);
6125 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6128 if ( (uint64_t) ( aSig0
<<1 ) == 0 ) return a
;
6129 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
6131 bSig
|= LIT64( 0x8000000000000000 );
6133 expDiff
= aExp
- bExp
;
6135 if ( expDiff
< 0 ) {
6136 if ( expDiff
< -1 ) return a
;
6137 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
6140 q
= ( bSig
<= aSig0
);
6141 if ( q
) aSig0
-= bSig
;
6143 while ( 0 < expDiff
) {
6144 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
6145 q
= ( 2 < q
) ? q
- 2 : 0;
6146 mul64To128( bSig
, q
, &term0
, &term1
);
6147 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6148 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
6152 if ( 0 < expDiff
) {
6153 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
6154 q
= ( 2 < q
) ? q
- 2 : 0;
6156 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
6157 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6158 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
6159 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
6161 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6168 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
6169 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
6170 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
6173 aSig0
= alternateASig0
;
6174 aSig1
= alternateASig1
;
6178 normalizeRoundAndPackFloatx80(
6179 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
, status
);
6183 /*----------------------------------------------------------------------------
6184 | Returns the square root of the extended double-precision floating-point
6185 | value `a'. The operation is performed according to the IEC/IEEE Standard
6186 | for Binary Floating-Point Arithmetic.
6187 *----------------------------------------------------------------------------*/
6189 floatx80
floatx80_sqrt(floatx80 a
, float_status
*status
)
6193 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
6194 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6196 if (floatx80_invalid_encoding(a
)) {
6197 float_raise(float_flag_invalid
, status
);
6198 return floatx80_default_nan(status
);
6200 aSig0
= extractFloatx80Frac( a
);
6201 aExp
= extractFloatx80Exp( a
);
6202 aSign
= extractFloatx80Sign( a
);
6203 if ( aExp
== 0x7FFF ) {
6204 if ((uint64_t)(aSig0
<< 1)) {
6205 return propagateFloatx80NaN(a
, a
, status
);
6207 if ( ! aSign
) return a
;
6211 if ( ( aExp
| aSig0
) == 0 ) return a
;
6213 float_raise(float_flag_invalid
, status
);
6214 return floatx80_default_nan(status
);
6217 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
6218 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
6220 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
6221 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
6222 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
6223 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6224 doubleZSig0
= zSig0
<<1;
6225 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6226 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6227 while ( (int64_t) rem0
< 0 ) {
6230 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6232 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
6233 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
6234 if ( zSig1
== 0 ) zSig1
= 1;
6235 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
6236 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6237 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
6238 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6239 while ( (int64_t) rem1
< 0 ) {
6241 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
6243 term2
|= doubleZSig0
;
6244 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6246 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6248 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
6249 zSig0
|= doubleZSig0
;
6250 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6251 0, zExp
, zSig0
, zSig1
, status
);
6254 /*----------------------------------------------------------------------------
6255 | Returns 1 if the extended double-precision floating-point value `a' is equal
6256 | to the corresponding value `b', and 0 otherwise. The invalid exception is
6257 | raised if either operand is a NaN. Otherwise, the comparison is performed
6258 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6259 *----------------------------------------------------------------------------*/
6261 int floatx80_eq(floatx80 a
, floatx80 b
, float_status
*status
)
6264 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
6265 || (extractFloatx80Exp(a
) == 0x7FFF
6266 && (uint64_t) (extractFloatx80Frac(a
) << 1))
6267 || (extractFloatx80Exp(b
) == 0x7FFF
6268 && (uint64_t) (extractFloatx80Frac(b
) << 1))
6270 float_raise(float_flag_invalid
, status
);
6275 && ( ( a
.high
== b
.high
)
6277 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6282 /*----------------------------------------------------------------------------
6283 | Returns 1 if the extended double-precision floating-point value `a' is
6284 | less than or equal to the corresponding value `b', and 0 otherwise. The
6285 | invalid exception is raised if either operand is a NaN. The comparison is
6286 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6288 *----------------------------------------------------------------------------*/
6290 int floatx80_le(floatx80 a
, floatx80 b
, float_status
*status
)
6294 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
6295 || (extractFloatx80Exp(a
) == 0x7FFF
6296 && (uint64_t) (extractFloatx80Frac(a
) << 1))
6297 || (extractFloatx80Exp(b
) == 0x7FFF
6298 && (uint64_t) (extractFloatx80Frac(b
) << 1))
6300 float_raise(float_flag_invalid
, status
);
6303 aSign
= extractFloatx80Sign( a
);
6304 bSign
= extractFloatx80Sign( b
);
6305 if ( aSign
!= bSign
) {
6308 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6312 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6313 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6317 /*----------------------------------------------------------------------------
6318 | Returns 1 if the extended double-precision floating-point value `a' is
6319 | less than the corresponding value `b', and 0 otherwise. The invalid
6320 | exception is raised if either operand is a NaN. The comparison is performed
6321 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6322 *----------------------------------------------------------------------------*/
6324 int floatx80_lt(floatx80 a
, floatx80 b
, float_status
*status
)
6328 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
6329 || (extractFloatx80Exp(a
) == 0x7FFF
6330 && (uint64_t) (extractFloatx80Frac(a
) << 1))
6331 || (extractFloatx80Exp(b
) == 0x7FFF
6332 && (uint64_t) (extractFloatx80Frac(b
) << 1))
6334 float_raise(float_flag_invalid
, status
);
6337 aSign
= extractFloatx80Sign( a
);
6338 bSign
= extractFloatx80Sign( b
);
6339 if ( aSign
!= bSign
) {
6342 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6346 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
6347 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6351 /*----------------------------------------------------------------------------
6352 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6353 | cannot be compared, and 0 otherwise. The invalid exception is raised if
6354 | either operand is a NaN. The comparison is performed according to the
6355 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6356 *----------------------------------------------------------------------------*/
6357 int floatx80_unordered(floatx80 a
, floatx80 b
, float_status
*status
)
6359 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
6360 || (extractFloatx80Exp(a
) == 0x7FFF
6361 && (uint64_t) (extractFloatx80Frac(a
) << 1))
6362 || (extractFloatx80Exp(b
) == 0x7FFF
6363 && (uint64_t) (extractFloatx80Frac(b
) << 1))
6365 float_raise(float_flag_invalid
, status
);
6371 /*----------------------------------------------------------------------------
6372 | Returns 1 if the extended double-precision floating-point value `a' is
6373 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6374 | cause an exception. The comparison is performed according to the IEC/IEEE
6375 | Standard for Binary Floating-Point Arithmetic.
6376 *----------------------------------------------------------------------------*/
6378 int floatx80_eq_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
6381 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6382 float_raise(float_flag_invalid
, status
);
6385 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
6386 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
6387 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
6388 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
6390 if (floatx80_is_signaling_nan(a
, status
)
6391 || floatx80_is_signaling_nan(b
, status
)) {
6392 float_raise(float_flag_invalid
, status
);
6398 && ( ( a
.high
== b
.high
)
6400 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6405 /*----------------------------------------------------------------------------
6406 | Returns 1 if the extended double-precision floating-point value `a' is less
6407 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
6408 | do not cause an exception. Otherwise, the comparison is performed according
6409 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6410 *----------------------------------------------------------------------------*/
6412 int floatx80_le_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
6416 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6417 float_raise(float_flag_invalid
, status
);
6420 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
6421 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
6422 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
6423 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
6425 if (floatx80_is_signaling_nan(a
, status
)
6426 || floatx80_is_signaling_nan(b
, status
)) {
6427 float_raise(float_flag_invalid
, status
);
6431 aSign
= extractFloatx80Sign( a
);
6432 bSign
= extractFloatx80Sign( b
);
6433 if ( aSign
!= bSign
) {
6436 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6440 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6441 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6445 /*----------------------------------------------------------------------------
6446 | Returns 1 if the extended double-precision floating-point value `a' is less
6447 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
6448 | an exception. Otherwise, the comparison is performed according to the
6449 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6450 *----------------------------------------------------------------------------*/
6452 int floatx80_lt_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
6456 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6457 float_raise(float_flag_invalid
, status
);
6460 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
6461 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
6462 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
6463 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
6465 if (floatx80_is_signaling_nan(a
, status
)
6466 || floatx80_is_signaling_nan(b
, status
)) {
6467 float_raise(float_flag_invalid
, status
);
6471 aSign
= extractFloatx80Sign( a
);
6472 bSign
= extractFloatx80Sign( b
);
6473 if ( aSign
!= bSign
) {
6476 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6480 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
6481 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6485 /*----------------------------------------------------------------------------
6486 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6487 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
6488 | The comparison is performed according to the IEC/IEEE Standard for Binary
6489 | Floating-Point Arithmetic.
6490 *----------------------------------------------------------------------------*/
6491 int floatx80_unordered_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
6493 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6494 float_raise(float_flag_invalid
, status
);
6497 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
6498 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
6499 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
6500 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
6502 if (floatx80_is_signaling_nan(a
, status
)
6503 || floatx80_is_signaling_nan(b
, status
)) {
6504 float_raise(float_flag_invalid
, status
);
6511 /*----------------------------------------------------------------------------
6512 | Returns the result of converting the quadruple-precision floating-point
6513 | value `a' to the 32-bit two's complement integer format. The conversion
6514 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6515 | Arithmetic---which means in particular that the conversion is rounded
6516 | according to the current rounding mode. If `a' is a NaN, the largest
6517 | positive integer is returned. Otherwise, if the conversion overflows, the
6518 | largest integer with the same sign as `a' is returned.
6519 *----------------------------------------------------------------------------*/
6521 int32_t float128_to_int32(float128 a
, float_status
*status
)
6524 int32_t aExp
, shiftCount
;
6525 uint64_t aSig0
, aSig1
;
6527 aSig1
= extractFloat128Frac1( a
);
6528 aSig0
= extractFloat128Frac0( a
);
6529 aExp
= extractFloat128Exp( a
);
6530 aSign
= extractFloat128Sign( a
);
6531 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
6532 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
6533 aSig0
|= ( aSig1
!= 0 );
6534 shiftCount
= 0x4028 - aExp
;
6535 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
6536 return roundAndPackInt32(aSign
, aSig0
, status
);
6540 /*----------------------------------------------------------------------------
6541 | Returns the result of converting the quadruple-precision floating-point
6542 | value `a' to the 32-bit two's complement integer format. The conversion
6543 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6544 | Arithmetic, except that the conversion is always rounded toward zero. If
6545 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
6546 | conversion overflows, the largest integer with the same sign as `a' is
6548 *----------------------------------------------------------------------------*/
6550 int32_t float128_to_int32_round_to_zero(float128 a
, float_status
*status
)
6553 int32_t aExp
, shiftCount
;
6554 uint64_t aSig0
, aSig1
, savedASig
;
6557 aSig1
= extractFloat128Frac1( a
);
6558 aSig0
= extractFloat128Frac0( a
);
6559 aExp
= extractFloat128Exp( a
);
6560 aSign
= extractFloat128Sign( a
);
6561 aSig0
|= ( aSig1
!= 0 );
6562 if ( 0x401E < aExp
) {
6563 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
6566 else if ( aExp
< 0x3FFF ) {
6567 if (aExp
|| aSig0
) {
6568 status
->float_exception_flags
|= float_flag_inexact
;
6572 aSig0
|= LIT64( 0x0001000000000000 );
6573 shiftCount
= 0x402F - aExp
;
6575 aSig0
>>= shiftCount
;
6577 if ( aSign
) z
= - z
;
6578 if ( ( z
< 0 ) ^ aSign
) {
6580 float_raise(float_flag_invalid
, status
);
6581 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
6583 if ( ( aSig0
<<shiftCount
) != savedASig
) {
6584 status
->float_exception_flags
|= float_flag_inexact
;
6590 /*----------------------------------------------------------------------------
6591 | Returns the result of converting the quadruple-precision floating-point
6592 | value `a' to the 64-bit two's complement integer format. The conversion
6593 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6594 | Arithmetic---which means in particular that the conversion is rounded
6595 | according to the current rounding mode. If `a' is a NaN, the largest
6596 | positive integer is returned. Otherwise, if the conversion overflows, the
6597 | largest integer with the same sign as `a' is returned.
6598 *----------------------------------------------------------------------------*/
6600 int64_t float128_to_int64(float128 a
, float_status
*status
)
6603 int32_t aExp
, shiftCount
;
6604 uint64_t aSig0
, aSig1
;
6606 aSig1
= extractFloat128Frac1( a
);
6607 aSig0
= extractFloat128Frac0( a
);
6608 aExp
= extractFloat128Exp( a
);
6609 aSign
= extractFloat128Sign( a
);
6610 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
6611 shiftCount
= 0x402F - aExp
;
6612 if ( shiftCount
<= 0 ) {
6613 if ( 0x403E < aExp
) {
6614 float_raise(float_flag_invalid
, status
);
6616 || ( ( aExp
== 0x7FFF )
6617 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
6620 return LIT64( 0x7FFFFFFFFFFFFFFF );
6622 return (int64_t) LIT64( 0x8000000000000000 );
6624 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
6627 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
6629 return roundAndPackInt64(aSign
, aSig0
, aSig1
, status
);
6633 /*----------------------------------------------------------------------------
6634 | Returns the result of converting the quadruple-precision floating-point
6635 | value `a' to the 64-bit two's complement integer format. The conversion
6636 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6637 | Arithmetic, except that the conversion is always rounded toward zero.
6638 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6639 | the conversion overflows, the largest integer with the same sign as `a' is
6641 *----------------------------------------------------------------------------*/
6643 int64_t float128_to_int64_round_to_zero(float128 a
, float_status
*status
)
6646 int32_t aExp
, shiftCount
;
6647 uint64_t aSig0
, aSig1
;
6650 aSig1
= extractFloat128Frac1( a
);
6651 aSig0
= extractFloat128Frac0( a
);
6652 aExp
= extractFloat128Exp( a
);
6653 aSign
= extractFloat128Sign( a
);
6654 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
6655 shiftCount
= aExp
- 0x402F;
6656 if ( 0 < shiftCount
) {
6657 if ( 0x403E <= aExp
) {
6658 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
6659 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
6660 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
6662 status
->float_exception_flags
|= float_flag_inexact
;
6666 float_raise(float_flag_invalid
, status
);
6667 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
6668 return LIT64( 0x7FFFFFFFFFFFFFFF );
6671 return (int64_t) LIT64( 0x8000000000000000 );
6673 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
6674 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
6675 status
->float_exception_flags
|= float_flag_inexact
;
6679 if ( aExp
< 0x3FFF ) {
6680 if ( aExp
| aSig0
| aSig1
) {
6681 status
->float_exception_flags
|= float_flag_inexact
;
6685 z
= aSig0
>>( - shiftCount
);
6687 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
6688 status
->float_exception_flags
|= float_flag_inexact
;
6691 if ( aSign
) z
= - z
;
6696 /*----------------------------------------------------------------------------
6697 | Returns the result of converting the quadruple-precision floating-point value
6698 | `a' to the 64-bit unsigned integer format. The conversion is
6699 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6700 | Arithmetic---which means in particular that the conversion is rounded
6701 | according to the current rounding mode. If `a' is a NaN, the largest
6702 | positive integer is returned. If the conversion overflows, the
6703 | largest unsigned integer is returned. If 'a' is negative, the value is
6704 | rounded and zero is returned; negative values that do not round to zero
6705 | will raise the inexact exception.
6706 *----------------------------------------------------------------------------*/
6708 uint64_t float128_to_uint64(float128 a
, float_status
*status
)
6713 uint64_t aSig0
, aSig1
;
6715 aSig0
= extractFloat128Frac0(a
);
6716 aSig1
= extractFloat128Frac1(a
);
6717 aExp
= extractFloat128Exp(a
);
6718 aSign
= extractFloat128Sign(a
);
6719 if (aSign
&& (aExp
> 0x3FFE)) {
6720 float_raise(float_flag_invalid
, status
);
6721 if (float128_is_any_nan(a
)) {
6722 return LIT64(0xFFFFFFFFFFFFFFFF);
6728 aSig0
|= LIT64(0x0001000000000000);
6730 shiftCount
= 0x402F - aExp
;
6731 if (shiftCount
<= 0) {
6732 if (0x403E < aExp
) {
6733 float_raise(float_flag_invalid
, status
);
6734 return LIT64(0xFFFFFFFFFFFFFFFF);
6736 shortShift128Left(aSig0
, aSig1
, -shiftCount
, &aSig0
, &aSig1
);
6738 shift64ExtraRightJamming(aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
6740 return roundAndPackUint64(aSign
, aSig0
, aSig1
, status
);
6743 uint64_t float128_to_uint64_round_to_zero(float128 a
, float_status
*status
)
6746 signed char current_rounding_mode
= status
->float_rounding_mode
;
6748 set_float_rounding_mode(float_round_to_zero
, status
);
6749 v
= float128_to_uint64(a
, status
);
6750 set_float_rounding_mode(current_rounding_mode
, status
);
6755 /*----------------------------------------------------------------------------
6756 | Returns the result of converting the quadruple-precision floating-point
6757 | value `a' to the 32-bit unsigned integer format. The conversion
6758 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6759 | Arithmetic except that the conversion is always rounded toward zero.
6760 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
6761 | if the conversion overflows, the largest unsigned integer is returned.
6762 | If 'a' is negative, the value is rounded and zero is returned; negative
6763 | values that do not round to zero will raise the inexact exception.
6764 *----------------------------------------------------------------------------*/
6766 uint32_t float128_to_uint32_round_to_zero(float128 a
, float_status
*status
)
6770 int old_exc_flags
= get_float_exception_flags(status
);
6772 v
= float128_to_uint64_round_to_zero(a
, status
);
6773 if (v
> 0xffffffff) {
6778 set_float_exception_flags(old_exc_flags
, status
);
6779 float_raise(float_flag_invalid
, status
);
6783 /*----------------------------------------------------------------------------
6784 | Returns the result of converting the quadruple-precision floating-point
6785 | value `a' to the single-precision floating-point format. The conversion
6786 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6788 *----------------------------------------------------------------------------*/
6790 float32
float128_to_float32(float128 a
, float_status
*status
)
6794 uint64_t aSig0
, aSig1
;
6797 aSig1
= extractFloat128Frac1( a
);
6798 aSig0
= extractFloat128Frac0( a
);
6799 aExp
= extractFloat128Exp( a
);
6800 aSign
= extractFloat128Sign( a
);
6801 if ( aExp
== 0x7FFF ) {
6802 if ( aSig0
| aSig1
) {
6803 return commonNaNToFloat32(float128ToCommonNaN(a
, status
), status
);
6805 return packFloat32( aSign
, 0xFF, 0 );
6807 aSig0
|= ( aSig1
!= 0 );
6808 shift64RightJamming( aSig0
, 18, &aSig0
);
6810 if ( aExp
|| zSig
) {
6814 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
6818 /*----------------------------------------------------------------------------
6819 | Returns the result of converting the quadruple-precision floating-point
6820 | value `a' to the double-precision floating-point format. The conversion
6821 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6823 *----------------------------------------------------------------------------*/
6825 float64
float128_to_float64(float128 a
, float_status
*status
)
6829 uint64_t aSig0
, aSig1
;
6831 aSig1
= extractFloat128Frac1( a
);
6832 aSig0
= extractFloat128Frac0( a
);
6833 aExp
= extractFloat128Exp( a
);
6834 aSign
= extractFloat128Sign( a
);
6835 if ( aExp
== 0x7FFF ) {
6836 if ( aSig0
| aSig1
) {
6837 return commonNaNToFloat64(float128ToCommonNaN(a
, status
), status
);
6839 return packFloat64( aSign
, 0x7FF, 0 );
6841 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6842 aSig0
|= ( aSig1
!= 0 );
6843 if ( aExp
|| aSig0
) {
6844 aSig0
|= LIT64( 0x4000000000000000 );
6847 return roundAndPackFloat64(aSign
, aExp
, aSig0
, status
);
6851 /*----------------------------------------------------------------------------
6852 | Returns the result of converting the quadruple-precision floating-point
6853 | value `a' to the extended double-precision floating-point format. The
6854 | conversion is performed according to the IEC/IEEE Standard for Binary
6855 | Floating-Point Arithmetic.
6856 *----------------------------------------------------------------------------*/
6858 floatx80
float128_to_floatx80(float128 a
, float_status
*status
)
6862 uint64_t aSig0
, aSig1
;
6864 aSig1
= extractFloat128Frac1( a
);
6865 aSig0
= extractFloat128Frac0( a
);
6866 aExp
= extractFloat128Exp( a
);
6867 aSign
= extractFloat128Sign( a
);
6868 if ( aExp
== 0x7FFF ) {
6869 if ( aSig0
| aSig1
) {
6870 return commonNaNToFloatx80(float128ToCommonNaN(a
, status
), status
);
6872 return packFloatx80(aSign
, floatx80_infinity_high
,
6873 floatx80_infinity_low
);
6876 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
6877 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6880 aSig0
|= LIT64( 0x0001000000000000 );
6882 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
6883 return roundAndPackFloatx80(80, aSign
, aExp
, aSig0
, aSig1
, status
);
6887 /*----------------------------------------------------------------------------
6888 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6889 | returns the result as a quadruple-precision floating-point value. The
6890 | operation is performed according to the IEC/IEEE Standard for Binary
6891 | Floating-Point Arithmetic.
6892 *----------------------------------------------------------------------------*/
6894 float128
float128_round_to_int(float128 a
, float_status
*status
)
6898 uint64_t lastBitMask
, roundBitsMask
;
6901 aExp
= extractFloat128Exp( a
);
6902 if ( 0x402F <= aExp
) {
6903 if ( 0x406F <= aExp
) {
6904 if ( ( aExp
== 0x7FFF )
6905 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
6907 return propagateFloat128NaN(a
, a
, status
);
6912 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
6913 roundBitsMask
= lastBitMask
- 1;
6915 switch (status
->float_rounding_mode
) {
6916 case float_round_nearest_even
:
6917 if ( lastBitMask
) {
6918 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
6919 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
6922 if ( (int64_t) z
.low
< 0 ) {
6924 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
6928 case float_round_ties_away
:
6930 add128(z
.high
, z
.low
, 0, lastBitMask
>> 1, &z
.high
, &z
.low
);
6932 if ((int64_t) z
.low
< 0) {
6937 case float_round_to_zero
:
6939 case float_round_up
:
6940 if (!extractFloat128Sign(z
)) {
6941 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6944 case float_round_down
:
6945 if (extractFloat128Sign(z
)) {
6946 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6952 z
.low
&= ~ roundBitsMask
;
6955 if ( aExp
< 0x3FFF ) {
6956 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
6957 status
->float_exception_flags
|= float_flag_inexact
;
6958 aSign
= extractFloat128Sign( a
);
6959 switch (status
->float_rounding_mode
) {
6960 case float_round_nearest_even
:
6961 if ( ( aExp
== 0x3FFE )
6962 && ( extractFloat128Frac0( a
)
6963 | extractFloat128Frac1( a
) )
6965 return packFloat128( aSign
, 0x3FFF, 0, 0 );
6968 case float_round_ties_away
:
6969 if (aExp
== 0x3FFE) {
6970 return packFloat128(aSign
, 0x3FFF, 0, 0);
6973 case float_round_down
:
6975 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
6976 : packFloat128( 0, 0, 0, 0 );
6977 case float_round_up
:
6979 aSign
? packFloat128( 1, 0, 0, 0 )
6980 : packFloat128( 0, 0x3FFF, 0, 0 );
6982 return packFloat128( aSign
, 0, 0, 0 );
6985 lastBitMask
<<= 0x402F - aExp
;
6986 roundBitsMask
= lastBitMask
- 1;
6989 switch (status
->float_rounding_mode
) {
6990 case float_round_nearest_even
:
6991 z
.high
+= lastBitMask
>>1;
6992 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
6993 z
.high
&= ~ lastBitMask
;
6996 case float_round_ties_away
:
6997 z
.high
+= lastBitMask
>>1;
6999 case float_round_to_zero
:
7001 case float_round_up
:
7002 if (!extractFloat128Sign(z
)) {
7003 z
.high
|= ( a
.low
!= 0 );
7004 z
.high
+= roundBitsMask
;
7007 case float_round_down
:
7008 if (extractFloat128Sign(z
)) {
7009 z
.high
|= (a
.low
!= 0);
7010 z
.high
+= roundBitsMask
;
7016 z
.high
&= ~ roundBitsMask
;
7018 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
7019 status
->float_exception_flags
|= float_flag_inexact
;
7025 /*----------------------------------------------------------------------------
7026 | Returns the result of adding the absolute values of the quadruple-precision
7027 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
7028 | before being returned. `zSign' is ignored if the result is a NaN.
7029 | The addition is performed according to the IEC/IEEE Standard for Binary
7030 | Floating-Point Arithmetic.
7031 *----------------------------------------------------------------------------*/
7033 static float128
addFloat128Sigs(float128 a
, float128 b
, flag zSign
,
7034 float_status
*status
)
7036 int32_t aExp
, bExp
, zExp
;
7037 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
7040 aSig1
= extractFloat128Frac1( a
);
7041 aSig0
= extractFloat128Frac0( a
);
7042 aExp
= extractFloat128Exp( a
);
7043 bSig1
= extractFloat128Frac1( b
);
7044 bSig0
= extractFloat128Frac0( b
);
7045 bExp
= extractFloat128Exp( b
);
7046 expDiff
= aExp
- bExp
;
7047 if ( 0 < expDiff
) {
7048 if ( aExp
== 0x7FFF ) {
7049 if (aSig0
| aSig1
) {
7050 return propagateFloat128NaN(a
, b
, status
);
7058 bSig0
|= LIT64( 0x0001000000000000 );
7060 shift128ExtraRightJamming(
7061 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
7064 else if ( expDiff
< 0 ) {
7065 if ( bExp
== 0x7FFF ) {
7066 if (bSig0
| bSig1
) {
7067 return propagateFloat128NaN(a
, b
, status
);
7069 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7075 aSig0
|= LIT64( 0x0001000000000000 );
7077 shift128ExtraRightJamming(
7078 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
7082 if ( aExp
== 0x7FFF ) {
7083 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
7084 return propagateFloat128NaN(a
, b
, status
);
7088 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
7090 if (status
->flush_to_zero
) {
7091 if (zSig0
| zSig1
) {
7092 float_raise(float_flag_output_denormal
, status
);
7094 return packFloat128(zSign
, 0, 0, 0);
7096 return packFloat128( zSign
, 0, zSig0
, zSig1
);
7099 zSig0
|= LIT64( 0x0002000000000000 );
7103 aSig0
|= LIT64( 0x0001000000000000 );
7104 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
7106 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
7109 shift128ExtraRightJamming(
7110 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
7112 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7116 /*----------------------------------------------------------------------------
7117 | Returns the result of subtracting the absolute values of the quadruple-
7118 | precision floating-point values `a' and `b'. If `zSign' is 1, the
7119 | difference is negated before being returned. `zSign' is ignored if the
7120 | result is a NaN. The subtraction is performed according to the IEC/IEEE
7121 | Standard for Binary Floating-Point Arithmetic.
7122 *----------------------------------------------------------------------------*/
7124 static float128
subFloat128Sigs(float128 a
, float128 b
, flag zSign
,
7125 float_status
*status
)
7127 int32_t aExp
, bExp
, zExp
;
7128 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
7131 aSig1
= extractFloat128Frac1( a
);
7132 aSig0
= extractFloat128Frac0( a
);
7133 aExp
= extractFloat128Exp( a
);
7134 bSig1
= extractFloat128Frac1( b
);
7135 bSig0
= extractFloat128Frac0( b
);
7136 bExp
= extractFloat128Exp( b
);
7137 expDiff
= aExp
- bExp
;
7138 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
7139 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
7140 if ( 0 < expDiff
) goto aExpBigger
;
7141 if ( expDiff
< 0 ) goto bExpBigger
;
7142 if ( aExp
== 0x7FFF ) {
7143 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
7144 return propagateFloat128NaN(a
, b
, status
);
7146 float_raise(float_flag_invalid
, status
);
7147 return float128_default_nan(status
);
7153 if ( bSig0
< aSig0
) goto aBigger
;
7154 if ( aSig0
< bSig0
) goto bBigger
;
7155 if ( bSig1
< aSig1
) goto aBigger
;
7156 if ( aSig1
< bSig1
) goto bBigger
;
7157 return packFloat128(status
->float_rounding_mode
== float_round_down
,
7160 if ( bExp
== 0x7FFF ) {
7161 if (bSig0
| bSig1
) {
7162 return propagateFloat128NaN(a
, b
, status
);
7164 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
7170 aSig0
|= LIT64( 0x4000000000000000 );
7172 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
7173 bSig0
|= LIT64( 0x4000000000000000 );
7175 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
7178 goto normalizeRoundAndPack
;
7180 if ( aExp
== 0x7FFF ) {
7181 if (aSig0
| aSig1
) {
7182 return propagateFloat128NaN(a
, b
, status
);
7190 bSig0
|= LIT64( 0x4000000000000000 );
7192 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
7193 aSig0
|= LIT64( 0x4000000000000000 );
7195 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
7197 normalizeRoundAndPack
:
7199 return normalizeRoundAndPackFloat128(zSign
, zExp
- 14, zSig0
, zSig1
,
7204 /*----------------------------------------------------------------------------
7205 | Returns the result of adding the quadruple-precision floating-point values
7206 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
7207 | for Binary Floating-Point Arithmetic.
7208 *----------------------------------------------------------------------------*/
7210 float128
float128_add(float128 a
, float128 b
, float_status
*status
)
7214 aSign
= extractFloat128Sign( a
);
7215 bSign
= extractFloat128Sign( b
);
7216 if ( aSign
== bSign
) {
7217 return addFloat128Sigs(a
, b
, aSign
, status
);
7220 return subFloat128Sigs(a
, b
, aSign
, status
);
7225 /*----------------------------------------------------------------------------
7226 | Returns the result of subtracting the quadruple-precision floating-point
7227 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7228 | Standard for Binary Floating-Point Arithmetic.
7229 *----------------------------------------------------------------------------*/
7231 float128
float128_sub(float128 a
, float128 b
, float_status
*status
)
7235 aSign
= extractFloat128Sign( a
);
7236 bSign
= extractFloat128Sign( b
);
7237 if ( aSign
== bSign
) {
7238 return subFloat128Sigs(a
, b
, aSign
, status
);
7241 return addFloat128Sigs(a
, b
, aSign
, status
);
7246 /*----------------------------------------------------------------------------
7247 | Returns the result of multiplying the quadruple-precision floating-point
7248 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7249 | Standard for Binary Floating-Point Arithmetic.
7250 *----------------------------------------------------------------------------*/
7252 float128
float128_mul(float128 a
, float128 b
, float_status
*status
)
7254 flag aSign
, bSign
, zSign
;
7255 int32_t aExp
, bExp
, zExp
;
7256 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
7258 aSig1
= extractFloat128Frac1( a
);
7259 aSig0
= extractFloat128Frac0( a
);
7260 aExp
= extractFloat128Exp( a
);
7261 aSign
= extractFloat128Sign( a
);
7262 bSig1
= extractFloat128Frac1( b
);
7263 bSig0
= extractFloat128Frac0( b
);
7264 bExp
= extractFloat128Exp( b
);
7265 bSign
= extractFloat128Sign( b
);
7266 zSign
= aSign
^ bSign
;
7267 if ( aExp
== 0x7FFF ) {
7268 if ( ( aSig0
| aSig1
)
7269 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
7270 return propagateFloat128NaN(a
, b
, status
);
7272 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
7273 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7275 if ( bExp
== 0x7FFF ) {
7276 if (bSig0
| bSig1
) {
7277 return propagateFloat128NaN(a
, b
, status
);
7279 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
7281 float_raise(float_flag_invalid
, status
);
7282 return float128_default_nan(status
);
7284 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7287 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7288 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7291 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7292 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7294 zExp
= aExp
+ bExp
- 0x4000;
7295 aSig0
|= LIT64( 0x0001000000000000 );
7296 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
7297 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
7298 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
7299 zSig2
|= ( zSig3
!= 0 );
7300 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
7301 shift128ExtraRightJamming(
7302 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
7305 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7309 /*----------------------------------------------------------------------------
7310 | Returns the result of dividing the quadruple-precision floating-point value
7311 | `a' by the corresponding value `b'. The operation is performed according to
7312 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7313 *----------------------------------------------------------------------------*/
7315 float128
float128_div(float128 a
, float128 b
, float_status
*status
)
7317 flag aSign
, bSign
, zSign
;
7318 int32_t aExp
, bExp
, zExp
;
7319 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
7320 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
7322 aSig1
= extractFloat128Frac1( a
);
7323 aSig0
= extractFloat128Frac0( a
);
7324 aExp
= extractFloat128Exp( a
);
7325 aSign
= extractFloat128Sign( a
);
7326 bSig1
= extractFloat128Frac1( b
);
7327 bSig0
= extractFloat128Frac0( b
);
7328 bExp
= extractFloat128Exp( b
);
7329 bSign
= extractFloat128Sign( b
);
7330 zSign
= aSign
^ bSign
;
7331 if ( aExp
== 0x7FFF ) {
7332 if (aSig0
| aSig1
) {
7333 return propagateFloat128NaN(a
, b
, status
);
7335 if ( bExp
== 0x7FFF ) {
7336 if (bSig0
| bSig1
) {
7337 return propagateFloat128NaN(a
, b
, status
);
7341 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7343 if ( bExp
== 0x7FFF ) {
7344 if (bSig0
| bSig1
) {
7345 return propagateFloat128NaN(a
, b
, status
);
7347 return packFloat128( zSign
, 0, 0, 0 );
7350 if ( ( bSig0
| bSig1
) == 0 ) {
7351 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
7353 float_raise(float_flag_invalid
, status
);
7354 return float128_default_nan(status
);
7356 float_raise(float_flag_divbyzero
, status
);
7357 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7359 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7362 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7363 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7365 zExp
= aExp
- bExp
+ 0x3FFD;
7367 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
7369 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
7370 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
7371 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
7374 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7375 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
7376 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
7377 while ( (int64_t) rem0
< 0 ) {
7379 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
7381 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
7382 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
7383 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
7384 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
7385 while ( (int64_t) rem1
< 0 ) {
7387 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
7389 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
7391 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
7392 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7396 /*----------------------------------------------------------------------------
7397 | Returns the remainder of the quadruple-precision floating-point value `a'
7398 | with respect to the corresponding value `b'. The operation is performed
7399 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7400 *----------------------------------------------------------------------------*/
7402 float128
float128_rem(float128 a
, float128 b
, float_status
*status
)
7405 int32_t aExp
, bExp
, expDiff
;
7406 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
7407 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
7410 aSig1
= extractFloat128Frac1( a
);
7411 aSig0
= extractFloat128Frac0( a
);
7412 aExp
= extractFloat128Exp( a
);
7413 aSign
= extractFloat128Sign( a
);
7414 bSig1
= extractFloat128Frac1( b
);
7415 bSig0
= extractFloat128Frac0( b
);
7416 bExp
= extractFloat128Exp( b
);
7417 if ( aExp
== 0x7FFF ) {
7418 if ( ( aSig0
| aSig1
)
7419 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
7420 return propagateFloat128NaN(a
, b
, status
);
7424 if ( bExp
== 0x7FFF ) {
7425 if (bSig0
| bSig1
) {
7426 return propagateFloat128NaN(a
, b
, status
);
7431 if ( ( bSig0
| bSig1
) == 0 ) {
7433 float_raise(float_flag_invalid
, status
);
7434 return float128_default_nan(status
);
7436 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7439 if ( ( aSig0
| aSig1
) == 0 ) return a
;
7440 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7442 expDiff
= aExp
- bExp
;
7443 if ( expDiff
< -1 ) return a
;
7445 aSig0
| LIT64( 0x0001000000000000 ),
7447 15 - ( expDiff
< 0 ),
7452 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
7453 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
7454 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
7456 while ( 0 < expDiff
) {
7457 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7458 q
= ( 4 < q
) ? q
- 4 : 0;
7459 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
7460 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
7461 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
7462 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
7465 if ( -64 < expDiff
) {
7466 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7467 q
= ( 4 < q
) ? q
- 4 : 0;
7469 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
7471 if ( expDiff
< 0 ) {
7472 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
7475 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
7477 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
7478 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
7481 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
7482 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
7485 alternateASig0
= aSig0
;
7486 alternateASig1
= aSig1
;
7488 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
7489 } while ( 0 <= (int64_t) aSig0
);
7491 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
7492 if ( ( sigMean0
< 0 )
7493 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
7494 aSig0
= alternateASig0
;
7495 aSig1
= alternateASig1
;
7497 zSign
= ( (int64_t) aSig0
< 0 );
7498 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
7499 return normalizeRoundAndPackFloat128(aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
,
7503 /*----------------------------------------------------------------------------
7504 | Returns the square root of the quadruple-precision floating-point value `a'.
7505 | The operation is performed according to the IEC/IEEE Standard for Binary
7506 | Floating-Point Arithmetic.
7507 *----------------------------------------------------------------------------*/
7509 float128
float128_sqrt(float128 a
, float_status
*status
)
7513 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
7514 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
7516 aSig1
= extractFloat128Frac1( a
);
7517 aSig0
= extractFloat128Frac0( a
);
7518 aExp
= extractFloat128Exp( a
);
7519 aSign
= extractFloat128Sign( a
);
7520 if ( aExp
== 0x7FFF ) {
7521 if (aSig0
| aSig1
) {
7522 return propagateFloat128NaN(a
, a
, status
);
7524 if ( ! aSign
) return a
;
7528 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
7530 float_raise(float_flag_invalid
, status
);
7531 return float128_default_nan(status
);
7534 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
7535 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7537 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
7538 aSig0
|= LIT64( 0x0001000000000000 );
7539 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
7540 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
7541 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
7542 doubleZSig0
= zSig0
<<1;
7543 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
7544 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
7545 while ( (int64_t) rem0
< 0 ) {
7548 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
7550 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
7551 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
7552 if ( zSig1
== 0 ) zSig1
= 1;
7553 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
7554 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
7555 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
7556 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
7557 while ( (int64_t) rem1
< 0 ) {
7559 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
7561 term2
|= doubleZSig0
;
7562 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
7564 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
7566 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
7567 return roundAndPackFloat128(0, zExp
, zSig0
, zSig1
, zSig2
, status
);
7571 /*----------------------------------------------------------------------------
7572 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7573 | the corresponding value `b', and 0 otherwise. The invalid exception is
7574 | raised if either operand is a NaN. Otherwise, the comparison is performed
7575 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7576 *----------------------------------------------------------------------------*/
7578 int float128_eq(float128 a
, float128 b
, float_status
*status
)
7581 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7582 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7583 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7584 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7586 float_raise(float_flag_invalid
, status
);
7591 && ( ( a
.high
== b
.high
)
7593 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
7598 /*----------------------------------------------------------------------------
7599 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7600 | or equal to the corresponding value `b', and 0 otherwise. The invalid
7601 | exception is raised if either operand is a NaN. The comparison is performed
7602 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7603 *----------------------------------------------------------------------------*/
7605 int float128_le(float128 a
, float128 b
, float_status
*status
)
7609 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7610 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7611 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7612 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7614 float_raise(float_flag_invalid
, status
);
7617 aSign
= extractFloat128Sign( a
);
7618 bSign
= extractFloat128Sign( b
);
7619 if ( aSign
!= bSign
) {
7622 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
7626 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
7627 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
7631 /*----------------------------------------------------------------------------
7632 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7633 | the corresponding value `b', and 0 otherwise. The invalid exception is
7634 | raised if either operand is a NaN. The comparison is performed according
7635 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7636 *----------------------------------------------------------------------------*/
7638 int float128_lt(float128 a
, float128 b
, float_status
*status
)
7642 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7643 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7644 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7645 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7647 float_raise(float_flag_invalid
, status
);
7650 aSign
= extractFloat128Sign( a
);
7651 bSign
= extractFloat128Sign( b
);
7652 if ( aSign
!= bSign
) {
7655 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
7659 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
7660 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
7664 /*----------------------------------------------------------------------------
7665 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7666 | be compared, and 0 otherwise. The invalid exception is raised if either
7667 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7668 | Standard for Binary Floating-Point Arithmetic.
7669 *----------------------------------------------------------------------------*/
7671 int float128_unordered(float128 a
, float128 b
, float_status
*status
)
7673 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7674 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7675 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7676 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7678 float_raise(float_flag_invalid
, status
);
7684 /*----------------------------------------------------------------------------
7685 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7686 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7687 | exception. The comparison is performed according to the IEC/IEEE Standard
7688 | for Binary Floating-Point Arithmetic.
7689 *----------------------------------------------------------------------------*/
7691 int float128_eq_quiet(float128 a
, float128 b
, float_status
*status
)
7694 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7695 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7696 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7697 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7699 if (float128_is_signaling_nan(a
, status
)
7700 || float128_is_signaling_nan(b
, status
)) {
7701 float_raise(float_flag_invalid
, status
);
7707 && ( ( a
.high
== b
.high
)
7709 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
7714 /*----------------------------------------------------------------------------
7715 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7716 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
7717 | cause an exception. Otherwise, the comparison is performed according to the
7718 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7719 *----------------------------------------------------------------------------*/
7721 int float128_le_quiet(float128 a
, float128 b
, float_status
*status
)
7725 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7726 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7727 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7728 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7730 if (float128_is_signaling_nan(a
, status
)
7731 || float128_is_signaling_nan(b
, status
)) {
7732 float_raise(float_flag_invalid
, status
);
7736 aSign
= extractFloat128Sign( a
);
7737 bSign
= extractFloat128Sign( b
);
7738 if ( aSign
!= bSign
) {
7741 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
7745 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
7746 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
7750 /*----------------------------------------------------------------------------
7751 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7752 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7753 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
7754 | Standard for Binary Floating-Point Arithmetic.
7755 *----------------------------------------------------------------------------*/
7757 int float128_lt_quiet(float128 a
, float128 b
, float_status
*status
)
7761 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7762 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7763 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7764 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7766 if (float128_is_signaling_nan(a
, status
)
7767 || float128_is_signaling_nan(b
, status
)) {
7768 float_raise(float_flag_invalid
, status
);
7772 aSign
= extractFloat128Sign( a
);
7773 bSign
= extractFloat128Sign( b
);
7774 if ( aSign
!= bSign
) {
7777 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
7781 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
7782 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
7786 /*----------------------------------------------------------------------------
7787 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7788 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
7789 | comparison is performed according to the IEC/IEEE Standard for Binary
7790 | Floating-Point Arithmetic.
7791 *----------------------------------------------------------------------------*/
7793 int float128_unordered_quiet(float128 a
, float128 b
, float_status
*status
)
7795 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7796 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7797 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7798 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7800 if (float128_is_signaling_nan(a
, status
)
7801 || float128_is_signaling_nan(b
, status
)) {
7802 float_raise(float_flag_invalid
, status
);
7809 static inline int floatx80_compare_internal(floatx80 a
, floatx80 b
,
7810 int is_quiet
, float_status
*status
)
7814 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
7815 float_raise(float_flag_invalid
, status
);
7816 return float_relation_unordered
;
7818 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
7819 ( extractFloatx80Frac( a
)<<1 ) ) ||
7820 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
7821 ( extractFloatx80Frac( b
)<<1 ) )) {
7823 floatx80_is_signaling_nan(a
, status
) ||
7824 floatx80_is_signaling_nan(b
, status
)) {
7825 float_raise(float_flag_invalid
, status
);
7827 return float_relation_unordered
;
7829 aSign
= extractFloatx80Sign( a
);
7830 bSign
= extractFloatx80Sign( b
);
7831 if ( aSign
!= bSign
) {
7833 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
7834 ( ( a
.low
| b
.low
) == 0 ) ) {
7836 return float_relation_equal
;
7838 return 1 - (2 * aSign
);
7841 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7842 return float_relation_equal
;
7844 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7849 int floatx80_compare(floatx80 a
, floatx80 b
, float_status
*status
)
7851 return floatx80_compare_internal(a
, b
, 0, status
);
7854 int floatx80_compare_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
7856 return floatx80_compare_internal(a
, b
, 1, status
);
7859 static inline int float128_compare_internal(float128 a
, float128 b
,
7860 int is_quiet
, float_status
*status
)
7864 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
7865 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
7866 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
7867 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
7869 float128_is_signaling_nan(a
, status
) ||
7870 float128_is_signaling_nan(b
, status
)) {
7871 float_raise(float_flag_invalid
, status
);
7873 return float_relation_unordered
;
7875 aSign
= extractFloat128Sign( a
);
7876 bSign
= extractFloat128Sign( b
);
7877 if ( aSign
!= bSign
) {
7878 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
7880 return float_relation_equal
;
7882 return 1 - (2 * aSign
);
7885 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7886 return float_relation_equal
;
7888 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7893 int float128_compare(float128 a
, float128 b
, float_status
*status
)
7895 return float128_compare_internal(a
, b
, 0, status
);
7898 int float128_compare_quiet(float128 a
, float128 b
, float_status
*status
)
7900 return float128_compare_internal(a
, b
, 1, status
);
7903 floatx80
floatx80_scalbn(floatx80 a
, int n
, float_status
*status
)
7909 if (floatx80_invalid_encoding(a
)) {
7910 float_raise(float_flag_invalid
, status
);
7911 return floatx80_default_nan(status
);
7913 aSig
= extractFloatx80Frac( a
);
7914 aExp
= extractFloatx80Exp( a
);
7915 aSign
= extractFloatx80Sign( a
);
7917 if ( aExp
== 0x7FFF ) {
7919 return propagateFloatx80NaN(a
, a
, status
);
7933 } else if (n
< -0x10000) {
7938 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
7939 aSign
, aExp
, aSig
, 0, status
);
7942 float128
float128_scalbn(float128 a
, int n
, float_status
*status
)
7946 uint64_t aSig0
, aSig1
;
7948 aSig1
= extractFloat128Frac1( a
);
7949 aSig0
= extractFloat128Frac0( a
);
7950 aExp
= extractFloat128Exp( a
);
7951 aSign
= extractFloat128Sign( a
);
7952 if ( aExp
== 0x7FFF ) {
7953 if ( aSig0
| aSig1
) {
7954 return propagateFloat128NaN(a
, a
, status
);
7959 aSig0
|= LIT64( 0x0001000000000000 );
7960 } else if (aSig0
== 0 && aSig1
== 0) {
7968 } else if (n
< -0x10000) {
7973 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1