s390x: upgrade status of KVM cores to "supported"
[qemu/ar7.git] / fpu / softfloat.c
blob9132d7a0b0f8c361b6be514a79e46a31e7b59841
1 /*
2 * QEMU float support
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
90 /* We only need stdlib for abort() */
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
100 * Hardfloat
102 * Fast emulation of guest FP instructions is challenging for two reasons.
103 * First, FP instruction semantics are similar but not identical, particularly
104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
105 * exception flags is not trivial: reading the host's flags register with a
106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107 * and trapping on every FP exception is not fast nor pleasant to work with.
109 * We address these challenges by leveraging the host FPU for a subset of the
110 * operations. To do this we expand on the idea presented in this paper:
112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
115 * The idea is thus to leverage the host FPU to (1) compute FP operations
116 * and (2) identify whether FP exceptions occurred while avoiding
117 * expensive exception flag register accesses.
119 * An important optimization shown in the paper is that given that exception
120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121 * This is particularly useful for the inexact flag, which is very frequently
122 * raised in floating-point workloads.
124 * We optimize the code further by deferring to soft-fp whenever FP exception
125 * detection might get hairy. Two examples: (1) when at least one operand is
126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127 * and the result is < the minimum normal.
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \
130 static inline void name(soft_t *a, float_status *s) \
132 if (unlikely(soft_t ## _is_denormal(*a))) { \
133 *a = soft_t ## _set_sign(soft_t ## _zero, \
134 soft_t ## _is_neg(*a)); \
135 s->float_exception_flags |= float_flag_input_denormal; \
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
143 #define GEN_INPUT_FLUSH1(name, soft_t) \
144 static inline void name(soft_t *a, float_status *s) \
146 if (likely(!s->flush_inputs_to_zero)) { \
147 return; \
149 soft_t ## _input_flush__nocheck(a, s); \
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
156 #define GEN_INPUT_FLUSH2(name, soft_t) \
157 static inline void name(soft_t *a, soft_t *b, float_status *s) \
159 if (likely(!s->flush_inputs_to_zero)) { \
160 return; \
162 soft_t ## _input_flush__nocheck(a, s); \
163 soft_t ## _input_flush__nocheck(b, s); \
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
170 #define GEN_INPUT_FLUSH3(name, soft_t) \
171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
173 if (likely(!s->flush_inputs_to_zero)) { \
174 return; \
176 soft_t ## _input_flush__nocheck(a, s); \
177 soft_t ## _input_flush__nocheck(b, s); \
178 soft_t ## _input_flush__nocheck(c, s); \
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
187 * hardfloat functions. Each combination of number of inputs and float size
188 * gets its own value.
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208 * float{32,64}_is_infinity when !USE_FP.
209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF 1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF 0
216 #endif
219 * Some targets clear the FP flags before most FP operations. This prevents
220 * the use of hardfloat, since hardfloat relies on the inexact flag being
221 * already set.
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226 IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
235 static inline bool can_use_fpu(const float_status *s)
237 if (QEMU_NO_HARDFLOAT) {
238 return false;
240 return likely(s->float_exception_flags & float_flag_inexact &&
241 s->float_rounding_mode == float_round_nearest_even);
245 * Hardfloat generation functions. Each operation can have two flavors:
246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247 * most condition checks, or native ones (e.g. fpclassify).
249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
250 * compiler to propagate constants and inline everything into the callers.
252 * We only generate functions for operations with two inputs, since only
253 * these are common enough to justify consolidating them into common code.
256 typedef union {
257 float32 s;
258 float h;
259 } union_float32;
261 typedef union {
262 float64 s;
263 double h;
264 } union_float64;
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float (*hard_f32_op2_fn)(float a, float b);
272 typedef double (*hard_f64_op2_fn)(double a, double b);
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
277 if (QEMU_HARDFLOAT_2F32_USE_FP) {
279 * Not using a temp variable for consecutive fpclassify calls ends up
280 * generating faster code.
282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
285 return float32_is_zero_or_normal(a.s) &&
286 float32_is_zero_or_normal(b.s);
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
291 if (QEMU_HARDFLOAT_2F64_USE_FP) {
292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
295 return float64_is_zero_or_normal(a.s) &&
296 float64_is_zero_or_normal(b.s);
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
303 if (QEMU_HARDFLOAT_3F32_USE_FP) {
304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
308 return float32_is_zero_or_normal(a.s) &&
309 float32_is_zero_or_normal(b.s) &&
310 float32_is_zero_or_normal(c.s);
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
316 if (QEMU_HARDFLOAT_3F64_USE_FP) {
317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
321 return float64_is_zero_or_normal(a.s) &&
322 float64_is_zero_or_normal(b.s) &&
323 float64_is_zero_or_normal(c.s);
326 static inline bool f32_is_inf(union_float32 a)
328 if (QEMU_HARDFLOAT_USE_ISINF) {
329 return isinf(a.h);
331 return float32_is_infinity(a.s);
334 static inline bool f64_is_inf(union_float64 a)
336 if (QEMU_HARDFLOAT_USE_ISINF) {
337 return isinf(a.h);
339 return float64_is_infinity(a.s);
342 /* Note: @fast_test and @post can be NULL */
343 static inline float32
344 float32_gen2(float32 xa, float32 xb, float_status *s,
345 hard_f32_op2_fn hard, soft_f32_op2_fn soft,
346 f32_check_fn pre, f32_check_fn post,
347 f32_check_fn fast_test, soft_f32_op2_fn fast_op)
349 union_float32 ua, ub, ur;
351 ua.s = xa;
352 ub.s = xb;
354 if (unlikely(!can_use_fpu(s))) {
355 goto soft;
358 float32_input_flush2(&ua.s, &ub.s, s);
359 if (unlikely(!pre(ua, ub))) {
360 goto soft;
362 if (fast_test && fast_test(ua, ub)) {
363 return fast_op(ua.s, ub.s, s);
366 ur.h = hard(ua.h, ub.h);
367 if (unlikely(f32_is_inf(ur))) {
368 s->float_exception_flags |= float_flag_overflow;
369 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
370 if (post == NULL || post(ua, ub)) {
371 goto soft;
374 return ur.s;
376 soft:
377 return soft(ua.s, ub.s, s);
380 static inline float64
381 float64_gen2(float64 xa, float64 xb, float_status *s,
382 hard_f64_op2_fn hard, soft_f64_op2_fn soft,
383 f64_check_fn pre, f64_check_fn post,
384 f64_check_fn fast_test, soft_f64_op2_fn fast_op)
386 union_float64 ua, ub, ur;
388 ua.s = xa;
389 ub.s = xb;
391 if (unlikely(!can_use_fpu(s))) {
392 goto soft;
395 float64_input_flush2(&ua.s, &ub.s, s);
396 if (unlikely(!pre(ua, ub))) {
397 goto soft;
399 if (fast_test && fast_test(ua, ub)) {
400 return fast_op(ua.s, ub.s, s);
403 ur.h = hard(ua.h, ub.h);
404 if (unlikely(f64_is_inf(ur))) {
405 s->float_exception_flags |= float_flag_overflow;
406 } else if (unlikely(fabs(ur.h) <= DBL_MIN)) {
407 if (post == NULL || post(ua, ub)) {
408 goto soft;
411 return ur.s;
413 soft:
414 return soft(ua.s, ub.s, s);
417 /*----------------------------------------------------------------------------
418 | Returns the fraction bits of the half-precision floating-point value `a'.
419 *----------------------------------------------------------------------------*/
421 static inline uint32_t extractFloat16Frac(float16 a)
423 return float16_val(a) & 0x3ff;
426 /*----------------------------------------------------------------------------
427 | Returns the exponent bits of the half-precision floating-point value `a'.
428 *----------------------------------------------------------------------------*/
430 static inline int extractFloat16Exp(float16 a)
432 return (float16_val(a) >> 10) & 0x1f;
435 /*----------------------------------------------------------------------------
436 | Returns the fraction bits of the single-precision floating-point value `a'.
437 *----------------------------------------------------------------------------*/
439 static inline uint32_t extractFloat32Frac(float32 a)
441 return float32_val(a) & 0x007FFFFF;
444 /*----------------------------------------------------------------------------
445 | Returns the exponent bits of the single-precision floating-point value `a'.
446 *----------------------------------------------------------------------------*/
448 static inline int extractFloat32Exp(float32 a)
450 return (float32_val(a) >> 23) & 0xFF;
453 /*----------------------------------------------------------------------------
454 | Returns the sign bit of the single-precision floating-point value `a'.
455 *----------------------------------------------------------------------------*/
457 static inline flag extractFloat32Sign(float32 a)
459 return float32_val(a) >> 31;
462 /*----------------------------------------------------------------------------
463 | Returns the fraction bits of the double-precision floating-point value `a'.
464 *----------------------------------------------------------------------------*/
466 static inline uint64_t extractFloat64Frac(float64 a)
468 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
471 /*----------------------------------------------------------------------------
472 | Returns the exponent bits of the double-precision floating-point value `a'.
473 *----------------------------------------------------------------------------*/
475 static inline int extractFloat64Exp(float64 a)
477 return (float64_val(a) >> 52) & 0x7FF;
480 /*----------------------------------------------------------------------------
481 | Returns the sign bit of the double-precision floating-point value `a'.
482 *----------------------------------------------------------------------------*/
484 static inline flag extractFloat64Sign(float64 a)
486 return float64_val(a) >> 63;
490 * Classify a floating point number. Everything above float_class_qnan
491 * is a NaN so cls >= float_class_qnan is any NaN.
494 typedef enum __attribute__ ((__packed__)) {
495 float_class_unclassified,
496 float_class_zero,
497 float_class_normal,
498 float_class_inf,
499 float_class_qnan, /* all NaNs from here */
500 float_class_snan,
501 } FloatClass;
503 /* Simple helpers for checking if, or what kind of, NaN we have */
504 static inline __attribute__((unused)) bool is_nan(FloatClass c)
506 return unlikely(c >= float_class_qnan);
509 static inline __attribute__((unused)) bool is_snan(FloatClass c)
511 return c == float_class_snan;
514 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
516 return c == float_class_qnan;
520 * Structure holding all of the decomposed parts of a float. The
521 * exponent is unbiased and the fraction is normalized. All
522 * calculations are done with a 64 bit fraction and then rounded as
523 * appropriate for the final format.
525 * Thanks to the packed FloatClass a decent compiler should be able to
526 * fit the whole structure into registers and avoid using the stack
527 * for parameter passing.
530 typedef struct {
531 uint64_t frac;
532 int32_t exp;
533 FloatClass cls;
534 bool sign;
535 } FloatParts;
537 #define DECOMPOSED_BINARY_POINT (64 - 2)
538 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
539 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
541 /* Structure holding all of the relevant parameters for a format.
542 * exp_size: the size of the exponent field
543 * exp_bias: the offset applied to the exponent field
544 * exp_max: the maximum normalised exponent
545 * frac_size: the size of the fraction field
546 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
547 * The following are computed based the size of fraction
548 * frac_lsb: least significant bit of fraction
549 * frac_lsbm1: the bit below the least significant bit (for rounding)
550 * round_mask/roundeven_mask: masks used for rounding
551 * The following optional modifiers are available:
552 * arm_althp: handle ARM Alternative Half Precision
554 typedef struct {
555 int exp_size;
556 int exp_bias;
557 int exp_max;
558 int frac_size;
559 int frac_shift;
560 uint64_t frac_lsb;
561 uint64_t frac_lsbm1;
562 uint64_t round_mask;
563 uint64_t roundeven_mask;
564 bool arm_althp;
565 } FloatFmt;
567 /* Expand fields based on the size of exponent and fraction */
568 #define FLOAT_PARAMS(E, F) \
569 .exp_size = E, \
570 .exp_bias = ((1 << E) - 1) >> 1, \
571 .exp_max = (1 << E) - 1, \
572 .frac_size = F, \
573 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
574 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
575 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
576 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
577 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
579 static const FloatFmt float16_params = {
580 FLOAT_PARAMS(5, 10)
583 static const FloatFmt float16_params_ahp = {
584 FLOAT_PARAMS(5, 10),
585 .arm_althp = true
588 static const FloatFmt float32_params = {
589 FLOAT_PARAMS(8, 23)
592 static const FloatFmt float64_params = {
593 FLOAT_PARAMS(11, 52)
596 /* Unpack a float to parts, but do not canonicalize. */
597 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
599 const int sign_pos = fmt.frac_size + fmt.exp_size;
601 return (FloatParts) {
602 .cls = float_class_unclassified,
603 .sign = extract64(raw, sign_pos, 1),
604 .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
605 .frac = extract64(raw, 0, fmt.frac_size),
609 static inline FloatParts float16_unpack_raw(float16 f)
611 return unpack_raw(float16_params, f);
614 static inline FloatParts float32_unpack_raw(float32 f)
616 return unpack_raw(float32_params, f);
619 static inline FloatParts float64_unpack_raw(float64 f)
621 return unpack_raw(float64_params, f);
624 /* Pack a float from parts, but do not canonicalize. */
625 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
627 const int sign_pos = fmt.frac_size + fmt.exp_size;
628 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
629 return deposit64(ret, sign_pos, 1, p.sign);
632 static inline float16 float16_pack_raw(FloatParts p)
634 return make_float16(pack_raw(float16_params, p));
637 static inline float32 float32_pack_raw(FloatParts p)
639 return make_float32(pack_raw(float32_params, p));
642 static inline float64 float64_pack_raw(FloatParts p)
644 return make_float64(pack_raw(float64_params, p));
647 /*----------------------------------------------------------------------------
648 | Functions and definitions to determine: (1) whether tininess for underflow
649 | is detected before or after rounding by default, (2) what (if anything)
650 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
651 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
652 | are propagated from function inputs to output. These details are target-
653 | specific.
654 *----------------------------------------------------------------------------*/
655 #include "softfloat-specialize.h"
657 /* Canonicalize EXP and FRAC, setting CLS. */
658 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm,
659 float_status *status)
661 if (part.exp == parm->exp_max && !parm->arm_althp) {
662 if (part.frac == 0) {
663 part.cls = float_class_inf;
664 } else {
665 part.frac <<= parm->frac_shift;
666 part.cls = (parts_is_snan_frac(part.frac, status)
667 ? float_class_snan : float_class_qnan);
669 } else if (part.exp == 0) {
670 if (likely(part.frac == 0)) {
671 part.cls = float_class_zero;
672 } else if (status->flush_inputs_to_zero) {
673 float_raise(float_flag_input_denormal, status);
674 part.cls = float_class_zero;
675 part.frac = 0;
676 } else {
677 int shift = clz64(part.frac) - 1;
678 part.cls = float_class_normal;
679 part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
680 part.frac <<= shift;
682 } else {
683 part.cls = float_class_normal;
684 part.exp -= parm->exp_bias;
685 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
687 return part;
690 /* Round and uncanonicalize a floating-point number by parts. There
691 * are FRAC_SHIFT bits that may require rounding at the bottom of the
692 * fraction; these bits will be removed. The exponent will be biased
693 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
696 static FloatParts round_canonical(FloatParts p, float_status *s,
697 const FloatFmt *parm)
699 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
700 const uint64_t round_mask = parm->round_mask;
701 const uint64_t roundeven_mask = parm->roundeven_mask;
702 const int exp_max = parm->exp_max;
703 const int frac_shift = parm->frac_shift;
704 uint64_t frac, inc;
705 int exp, flags = 0;
706 bool overflow_norm;
708 frac = p.frac;
709 exp = p.exp;
711 switch (p.cls) {
712 case float_class_normal:
713 switch (s->float_rounding_mode) {
714 case float_round_nearest_even:
715 overflow_norm = false;
716 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
717 break;
718 case float_round_ties_away:
719 overflow_norm = false;
720 inc = frac_lsbm1;
721 break;
722 case float_round_to_zero:
723 overflow_norm = true;
724 inc = 0;
725 break;
726 case float_round_up:
727 inc = p.sign ? 0 : round_mask;
728 overflow_norm = p.sign;
729 break;
730 case float_round_down:
731 inc = p.sign ? round_mask : 0;
732 overflow_norm = !p.sign;
733 break;
734 default:
735 g_assert_not_reached();
738 exp += parm->exp_bias;
739 if (likely(exp > 0)) {
740 if (frac & round_mask) {
741 flags |= float_flag_inexact;
742 frac += inc;
743 if (frac & DECOMPOSED_OVERFLOW_BIT) {
744 frac >>= 1;
745 exp++;
748 frac >>= frac_shift;
750 if (parm->arm_althp) {
751 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
752 if (unlikely(exp > exp_max)) {
753 /* Overflow. Return the maximum normal. */
754 flags = float_flag_invalid;
755 exp = exp_max;
756 frac = -1;
758 } else if (unlikely(exp >= exp_max)) {
759 flags |= float_flag_overflow | float_flag_inexact;
760 if (overflow_norm) {
761 exp = exp_max - 1;
762 frac = -1;
763 } else {
764 p.cls = float_class_inf;
765 goto do_inf;
768 } else if (s->flush_to_zero) {
769 flags |= float_flag_output_denormal;
770 p.cls = float_class_zero;
771 goto do_zero;
772 } else {
773 bool is_tiny = (s->float_detect_tininess
774 == float_tininess_before_rounding)
775 || (exp < 0)
776 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
778 shift64RightJamming(frac, 1 - exp, &frac);
779 if (frac & round_mask) {
780 /* Need to recompute round-to-even. */
781 if (s->float_rounding_mode == float_round_nearest_even) {
782 inc = ((frac & roundeven_mask) != frac_lsbm1
783 ? frac_lsbm1 : 0);
785 flags |= float_flag_inexact;
786 frac += inc;
789 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
790 frac >>= frac_shift;
792 if (is_tiny && (flags & float_flag_inexact)) {
793 flags |= float_flag_underflow;
795 if (exp == 0 && frac == 0) {
796 p.cls = float_class_zero;
799 break;
801 case float_class_zero:
802 do_zero:
803 exp = 0;
804 frac = 0;
805 break;
807 case float_class_inf:
808 do_inf:
809 assert(!parm->arm_althp);
810 exp = exp_max;
811 frac = 0;
812 break;
814 case float_class_qnan:
815 case float_class_snan:
816 assert(!parm->arm_althp);
817 exp = exp_max;
818 frac >>= parm->frac_shift;
819 break;
821 default:
822 g_assert_not_reached();
825 float_raise(flags, s);
826 p.exp = exp;
827 p.frac = frac;
828 return p;
831 /* Explicit FloatFmt version */
832 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
833 const FloatFmt *params)
835 return sf_canonicalize(float16_unpack_raw(f), params, s);
838 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
840 return float16a_unpack_canonical(f, s, &float16_params);
843 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
844 const FloatFmt *params)
846 return float16_pack_raw(round_canonical(p, s, params));
849 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
851 return float16a_round_pack_canonical(p, s, &float16_params);
854 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
856 return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
859 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
861 return float32_pack_raw(round_canonical(p, s, &float32_params));
864 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
866 return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
869 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
871 return float64_pack_raw(round_canonical(p, s, &float64_params));
874 static FloatParts return_nan(FloatParts a, float_status *s)
876 switch (a.cls) {
877 case float_class_snan:
878 s->float_exception_flags |= float_flag_invalid;
879 a = parts_silence_nan(a, s);
880 /* fall through */
881 case float_class_qnan:
882 if (s->default_nan_mode) {
883 return parts_default_nan(s);
885 break;
887 default:
888 g_assert_not_reached();
890 return a;
893 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
895 if (is_snan(a.cls) || is_snan(b.cls)) {
896 s->float_exception_flags |= float_flag_invalid;
899 if (s->default_nan_mode) {
900 return parts_default_nan(s);
901 } else {
902 if (pickNaN(a.cls, b.cls,
903 a.frac > b.frac ||
904 (a.frac == b.frac && a.sign < b.sign))) {
905 a = b;
907 if (is_snan(a.cls)) {
908 return parts_silence_nan(a, s);
911 return a;
914 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
915 bool inf_zero, float_status *s)
917 int which;
919 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
920 s->float_exception_flags |= float_flag_invalid;
923 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
925 if (s->default_nan_mode) {
926 /* Note that this check is after pickNaNMulAdd so that function
927 * has an opportunity to set the Invalid flag.
929 which = 3;
932 switch (which) {
933 case 0:
934 break;
935 case 1:
936 a = b;
937 break;
938 case 2:
939 a = c;
940 break;
941 case 3:
942 return parts_default_nan(s);
943 default:
944 g_assert_not_reached();
947 if (is_snan(a.cls)) {
948 return parts_silence_nan(a, s);
950 return a;
954 * Returns the result of adding or subtracting the values of the
955 * floating-point values `a' and `b'. The operation is performed
956 * according to the IEC/IEEE Standard for Binary Floating-Point
957 * Arithmetic.
960 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
961 float_status *s)
963 bool a_sign = a.sign;
964 bool b_sign = b.sign ^ subtract;
966 if (a_sign != b_sign) {
967 /* Subtraction */
969 if (a.cls == float_class_normal && b.cls == float_class_normal) {
970 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
971 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
972 a.frac = a.frac - b.frac;
973 } else {
974 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
975 a.frac = b.frac - a.frac;
976 a.exp = b.exp;
977 a_sign ^= 1;
980 if (a.frac == 0) {
981 a.cls = float_class_zero;
982 a.sign = s->float_rounding_mode == float_round_down;
983 } else {
984 int shift = clz64(a.frac) - 1;
985 a.frac = a.frac << shift;
986 a.exp = a.exp - shift;
987 a.sign = a_sign;
989 return a;
991 if (is_nan(a.cls) || is_nan(b.cls)) {
992 return pick_nan(a, b, s);
994 if (a.cls == float_class_inf) {
995 if (b.cls == float_class_inf) {
996 float_raise(float_flag_invalid, s);
997 return parts_default_nan(s);
999 return a;
1001 if (a.cls == float_class_zero && b.cls == float_class_zero) {
1002 a.sign = s->float_rounding_mode == float_round_down;
1003 return a;
1005 if (a.cls == float_class_zero || b.cls == float_class_inf) {
1006 b.sign = a_sign ^ 1;
1007 return b;
1009 if (b.cls == float_class_zero) {
1010 return a;
1012 } else {
1013 /* Addition */
1014 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1015 if (a.exp > b.exp) {
1016 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1017 } else if (a.exp < b.exp) {
1018 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1019 a.exp = b.exp;
1021 a.frac += b.frac;
1022 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1023 shift64RightJamming(a.frac, 1, &a.frac);
1024 a.exp += 1;
1026 return a;
1028 if (is_nan(a.cls) || is_nan(b.cls)) {
1029 return pick_nan(a, b, s);
1031 if (a.cls == float_class_inf || b.cls == float_class_zero) {
1032 return a;
1034 if (b.cls == float_class_inf || a.cls == float_class_zero) {
1035 b.sign = b_sign;
1036 return b;
1039 g_assert_not_reached();
1043 * Returns the result of adding or subtracting the floating-point
1044 * values `a' and `b'. The operation is performed according to the
1045 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1048 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1050 FloatParts pa = float16_unpack_canonical(a, status);
1051 FloatParts pb = float16_unpack_canonical(b, status);
1052 FloatParts pr = addsub_floats(pa, pb, false, status);
1054 return float16_round_pack_canonical(pr, status);
1057 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1059 FloatParts pa = float16_unpack_canonical(a, status);
1060 FloatParts pb = float16_unpack_canonical(b, status);
1061 FloatParts pr = addsub_floats(pa, pb, true, status);
1063 return float16_round_pack_canonical(pr, status);
1066 static float32 QEMU_SOFTFLOAT_ATTR
1067 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1069 FloatParts pa = float32_unpack_canonical(a, status);
1070 FloatParts pb = float32_unpack_canonical(b, status);
1071 FloatParts pr = addsub_floats(pa, pb, subtract, status);
1073 return float32_round_pack_canonical(pr, status);
1076 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1078 return soft_f32_addsub(a, b, false, status);
1081 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1083 return soft_f32_addsub(a, b, true, status);
1086 static float64 QEMU_SOFTFLOAT_ATTR
1087 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1089 FloatParts pa = float64_unpack_canonical(a, status);
1090 FloatParts pb = float64_unpack_canonical(b, status);
1091 FloatParts pr = addsub_floats(pa, pb, subtract, status);
1093 return float64_round_pack_canonical(pr, status);
1096 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1098 return soft_f64_addsub(a, b, false, status);
1101 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1103 return soft_f64_addsub(a, b, true, status);
1106 static float hard_f32_add(float a, float b)
1108 return a + b;
1111 static float hard_f32_sub(float a, float b)
1113 return a - b;
1116 static double hard_f64_add(double a, double b)
1118 return a + b;
1121 static double hard_f64_sub(double a, double b)
1123 return a - b;
1126 static bool f32_addsub_post(union_float32 a, union_float32 b)
1128 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1129 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1131 return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1134 static bool f64_addsub_post(union_float64 a, union_float64 b)
1136 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1137 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1138 } else {
1139 return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1143 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1144 hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1146 return float32_gen2(a, b, s, hard, soft,
1147 f32_is_zon2, f32_addsub_post, NULL, NULL);
1150 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1151 hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1153 return float64_gen2(a, b, s, hard, soft,
1154 f64_is_zon2, f64_addsub_post, NULL, NULL);
1157 float32 QEMU_FLATTEN
1158 float32_add(float32 a, float32 b, float_status *s)
1160 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1163 float32 QEMU_FLATTEN
1164 float32_sub(float32 a, float32 b, float_status *s)
1166 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1169 float64 QEMU_FLATTEN
1170 float64_add(float64 a, float64 b, float_status *s)
1172 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1175 float64 QEMU_FLATTEN
1176 float64_sub(float64 a, float64 b, float_status *s)
1178 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1182 * Returns the result of multiplying the floating-point values `a' and
1183 * `b'. The operation is performed according to the IEC/IEEE Standard
1184 * for Binary Floating-Point Arithmetic.
1187 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
1189 bool sign = a.sign ^ b.sign;
1191 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1192 uint64_t hi, lo;
1193 int exp = a.exp + b.exp;
1195 mul64To128(a.frac, b.frac, &hi, &lo);
1196 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1197 if (lo & DECOMPOSED_OVERFLOW_BIT) {
1198 shift64RightJamming(lo, 1, &lo);
1199 exp += 1;
1202 /* Re-use a */
1203 a.exp = exp;
1204 a.sign = sign;
1205 a.frac = lo;
1206 return a;
1208 /* handle all the NaN cases */
1209 if (is_nan(a.cls) || is_nan(b.cls)) {
1210 return pick_nan(a, b, s);
1212 /* Inf * Zero == NaN */
1213 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1214 (a.cls == float_class_zero && b.cls == float_class_inf)) {
1215 s->float_exception_flags |= float_flag_invalid;
1216 return parts_default_nan(s);
1218 /* Multiply by 0 or Inf */
1219 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1220 a.sign = sign;
1221 return a;
1223 if (b.cls == float_class_inf || b.cls == float_class_zero) {
1224 b.sign = sign;
1225 return b;
1227 g_assert_not_reached();
1230 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1232 FloatParts pa = float16_unpack_canonical(a, status);
1233 FloatParts pb = float16_unpack_canonical(b, status);
1234 FloatParts pr = mul_floats(pa, pb, status);
1236 return float16_round_pack_canonical(pr, status);
1239 static float32 QEMU_SOFTFLOAT_ATTR
1240 soft_f32_mul(float32 a, float32 b, float_status *status)
1242 FloatParts pa = float32_unpack_canonical(a, status);
1243 FloatParts pb = float32_unpack_canonical(b, status);
1244 FloatParts pr = mul_floats(pa, pb, status);
1246 return float32_round_pack_canonical(pr, status);
1249 static float64 QEMU_SOFTFLOAT_ATTR
1250 soft_f64_mul(float64 a, float64 b, float_status *status)
1252 FloatParts pa = float64_unpack_canonical(a, status);
1253 FloatParts pb = float64_unpack_canonical(b, status);
1254 FloatParts pr = mul_floats(pa, pb, status);
1256 return float64_round_pack_canonical(pr, status);
1259 static float hard_f32_mul(float a, float b)
1261 return a * b;
1264 static double hard_f64_mul(double a, double b)
1266 return a * b;
1269 static bool f32_mul_fast_test(union_float32 a, union_float32 b)
1271 return float32_is_zero(a.s) || float32_is_zero(b.s);
1274 static bool f64_mul_fast_test(union_float64 a, union_float64 b)
1276 return float64_is_zero(a.s) || float64_is_zero(b.s);
1279 static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s)
1281 bool signbit = float32_is_neg(a) ^ float32_is_neg(b);
1283 return float32_set_sign(float32_zero, signbit);
1286 static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s)
1288 bool signbit = float64_is_neg(a) ^ float64_is_neg(b);
1290 return float64_set_sign(float64_zero, signbit);
1293 float32 QEMU_FLATTEN
1294 float32_mul(float32 a, float32 b, float_status *s)
1296 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1297 f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op);
1300 float64 QEMU_FLATTEN
1301 float64_mul(float64 a, float64 b, float_status *s)
1303 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1304 f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op);
1308 * Returns the result of multiplying the floating-point values `a' and
1309 * `b' then adding 'c', with no intermediate rounding step after the
1310 * multiplication. The operation is performed according to the
1311 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1312 * The flags argument allows the caller to select negation of the
1313 * addend, the intermediate product, or the final result. (The
1314 * difference between this and having the caller do a separate
1315 * negation is that negating externally will flip the sign bit on
1316 * NaNs.)
1319 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
1320 int flags, float_status *s)
1322 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
1323 ((1 << float_class_inf) | (1 << float_class_zero));
1324 bool p_sign;
1325 bool sign_flip = flags & float_muladd_negate_result;
1326 FloatClass p_class;
1327 uint64_t hi, lo;
1328 int p_exp;
1330 /* It is implementation-defined whether the cases of (0,inf,qnan)
1331 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1332 * they return if they do), so we have to hand this information
1333 * off to the target-specific pick-a-NaN routine.
1335 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
1336 return pick_nan_muladd(a, b, c, inf_zero, s);
1339 if (inf_zero) {
1340 s->float_exception_flags |= float_flag_invalid;
1341 return parts_default_nan(s);
1344 if (flags & float_muladd_negate_c) {
1345 c.sign ^= 1;
1348 p_sign = a.sign ^ b.sign;
1350 if (flags & float_muladd_negate_product) {
1351 p_sign ^= 1;
1354 if (a.cls == float_class_inf || b.cls == float_class_inf) {
1355 p_class = float_class_inf;
1356 } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
1357 p_class = float_class_zero;
1358 } else {
1359 p_class = float_class_normal;
1362 if (c.cls == float_class_inf) {
1363 if (p_class == float_class_inf && p_sign != c.sign) {
1364 s->float_exception_flags |= float_flag_invalid;
1365 return parts_default_nan(s);
1366 } else {
1367 a.cls = float_class_inf;
1368 a.sign = c.sign ^ sign_flip;
1369 return a;
1373 if (p_class == float_class_inf) {
1374 a.cls = float_class_inf;
1375 a.sign = p_sign ^ sign_flip;
1376 return a;
1379 if (p_class == float_class_zero) {
1380 if (c.cls == float_class_zero) {
1381 if (p_sign != c.sign) {
1382 p_sign = s->float_rounding_mode == float_round_down;
1384 c.sign = p_sign;
1385 } else if (flags & float_muladd_halve_result) {
1386 c.exp -= 1;
1388 c.sign ^= sign_flip;
1389 return c;
1392 /* a & b should be normals now... */
1393 assert(a.cls == float_class_normal &&
1394 b.cls == float_class_normal);
1396 p_exp = a.exp + b.exp;
1398 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1399 * result.
1401 mul64To128(a.frac, b.frac, &hi, &lo);
1402 /* binary point now at bit 124 */
1404 /* check for overflow */
1405 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
1406 shift128RightJamming(hi, lo, 1, &hi, &lo);
1407 p_exp += 1;
1410 /* + add/sub */
1411 if (c.cls == float_class_zero) {
1412 /* move binary point back to 62 */
1413 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1414 } else {
1415 int exp_diff = p_exp - c.exp;
1416 if (p_sign == c.sign) {
1417 /* Addition */
1418 if (exp_diff <= 0) {
1419 shift128RightJamming(hi, lo,
1420 DECOMPOSED_BINARY_POINT - exp_diff,
1421 &hi, &lo);
1422 lo += c.frac;
1423 p_exp = c.exp;
1424 } else {
1425 uint64_t c_hi, c_lo;
1426 /* shift c to the same binary point as the product (124) */
1427 c_hi = c.frac >> 2;
1428 c_lo = 0;
1429 shift128RightJamming(c_hi, c_lo,
1430 exp_diff,
1431 &c_hi, &c_lo);
1432 add128(hi, lo, c_hi, c_lo, &hi, &lo);
1433 /* move binary point back to 62 */
1434 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1437 if (lo & DECOMPOSED_OVERFLOW_BIT) {
1438 shift64RightJamming(lo, 1, &lo);
1439 p_exp += 1;
1442 } else {
1443 /* Subtraction */
1444 uint64_t c_hi, c_lo;
1445 /* make C binary point match product at bit 124 */
1446 c_hi = c.frac >> 2;
1447 c_lo = 0;
1449 if (exp_diff <= 0) {
1450 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1451 if (exp_diff == 0
1453 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1454 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1455 } else {
1456 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1457 p_sign ^= 1;
1458 p_exp = c.exp;
1460 } else {
1461 shift128RightJamming(c_hi, c_lo,
1462 exp_diff,
1463 &c_hi, &c_lo);
1464 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1467 if (hi == 0 && lo == 0) {
1468 a.cls = float_class_zero;
1469 a.sign = s->float_rounding_mode == float_round_down;
1470 a.sign ^= sign_flip;
1471 return a;
1472 } else {
1473 int shift;
1474 if (hi != 0) {
1475 shift = clz64(hi);
1476 } else {
1477 shift = clz64(lo) + 64;
1479 /* Normalizing to a binary point of 124 is the
1480 correct adjust for the exponent. However since we're
1481 shifting, we might as well put the binary point back
1482 at 62 where we really want it. Therefore shift as
1483 if we're leaving 1 bit at the top of the word, but
1484 adjust the exponent as if we're leaving 3 bits. */
1485 shift -= 1;
1486 if (shift >= 64) {
1487 lo = lo << (shift - 64);
1488 } else {
1489 hi = (hi << shift) | (lo >> (64 - shift));
1490 lo = hi | ((lo << shift) != 0);
1492 p_exp -= shift - 2;
1497 if (flags & float_muladd_halve_result) {
1498 p_exp -= 1;
1501 /* finally prepare our result */
1502 a.cls = float_class_normal;
1503 a.sign = p_sign ^ sign_flip;
1504 a.exp = p_exp;
1505 a.frac = lo;
1507 return a;
1510 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1511 int flags, float_status *status)
1513 FloatParts pa = float16_unpack_canonical(a, status);
1514 FloatParts pb = float16_unpack_canonical(b, status);
1515 FloatParts pc = float16_unpack_canonical(c, status);
1516 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1518 return float16_round_pack_canonical(pr, status);
1521 static float32 QEMU_SOFTFLOAT_ATTR
1522 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1523 float_status *status)
1525 FloatParts pa = float32_unpack_canonical(a, status);
1526 FloatParts pb = float32_unpack_canonical(b, status);
1527 FloatParts pc = float32_unpack_canonical(c, status);
1528 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1530 return float32_round_pack_canonical(pr, status);
1533 static float64 QEMU_SOFTFLOAT_ATTR
1534 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1535 float_status *status)
1537 FloatParts pa = float64_unpack_canonical(a, status);
1538 FloatParts pb = float64_unpack_canonical(b, status);
1539 FloatParts pc = float64_unpack_canonical(c, status);
1540 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1542 return float64_round_pack_canonical(pr, status);
1545 static bool force_soft_fma;
1547 float32 QEMU_FLATTEN
1548 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1550 union_float32 ua, ub, uc, ur;
1552 ua.s = xa;
1553 ub.s = xb;
1554 uc.s = xc;
1556 if (unlikely(!can_use_fpu(s))) {
1557 goto soft;
1559 if (unlikely(flags & float_muladd_halve_result)) {
1560 goto soft;
1563 float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1564 if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1565 goto soft;
1568 if (unlikely(force_soft_fma)) {
1569 goto soft;
1573 * When (a || b) == 0, there's no need to check for under/over flow,
1574 * since we know the addend is (normal || 0) and the product is 0.
1576 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1577 union_float32 up;
1578 bool prod_sign;
1580 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1581 prod_sign ^= !!(flags & float_muladd_negate_product);
1582 up.s = float32_set_sign(float32_zero, prod_sign);
1584 if (flags & float_muladd_negate_c) {
1585 uc.h = -uc.h;
1587 ur.h = up.h + uc.h;
1588 } else {
1589 if (flags & float_muladd_negate_product) {
1590 ua.h = -ua.h;
1592 if (flags & float_muladd_negate_c) {
1593 uc.h = -uc.h;
1596 ur.h = fmaf(ua.h, ub.h, uc.h);
1598 if (unlikely(f32_is_inf(ur))) {
1599 s->float_exception_flags |= float_flag_overflow;
1600 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1601 goto soft;
1604 if (flags & float_muladd_negate_result) {
1605 return float32_chs(ur.s);
1607 return ur.s;
1609 soft:
1610 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1613 float64 QEMU_FLATTEN
1614 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1616 union_float64 ua, ub, uc, ur;
1618 ua.s = xa;
1619 ub.s = xb;
1620 uc.s = xc;
1622 if (unlikely(!can_use_fpu(s))) {
1623 goto soft;
1625 if (unlikely(flags & float_muladd_halve_result)) {
1626 goto soft;
1629 float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1630 if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1631 goto soft;
1634 if (unlikely(force_soft_fma)) {
1635 goto soft;
1639 * When (a || b) == 0, there's no need to check for under/over flow,
1640 * since we know the addend is (normal || 0) and the product is 0.
1642 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1643 union_float64 up;
1644 bool prod_sign;
1646 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1647 prod_sign ^= !!(flags & float_muladd_negate_product);
1648 up.s = float64_set_sign(float64_zero, prod_sign);
1650 if (flags & float_muladd_negate_c) {
1651 uc.h = -uc.h;
1653 ur.h = up.h + uc.h;
1654 } else {
1655 if (flags & float_muladd_negate_product) {
1656 ua.h = -ua.h;
1658 if (flags & float_muladd_negate_c) {
1659 uc.h = -uc.h;
1662 ur.h = fma(ua.h, ub.h, uc.h);
1664 if (unlikely(f64_is_inf(ur))) {
1665 s->float_exception_flags |= float_flag_overflow;
1666 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1667 goto soft;
1670 if (flags & float_muladd_negate_result) {
1671 return float64_chs(ur.s);
1673 return ur.s;
1675 soft:
1676 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1680 * Returns the result of dividing the floating-point value `a' by the
1681 * corresponding value `b'. The operation is performed according to
1682 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1685 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1687 bool sign = a.sign ^ b.sign;
1689 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1690 uint64_t n0, n1, q, r;
1691 int exp = a.exp - b.exp;
1694 * We want a 2*N / N-bit division to produce exactly an N-bit
1695 * result, so that we do not lose any precision and so that we
1696 * do not have to renormalize afterward. If A.frac < B.frac,
1697 * then division would produce an (N-1)-bit result; shift A left
1698 * by one to produce the an N-bit result, and decrement the
1699 * exponent to match.
1701 * The udiv_qrnnd algorithm that we're using requires normalization,
1702 * i.e. the msb of the denominator must be set. Since we know that
1703 * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1704 * by one (more), and the remainder must be shifted right by one.
1706 if (a.frac < b.frac) {
1707 exp -= 1;
1708 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1709 } else {
1710 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1712 q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1715 * Set lsb if there is a remainder, to set inexact.
1716 * As mentioned above, to find the actual value of the remainder we
1717 * would need to shift right, but (1) we are only concerned about
1718 * non-zero-ness, and (2) the remainder will always be even because
1719 * both inputs to the division primitive are even.
1721 a.frac = q | (r != 0);
1722 a.sign = sign;
1723 a.exp = exp;
1724 return a;
1726 /* handle all the NaN cases */
1727 if (is_nan(a.cls) || is_nan(b.cls)) {
1728 return pick_nan(a, b, s);
1730 /* 0/0 or Inf/Inf */
1731 if (a.cls == b.cls
1733 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1734 s->float_exception_flags |= float_flag_invalid;
1735 return parts_default_nan(s);
1737 /* Inf / x or 0 / x */
1738 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1739 a.sign = sign;
1740 return a;
1742 /* Div 0 => Inf */
1743 if (b.cls == float_class_zero) {
1744 s->float_exception_flags |= float_flag_divbyzero;
1745 a.cls = float_class_inf;
1746 a.sign = sign;
1747 return a;
1749 /* Div by Inf */
1750 if (b.cls == float_class_inf) {
1751 a.cls = float_class_zero;
1752 a.sign = sign;
1753 return a;
1755 g_assert_not_reached();
1758 float16 float16_div(float16 a, float16 b, float_status *status)
1760 FloatParts pa = float16_unpack_canonical(a, status);
1761 FloatParts pb = float16_unpack_canonical(b, status);
1762 FloatParts pr = div_floats(pa, pb, status);
1764 return float16_round_pack_canonical(pr, status);
1767 static float32 QEMU_SOFTFLOAT_ATTR
1768 soft_f32_div(float32 a, float32 b, float_status *status)
1770 FloatParts pa = float32_unpack_canonical(a, status);
1771 FloatParts pb = float32_unpack_canonical(b, status);
1772 FloatParts pr = div_floats(pa, pb, status);
1774 return float32_round_pack_canonical(pr, status);
1777 static float64 QEMU_SOFTFLOAT_ATTR
1778 soft_f64_div(float64 a, float64 b, float_status *status)
1780 FloatParts pa = float64_unpack_canonical(a, status);
1781 FloatParts pb = float64_unpack_canonical(b, status);
1782 FloatParts pr = div_floats(pa, pb, status);
1784 return float64_round_pack_canonical(pr, status);
1787 static float hard_f32_div(float a, float b)
1789 return a / b;
1792 static double hard_f64_div(double a, double b)
1794 return a / b;
1797 static bool f32_div_pre(union_float32 a, union_float32 b)
1799 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1800 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1801 fpclassify(b.h) == FP_NORMAL;
1803 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1806 static bool f64_div_pre(union_float64 a, union_float64 b)
1808 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1809 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1810 fpclassify(b.h) == FP_NORMAL;
1812 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1815 static bool f32_div_post(union_float32 a, union_float32 b)
1817 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1818 return fpclassify(a.h) != FP_ZERO;
1820 return !float32_is_zero(a.s);
1823 static bool f64_div_post(union_float64 a, union_float64 b)
1825 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1826 return fpclassify(a.h) != FP_ZERO;
1828 return !float64_is_zero(a.s);
1831 float32 QEMU_FLATTEN
1832 float32_div(float32 a, float32 b, float_status *s)
1834 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1835 f32_div_pre, f32_div_post, NULL, NULL);
1838 float64 QEMU_FLATTEN
1839 float64_div(float64 a, float64 b, float_status *s)
1841 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1842 f64_div_pre, f64_div_post, NULL, NULL);
1846 * Float to Float conversions
1848 * Returns the result of converting one float format to another. The
1849 * conversion is performed according to the IEC/IEEE Standard for
1850 * Binary Floating-Point Arithmetic.
1852 * The float_to_float helper only needs to take care of raising
1853 * invalid exceptions and handling the conversion on NaNs.
1856 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1857 float_status *s)
1859 if (dstf->arm_althp) {
1860 switch (a.cls) {
1861 case float_class_qnan:
1862 case float_class_snan:
1863 /* There is no NaN in the destination format. Raise Invalid
1864 * and return a zero with the sign of the input NaN.
1866 s->float_exception_flags |= float_flag_invalid;
1867 a.cls = float_class_zero;
1868 a.frac = 0;
1869 a.exp = 0;
1870 break;
1872 case float_class_inf:
1873 /* There is no Inf in the destination format. Raise Invalid
1874 * and return the maximum normal with the correct sign.
1876 s->float_exception_flags |= float_flag_invalid;
1877 a.cls = float_class_normal;
1878 a.exp = dstf->exp_max;
1879 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1880 break;
1882 default:
1883 break;
1885 } else if (is_nan(a.cls)) {
1886 if (is_snan(a.cls)) {
1887 s->float_exception_flags |= float_flag_invalid;
1888 a = parts_silence_nan(a, s);
1890 if (s->default_nan_mode) {
1891 return parts_default_nan(s);
1894 return a;
1897 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1899 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1900 FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1901 FloatParts pr = float_to_float(p, &float32_params, s);
1902 return float32_round_pack_canonical(pr, s);
1905 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1907 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1908 FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1909 FloatParts pr = float_to_float(p, &float64_params, s);
1910 return float64_round_pack_canonical(pr, s);
1913 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1915 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1916 FloatParts p = float32_unpack_canonical(a, s);
1917 FloatParts pr = float_to_float(p, fmt16, s);
1918 return float16a_round_pack_canonical(pr, s, fmt16);
1921 float64 float32_to_float64(float32 a, float_status *s)
1923 FloatParts p = float32_unpack_canonical(a, s);
1924 FloatParts pr = float_to_float(p, &float64_params, s);
1925 return float64_round_pack_canonical(pr, s);
1928 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1930 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1931 FloatParts p = float64_unpack_canonical(a, s);
1932 FloatParts pr = float_to_float(p, fmt16, s);
1933 return float16a_round_pack_canonical(pr, s, fmt16);
1936 float32 float64_to_float32(float64 a, float_status *s)
1938 FloatParts p = float64_unpack_canonical(a, s);
1939 FloatParts pr = float_to_float(p, &float32_params, s);
1940 return float32_round_pack_canonical(pr, s);
1944 * Rounds the floating-point value `a' to an integer, and returns the
1945 * result as a floating-point value. The operation is performed
1946 * according to the IEC/IEEE Standard for Binary Floating-Point
1947 * Arithmetic.
1950 static FloatParts round_to_int(FloatParts a, int rmode,
1951 int scale, float_status *s)
1953 switch (a.cls) {
1954 case float_class_qnan:
1955 case float_class_snan:
1956 return return_nan(a, s);
1958 case float_class_zero:
1959 case float_class_inf:
1960 /* already "integral" */
1961 break;
1963 case float_class_normal:
1964 scale = MIN(MAX(scale, -0x10000), 0x10000);
1965 a.exp += scale;
1967 if (a.exp >= DECOMPOSED_BINARY_POINT) {
1968 /* already integral */
1969 break;
1971 if (a.exp < 0) {
1972 bool one;
1973 /* all fractional */
1974 s->float_exception_flags |= float_flag_inexact;
1975 switch (rmode) {
1976 case float_round_nearest_even:
1977 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1978 break;
1979 case float_round_ties_away:
1980 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1981 break;
1982 case float_round_to_zero:
1983 one = false;
1984 break;
1985 case float_round_up:
1986 one = !a.sign;
1987 break;
1988 case float_round_down:
1989 one = a.sign;
1990 break;
1991 default:
1992 g_assert_not_reached();
1995 if (one) {
1996 a.frac = DECOMPOSED_IMPLICIT_BIT;
1997 a.exp = 0;
1998 } else {
1999 a.cls = float_class_zero;
2001 } else {
2002 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2003 uint64_t frac_lsbm1 = frac_lsb >> 1;
2004 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2005 uint64_t rnd_mask = rnd_even_mask >> 1;
2006 uint64_t inc;
2008 switch (rmode) {
2009 case float_round_nearest_even:
2010 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2011 break;
2012 case float_round_ties_away:
2013 inc = frac_lsbm1;
2014 break;
2015 case float_round_to_zero:
2016 inc = 0;
2017 break;
2018 case float_round_up:
2019 inc = a.sign ? 0 : rnd_mask;
2020 break;
2021 case float_round_down:
2022 inc = a.sign ? rnd_mask : 0;
2023 break;
2024 default:
2025 g_assert_not_reached();
2028 if (a.frac & rnd_mask) {
2029 s->float_exception_flags |= float_flag_inexact;
2030 a.frac += inc;
2031 a.frac &= ~rnd_mask;
2032 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
2033 a.frac >>= 1;
2034 a.exp++;
2038 break;
2039 default:
2040 g_assert_not_reached();
2042 return a;
2045 float16 float16_round_to_int(float16 a, float_status *s)
2047 FloatParts pa = float16_unpack_canonical(a, s);
2048 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2049 return float16_round_pack_canonical(pr, s);
2052 float32 float32_round_to_int(float32 a, float_status *s)
2054 FloatParts pa = float32_unpack_canonical(a, s);
2055 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2056 return float32_round_pack_canonical(pr, s);
2059 float64 float64_round_to_int(float64 a, float_status *s)
2061 FloatParts pa = float64_unpack_canonical(a, s);
2062 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2063 return float64_round_pack_canonical(pr, s);
2067 * Returns the result of converting the floating-point value `a' to
2068 * the two's complement integer format. The conversion is performed
2069 * according to the IEC/IEEE Standard for Binary Floating-Point
2070 * Arithmetic---which means in particular that the conversion is
2071 * rounded according to the current rounding mode. If `a' is a NaN,
2072 * the largest positive integer is returned. Otherwise, if the
2073 * conversion overflows, the largest integer with the same sign as `a'
2074 * is returned.
2077 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale,
2078 int64_t min, int64_t max,
2079 float_status *s)
2081 uint64_t r;
2082 int orig_flags = get_float_exception_flags(s);
2083 FloatParts p = round_to_int(in, rmode, scale, s);
2085 switch (p.cls) {
2086 case float_class_snan:
2087 case float_class_qnan:
2088 s->float_exception_flags = orig_flags | float_flag_invalid;
2089 return max;
2090 case float_class_inf:
2091 s->float_exception_flags = orig_flags | float_flag_invalid;
2092 return p.sign ? min : max;
2093 case float_class_zero:
2094 return 0;
2095 case float_class_normal:
2096 if (p.exp < DECOMPOSED_BINARY_POINT) {
2097 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2098 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2099 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2100 } else {
2101 r = UINT64_MAX;
2103 if (p.sign) {
2104 if (r <= -(uint64_t) min) {
2105 return -r;
2106 } else {
2107 s->float_exception_flags = orig_flags | float_flag_invalid;
2108 return min;
2110 } else {
2111 if (r <= max) {
2112 return r;
2113 } else {
2114 s->float_exception_flags = orig_flags | float_flag_invalid;
2115 return max;
2118 default:
2119 g_assert_not_reached();
2123 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale,
2124 float_status *s)
2126 return round_to_int_and_pack(float16_unpack_canonical(a, s),
2127 rmode, scale, INT16_MIN, INT16_MAX, s);
2130 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale,
2131 float_status *s)
2133 return round_to_int_and_pack(float16_unpack_canonical(a, s),
2134 rmode, scale, INT32_MIN, INT32_MAX, s);
2137 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale,
2138 float_status *s)
2140 return round_to_int_and_pack(float16_unpack_canonical(a, s),
2141 rmode, scale, INT64_MIN, INT64_MAX, s);
2144 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale,
2145 float_status *s)
2147 return round_to_int_and_pack(float32_unpack_canonical(a, s),
2148 rmode, scale, INT16_MIN, INT16_MAX, s);
2151 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale,
2152 float_status *s)
2154 return round_to_int_and_pack(float32_unpack_canonical(a, s),
2155 rmode, scale, INT32_MIN, INT32_MAX, s);
2158 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale,
2159 float_status *s)
2161 return round_to_int_and_pack(float32_unpack_canonical(a, s),
2162 rmode, scale, INT64_MIN, INT64_MAX, s);
2165 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale,
2166 float_status *s)
2168 return round_to_int_and_pack(float64_unpack_canonical(a, s),
2169 rmode, scale, INT16_MIN, INT16_MAX, s);
2172 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale,
2173 float_status *s)
2175 return round_to_int_and_pack(float64_unpack_canonical(a, s),
2176 rmode, scale, INT32_MIN, INT32_MAX, s);
2179 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale,
2180 float_status *s)
2182 return round_to_int_and_pack(float64_unpack_canonical(a, s),
2183 rmode, scale, INT64_MIN, INT64_MAX, s);
2186 int16_t float16_to_int16(float16 a, float_status *s)
2188 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2191 int32_t float16_to_int32(float16 a, float_status *s)
2193 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2196 int64_t float16_to_int64(float16 a, float_status *s)
2198 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2201 int16_t float32_to_int16(float32 a, float_status *s)
2203 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2206 int32_t float32_to_int32(float32 a, float_status *s)
2208 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2211 int64_t float32_to_int64(float32 a, float_status *s)
2213 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2216 int16_t float64_to_int16(float64 a, float_status *s)
2218 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2221 int32_t float64_to_int32(float64 a, float_status *s)
2223 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2226 int64_t float64_to_int64(float64 a, float_status *s)
2228 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2231 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2233 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2236 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2238 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2241 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2243 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2246 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2248 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2251 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2253 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2256 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2258 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2261 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2263 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2266 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2268 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2271 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2273 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2277 * Returns the result of converting the floating-point value `a' to
2278 * the unsigned integer format. The conversion is performed according
2279 * to the IEC/IEEE Standard for Binary Floating-Point
2280 * Arithmetic---which means in particular that the conversion is
2281 * rounded according to the current rounding mode. If `a' is a NaN,
2282 * the largest unsigned integer is returned. Otherwise, if the
2283 * conversion overflows, the largest unsigned integer is returned. If
2284 * the 'a' is negative, the result is rounded and zero is returned;
2285 * values that do not round to zero will raise the inexact exception
2286 * flag.
2289 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale,
2290 uint64_t max, float_status *s)
2292 int orig_flags = get_float_exception_flags(s);
2293 FloatParts p = round_to_int(in, rmode, scale, s);
2294 uint64_t r;
2296 switch (p.cls) {
2297 case float_class_snan:
2298 case float_class_qnan:
2299 s->float_exception_flags = orig_flags | float_flag_invalid;
2300 return max;
2301 case float_class_inf:
2302 s->float_exception_flags = orig_flags | float_flag_invalid;
2303 return p.sign ? 0 : max;
2304 case float_class_zero:
2305 return 0;
2306 case float_class_normal:
2307 if (p.sign) {
2308 s->float_exception_flags = orig_flags | float_flag_invalid;
2309 return 0;
2312 if (p.exp < DECOMPOSED_BINARY_POINT) {
2313 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2314 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2315 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2316 } else {
2317 s->float_exception_flags = orig_flags | float_flag_invalid;
2318 return max;
2321 /* For uint64 this will never trip, but if p.exp is too large
2322 * to shift a decomposed fraction we shall have exited via the
2323 * 3rd leg above.
2325 if (r > max) {
2326 s->float_exception_flags = orig_flags | float_flag_invalid;
2327 return max;
2329 return r;
2330 default:
2331 g_assert_not_reached();
2335 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale,
2336 float_status *s)
2338 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2339 rmode, scale, UINT16_MAX, s);
2342 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale,
2343 float_status *s)
2345 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2346 rmode, scale, UINT32_MAX, s);
2349 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale,
2350 float_status *s)
2352 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2353 rmode, scale, UINT64_MAX, s);
2356 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale,
2357 float_status *s)
2359 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2360 rmode, scale, UINT16_MAX, s);
2363 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale,
2364 float_status *s)
2366 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2367 rmode, scale, UINT32_MAX, s);
2370 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale,
2371 float_status *s)
2373 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2374 rmode, scale, UINT64_MAX, s);
2377 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale,
2378 float_status *s)
2380 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2381 rmode, scale, UINT16_MAX, s);
2384 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale,
2385 float_status *s)
2387 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2388 rmode, scale, UINT32_MAX, s);
2391 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale,
2392 float_status *s)
2394 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2395 rmode, scale, UINT64_MAX, s);
2398 uint16_t float16_to_uint16(float16 a, float_status *s)
2400 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2403 uint32_t float16_to_uint32(float16 a, float_status *s)
2405 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2408 uint64_t float16_to_uint64(float16 a, float_status *s)
2410 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2413 uint16_t float32_to_uint16(float32 a, float_status *s)
2415 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2418 uint32_t float32_to_uint32(float32 a, float_status *s)
2420 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2423 uint64_t float32_to_uint64(float32 a, float_status *s)
2425 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2428 uint16_t float64_to_uint16(float64 a, float_status *s)
2430 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2433 uint32_t float64_to_uint32(float64 a, float_status *s)
2435 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2438 uint64_t float64_to_uint64(float64 a, float_status *s)
2440 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2443 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2445 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2448 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2450 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2453 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2455 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2458 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2460 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2463 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2465 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2468 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2470 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2473 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2475 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2478 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2480 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2483 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2485 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2489 * Integer to float conversions
2491 * Returns the result of converting the two's complement integer `a'
2492 * to the floating-point format. The conversion is performed according
2493 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2496 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
2498 FloatParts r = { .sign = false };
2500 if (a == 0) {
2501 r.cls = float_class_zero;
2502 } else {
2503 uint64_t f = a;
2504 int shift;
2506 r.cls = float_class_normal;
2507 if (a < 0) {
2508 f = -f;
2509 r.sign = true;
2511 shift = clz64(f) - 1;
2512 scale = MIN(MAX(scale, -0x10000), 0x10000);
2514 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2515 r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
2518 return r;
2521 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2523 FloatParts pa = int_to_float(a, scale, status);
2524 return float16_round_pack_canonical(pa, status);
2527 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2529 return int64_to_float16_scalbn(a, scale, status);
2532 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2534 return int64_to_float16_scalbn(a, scale, status);
2537 float16 int64_to_float16(int64_t a, float_status *status)
2539 return int64_to_float16_scalbn(a, 0, status);
2542 float16 int32_to_float16(int32_t a, float_status *status)
2544 return int64_to_float16_scalbn(a, 0, status);
2547 float16 int16_to_float16(int16_t a, float_status *status)
2549 return int64_to_float16_scalbn(a, 0, status);
2552 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2554 FloatParts pa = int_to_float(a, scale, status);
2555 return float32_round_pack_canonical(pa, status);
2558 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2560 return int64_to_float32_scalbn(a, scale, status);
2563 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2565 return int64_to_float32_scalbn(a, scale, status);
2568 float32 int64_to_float32(int64_t a, float_status *status)
2570 return int64_to_float32_scalbn(a, 0, status);
2573 float32 int32_to_float32(int32_t a, float_status *status)
2575 return int64_to_float32_scalbn(a, 0, status);
2578 float32 int16_to_float32(int16_t a, float_status *status)
2580 return int64_to_float32_scalbn(a, 0, status);
2583 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2585 FloatParts pa = int_to_float(a, scale, status);
2586 return float64_round_pack_canonical(pa, status);
2589 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2591 return int64_to_float64_scalbn(a, scale, status);
2594 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2596 return int64_to_float64_scalbn(a, scale, status);
2599 float64 int64_to_float64(int64_t a, float_status *status)
2601 return int64_to_float64_scalbn(a, 0, status);
2604 float64 int32_to_float64(int32_t a, float_status *status)
2606 return int64_to_float64_scalbn(a, 0, status);
2609 float64 int16_to_float64(int16_t a, float_status *status)
2611 return int64_to_float64_scalbn(a, 0, status);
2616 * Unsigned Integer to float conversions
2618 * Returns the result of converting the unsigned integer `a' to the
2619 * floating-point format. The conversion is performed according to the
2620 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2623 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
2625 FloatParts r = { .sign = false };
2627 if (a == 0) {
2628 r.cls = float_class_zero;
2629 } else {
2630 scale = MIN(MAX(scale, -0x10000), 0x10000);
2631 r.cls = float_class_normal;
2632 if ((int64_t)a < 0) {
2633 r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
2634 shift64RightJamming(a, 1, &a);
2635 r.frac = a;
2636 } else {
2637 int shift = clz64(a) - 1;
2638 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2639 r.frac = a << shift;
2643 return r;
2646 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2648 FloatParts pa = uint_to_float(a, scale, status);
2649 return float16_round_pack_canonical(pa, status);
2652 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2654 return uint64_to_float16_scalbn(a, scale, status);
2657 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2659 return uint64_to_float16_scalbn(a, scale, status);
2662 float16 uint64_to_float16(uint64_t a, float_status *status)
2664 return uint64_to_float16_scalbn(a, 0, status);
2667 float16 uint32_to_float16(uint32_t a, float_status *status)
2669 return uint64_to_float16_scalbn(a, 0, status);
2672 float16 uint16_to_float16(uint16_t a, float_status *status)
2674 return uint64_to_float16_scalbn(a, 0, status);
2677 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2679 FloatParts pa = uint_to_float(a, scale, status);
2680 return float32_round_pack_canonical(pa, status);
2683 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2685 return uint64_to_float32_scalbn(a, scale, status);
2688 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2690 return uint64_to_float32_scalbn(a, scale, status);
2693 float32 uint64_to_float32(uint64_t a, float_status *status)
2695 return uint64_to_float32_scalbn(a, 0, status);
2698 float32 uint32_to_float32(uint32_t a, float_status *status)
2700 return uint64_to_float32_scalbn(a, 0, status);
2703 float32 uint16_to_float32(uint16_t a, float_status *status)
2705 return uint64_to_float32_scalbn(a, 0, status);
2708 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2710 FloatParts pa = uint_to_float(a, scale, status);
2711 return float64_round_pack_canonical(pa, status);
2714 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2716 return uint64_to_float64_scalbn(a, scale, status);
2719 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2721 return uint64_to_float64_scalbn(a, scale, status);
2724 float64 uint64_to_float64(uint64_t a, float_status *status)
2726 return uint64_to_float64_scalbn(a, 0, status);
2729 float64 uint32_to_float64(uint32_t a, float_status *status)
2731 return uint64_to_float64_scalbn(a, 0, status);
2734 float64 uint16_to_float64(uint16_t a, float_status *status)
2736 return uint64_to_float64_scalbn(a, 0, status);
2739 /* Float Min/Max */
2740 /* min() and max() functions. These can't be implemented as
2741 * 'compare and pick one input' because that would mishandle
2742 * NaNs and +0 vs -0.
2744 * minnum() and maxnum() functions. These are similar to the min()
2745 * and max() functions but if one of the arguments is a QNaN and
2746 * the other is numerical then the numerical argument is returned.
2747 * SNaNs will get quietened before being returned.
2748 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2749 * and maxNum() operations. min() and max() are the typical min/max
2750 * semantics provided by many CPUs which predate that specification.
2752 * minnummag() and maxnummag() functions correspond to minNumMag()
2753 * and minNumMag() from the IEEE-754 2008.
2755 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2756 bool ieee, bool ismag, float_status *s)
2758 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2759 if (ieee) {
2760 /* Takes two floating-point values `a' and `b', one of
2761 * which is a NaN, and returns the appropriate NaN
2762 * result. If either `a' or `b' is a signaling NaN,
2763 * the invalid exception is raised.
2765 if (is_snan(a.cls) || is_snan(b.cls)) {
2766 return pick_nan(a, b, s);
2767 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2768 return b;
2769 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2770 return a;
2773 return pick_nan(a, b, s);
2774 } else {
2775 int a_exp, b_exp;
2777 switch (a.cls) {
2778 case float_class_normal:
2779 a_exp = a.exp;
2780 break;
2781 case float_class_inf:
2782 a_exp = INT_MAX;
2783 break;
2784 case float_class_zero:
2785 a_exp = INT_MIN;
2786 break;
2787 default:
2788 g_assert_not_reached();
2789 break;
2791 switch (b.cls) {
2792 case float_class_normal:
2793 b_exp = b.exp;
2794 break;
2795 case float_class_inf:
2796 b_exp = INT_MAX;
2797 break;
2798 case float_class_zero:
2799 b_exp = INT_MIN;
2800 break;
2801 default:
2802 g_assert_not_reached();
2803 break;
2806 if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2807 bool a_less = a_exp < b_exp;
2808 if (a_exp == b_exp) {
2809 a_less = a.frac < b.frac;
2811 return a_less ^ ismin ? b : a;
2814 if (a.sign == b.sign) {
2815 bool a_less = a_exp < b_exp;
2816 if (a_exp == b_exp) {
2817 a_less = a.frac < b.frac;
2819 return a.sign ^ a_less ^ ismin ? b : a;
2820 } else {
2821 return a.sign ^ ismin ? b : a;
2826 #define MINMAX(sz, name, ismin, isiee, ismag) \
2827 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
2828 float_status *s) \
2830 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2831 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2832 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
2834 return float ## sz ## _round_pack_canonical(pr, s); \
2837 MINMAX(16, min, true, false, false)
2838 MINMAX(16, minnum, true, true, false)
2839 MINMAX(16, minnummag, true, true, true)
2840 MINMAX(16, max, false, false, false)
2841 MINMAX(16, maxnum, false, true, false)
2842 MINMAX(16, maxnummag, false, true, true)
2844 MINMAX(32, min, true, false, false)
2845 MINMAX(32, minnum, true, true, false)
2846 MINMAX(32, minnummag, true, true, true)
2847 MINMAX(32, max, false, false, false)
2848 MINMAX(32, maxnum, false, true, false)
2849 MINMAX(32, maxnummag, false, true, true)
2851 MINMAX(64, min, true, false, false)
2852 MINMAX(64, minnum, true, true, false)
2853 MINMAX(64, minnummag, true, true, true)
2854 MINMAX(64, max, false, false, false)
2855 MINMAX(64, maxnum, false, true, false)
2856 MINMAX(64, maxnummag, false, true, true)
2858 #undef MINMAX
2860 /* Floating point compare */
2861 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2862 float_status *s)
2864 if (is_nan(a.cls) || is_nan(b.cls)) {
2865 if (!is_quiet ||
2866 a.cls == float_class_snan ||
2867 b.cls == float_class_snan) {
2868 s->float_exception_flags |= float_flag_invalid;
2870 return float_relation_unordered;
2873 if (a.cls == float_class_zero) {
2874 if (b.cls == float_class_zero) {
2875 return float_relation_equal;
2877 return b.sign ? float_relation_greater : float_relation_less;
2878 } else if (b.cls == float_class_zero) {
2879 return a.sign ? float_relation_less : float_relation_greater;
2882 /* The only really important thing about infinity is its sign. If
2883 * both are infinities the sign marks the smallest of the two.
2885 if (a.cls == float_class_inf) {
2886 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2887 return float_relation_equal;
2889 return a.sign ? float_relation_less : float_relation_greater;
2890 } else if (b.cls == float_class_inf) {
2891 return b.sign ? float_relation_greater : float_relation_less;
2894 if (a.sign != b.sign) {
2895 return a.sign ? float_relation_less : float_relation_greater;
2898 if (a.exp == b.exp) {
2899 if (a.frac == b.frac) {
2900 return float_relation_equal;
2902 if (a.sign) {
2903 return a.frac > b.frac ?
2904 float_relation_less : float_relation_greater;
2905 } else {
2906 return a.frac > b.frac ?
2907 float_relation_greater : float_relation_less;
2909 } else {
2910 if (a.sign) {
2911 return a.exp > b.exp ? float_relation_less : float_relation_greater;
2912 } else {
2913 return a.exp > b.exp ? float_relation_greater : float_relation_less;
2918 #define COMPARE(name, attr, sz) \
2919 static int attr \
2920 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \
2922 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2923 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2924 return compare_floats(pa, pb, is_quiet, s); \
2927 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
2928 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
2929 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
2931 #undef COMPARE
2933 int float16_compare(float16 a, float16 b, float_status *s)
2935 return soft_f16_compare(a, b, false, s);
2938 int float16_compare_quiet(float16 a, float16 b, float_status *s)
2940 return soft_f16_compare(a, b, true, s);
2943 static int QEMU_FLATTEN
2944 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
2946 union_float32 ua, ub;
2948 ua.s = xa;
2949 ub.s = xb;
2951 if (QEMU_NO_HARDFLOAT) {
2952 goto soft;
2955 float32_input_flush2(&ua.s, &ub.s, s);
2956 if (isgreaterequal(ua.h, ub.h)) {
2957 if (isgreater(ua.h, ub.h)) {
2958 return float_relation_greater;
2960 return float_relation_equal;
2962 if (likely(isless(ua.h, ub.h))) {
2963 return float_relation_less;
2965 /* The only condition remaining is unordered.
2966 * Fall through to set flags.
2968 soft:
2969 return soft_f32_compare(ua.s, ub.s, is_quiet, s);
2972 int float32_compare(float32 a, float32 b, float_status *s)
2974 return f32_compare(a, b, false, s);
2977 int float32_compare_quiet(float32 a, float32 b, float_status *s)
2979 return f32_compare(a, b, true, s);
2982 static int QEMU_FLATTEN
2983 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
2985 union_float64 ua, ub;
2987 ua.s = xa;
2988 ub.s = xb;
2990 if (QEMU_NO_HARDFLOAT) {
2991 goto soft;
2994 float64_input_flush2(&ua.s, &ub.s, s);
2995 if (isgreaterequal(ua.h, ub.h)) {
2996 if (isgreater(ua.h, ub.h)) {
2997 return float_relation_greater;
2999 return float_relation_equal;
3001 if (likely(isless(ua.h, ub.h))) {
3002 return float_relation_less;
3004 /* The only condition remaining is unordered.
3005 * Fall through to set flags.
3007 soft:
3008 return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3011 int float64_compare(float64 a, float64 b, float_status *s)
3013 return f64_compare(a, b, false, s);
3016 int float64_compare_quiet(float64 a, float64 b, float_status *s)
3018 return f64_compare(a, b, true, s);
3021 /* Multiply A by 2 raised to the power N. */
3022 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
3024 if (unlikely(is_nan(a.cls))) {
3025 return return_nan(a, s);
3027 if (a.cls == float_class_normal) {
3028 /* The largest float type (even though not supported by FloatParts)
3029 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
3030 * still allows rounding to infinity, without allowing overflow
3031 * within the int32_t that backs FloatParts.exp.
3033 n = MIN(MAX(n, -0x10000), 0x10000);
3034 a.exp += n;
3036 return a;
3039 float16 float16_scalbn(float16 a, int n, float_status *status)
3041 FloatParts pa = float16_unpack_canonical(a, status);
3042 FloatParts pr = scalbn_decomposed(pa, n, status);
3043 return float16_round_pack_canonical(pr, status);
3046 float32 float32_scalbn(float32 a, int n, float_status *status)
3048 FloatParts pa = float32_unpack_canonical(a, status);
3049 FloatParts pr = scalbn_decomposed(pa, n, status);
3050 return float32_round_pack_canonical(pr, status);
3053 float64 float64_scalbn(float64 a, int n, float_status *status)
3055 FloatParts pa = float64_unpack_canonical(a, status);
3056 FloatParts pr = scalbn_decomposed(pa, n, status);
3057 return float64_round_pack_canonical(pr, status);
3061 * Square Root
3063 * The old softfloat code did an approximation step before zeroing in
3064 * on the final result. However for simpleness we just compute the
3065 * square root by iterating down from the implicit bit to enough extra
3066 * bits to ensure we get a correctly rounded result.
3068 * This does mean however the calculation is slower than before,
3069 * especially for 64 bit floats.
3072 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
3074 uint64_t a_frac, r_frac, s_frac;
3075 int bit, last_bit;
3077 if (is_nan(a.cls)) {
3078 return return_nan(a, s);
3080 if (a.cls == float_class_zero) {
3081 return a; /* sqrt(+-0) = +-0 */
3083 if (a.sign) {
3084 s->float_exception_flags |= float_flag_invalid;
3085 return parts_default_nan(s);
3087 if (a.cls == float_class_inf) {
3088 return a; /* sqrt(+inf) = +inf */
3091 assert(a.cls == float_class_normal);
3093 /* We need two overflow bits at the top. Adding room for that is a
3094 * right shift. If the exponent is odd, we can discard the low bit
3095 * by multiplying the fraction by 2; that's a left shift. Combine
3096 * those and we shift right if the exponent is even.
3098 a_frac = a.frac;
3099 if (!(a.exp & 1)) {
3100 a_frac >>= 1;
3102 a.exp >>= 1;
3104 /* Bit-by-bit computation of sqrt. */
3105 r_frac = 0;
3106 s_frac = 0;
3108 /* Iterate from implicit bit down to the 3 extra bits to compute a
3109 * properly rounded result. Remember we've inserted one more bit
3110 * at the top, so these positions are one less.
3112 bit = DECOMPOSED_BINARY_POINT - 1;
3113 last_bit = MAX(p->frac_shift - 4, 0);
3114 do {
3115 uint64_t q = 1ULL << bit;
3116 uint64_t t_frac = s_frac + q;
3117 if (t_frac <= a_frac) {
3118 s_frac = t_frac + q;
3119 a_frac -= t_frac;
3120 r_frac += q;
3122 a_frac <<= 1;
3123 } while (--bit >= last_bit);
3125 /* Undo the right shift done above. If there is any remaining
3126 * fraction, the result is inexact. Set the sticky bit.
3128 a.frac = (r_frac << 1) + (a_frac != 0);
3130 return a;
3133 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3135 FloatParts pa = float16_unpack_canonical(a, status);
3136 FloatParts pr = sqrt_float(pa, status, &float16_params);
3137 return float16_round_pack_canonical(pr, status);
3140 static float32 QEMU_SOFTFLOAT_ATTR
3141 soft_f32_sqrt(float32 a, float_status *status)
3143 FloatParts pa = float32_unpack_canonical(a, status);
3144 FloatParts pr = sqrt_float(pa, status, &float32_params);
3145 return float32_round_pack_canonical(pr, status);
3148 static float64 QEMU_SOFTFLOAT_ATTR
3149 soft_f64_sqrt(float64 a, float_status *status)
3151 FloatParts pa = float64_unpack_canonical(a, status);
3152 FloatParts pr = sqrt_float(pa, status, &float64_params);
3153 return float64_round_pack_canonical(pr, status);
3156 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3158 union_float32 ua, ur;
3160 ua.s = xa;
3161 if (unlikely(!can_use_fpu(s))) {
3162 goto soft;
3165 float32_input_flush1(&ua.s, s);
3166 if (QEMU_HARDFLOAT_1F32_USE_FP) {
3167 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3168 fpclassify(ua.h) == FP_ZERO) ||
3169 signbit(ua.h))) {
3170 goto soft;
3172 } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3173 float32_is_neg(ua.s))) {
3174 goto soft;
3176 ur.h = sqrtf(ua.h);
3177 return ur.s;
3179 soft:
3180 return soft_f32_sqrt(ua.s, s);
3183 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3185 union_float64 ua, ur;
3187 ua.s = xa;
3188 if (unlikely(!can_use_fpu(s))) {
3189 goto soft;
3192 float64_input_flush1(&ua.s, s);
3193 if (QEMU_HARDFLOAT_1F64_USE_FP) {
3194 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3195 fpclassify(ua.h) == FP_ZERO) ||
3196 signbit(ua.h))) {
3197 goto soft;
3199 } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3200 float64_is_neg(ua.s))) {
3201 goto soft;
3203 ur.h = sqrt(ua.h);
3204 return ur.s;
3206 soft:
3207 return soft_f64_sqrt(ua.s, s);
3210 /*----------------------------------------------------------------------------
3211 | The pattern for a default generated NaN.
3212 *----------------------------------------------------------------------------*/
3214 float16 float16_default_nan(float_status *status)
3216 FloatParts p = parts_default_nan(status);
3217 p.frac >>= float16_params.frac_shift;
3218 return float16_pack_raw(p);
3221 float32 float32_default_nan(float_status *status)
3223 FloatParts p = parts_default_nan(status);
3224 p.frac >>= float32_params.frac_shift;
3225 return float32_pack_raw(p);
3228 float64 float64_default_nan(float_status *status)
3230 FloatParts p = parts_default_nan(status);
3231 p.frac >>= float64_params.frac_shift;
3232 return float64_pack_raw(p);
3235 float128 float128_default_nan(float_status *status)
3237 FloatParts p = parts_default_nan(status);
3238 float128 r;
3240 /* Extrapolate from the choices made by parts_default_nan to fill
3241 * in the quad-floating format. If the low bit is set, assume we
3242 * want to set all non-snan bits.
3244 r.low = -(p.frac & 1);
3245 r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
3246 r.high |= LIT64(0x7FFF000000000000);
3247 r.high |= (uint64_t)p.sign << 63;
3249 return r;
3252 /*----------------------------------------------------------------------------
3253 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3254 *----------------------------------------------------------------------------*/
3256 float16 float16_silence_nan(float16 a, float_status *status)
3258 FloatParts p = float16_unpack_raw(a);
3259 p.frac <<= float16_params.frac_shift;
3260 p = parts_silence_nan(p, status);
3261 p.frac >>= float16_params.frac_shift;
3262 return float16_pack_raw(p);
3265 float32 float32_silence_nan(float32 a, float_status *status)
3267 FloatParts p = float32_unpack_raw(a);
3268 p.frac <<= float32_params.frac_shift;
3269 p = parts_silence_nan(p, status);
3270 p.frac >>= float32_params.frac_shift;
3271 return float32_pack_raw(p);
3274 float64 float64_silence_nan(float64 a, float_status *status)
3276 FloatParts p = float64_unpack_raw(a);
3277 p.frac <<= float64_params.frac_shift;
3278 p = parts_silence_nan(p, status);
3279 p.frac >>= float64_params.frac_shift;
3280 return float64_pack_raw(p);
3283 /*----------------------------------------------------------------------------
3284 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3285 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3286 | input. If `zSign' is 1, the input is negated before being converted to an
3287 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
3288 | is simply rounded to an integer, with the inexact exception raised if the
3289 | input cannot be represented exactly as an integer. However, if the fixed-
3290 | point input is too large, the invalid exception is raised and the largest
3291 | positive or negative integer is returned.
3292 *----------------------------------------------------------------------------*/
3294 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
3296 int8_t roundingMode;
3297 flag roundNearestEven;
3298 int8_t roundIncrement, roundBits;
3299 int32_t z;
3301 roundingMode = status->float_rounding_mode;
3302 roundNearestEven = ( roundingMode == float_round_nearest_even );
3303 switch (roundingMode) {
3304 case float_round_nearest_even:
3305 case float_round_ties_away:
3306 roundIncrement = 0x40;
3307 break;
3308 case float_round_to_zero:
3309 roundIncrement = 0;
3310 break;
3311 case float_round_up:
3312 roundIncrement = zSign ? 0 : 0x7f;
3313 break;
3314 case float_round_down:
3315 roundIncrement = zSign ? 0x7f : 0;
3316 break;
3317 default:
3318 abort();
3320 roundBits = absZ & 0x7F;
3321 absZ = ( absZ + roundIncrement )>>7;
3322 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3323 z = absZ;
3324 if ( zSign ) z = - z;
3325 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3326 float_raise(float_flag_invalid, status);
3327 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3329 if (roundBits) {
3330 status->float_exception_flags |= float_flag_inexact;
3332 return z;
3336 /*----------------------------------------------------------------------------
3337 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3338 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3339 | and returns the properly rounded 64-bit integer corresponding to the input.
3340 | If `zSign' is 1, the input is negated before being converted to an integer.
3341 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3342 | the inexact exception raised if the input cannot be represented exactly as
3343 | an integer. However, if the fixed-point input is too large, the invalid
3344 | exception is raised and the largest positive or negative integer is
3345 | returned.
3346 *----------------------------------------------------------------------------*/
3348 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
3349 float_status *status)
3351 int8_t roundingMode;
3352 flag roundNearestEven, increment;
3353 int64_t z;
3355 roundingMode = status->float_rounding_mode;
3356 roundNearestEven = ( roundingMode == float_round_nearest_even );
3357 switch (roundingMode) {
3358 case float_round_nearest_even:
3359 case float_round_ties_away:
3360 increment = ((int64_t) absZ1 < 0);
3361 break;
3362 case float_round_to_zero:
3363 increment = 0;
3364 break;
3365 case float_round_up:
3366 increment = !zSign && absZ1;
3367 break;
3368 case float_round_down:
3369 increment = zSign && absZ1;
3370 break;
3371 default:
3372 abort();
3374 if ( increment ) {
3375 ++absZ0;
3376 if ( absZ0 == 0 ) goto overflow;
3377 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
3379 z = absZ0;
3380 if ( zSign ) z = - z;
3381 if ( z && ( ( z < 0 ) ^ zSign ) ) {
3382 overflow:
3383 float_raise(float_flag_invalid, status);
3384 return
3385 zSign ? (int64_t) LIT64( 0x8000000000000000 )
3386 : LIT64( 0x7FFFFFFFFFFFFFFF );
3388 if (absZ1) {
3389 status->float_exception_flags |= float_flag_inexact;
3391 return z;
3395 /*----------------------------------------------------------------------------
3396 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3397 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3398 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3399 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
3400 | with the inexact exception raised if the input cannot be represented exactly
3401 | as an integer. However, if the fixed-point input is too large, the invalid
3402 | exception is raised and the largest unsigned integer is returned.
3403 *----------------------------------------------------------------------------*/
3405 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
3406 uint64_t absZ1, float_status *status)
3408 int8_t roundingMode;
3409 flag roundNearestEven, increment;
3411 roundingMode = status->float_rounding_mode;
3412 roundNearestEven = (roundingMode == float_round_nearest_even);
3413 switch (roundingMode) {
3414 case float_round_nearest_even:
3415 case float_round_ties_away:
3416 increment = ((int64_t)absZ1 < 0);
3417 break;
3418 case float_round_to_zero:
3419 increment = 0;
3420 break;
3421 case float_round_up:
3422 increment = !zSign && absZ1;
3423 break;
3424 case float_round_down:
3425 increment = zSign && absZ1;
3426 break;
3427 default:
3428 abort();
3430 if (increment) {
3431 ++absZ0;
3432 if (absZ0 == 0) {
3433 float_raise(float_flag_invalid, status);
3434 return LIT64(0xFFFFFFFFFFFFFFFF);
3436 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
3439 if (zSign && absZ0) {
3440 float_raise(float_flag_invalid, status);
3441 return 0;
3444 if (absZ1) {
3445 status->float_exception_flags |= float_flag_inexact;
3447 return absZ0;
3450 /*----------------------------------------------------------------------------
3451 | If `a' is denormal and we are in flush-to-zero mode then set the
3452 | input-denormal exception and return zero. Otherwise just return the value.
3453 *----------------------------------------------------------------------------*/
3454 float32 float32_squash_input_denormal(float32 a, float_status *status)
3456 if (status->flush_inputs_to_zero) {
3457 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
3458 float_raise(float_flag_input_denormal, status);
3459 return make_float32(float32_val(a) & 0x80000000);
3462 return a;
3465 /*----------------------------------------------------------------------------
3466 | Normalizes the subnormal single-precision floating-point value represented
3467 | by the denormalized significand `aSig'. The normalized exponent and
3468 | significand are stored at the locations pointed to by `zExpPtr' and
3469 | `zSigPtr', respectively.
3470 *----------------------------------------------------------------------------*/
3472 static void
3473 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3475 int8_t shiftCount;
3477 shiftCount = clz32(aSig) - 8;
3478 *zSigPtr = aSig<<shiftCount;
3479 *zExpPtr = 1 - shiftCount;
3483 /*----------------------------------------------------------------------------
3484 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3485 | and significand `zSig', and returns the proper single-precision floating-
3486 | point value corresponding to the abstract input. Ordinarily, the abstract
3487 | value is simply rounded and packed into the single-precision format, with
3488 | the inexact exception raised if the abstract input cannot be represented
3489 | exactly. However, if the abstract value is too large, the overflow and
3490 | inexact exceptions are raised and an infinity or maximal finite value is
3491 | returned. If the abstract value is too small, the input value is rounded to
3492 | a subnormal number, and the underflow and inexact exceptions are raised if
3493 | the abstract input cannot be represented exactly as a subnormal single-
3494 | precision floating-point number.
3495 | The input significand `zSig' has its binary point between bits 30
3496 | and 29, which is 7 bits to the left of the usual location. This shifted
3497 | significand must be normalized or smaller. If `zSig' is not normalized,
3498 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3499 | and it must not require rounding. In the usual case that `zSig' is
3500 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3501 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3502 | Binary Floating-Point Arithmetic.
3503 *----------------------------------------------------------------------------*/
3505 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3506 float_status *status)
3508 int8_t roundingMode;
3509 flag roundNearestEven;
3510 int8_t roundIncrement, roundBits;
3511 flag isTiny;
3513 roundingMode = status->float_rounding_mode;
3514 roundNearestEven = ( roundingMode == float_round_nearest_even );
3515 switch (roundingMode) {
3516 case float_round_nearest_even:
3517 case float_round_ties_away:
3518 roundIncrement = 0x40;
3519 break;
3520 case float_round_to_zero:
3521 roundIncrement = 0;
3522 break;
3523 case float_round_up:
3524 roundIncrement = zSign ? 0 : 0x7f;
3525 break;
3526 case float_round_down:
3527 roundIncrement = zSign ? 0x7f : 0;
3528 break;
3529 default:
3530 abort();
3531 break;
3533 roundBits = zSig & 0x7F;
3534 if ( 0xFD <= (uint16_t) zExp ) {
3535 if ( ( 0xFD < zExp )
3536 || ( ( zExp == 0xFD )
3537 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3539 float_raise(float_flag_overflow | float_flag_inexact, status);
3540 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
3542 if ( zExp < 0 ) {
3543 if (status->flush_to_zero) {
3544 float_raise(float_flag_output_denormal, status);
3545 return packFloat32(zSign, 0, 0);
3547 isTiny =
3548 (status->float_detect_tininess
3549 == float_tininess_before_rounding)
3550 || ( zExp < -1 )
3551 || ( zSig + roundIncrement < 0x80000000 );
3552 shift32RightJamming( zSig, - zExp, &zSig );
3553 zExp = 0;
3554 roundBits = zSig & 0x7F;
3555 if (isTiny && roundBits) {
3556 float_raise(float_flag_underflow, status);
3560 if (roundBits) {
3561 status->float_exception_flags |= float_flag_inexact;
3563 zSig = ( zSig + roundIncrement )>>7;
3564 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3565 if ( zSig == 0 ) zExp = 0;
3566 return packFloat32( zSign, zExp, zSig );
3570 /*----------------------------------------------------------------------------
3571 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3572 | and significand `zSig', and returns the proper single-precision floating-
3573 | point value corresponding to the abstract input. This routine is just like
3574 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
3575 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3576 | floating-point exponent.
3577 *----------------------------------------------------------------------------*/
3579 static float32
3580 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3581 float_status *status)
3583 int8_t shiftCount;
3585 shiftCount = clz32(zSig) - 1;
3586 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
3587 status);
3591 /*----------------------------------------------------------------------------
3592 | If `a' is denormal and we are in flush-to-zero mode then set the
3593 | input-denormal exception and return zero. Otherwise just return the value.
3594 *----------------------------------------------------------------------------*/
3595 float64 float64_squash_input_denormal(float64 a, float_status *status)
3597 if (status->flush_inputs_to_zero) {
3598 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
3599 float_raise(float_flag_input_denormal, status);
3600 return make_float64(float64_val(a) & (1ULL << 63));
3603 return a;
3606 /*----------------------------------------------------------------------------
3607 | Normalizes the subnormal double-precision floating-point value represented
3608 | by the denormalized significand `aSig'. The normalized exponent and
3609 | significand are stored at the locations pointed to by `zExpPtr' and
3610 | `zSigPtr', respectively.
3611 *----------------------------------------------------------------------------*/
3613 static void
3614 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
3616 int8_t shiftCount;
3618 shiftCount = clz64(aSig) - 11;
3619 *zSigPtr = aSig<<shiftCount;
3620 *zExpPtr = 1 - shiftCount;
3624 /*----------------------------------------------------------------------------
3625 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3626 | double-precision floating-point value, returning the result. After being
3627 | shifted into the proper positions, the three fields are simply added
3628 | together to form the result. This means that any integer portion of `zSig'
3629 | will be added into the exponent. Since a properly normalized significand
3630 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3631 | than the desired result exponent whenever `zSig' is a complete, normalized
3632 | significand.
3633 *----------------------------------------------------------------------------*/
3635 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
3638 return make_float64(
3639 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
3643 /*----------------------------------------------------------------------------
3644 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3645 | and significand `zSig', and returns the proper double-precision floating-
3646 | point value corresponding to the abstract input. Ordinarily, the abstract
3647 | value is simply rounded and packed into the double-precision format, with
3648 | the inexact exception raised if the abstract input cannot be represented
3649 | exactly. However, if the abstract value is too large, the overflow and
3650 | inexact exceptions are raised and an infinity or maximal finite value is
3651 | returned. If the abstract value is too small, the input value is rounded to
3652 | a subnormal number, and the underflow and inexact exceptions are raised if
3653 | the abstract input cannot be represented exactly as a subnormal double-
3654 | precision floating-point number.
3655 | The input significand `zSig' has its binary point between bits 62
3656 | and 61, which is 10 bits to the left of the usual location. This shifted
3657 | significand must be normalized or smaller. If `zSig' is not normalized,
3658 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3659 | and it must not require rounding. In the usual case that `zSig' is
3660 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3661 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3662 | Binary Floating-Point Arithmetic.
3663 *----------------------------------------------------------------------------*/
3665 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3666 float_status *status)
3668 int8_t roundingMode;
3669 flag roundNearestEven;
3670 int roundIncrement, roundBits;
3671 flag isTiny;
3673 roundingMode = status->float_rounding_mode;
3674 roundNearestEven = ( roundingMode == float_round_nearest_even );
3675 switch (roundingMode) {
3676 case float_round_nearest_even:
3677 case float_round_ties_away:
3678 roundIncrement = 0x200;
3679 break;
3680 case float_round_to_zero:
3681 roundIncrement = 0;
3682 break;
3683 case float_round_up:
3684 roundIncrement = zSign ? 0 : 0x3ff;
3685 break;
3686 case float_round_down:
3687 roundIncrement = zSign ? 0x3ff : 0;
3688 break;
3689 case float_round_to_odd:
3690 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3691 break;
3692 default:
3693 abort();
3695 roundBits = zSig & 0x3FF;
3696 if ( 0x7FD <= (uint16_t) zExp ) {
3697 if ( ( 0x7FD < zExp )
3698 || ( ( zExp == 0x7FD )
3699 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
3701 bool overflow_to_inf = roundingMode != float_round_to_odd &&
3702 roundIncrement != 0;
3703 float_raise(float_flag_overflow | float_flag_inexact, status);
3704 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
3706 if ( zExp < 0 ) {
3707 if (status->flush_to_zero) {
3708 float_raise(float_flag_output_denormal, status);
3709 return packFloat64(zSign, 0, 0);
3711 isTiny =
3712 (status->float_detect_tininess
3713 == float_tininess_before_rounding)
3714 || ( zExp < -1 )
3715 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
3716 shift64RightJamming( zSig, - zExp, &zSig );
3717 zExp = 0;
3718 roundBits = zSig & 0x3FF;
3719 if (isTiny && roundBits) {
3720 float_raise(float_flag_underflow, status);
3722 if (roundingMode == float_round_to_odd) {
3724 * For round-to-odd case, the roundIncrement depends on
3725 * zSig which just changed.
3727 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3731 if (roundBits) {
3732 status->float_exception_flags |= float_flag_inexact;
3734 zSig = ( zSig + roundIncrement )>>10;
3735 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
3736 if ( zSig == 0 ) zExp = 0;
3737 return packFloat64( zSign, zExp, zSig );
3741 /*----------------------------------------------------------------------------
3742 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3743 | and significand `zSig', and returns the proper double-precision floating-
3744 | point value corresponding to the abstract input. This routine is just like
3745 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
3746 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3747 | floating-point exponent.
3748 *----------------------------------------------------------------------------*/
3750 static float64
3751 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3752 float_status *status)
3754 int8_t shiftCount;
3756 shiftCount = clz64(zSig) - 1;
3757 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
3758 status);
3762 /*----------------------------------------------------------------------------
3763 | Normalizes the subnormal extended double-precision floating-point value
3764 | represented by the denormalized significand `aSig'. The normalized exponent
3765 | and significand are stored at the locations pointed to by `zExpPtr' and
3766 | `zSigPtr', respectively.
3767 *----------------------------------------------------------------------------*/
3769 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
3770 uint64_t *zSigPtr)
3772 int8_t shiftCount;
3774 shiftCount = clz64(aSig);
3775 *zSigPtr = aSig<<shiftCount;
3776 *zExpPtr = 1 - shiftCount;
3779 /*----------------------------------------------------------------------------
3780 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3781 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3782 | and returns the proper extended double-precision floating-point value
3783 | corresponding to the abstract input. Ordinarily, the abstract value is
3784 | rounded and packed into the extended double-precision format, with the
3785 | inexact exception raised if the abstract input cannot be represented
3786 | exactly. However, if the abstract value is too large, the overflow and
3787 | inexact exceptions are raised and an infinity or maximal finite value is
3788 | returned. If the abstract value is too small, the input value is rounded to
3789 | a subnormal number, and the underflow and inexact exceptions are raised if
3790 | the abstract input cannot be represented exactly as a subnormal extended
3791 | double-precision floating-point number.
3792 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
3793 | number of bits as single or double precision, respectively. Otherwise, the
3794 | result is rounded to the full precision of the extended double-precision
3795 | format.
3796 | The input significand must be normalized or smaller. If the input
3797 | significand is not normalized, `zExp' must be 0; in that case, the result
3798 | returned is a subnormal number, and it must not require rounding. The
3799 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3800 | Floating-Point Arithmetic.
3801 *----------------------------------------------------------------------------*/
3803 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
3804 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3805 float_status *status)
3807 int8_t roundingMode;
3808 flag roundNearestEven, increment, isTiny;
3809 int64_t roundIncrement, roundMask, roundBits;
3811 roundingMode = status->float_rounding_mode;
3812 roundNearestEven = ( roundingMode == float_round_nearest_even );
3813 if ( roundingPrecision == 80 ) goto precision80;
3814 if ( roundingPrecision == 64 ) {
3815 roundIncrement = LIT64( 0x0000000000000400 );
3816 roundMask = LIT64( 0x00000000000007FF );
3818 else if ( roundingPrecision == 32 ) {
3819 roundIncrement = LIT64( 0x0000008000000000 );
3820 roundMask = LIT64( 0x000000FFFFFFFFFF );
3822 else {
3823 goto precision80;
3825 zSig0 |= ( zSig1 != 0 );
3826 switch (roundingMode) {
3827 case float_round_nearest_even:
3828 case float_round_ties_away:
3829 break;
3830 case float_round_to_zero:
3831 roundIncrement = 0;
3832 break;
3833 case float_round_up:
3834 roundIncrement = zSign ? 0 : roundMask;
3835 break;
3836 case float_round_down:
3837 roundIncrement = zSign ? roundMask : 0;
3838 break;
3839 default:
3840 abort();
3842 roundBits = zSig0 & roundMask;
3843 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3844 if ( ( 0x7FFE < zExp )
3845 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3847 goto overflow;
3849 if ( zExp <= 0 ) {
3850 if (status->flush_to_zero) {
3851 float_raise(float_flag_output_denormal, status);
3852 return packFloatx80(zSign, 0, 0);
3854 isTiny =
3855 (status->float_detect_tininess
3856 == float_tininess_before_rounding)
3857 || ( zExp < 0 )
3858 || ( zSig0 <= zSig0 + roundIncrement );
3859 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3860 zExp = 0;
3861 roundBits = zSig0 & roundMask;
3862 if (isTiny && roundBits) {
3863 float_raise(float_flag_underflow, status);
3865 if (roundBits) {
3866 status->float_exception_flags |= float_flag_inexact;
3868 zSig0 += roundIncrement;
3869 if ( (int64_t) zSig0 < 0 ) zExp = 1;
3870 roundIncrement = roundMask + 1;
3871 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3872 roundMask |= roundIncrement;
3874 zSig0 &= ~ roundMask;
3875 return packFloatx80( zSign, zExp, zSig0 );
3878 if (roundBits) {
3879 status->float_exception_flags |= float_flag_inexact;
3881 zSig0 += roundIncrement;
3882 if ( zSig0 < roundIncrement ) {
3883 ++zExp;
3884 zSig0 = LIT64( 0x8000000000000000 );
3886 roundIncrement = roundMask + 1;
3887 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3888 roundMask |= roundIncrement;
3890 zSig0 &= ~ roundMask;
3891 if ( zSig0 == 0 ) zExp = 0;
3892 return packFloatx80( zSign, zExp, zSig0 );
3893 precision80:
3894 switch (roundingMode) {
3895 case float_round_nearest_even:
3896 case float_round_ties_away:
3897 increment = ((int64_t)zSig1 < 0);
3898 break;
3899 case float_round_to_zero:
3900 increment = 0;
3901 break;
3902 case float_round_up:
3903 increment = !zSign && zSig1;
3904 break;
3905 case float_round_down:
3906 increment = zSign && zSig1;
3907 break;
3908 default:
3909 abort();
3911 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3912 if ( ( 0x7FFE < zExp )
3913 || ( ( zExp == 0x7FFE )
3914 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
3915 && increment
3918 roundMask = 0;
3919 overflow:
3920 float_raise(float_flag_overflow | float_flag_inexact, status);
3921 if ( ( roundingMode == float_round_to_zero )
3922 || ( zSign && ( roundingMode == float_round_up ) )
3923 || ( ! zSign && ( roundingMode == float_round_down ) )
3925 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3927 return packFloatx80(zSign,
3928 floatx80_infinity_high,
3929 floatx80_infinity_low);
3931 if ( zExp <= 0 ) {
3932 isTiny =
3933 (status->float_detect_tininess
3934 == float_tininess_before_rounding)
3935 || ( zExp < 0 )
3936 || ! increment
3937 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
3938 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
3939 zExp = 0;
3940 if (isTiny && zSig1) {
3941 float_raise(float_flag_underflow, status);
3943 if (zSig1) {
3944 status->float_exception_flags |= float_flag_inexact;
3946 switch (roundingMode) {
3947 case float_round_nearest_even:
3948 case float_round_ties_away:
3949 increment = ((int64_t)zSig1 < 0);
3950 break;
3951 case float_round_to_zero:
3952 increment = 0;
3953 break;
3954 case float_round_up:
3955 increment = !zSign && zSig1;
3956 break;
3957 case float_round_down:
3958 increment = zSign && zSig1;
3959 break;
3960 default:
3961 abort();
3963 if ( increment ) {
3964 ++zSig0;
3965 zSig0 &=
3966 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3967 if ( (int64_t) zSig0 < 0 ) zExp = 1;
3969 return packFloatx80( zSign, zExp, zSig0 );
3972 if (zSig1) {
3973 status->float_exception_flags |= float_flag_inexact;
3975 if ( increment ) {
3976 ++zSig0;
3977 if ( zSig0 == 0 ) {
3978 ++zExp;
3979 zSig0 = LIT64( 0x8000000000000000 );
3981 else {
3982 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3985 else {
3986 if ( zSig0 == 0 ) zExp = 0;
3988 return packFloatx80( zSign, zExp, zSig0 );
3992 /*----------------------------------------------------------------------------
3993 | Takes an abstract floating-point value having sign `zSign', exponent
3994 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
3995 | and returns the proper extended double-precision floating-point value
3996 | corresponding to the abstract input. This routine is just like
3997 | `roundAndPackFloatx80' except that the input significand does not have to be
3998 | normalized.
3999 *----------------------------------------------------------------------------*/
4001 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4002 flag zSign, int32_t zExp,
4003 uint64_t zSig0, uint64_t zSig1,
4004 float_status *status)
4006 int8_t shiftCount;
4008 if ( zSig0 == 0 ) {
4009 zSig0 = zSig1;
4010 zSig1 = 0;
4011 zExp -= 64;
4013 shiftCount = clz64(zSig0);
4014 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4015 zExp -= shiftCount;
4016 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4017 zSig0, zSig1, status);
4021 /*----------------------------------------------------------------------------
4022 | Returns the least-significant 64 fraction bits of the quadruple-precision
4023 | floating-point value `a'.
4024 *----------------------------------------------------------------------------*/
4026 static inline uint64_t extractFloat128Frac1( float128 a )
4029 return a.low;
4033 /*----------------------------------------------------------------------------
4034 | Returns the most-significant 48 fraction bits of the quadruple-precision
4035 | floating-point value `a'.
4036 *----------------------------------------------------------------------------*/
4038 static inline uint64_t extractFloat128Frac0( float128 a )
4041 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
4045 /*----------------------------------------------------------------------------
4046 | Returns the exponent bits of the quadruple-precision floating-point value
4047 | `a'.
4048 *----------------------------------------------------------------------------*/
4050 static inline int32_t extractFloat128Exp( float128 a )
4053 return ( a.high>>48 ) & 0x7FFF;
4057 /*----------------------------------------------------------------------------
4058 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4059 *----------------------------------------------------------------------------*/
4061 static inline flag extractFloat128Sign( float128 a )
4064 return a.high>>63;
4068 /*----------------------------------------------------------------------------
4069 | Normalizes the subnormal quadruple-precision floating-point value
4070 | represented by the denormalized significand formed by the concatenation of
4071 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
4072 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
4073 | significand are stored at the location pointed to by `zSig0Ptr', and the
4074 | least significant 64 bits of the normalized significand are stored at the
4075 | location pointed to by `zSig1Ptr'.
4076 *----------------------------------------------------------------------------*/
4078 static void
4079 normalizeFloat128Subnormal(
4080 uint64_t aSig0,
4081 uint64_t aSig1,
4082 int32_t *zExpPtr,
4083 uint64_t *zSig0Ptr,
4084 uint64_t *zSig1Ptr
4087 int8_t shiftCount;
4089 if ( aSig0 == 0 ) {
4090 shiftCount = clz64(aSig1) - 15;
4091 if ( shiftCount < 0 ) {
4092 *zSig0Ptr = aSig1>>( - shiftCount );
4093 *zSig1Ptr = aSig1<<( shiftCount & 63 );
4095 else {
4096 *zSig0Ptr = aSig1<<shiftCount;
4097 *zSig1Ptr = 0;
4099 *zExpPtr = - shiftCount - 63;
4101 else {
4102 shiftCount = clz64(aSig0) - 15;
4103 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4104 *zExpPtr = 1 - shiftCount;
4109 /*----------------------------------------------------------------------------
4110 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4111 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4112 | floating-point value, returning the result. After being shifted into the
4113 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4114 | added together to form the most significant 32 bits of the result. This
4115 | means that any integer portion of `zSig0' will be added into the exponent.
4116 | Since a properly normalized significand will have an integer portion equal
4117 | to 1, the `zExp' input should be 1 less than the desired result exponent
4118 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4119 | significand.
4120 *----------------------------------------------------------------------------*/
4122 static inline float128
4123 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
4125 float128 z;
4127 z.low = zSig1;
4128 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
4129 return z;
4133 /*----------------------------------------------------------------------------
4134 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4135 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4136 | and `zSig2', and returns the proper quadruple-precision floating-point value
4137 | corresponding to the abstract input. Ordinarily, the abstract value is
4138 | simply rounded and packed into the quadruple-precision format, with the
4139 | inexact exception raised if the abstract input cannot be represented
4140 | exactly. However, if the abstract value is too large, the overflow and
4141 | inexact exceptions are raised and an infinity or maximal finite value is
4142 | returned. If the abstract value is too small, the input value is rounded to
4143 | a subnormal number, and the underflow and inexact exceptions are raised if
4144 | the abstract input cannot be represented exactly as a subnormal quadruple-
4145 | precision floating-point number.
4146 | The input significand must be normalized or smaller. If the input
4147 | significand is not normalized, `zExp' must be 0; in that case, the result
4148 | returned is a subnormal number, and it must not require rounding. In the
4149 | usual case that the input significand is normalized, `zExp' must be 1 less
4150 | than the ``true'' floating-point exponent. The handling of underflow and
4151 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4152 *----------------------------------------------------------------------------*/
4154 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
4155 uint64_t zSig0, uint64_t zSig1,
4156 uint64_t zSig2, float_status *status)
4158 int8_t roundingMode;
4159 flag roundNearestEven, increment, isTiny;
4161 roundingMode = status->float_rounding_mode;
4162 roundNearestEven = ( roundingMode == float_round_nearest_even );
4163 switch (roundingMode) {
4164 case float_round_nearest_even:
4165 case float_round_ties_away:
4166 increment = ((int64_t)zSig2 < 0);
4167 break;
4168 case float_round_to_zero:
4169 increment = 0;
4170 break;
4171 case float_round_up:
4172 increment = !zSign && zSig2;
4173 break;
4174 case float_round_down:
4175 increment = zSign && zSig2;
4176 break;
4177 case float_round_to_odd:
4178 increment = !(zSig1 & 0x1) && zSig2;
4179 break;
4180 default:
4181 abort();
4183 if ( 0x7FFD <= (uint32_t) zExp ) {
4184 if ( ( 0x7FFD < zExp )
4185 || ( ( zExp == 0x7FFD )
4186 && eq128(
4187 LIT64( 0x0001FFFFFFFFFFFF ),
4188 LIT64( 0xFFFFFFFFFFFFFFFF ),
4189 zSig0,
4190 zSig1
4192 && increment
4195 float_raise(float_flag_overflow | float_flag_inexact, status);
4196 if ( ( roundingMode == float_round_to_zero )
4197 || ( zSign && ( roundingMode == float_round_up ) )
4198 || ( ! zSign && ( roundingMode == float_round_down ) )
4199 || (roundingMode == float_round_to_odd)
4201 return
4202 packFloat128(
4203 zSign,
4204 0x7FFE,
4205 LIT64( 0x0000FFFFFFFFFFFF ),
4206 LIT64( 0xFFFFFFFFFFFFFFFF )
4209 return packFloat128( zSign, 0x7FFF, 0, 0 );
4211 if ( zExp < 0 ) {
4212 if (status->flush_to_zero) {
4213 float_raise(float_flag_output_denormal, status);
4214 return packFloat128(zSign, 0, 0, 0);
4216 isTiny =
4217 (status->float_detect_tininess
4218 == float_tininess_before_rounding)
4219 || ( zExp < -1 )
4220 || ! increment
4221 || lt128(
4222 zSig0,
4223 zSig1,
4224 LIT64( 0x0001FFFFFFFFFFFF ),
4225 LIT64( 0xFFFFFFFFFFFFFFFF )
4227 shift128ExtraRightJamming(
4228 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4229 zExp = 0;
4230 if (isTiny && zSig2) {
4231 float_raise(float_flag_underflow, status);
4233 switch (roundingMode) {
4234 case float_round_nearest_even:
4235 case float_round_ties_away:
4236 increment = ((int64_t)zSig2 < 0);
4237 break;
4238 case float_round_to_zero:
4239 increment = 0;
4240 break;
4241 case float_round_up:
4242 increment = !zSign && zSig2;
4243 break;
4244 case float_round_down:
4245 increment = zSign && zSig2;
4246 break;
4247 case float_round_to_odd:
4248 increment = !(zSig1 & 0x1) && zSig2;
4249 break;
4250 default:
4251 abort();
4255 if (zSig2) {
4256 status->float_exception_flags |= float_flag_inexact;
4258 if ( increment ) {
4259 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4260 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
4262 else {
4263 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4265 return packFloat128( zSign, zExp, zSig0, zSig1 );
4269 /*----------------------------------------------------------------------------
4270 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4271 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4272 | returns the proper quadruple-precision floating-point value corresponding
4273 | to the abstract input. This routine is just like `roundAndPackFloat128'
4274 | except that the input significand has fewer bits and does not have to be
4275 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
4276 | point exponent.
4277 *----------------------------------------------------------------------------*/
4279 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
4280 uint64_t zSig0, uint64_t zSig1,
4281 float_status *status)
4283 int8_t shiftCount;
4284 uint64_t zSig2;
4286 if ( zSig0 == 0 ) {
4287 zSig0 = zSig1;
4288 zSig1 = 0;
4289 zExp -= 64;
4291 shiftCount = clz64(zSig0) - 15;
4292 if ( 0 <= shiftCount ) {
4293 zSig2 = 0;
4294 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4296 else {
4297 shift128ExtraRightJamming(
4298 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4300 zExp -= shiftCount;
4301 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4306 /*----------------------------------------------------------------------------
4307 | Returns the result of converting the 32-bit two's complement integer `a'
4308 | to the extended double-precision floating-point format. The conversion
4309 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4310 | Arithmetic.
4311 *----------------------------------------------------------------------------*/
4313 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4315 flag zSign;
4316 uint32_t absA;
4317 int8_t shiftCount;
4318 uint64_t zSig;
4320 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4321 zSign = ( a < 0 );
4322 absA = zSign ? - a : a;
4323 shiftCount = clz32(absA) + 32;
4324 zSig = absA;
4325 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4329 /*----------------------------------------------------------------------------
4330 | Returns the result of converting the 32-bit two's complement integer `a' to
4331 | the quadruple-precision floating-point format. The conversion is performed
4332 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4333 *----------------------------------------------------------------------------*/
4335 float128 int32_to_float128(int32_t a, float_status *status)
4337 flag zSign;
4338 uint32_t absA;
4339 int8_t shiftCount;
4340 uint64_t zSig0;
4342 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4343 zSign = ( a < 0 );
4344 absA = zSign ? - a : a;
4345 shiftCount = clz32(absA) + 17;
4346 zSig0 = absA;
4347 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4351 /*----------------------------------------------------------------------------
4352 | Returns the result of converting the 64-bit two's complement integer `a'
4353 | to the extended double-precision floating-point format. The conversion
4354 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4355 | Arithmetic.
4356 *----------------------------------------------------------------------------*/
4358 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4360 flag zSign;
4361 uint64_t absA;
4362 int8_t shiftCount;
4364 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4365 zSign = ( a < 0 );
4366 absA = zSign ? - a : a;
4367 shiftCount = clz64(absA);
4368 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4372 /*----------------------------------------------------------------------------
4373 | Returns the result of converting the 64-bit two's complement integer `a' to
4374 | the quadruple-precision floating-point format. The conversion is performed
4375 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4376 *----------------------------------------------------------------------------*/
4378 float128 int64_to_float128(int64_t a, float_status *status)
4380 flag zSign;
4381 uint64_t absA;
4382 int8_t shiftCount;
4383 int32_t zExp;
4384 uint64_t zSig0, zSig1;
4386 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4387 zSign = ( a < 0 );
4388 absA = zSign ? - a : a;
4389 shiftCount = clz64(absA) + 49;
4390 zExp = 0x406E - shiftCount;
4391 if ( 64 <= shiftCount ) {
4392 zSig1 = 0;
4393 zSig0 = absA;
4394 shiftCount -= 64;
4396 else {
4397 zSig1 = absA;
4398 zSig0 = 0;
4400 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4401 return packFloat128( zSign, zExp, zSig0, zSig1 );
4405 /*----------------------------------------------------------------------------
4406 | Returns the result of converting the 64-bit unsigned integer `a'
4407 | to the quadruple-precision floating-point format. The conversion is performed
4408 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4409 *----------------------------------------------------------------------------*/
4411 float128 uint64_to_float128(uint64_t a, float_status *status)
4413 if (a == 0) {
4414 return float128_zero;
4416 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4419 /*----------------------------------------------------------------------------
4420 | Returns the result of converting the single-precision floating-point value
4421 | `a' to the extended double-precision floating-point format. The conversion
4422 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4423 | Arithmetic.
4424 *----------------------------------------------------------------------------*/
4426 floatx80 float32_to_floatx80(float32 a, float_status *status)
4428 flag aSign;
4429 int aExp;
4430 uint32_t aSig;
4432 a = float32_squash_input_denormal(a, status);
4433 aSig = extractFloat32Frac( a );
4434 aExp = extractFloat32Exp( a );
4435 aSign = extractFloat32Sign( a );
4436 if ( aExp == 0xFF ) {
4437 if (aSig) {
4438 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
4440 return packFloatx80(aSign,
4441 floatx80_infinity_high,
4442 floatx80_infinity_low);
4444 if ( aExp == 0 ) {
4445 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4446 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4448 aSig |= 0x00800000;
4449 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4453 /*----------------------------------------------------------------------------
4454 | Returns the result of converting the single-precision floating-point value
4455 | `a' to the double-precision floating-point format. The conversion is
4456 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4457 | Arithmetic.
4458 *----------------------------------------------------------------------------*/
4460 float128 float32_to_float128(float32 a, float_status *status)
4462 flag aSign;
4463 int aExp;
4464 uint32_t aSig;
4466 a = float32_squash_input_denormal(a, status);
4467 aSig = extractFloat32Frac( a );
4468 aExp = extractFloat32Exp( a );
4469 aSign = extractFloat32Sign( a );
4470 if ( aExp == 0xFF ) {
4471 if (aSig) {
4472 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4474 return packFloat128( aSign, 0x7FFF, 0, 0 );
4476 if ( aExp == 0 ) {
4477 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4478 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4479 --aExp;
4481 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4485 /*----------------------------------------------------------------------------
4486 | Returns the remainder of the single-precision floating-point value `a'
4487 | with respect to the corresponding value `b'. The operation is performed
4488 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4489 *----------------------------------------------------------------------------*/
4491 float32 float32_rem(float32 a, float32 b, float_status *status)
4493 flag aSign, zSign;
4494 int aExp, bExp, expDiff;
4495 uint32_t aSig, bSig;
4496 uint32_t q;
4497 uint64_t aSig64, bSig64, q64;
4498 uint32_t alternateASig;
4499 int32_t sigMean;
4500 a = float32_squash_input_denormal(a, status);
4501 b = float32_squash_input_denormal(b, status);
4503 aSig = extractFloat32Frac( a );
4504 aExp = extractFloat32Exp( a );
4505 aSign = extractFloat32Sign( a );
4506 bSig = extractFloat32Frac( b );
4507 bExp = extractFloat32Exp( b );
4508 if ( aExp == 0xFF ) {
4509 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4510 return propagateFloat32NaN(a, b, status);
4512 float_raise(float_flag_invalid, status);
4513 return float32_default_nan(status);
4515 if ( bExp == 0xFF ) {
4516 if (bSig) {
4517 return propagateFloat32NaN(a, b, status);
4519 return a;
4521 if ( bExp == 0 ) {
4522 if ( bSig == 0 ) {
4523 float_raise(float_flag_invalid, status);
4524 return float32_default_nan(status);
4526 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4528 if ( aExp == 0 ) {
4529 if ( aSig == 0 ) return a;
4530 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4532 expDiff = aExp - bExp;
4533 aSig |= 0x00800000;
4534 bSig |= 0x00800000;
4535 if ( expDiff < 32 ) {
4536 aSig <<= 8;
4537 bSig <<= 8;
4538 if ( expDiff < 0 ) {
4539 if ( expDiff < -1 ) return a;
4540 aSig >>= 1;
4542 q = ( bSig <= aSig );
4543 if ( q ) aSig -= bSig;
4544 if ( 0 < expDiff ) {
4545 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4546 q >>= 32 - expDiff;
4547 bSig >>= 2;
4548 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4550 else {
4551 aSig >>= 2;
4552 bSig >>= 2;
4555 else {
4556 if ( bSig <= aSig ) aSig -= bSig;
4557 aSig64 = ( (uint64_t) aSig )<<40;
4558 bSig64 = ( (uint64_t) bSig )<<40;
4559 expDiff -= 64;
4560 while ( 0 < expDiff ) {
4561 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4562 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4563 aSig64 = - ( ( bSig * q64 )<<38 );
4564 expDiff -= 62;
4566 expDiff += 64;
4567 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4568 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4569 q = q64>>( 64 - expDiff );
4570 bSig <<= 6;
4571 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
4573 do {
4574 alternateASig = aSig;
4575 ++q;
4576 aSig -= bSig;
4577 } while ( 0 <= (int32_t) aSig );
4578 sigMean = aSig + alternateASig;
4579 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4580 aSig = alternateASig;
4582 zSign = ( (int32_t) aSig < 0 );
4583 if ( zSign ) aSig = - aSig;
4584 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
4589 /*----------------------------------------------------------------------------
4590 | Returns the binary exponential of the single-precision floating-point value
4591 | `a'. The operation is performed according to the IEC/IEEE Standard for
4592 | Binary Floating-Point Arithmetic.
4594 | Uses the following identities:
4596 | 1. -------------------------------------------------------------------------
4597 | x x*ln(2)
4598 | 2 = e
4600 | 2. -------------------------------------------------------------------------
4601 | 2 3 4 5 n
4602 | x x x x x x x
4603 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
4604 | 1! 2! 3! 4! 5! n!
4605 *----------------------------------------------------------------------------*/
4607 static const float64 float32_exp2_coefficients[15] =
4609 const_float64( 0x3ff0000000000000ll ), /* 1 */
4610 const_float64( 0x3fe0000000000000ll ), /* 2 */
4611 const_float64( 0x3fc5555555555555ll ), /* 3 */
4612 const_float64( 0x3fa5555555555555ll ), /* 4 */
4613 const_float64( 0x3f81111111111111ll ), /* 5 */
4614 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
4615 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
4616 const_float64( 0x3efa01a01a01a01all ), /* 8 */
4617 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
4618 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
4619 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
4620 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
4621 const_float64( 0x3de6124613a86d09ll ), /* 13 */
4622 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
4623 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
4626 float32 float32_exp2(float32 a, float_status *status)
4628 flag aSign;
4629 int aExp;
4630 uint32_t aSig;
4631 float64 r, x, xn;
4632 int i;
4633 a = float32_squash_input_denormal(a, status);
4635 aSig = extractFloat32Frac( a );
4636 aExp = extractFloat32Exp( a );
4637 aSign = extractFloat32Sign( a );
4639 if ( aExp == 0xFF) {
4640 if (aSig) {
4641 return propagateFloat32NaN(a, float32_zero, status);
4643 return (aSign) ? float32_zero : a;
4645 if (aExp == 0) {
4646 if (aSig == 0) return float32_one;
4649 float_raise(float_flag_inexact, status);
4651 /* ******************************* */
4652 /* using float64 for approximation */
4653 /* ******************************* */
4654 x = float32_to_float64(a, status);
4655 x = float64_mul(x, float64_ln2, status);
4657 xn = x;
4658 r = float64_one;
4659 for (i = 0 ; i < 15 ; i++) {
4660 float64 f;
4662 f = float64_mul(xn, float32_exp2_coefficients[i], status);
4663 r = float64_add(r, f, status);
4665 xn = float64_mul(xn, x, status);
4668 return float64_to_float32(r, status);
4671 /*----------------------------------------------------------------------------
4672 | Returns the binary log of the single-precision floating-point value `a'.
4673 | The operation is performed according to the IEC/IEEE Standard for Binary
4674 | Floating-Point Arithmetic.
4675 *----------------------------------------------------------------------------*/
4676 float32 float32_log2(float32 a, float_status *status)
4678 flag aSign, zSign;
4679 int aExp;
4680 uint32_t aSig, zSig, i;
4682 a = float32_squash_input_denormal(a, status);
4683 aSig = extractFloat32Frac( a );
4684 aExp = extractFloat32Exp( a );
4685 aSign = extractFloat32Sign( a );
4687 if ( aExp == 0 ) {
4688 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
4689 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4691 if ( aSign ) {
4692 float_raise(float_flag_invalid, status);
4693 return float32_default_nan(status);
4695 if ( aExp == 0xFF ) {
4696 if (aSig) {
4697 return propagateFloat32NaN(a, float32_zero, status);
4699 return a;
4702 aExp -= 0x7F;
4703 aSig |= 0x00800000;
4704 zSign = aExp < 0;
4705 zSig = aExp << 23;
4707 for (i = 1 << 22; i > 0; i >>= 1) {
4708 aSig = ( (uint64_t)aSig * aSig ) >> 23;
4709 if ( aSig & 0x01000000 ) {
4710 aSig >>= 1;
4711 zSig |= i;
4715 if ( zSign )
4716 zSig = -zSig;
4718 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
4721 /*----------------------------------------------------------------------------
4722 | Returns 1 if the single-precision floating-point value `a' is equal to
4723 | the corresponding value `b', and 0 otherwise. The invalid exception is
4724 | raised if either operand is a NaN. Otherwise, the comparison is performed
4725 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4726 *----------------------------------------------------------------------------*/
4728 int float32_eq(float32 a, float32 b, float_status *status)
4730 uint32_t av, bv;
4731 a = float32_squash_input_denormal(a, status);
4732 b = float32_squash_input_denormal(b, status);
4734 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4735 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4737 float_raise(float_flag_invalid, status);
4738 return 0;
4740 av = float32_val(a);
4741 bv = float32_val(b);
4742 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4745 /*----------------------------------------------------------------------------
4746 | Returns 1 if the single-precision floating-point value `a' is less than
4747 | or equal to the corresponding value `b', and 0 otherwise. The invalid
4748 | exception is raised if either operand is a NaN. The comparison is performed
4749 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4750 *----------------------------------------------------------------------------*/
4752 int float32_le(float32 a, float32 b, float_status *status)
4754 flag aSign, bSign;
4755 uint32_t av, bv;
4756 a = float32_squash_input_denormal(a, status);
4757 b = float32_squash_input_denormal(b, status);
4759 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4760 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4762 float_raise(float_flag_invalid, status);
4763 return 0;
4765 aSign = extractFloat32Sign( a );
4766 bSign = extractFloat32Sign( b );
4767 av = float32_val(a);
4768 bv = float32_val(b);
4769 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4770 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4774 /*----------------------------------------------------------------------------
4775 | Returns 1 if the single-precision floating-point value `a' is less than
4776 | the corresponding value `b', and 0 otherwise. The invalid exception is
4777 | raised if either operand is a NaN. The comparison is performed according
4778 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4779 *----------------------------------------------------------------------------*/
4781 int float32_lt(float32 a, float32 b, float_status *status)
4783 flag aSign, bSign;
4784 uint32_t av, bv;
4785 a = float32_squash_input_denormal(a, status);
4786 b = float32_squash_input_denormal(b, status);
4788 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4789 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4791 float_raise(float_flag_invalid, status);
4792 return 0;
4794 aSign = extractFloat32Sign( a );
4795 bSign = extractFloat32Sign( b );
4796 av = float32_val(a);
4797 bv = float32_val(b);
4798 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4799 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4803 /*----------------------------------------------------------------------------
4804 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4805 | be compared, and 0 otherwise. The invalid exception is raised if either
4806 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4807 | Standard for Binary Floating-Point Arithmetic.
4808 *----------------------------------------------------------------------------*/
4810 int float32_unordered(float32 a, float32 b, float_status *status)
4812 a = float32_squash_input_denormal(a, status);
4813 b = float32_squash_input_denormal(b, status);
4815 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4816 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4818 float_raise(float_flag_invalid, status);
4819 return 1;
4821 return 0;
4824 /*----------------------------------------------------------------------------
4825 | Returns 1 if the single-precision floating-point value `a' is equal to
4826 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4827 | exception. The comparison is performed according to the IEC/IEEE Standard
4828 | for Binary Floating-Point Arithmetic.
4829 *----------------------------------------------------------------------------*/
4831 int float32_eq_quiet(float32 a, float32 b, float_status *status)
4833 a = float32_squash_input_denormal(a, status);
4834 b = float32_squash_input_denormal(b, status);
4836 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4837 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4839 if (float32_is_signaling_nan(a, status)
4840 || float32_is_signaling_nan(b, status)) {
4841 float_raise(float_flag_invalid, status);
4843 return 0;
4845 return ( float32_val(a) == float32_val(b) ) ||
4846 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
4849 /*----------------------------------------------------------------------------
4850 | Returns 1 if the single-precision floating-point value `a' is less than or
4851 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4852 | cause an exception. Otherwise, the comparison is performed according to the
4853 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4854 *----------------------------------------------------------------------------*/
4856 int float32_le_quiet(float32 a, float32 b, float_status *status)
4858 flag aSign, bSign;
4859 uint32_t av, bv;
4860 a = float32_squash_input_denormal(a, status);
4861 b = float32_squash_input_denormal(b, status);
4863 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4864 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4866 if (float32_is_signaling_nan(a, status)
4867 || float32_is_signaling_nan(b, status)) {
4868 float_raise(float_flag_invalid, status);
4870 return 0;
4872 aSign = extractFloat32Sign( a );
4873 bSign = extractFloat32Sign( b );
4874 av = float32_val(a);
4875 bv = float32_val(b);
4876 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4877 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4881 /*----------------------------------------------------------------------------
4882 | Returns 1 if the single-precision floating-point value `a' is less than
4883 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4884 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4885 | Standard for Binary Floating-Point Arithmetic.
4886 *----------------------------------------------------------------------------*/
4888 int float32_lt_quiet(float32 a, float32 b, float_status *status)
4890 flag aSign, bSign;
4891 uint32_t av, bv;
4892 a = float32_squash_input_denormal(a, status);
4893 b = float32_squash_input_denormal(b, status);
4895 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4896 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4898 if (float32_is_signaling_nan(a, status)
4899 || float32_is_signaling_nan(b, status)) {
4900 float_raise(float_flag_invalid, status);
4902 return 0;
4904 aSign = extractFloat32Sign( a );
4905 bSign = extractFloat32Sign( b );
4906 av = float32_val(a);
4907 bv = float32_val(b);
4908 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4909 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4913 /*----------------------------------------------------------------------------
4914 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4915 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4916 | comparison is performed according to the IEC/IEEE Standard for Binary
4917 | Floating-Point Arithmetic.
4918 *----------------------------------------------------------------------------*/
4920 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
4922 a = float32_squash_input_denormal(a, status);
4923 b = float32_squash_input_denormal(b, status);
4925 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4926 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4928 if (float32_is_signaling_nan(a, status)
4929 || float32_is_signaling_nan(b, status)) {
4930 float_raise(float_flag_invalid, status);
4932 return 1;
4934 return 0;
4937 /*----------------------------------------------------------------------------
4938 | If `a' is denormal and we are in flush-to-zero mode then set the
4939 | input-denormal exception and return zero. Otherwise just return the value.
4940 *----------------------------------------------------------------------------*/
4941 float16 float16_squash_input_denormal(float16 a, float_status *status)
4943 if (status->flush_inputs_to_zero) {
4944 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
4945 float_raise(float_flag_input_denormal, status);
4946 return make_float16(float16_val(a) & 0x8000);
4949 return a;
4952 /*----------------------------------------------------------------------------
4953 | Returns the result of converting the double-precision floating-point value
4954 | `a' to the extended double-precision floating-point format. The conversion
4955 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4956 | Arithmetic.
4957 *----------------------------------------------------------------------------*/
4959 floatx80 float64_to_floatx80(float64 a, float_status *status)
4961 flag aSign;
4962 int aExp;
4963 uint64_t aSig;
4965 a = float64_squash_input_denormal(a, status);
4966 aSig = extractFloat64Frac( a );
4967 aExp = extractFloat64Exp( a );
4968 aSign = extractFloat64Sign( a );
4969 if ( aExp == 0x7FF ) {
4970 if (aSig) {
4971 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4973 return packFloatx80(aSign,
4974 floatx80_infinity_high,
4975 floatx80_infinity_low);
4977 if ( aExp == 0 ) {
4978 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4979 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4981 return
4982 packFloatx80(
4983 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4987 /*----------------------------------------------------------------------------
4988 | Returns the result of converting the double-precision floating-point value
4989 | `a' to the quadruple-precision floating-point format. The conversion is
4990 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4991 | Arithmetic.
4992 *----------------------------------------------------------------------------*/
4994 float128 float64_to_float128(float64 a, float_status *status)
4996 flag aSign;
4997 int aExp;
4998 uint64_t aSig, zSig0, zSig1;
5000 a = float64_squash_input_denormal(a, status);
5001 aSig = extractFloat64Frac( a );
5002 aExp = extractFloat64Exp( a );
5003 aSign = extractFloat64Sign( a );
5004 if ( aExp == 0x7FF ) {
5005 if (aSig) {
5006 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5008 return packFloat128( aSign, 0x7FFF, 0, 0 );
5010 if ( aExp == 0 ) {
5011 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5012 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5013 --aExp;
5015 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5016 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5021 /*----------------------------------------------------------------------------
5022 | Returns the remainder of the double-precision floating-point value `a'
5023 | with respect to the corresponding value `b'. The operation is performed
5024 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5025 *----------------------------------------------------------------------------*/
5027 float64 float64_rem(float64 a, float64 b, float_status *status)
5029 flag aSign, zSign;
5030 int aExp, bExp, expDiff;
5031 uint64_t aSig, bSig;
5032 uint64_t q, alternateASig;
5033 int64_t sigMean;
5035 a = float64_squash_input_denormal(a, status);
5036 b = float64_squash_input_denormal(b, status);
5037 aSig = extractFloat64Frac( a );
5038 aExp = extractFloat64Exp( a );
5039 aSign = extractFloat64Sign( a );
5040 bSig = extractFloat64Frac( b );
5041 bExp = extractFloat64Exp( b );
5042 if ( aExp == 0x7FF ) {
5043 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5044 return propagateFloat64NaN(a, b, status);
5046 float_raise(float_flag_invalid, status);
5047 return float64_default_nan(status);
5049 if ( bExp == 0x7FF ) {
5050 if (bSig) {
5051 return propagateFloat64NaN(a, b, status);
5053 return a;
5055 if ( bExp == 0 ) {
5056 if ( bSig == 0 ) {
5057 float_raise(float_flag_invalid, status);
5058 return float64_default_nan(status);
5060 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5062 if ( aExp == 0 ) {
5063 if ( aSig == 0 ) return a;
5064 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5066 expDiff = aExp - bExp;
5067 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
5068 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
5069 if ( expDiff < 0 ) {
5070 if ( expDiff < -1 ) return a;
5071 aSig >>= 1;
5073 q = ( bSig <= aSig );
5074 if ( q ) aSig -= bSig;
5075 expDiff -= 64;
5076 while ( 0 < expDiff ) {
5077 q = estimateDiv128To64( aSig, 0, bSig );
5078 q = ( 2 < q ) ? q - 2 : 0;
5079 aSig = - ( ( bSig>>2 ) * q );
5080 expDiff -= 62;
5082 expDiff += 64;
5083 if ( 0 < expDiff ) {
5084 q = estimateDiv128To64( aSig, 0, bSig );
5085 q = ( 2 < q ) ? q - 2 : 0;
5086 q >>= 64 - expDiff;
5087 bSig >>= 2;
5088 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5090 else {
5091 aSig >>= 2;
5092 bSig >>= 2;
5094 do {
5095 alternateASig = aSig;
5096 ++q;
5097 aSig -= bSig;
5098 } while ( 0 <= (int64_t) aSig );
5099 sigMean = aSig + alternateASig;
5100 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5101 aSig = alternateASig;
5103 zSign = ( (int64_t) aSig < 0 );
5104 if ( zSign ) aSig = - aSig;
5105 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5109 /*----------------------------------------------------------------------------
5110 | Returns the binary log of the double-precision floating-point value `a'.
5111 | The operation is performed according to the IEC/IEEE Standard for Binary
5112 | Floating-Point Arithmetic.
5113 *----------------------------------------------------------------------------*/
5114 float64 float64_log2(float64 a, float_status *status)
5116 flag aSign, zSign;
5117 int aExp;
5118 uint64_t aSig, aSig0, aSig1, zSig, i;
5119 a = float64_squash_input_denormal(a, status);
5121 aSig = extractFloat64Frac( a );
5122 aExp = extractFloat64Exp( a );
5123 aSign = extractFloat64Sign( a );
5125 if ( aExp == 0 ) {
5126 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5127 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5129 if ( aSign ) {
5130 float_raise(float_flag_invalid, status);
5131 return float64_default_nan(status);
5133 if ( aExp == 0x7FF ) {
5134 if (aSig) {
5135 return propagateFloat64NaN(a, float64_zero, status);
5137 return a;
5140 aExp -= 0x3FF;
5141 aSig |= LIT64( 0x0010000000000000 );
5142 zSign = aExp < 0;
5143 zSig = (uint64_t)aExp << 52;
5144 for (i = 1LL << 51; i > 0; i >>= 1) {
5145 mul64To128( aSig, aSig, &aSig0, &aSig1 );
5146 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5147 if ( aSig & LIT64( 0x0020000000000000 ) ) {
5148 aSig >>= 1;
5149 zSig |= i;
5153 if ( zSign )
5154 zSig = -zSig;
5155 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5158 /*----------------------------------------------------------------------------
5159 | Returns 1 if the double-precision floating-point value `a' is equal to the
5160 | corresponding value `b', and 0 otherwise. The invalid exception is raised
5161 | if either operand is a NaN. Otherwise, the comparison is performed
5162 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5163 *----------------------------------------------------------------------------*/
5165 int float64_eq(float64 a, float64 b, float_status *status)
5167 uint64_t av, bv;
5168 a = float64_squash_input_denormal(a, status);
5169 b = float64_squash_input_denormal(b, status);
5171 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5172 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5174 float_raise(float_flag_invalid, status);
5175 return 0;
5177 av = float64_val(a);
5178 bv = float64_val(b);
5179 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5183 /*----------------------------------------------------------------------------
5184 | Returns 1 if the double-precision floating-point value `a' is less than or
5185 | equal to the corresponding value `b', and 0 otherwise. The invalid
5186 | exception is raised if either operand is a NaN. The comparison is performed
5187 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5188 *----------------------------------------------------------------------------*/
5190 int float64_le(float64 a, float64 b, float_status *status)
5192 flag aSign, bSign;
5193 uint64_t av, bv;
5194 a = float64_squash_input_denormal(a, status);
5195 b = float64_squash_input_denormal(b, status);
5197 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5198 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5200 float_raise(float_flag_invalid, status);
5201 return 0;
5203 aSign = extractFloat64Sign( a );
5204 bSign = extractFloat64Sign( b );
5205 av = float64_val(a);
5206 bv = float64_val(b);
5207 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5208 return ( av == bv ) || ( aSign ^ ( av < bv ) );
5212 /*----------------------------------------------------------------------------
5213 | Returns 1 if the double-precision floating-point value `a' is less than
5214 | the corresponding value `b', and 0 otherwise. The invalid exception is
5215 | raised if either operand is a NaN. The comparison is performed according
5216 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5217 *----------------------------------------------------------------------------*/
5219 int float64_lt(float64 a, float64 b, float_status *status)
5221 flag aSign, bSign;
5222 uint64_t av, bv;
5224 a = float64_squash_input_denormal(a, status);
5225 b = float64_squash_input_denormal(b, status);
5226 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5227 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5229 float_raise(float_flag_invalid, status);
5230 return 0;
5232 aSign = extractFloat64Sign( a );
5233 bSign = extractFloat64Sign( b );
5234 av = float64_val(a);
5235 bv = float64_val(b);
5236 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5237 return ( av != bv ) && ( aSign ^ ( av < bv ) );
5241 /*----------------------------------------------------------------------------
5242 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5243 | be compared, and 0 otherwise. The invalid exception is raised if either
5244 | operand is a NaN. The comparison is performed according to the IEC/IEEE
5245 | Standard for Binary Floating-Point Arithmetic.
5246 *----------------------------------------------------------------------------*/
5248 int float64_unordered(float64 a, float64 b, float_status *status)
5250 a = float64_squash_input_denormal(a, status);
5251 b = float64_squash_input_denormal(b, status);
5253 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5254 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5256 float_raise(float_flag_invalid, status);
5257 return 1;
5259 return 0;
5262 /*----------------------------------------------------------------------------
5263 | Returns 1 if the double-precision floating-point value `a' is equal to the
5264 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5265 | exception.The comparison is performed according to the IEC/IEEE Standard
5266 | for Binary Floating-Point Arithmetic.
5267 *----------------------------------------------------------------------------*/
5269 int float64_eq_quiet(float64 a, float64 b, float_status *status)
5271 uint64_t av, bv;
5272 a = float64_squash_input_denormal(a, status);
5273 b = float64_squash_input_denormal(b, status);
5275 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5276 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5278 if (float64_is_signaling_nan(a, status)
5279 || float64_is_signaling_nan(b, status)) {
5280 float_raise(float_flag_invalid, status);
5282 return 0;
5284 av = float64_val(a);
5285 bv = float64_val(b);
5286 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5290 /*----------------------------------------------------------------------------
5291 | Returns 1 if the double-precision floating-point value `a' is less than or
5292 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5293 | cause an exception. Otherwise, the comparison is performed according to the
5294 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5295 *----------------------------------------------------------------------------*/
5297 int float64_le_quiet(float64 a, float64 b, float_status *status)
5299 flag aSign, bSign;
5300 uint64_t av, bv;
5301 a = float64_squash_input_denormal(a, status);
5302 b = float64_squash_input_denormal(b, status);
5304 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5305 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5307 if (float64_is_signaling_nan(a, status)
5308 || float64_is_signaling_nan(b, status)) {
5309 float_raise(float_flag_invalid, status);
5311 return 0;
5313 aSign = extractFloat64Sign( a );
5314 bSign = extractFloat64Sign( b );
5315 av = float64_val(a);
5316 bv = float64_val(b);
5317 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5318 return ( av == bv ) || ( aSign ^ ( av < bv ) );
5322 /*----------------------------------------------------------------------------
5323 | Returns 1 if the double-precision floating-point value `a' is less than
5324 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5325 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5326 | Standard for Binary Floating-Point Arithmetic.
5327 *----------------------------------------------------------------------------*/
5329 int float64_lt_quiet(float64 a, float64 b, float_status *status)
5331 flag aSign, bSign;
5332 uint64_t av, bv;
5333 a = float64_squash_input_denormal(a, status);
5334 b = float64_squash_input_denormal(b, status);
5336 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5337 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5339 if (float64_is_signaling_nan(a, status)
5340 || float64_is_signaling_nan(b, status)) {
5341 float_raise(float_flag_invalid, status);
5343 return 0;
5345 aSign = extractFloat64Sign( a );
5346 bSign = extractFloat64Sign( b );
5347 av = float64_val(a);
5348 bv = float64_val(b);
5349 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5350 return ( av != bv ) && ( aSign ^ ( av < bv ) );
5354 /*----------------------------------------------------------------------------
5355 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5356 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
5357 | comparison is performed according to the IEC/IEEE Standard for Binary
5358 | Floating-Point Arithmetic.
5359 *----------------------------------------------------------------------------*/
5361 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
5363 a = float64_squash_input_denormal(a, status);
5364 b = float64_squash_input_denormal(b, status);
5366 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5367 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5369 if (float64_is_signaling_nan(a, status)
5370 || float64_is_signaling_nan(b, status)) {
5371 float_raise(float_flag_invalid, status);
5373 return 1;
5375 return 0;
5378 /*----------------------------------------------------------------------------
5379 | Returns the result of converting the extended double-precision floating-
5380 | point value `a' to the 32-bit two's complement integer format. The
5381 | conversion is performed according to the IEC/IEEE Standard for Binary
5382 | Floating-Point Arithmetic---which means in particular that the conversion
5383 | is rounded according to the current rounding mode. If `a' is a NaN, the
5384 | largest positive integer is returned. Otherwise, if the conversion
5385 | overflows, the largest integer with the same sign as `a' is returned.
5386 *----------------------------------------------------------------------------*/
5388 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5390 flag aSign;
5391 int32_t aExp, shiftCount;
5392 uint64_t aSig;
5394 if (floatx80_invalid_encoding(a)) {
5395 float_raise(float_flag_invalid, status);
5396 return 1 << 31;
5398 aSig = extractFloatx80Frac( a );
5399 aExp = extractFloatx80Exp( a );
5400 aSign = extractFloatx80Sign( a );
5401 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5402 shiftCount = 0x4037 - aExp;
5403 if ( shiftCount <= 0 ) shiftCount = 1;
5404 shift64RightJamming( aSig, shiftCount, &aSig );
5405 return roundAndPackInt32(aSign, aSig, status);
5409 /*----------------------------------------------------------------------------
5410 | Returns the result of converting the extended double-precision floating-
5411 | point value `a' to the 32-bit two's complement integer format. The
5412 | conversion is performed according to the IEC/IEEE Standard for Binary
5413 | Floating-Point Arithmetic, except that the conversion is always rounded
5414 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5415 | Otherwise, if the conversion overflows, the largest integer with the same
5416 | sign as `a' is returned.
5417 *----------------------------------------------------------------------------*/
5419 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5421 flag aSign;
5422 int32_t aExp, shiftCount;
5423 uint64_t aSig, savedASig;
5424 int32_t z;
5426 if (floatx80_invalid_encoding(a)) {
5427 float_raise(float_flag_invalid, status);
5428 return 1 << 31;
5430 aSig = extractFloatx80Frac( a );
5431 aExp = extractFloatx80Exp( a );
5432 aSign = extractFloatx80Sign( a );
5433 if ( 0x401E < aExp ) {
5434 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5435 goto invalid;
5437 else if ( aExp < 0x3FFF ) {
5438 if (aExp || aSig) {
5439 status->float_exception_flags |= float_flag_inexact;
5441 return 0;
5443 shiftCount = 0x403E - aExp;
5444 savedASig = aSig;
5445 aSig >>= shiftCount;
5446 z = aSig;
5447 if ( aSign ) z = - z;
5448 if ( ( z < 0 ) ^ aSign ) {
5449 invalid:
5450 float_raise(float_flag_invalid, status);
5451 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5453 if ( ( aSig<<shiftCount ) != savedASig ) {
5454 status->float_exception_flags |= float_flag_inexact;
5456 return z;
5460 /*----------------------------------------------------------------------------
5461 | Returns the result of converting the extended double-precision floating-
5462 | point value `a' to the 64-bit two's complement integer format. The
5463 | conversion is performed according to the IEC/IEEE Standard for Binary
5464 | Floating-Point Arithmetic---which means in particular that the conversion
5465 | is rounded according to the current rounding mode. If `a' is a NaN,
5466 | the largest positive integer is returned. Otherwise, if the conversion
5467 | overflows, the largest integer with the same sign as `a' is returned.
5468 *----------------------------------------------------------------------------*/
5470 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5472 flag aSign;
5473 int32_t aExp, shiftCount;
5474 uint64_t aSig, aSigExtra;
5476 if (floatx80_invalid_encoding(a)) {
5477 float_raise(float_flag_invalid, status);
5478 return 1ULL << 63;
5480 aSig = extractFloatx80Frac( a );
5481 aExp = extractFloatx80Exp( a );
5482 aSign = extractFloatx80Sign( a );
5483 shiftCount = 0x403E - aExp;
5484 if ( shiftCount <= 0 ) {
5485 if ( shiftCount ) {
5486 float_raise(float_flag_invalid, status);
5487 if (!aSign || floatx80_is_any_nan(a)) {
5488 return LIT64( 0x7FFFFFFFFFFFFFFF );
5490 return (int64_t) LIT64( 0x8000000000000000 );
5492 aSigExtra = 0;
5494 else {
5495 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5497 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5501 /*----------------------------------------------------------------------------
5502 | Returns the result of converting the extended double-precision floating-
5503 | point value `a' to the 64-bit two's complement integer format. The
5504 | conversion is performed according to the IEC/IEEE Standard for Binary
5505 | Floating-Point Arithmetic, except that the conversion is always rounded
5506 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5507 | Otherwise, if the conversion overflows, the largest integer with the same
5508 | sign as `a' is returned.
5509 *----------------------------------------------------------------------------*/
5511 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5513 flag aSign;
5514 int32_t aExp, shiftCount;
5515 uint64_t aSig;
5516 int64_t z;
5518 if (floatx80_invalid_encoding(a)) {
5519 float_raise(float_flag_invalid, status);
5520 return 1ULL << 63;
5522 aSig = extractFloatx80Frac( a );
5523 aExp = extractFloatx80Exp( a );
5524 aSign = extractFloatx80Sign( a );
5525 shiftCount = aExp - 0x403E;
5526 if ( 0 <= shiftCount ) {
5527 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
5528 if ( ( a.high != 0xC03E ) || aSig ) {
5529 float_raise(float_flag_invalid, status);
5530 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5531 return LIT64( 0x7FFFFFFFFFFFFFFF );
5534 return (int64_t) LIT64( 0x8000000000000000 );
5536 else if ( aExp < 0x3FFF ) {
5537 if (aExp | aSig) {
5538 status->float_exception_flags |= float_flag_inexact;
5540 return 0;
5542 z = aSig>>( - shiftCount );
5543 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5544 status->float_exception_flags |= float_flag_inexact;
5546 if ( aSign ) z = - z;
5547 return z;
5551 /*----------------------------------------------------------------------------
5552 | Returns the result of converting the extended double-precision floating-
5553 | point value `a' to the single-precision floating-point format. The
5554 | conversion is performed according to the IEC/IEEE Standard for Binary
5555 | Floating-Point Arithmetic.
5556 *----------------------------------------------------------------------------*/
5558 float32 floatx80_to_float32(floatx80 a, float_status *status)
5560 flag aSign;
5561 int32_t aExp;
5562 uint64_t aSig;
5564 if (floatx80_invalid_encoding(a)) {
5565 float_raise(float_flag_invalid, status);
5566 return float32_default_nan(status);
5568 aSig = extractFloatx80Frac( a );
5569 aExp = extractFloatx80Exp( a );
5570 aSign = extractFloatx80Sign( a );
5571 if ( aExp == 0x7FFF ) {
5572 if ( (uint64_t) ( aSig<<1 ) ) {
5573 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5575 return packFloat32( aSign, 0xFF, 0 );
5577 shift64RightJamming( aSig, 33, &aSig );
5578 if ( aExp || aSig ) aExp -= 0x3F81;
5579 return roundAndPackFloat32(aSign, aExp, aSig, status);
5583 /*----------------------------------------------------------------------------
5584 | Returns the result of converting the extended double-precision floating-
5585 | point value `a' to the double-precision floating-point format. The
5586 | conversion is performed according to the IEC/IEEE Standard for Binary
5587 | Floating-Point Arithmetic.
5588 *----------------------------------------------------------------------------*/
5590 float64 floatx80_to_float64(floatx80 a, float_status *status)
5592 flag aSign;
5593 int32_t aExp;
5594 uint64_t aSig, zSig;
5596 if (floatx80_invalid_encoding(a)) {
5597 float_raise(float_flag_invalid, status);
5598 return float64_default_nan(status);
5600 aSig = extractFloatx80Frac( a );
5601 aExp = extractFloatx80Exp( a );
5602 aSign = extractFloatx80Sign( a );
5603 if ( aExp == 0x7FFF ) {
5604 if ( (uint64_t) ( aSig<<1 ) ) {
5605 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5607 return packFloat64( aSign, 0x7FF, 0 );
5609 shift64RightJamming( aSig, 1, &zSig );
5610 if ( aExp || aSig ) aExp -= 0x3C01;
5611 return roundAndPackFloat64(aSign, aExp, zSig, status);
5615 /*----------------------------------------------------------------------------
5616 | Returns the result of converting the extended double-precision floating-
5617 | point value `a' to the quadruple-precision floating-point format. The
5618 | conversion is performed according to the IEC/IEEE Standard for Binary
5619 | Floating-Point Arithmetic.
5620 *----------------------------------------------------------------------------*/
5622 float128 floatx80_to_float128(floatx80 a, float_status *status)
5624 flag aSign;
5625 int aExp;
5626 uint64_t aSig, zSig0, zSig1;
5628 if (floatx80_invalid_encoding(a)) {
5629 float_raise(float_flag_invalid, status);
5630 return float128_default_nan(status);
5632 aSig = extractFloatx80Frac( a );
5633 aExp = extractFloatx80Exp( a );
5634 aSign = extractFloatx80Sign( a );
5635 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5636 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5638 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5639 return packFloat128( aSign, aExp, zSig0, zSig1 );
5643 /*----------------------------------------------------------------------------
5644 | Rounds the extended double-precision floating-point value `a'
5645 | to the precision provided by floatx80_rounding_precision and returns the
5646 | result as an extended double-precision floating-point value.
5647 | The operation is performed according to the IEC/IEEE Standard for Binary
5648 | Floating-Point Arithmetic.
5649 *----------------------------------------------------------------------------*/
5651 floatx80 floatx80_round(floatx80 a, float_status *status)
5653 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5654 extractFloatx80Sign(a),
5655 extractFloatx80Exp(a),
5656 extractFloatx80Frac(a), 0, status);
5659 /*----------------------------------------------------------------------------
5660 | Rounds the extended double-precision floating-point value `a' to an integer,
5661 | and returns the result as an extended quadruple-precision floating-point
5662 | value. The operation is performed according to the IEC/IEEE Standard for
5663 | Binary Floating-Point Arithmetic.
5664 *----------------------------------------------------------------------------*/
5666 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5668 flag aSign;
5669 int32_t aExp;
5670 uint64_t lastBitMask, roundBitsMask;
5671 floatx80 z;
5673 if (floatx80_invalid_encoding(a)) {
5674 float_raise(float_flag_invalid, status);
5675 return floatx80_default_nan(status);
5677 aExp = extractFloatx80Exp( a );
5678 if ( 0x403E <= aExp ) {
5679 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5680 return propagateFloatx80NaN(a, a, status);
5682 return a;
5684 if ( aExp < 0x3FFF ) {
5685 if ( ( aExp == 0 )
5686 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5687 return a;
5689 status->float_exception_flags |= float_flag_inexact;
5690 aSign = extractFloatx80Sign( a );
5691 switch (status->float_rounding_mode) {
5692 case float_round_nearest_even:
5693 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5695 return
5696 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5698 break;
5699 case float_round_ties_away:
5700 if (aExp == 0x3FFE) {
5701 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5703 break;
5704 case float_round_down:
5705 return
5706 aSign ?
5707 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5708 : packFloatx80( 0, 0, 0 );
5709 case float_round_up:
5710 return
5711 aSign ? packFloatx80( 1, 0, 0 )
5712 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5714 return packFloatx80( aSign, 0, 0 );
5716 lastBitMask = 1;
5717 lastBitMask <<= 0x403E - aExp;
5718 roundBitsMask = lastBitMask - 1;
5719 z = a;
5720 switch (status->float_rounding_mode) {
5721 case float_round_nearest_even:
5722 z.low += lastBitMask>>1;
5723 if ((z.low & roundBitsMask) == 0) {
5724 z.low &= ~lastBitMask;
5726 break;
5727 case float_round_ties_away:
5728 z.low += lastBitMask >> 1;
5729 break;
5730 case float_round_to_zero:
5731 break;
5732 case float_round_up:
5733 if (!extractFloatx80Sign(z)) {
5734 z.low += roundBitsMask;
5736 break;
5737 case float_round_down:
5738 if (extractFloatx80Sign(z)) {
5739 z.low += roundBitsMask;
5741 break;
5742 default:
5743 abort();
5745 z.low &= ~ roundBitsMask;
5746 if ( z.low == 0 ) {
5747 ++z.high;
5748 z.low = LIT64( 0x8000000000000000 );
5750 if (z.low != a.low) {
5751 status->float_exception_flags |= float_flag_inexact;
5753 return z;
5757 /*----------------------------------------------------------------------------
5758 | Returns the result of adding the absolute values of the extended double-
5759 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5760 | negated before being returned. `zSign' is ignored if the result is a NaN.
5761 | The addition is performed according to the IEC/IEEE Standard for Binary
5762 | Floating-Point Arithmetic.
5763 *----------------------------------------------------------------------------*/
5765 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5766 float_status *status)
5768 int32_t aExp, bExp, zExp;
5769 uint64_t aSig, bSig, zSig0, zSig1;
5770 int32_t expDiff;
5772 aSig = extractFloatx80Frac( a );
5773 aExp = extractFloatx80Exp( a );
5774 bSig = extractFloatx80Frac( b );
5775 bExp = extractFloatx80Exp( b );
5776 expDiff = aExp - bExp;
5777 if ( 0 < expDiff ) {
5778 if ( aExp == 0x7FFF ) {
5779 if ((uint64_t)(aSig << 1)) {
5780 return propagateFloatx80NaN(a, b, status);
5782 return a;
5784 if ( bExp == 0 ) --expDiff;
5785 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5786 zExp = aExp;
5788 else if ( expDiff < 0 ) {
5789 if ( bExp == 0x7FFF ) {
5790 if ((uint64_t)(bSig << 1)) {
5791 return propagateFloatx80NaN(a, b, status);
5793 return packFloatx80(zSign,
5794 floatx80_infinity_high,
5795 floatx80_infinity_low);
5797 if ( aExp == 0 ) ++expDiff;
5798 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5799 zExp = bExp;
5801 else {
5802 if ( aExp == 0x7FFF ) {
5803 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5804 return propagateFloatx80NaN(a, b, status);
5806 return a;
5808 zSig1 = 0;
5809 zSig0 = aSig + bSig;
5810 if ( aExp == 0 ) {
5811 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5812 goto roundAndPack;
5814 zExp = aExp;
5815 goto shiftRight1;
5817 zSig0 = aSig + bSig;
5818 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5819 shiftRight1:
5820 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5821 zSig0 |= LIT64( 0x8000000000000000 );
5822 ++zExp;
5823 roundAndPack:
5824 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5825 zSign, zExp, zSig0, zSig1, status);
5828 /*----------------------------------------------------------------------------
5829 | Returns the result of subtracting the absolute values of the extended
5830 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5831 | difference is negated before being returned. `zSign' is ignored if the
5832 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5833 | Standard for Binary Floating-Point Arithmetic.
5834 *----------------------------------------------------------------------------*/
5836 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5837 float_status *status)
5839 int32_t aExp, bExp, zExp;
5840 uint64_t aSig, bSig, zSig0, zSig1;
5841 int32_t expDiff;
5843 aSig = extractFloatx80Frac( a );
5844 aExp = extractFloatx80Exp( a );
5845 bSig = extractFloatx80Frac( b );
5846 bExp = extractFloatx80Exp( b );
5847 expDiff = aExp - bExp;
5848 if ( 0 < expDiff ) goto aExpBigger;
5849 if ( expDiff < 0 ) goto bExpBigger;
5850 if ( aExp == 0x7FFF ) {
5851 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5852 return propagateFloatx80NaN(a, b, status);
5854 float_raise(float_flag_invalid, status);
5855 return floatx80_default_nan(status);
5857 if ( aExp == 0 ) {
5858 aExp = 1;
5859 bExp = 1;
5861 zSig1 = 0;
5862 if ( bSig < aSig ) goto aBigger;
5863 if ( aSig < bSig ) goto bBigger;
5864 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5865 bExpBigger:
5866 if ( bExp == 0x7FFF ) {
5867 if ((uint64_t)(bSig << 1)) {
5868 return propagateFloatx80NaN(a, b, status);
5870 return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5871 floatx80_infinity_low);
5873 if ( aExp == 0 ) ++expDiff;
5874 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5875 bBigger:
5876 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5877 zExp = bExp;
5878 zSign ^= 1;
5879 goto normalizeRoundAndPack;
5880 aExpBigger:
5881 if ( aExp == 0x7FFF ) {
5882 if ((uint64_t)(aSig << 1)) {
5883 return propagateFloatx80NaN(a, b, status);
5885 return a;
5887 if ( bExp == 0 ) --expDiff;
5888 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5889 aBigger:
5890 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5891 zExp = aExp;
5892 normalizeRoundAndPack:
5893 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5894 zSign, zExp, zSig0, zSig1, status);
5897 /*----------------------------------------------------------------------------
5898 | Returns the result of adding the extended double-precision floating-point
5899 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5900 | Standard for Binary Floating-Point Arithmetic.
5901 *----------------------------------------------------------------------------*/
5903 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5905 flag aSign, bSign;
5907 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5908 float_raise(float_flag_invalid, status);
5909 return floatx80_default_nan(status);
5911 aSign = extractFloatx80Sign( a );
5912 bSign = extractFloatx80Sign( b );
5913 if ( aSign == bSign ) {
5914 return addFloatx80Sigs(a, b, aSign, status);
5916 else {
5917 return subFloatx80Sigs(a, b, aSign, status);
5922 /*----------------------------------------------------------------------------
5923 | Returns the result of subtracting the extended double-precision floating-
5924 | point values `a' and `b'. The operation is performed according to the
5925 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5926 *----------------------------------------------------------------------------*/
5928 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5930 flag aSign, bSign;
5932 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5933 float_raise(float_flag_invalid, status);
5934 return floatx80_default_nan(status);
5936 aSign = extractFloatx80Sign( a );
5937 bSign = extractFloatx80Sign( b );
5938 if ( aSign == bSign ) {
5939 return subFloatx80Sigs(a, b, aSign, status);
5941 else {
5942 return addFloatx80Sigs(a, b, aSign, status);
5947 /*----------------------------------------------------------------------------
5948 | Returns the result of multiplying the extended double-precision floating-
5949 | point values `a' and `b'. The operation is performed according to the
5950 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5951 *----------------------------------------------------------------------------*/
5953 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5955 flag aSign, bSign, zSign;
5956 int32_t aExp, bExp, zExp;
5957 uint64_t aSig, bSig, zSig0, zSig1;
5959 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5960 float_raise(float_flag_invalid, status);
5961 return floatx80_default_nan(status);
5963 aSig = extractFloatx80Frac( a );
5964 aExp = extractFloatx80Exp( a );
5965 aSign = extractFloatx80Sign( a );
5966 bSig = extractFloatx80Frac( b );
5967 bExp = extractFloatx80Exp( b );
5968 bSign = extractFloatx80Sign( b );
5969 zSign = aSign ^ bSign;
5970 if ( aExp == 0x7FFF ) {
5971 if ( (uint64_t) ( aSig<<1 )
5972 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5973 return propagateFloatx80NaN(a, b, status);
5975 if ( ( bExp | bSig ) == 0 ) goto invalid;
5976 return packFloatx80(zSign, floatx80_infinity_high,
5977 floatx80_infinity_low);
5979 if ( bExp == 0x7FFF ) {
5980 if ((uint64_t)(bSig << 1)) {
5981 return propagateFloatx80NaN(a, b, status);
5983 if ( ( aExp | aSig ) == 0 ) {
5984 invalid:
5985 float_raise(float_flag_invalid, status);
5986 return floatx80_default_nan(status);
5988 return packFloatx80(zSign, floatx80_infinity_high,
5989 floatx80_infinity_low);
5991 if ( aExp == 0 ) {
5992 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5993 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5995 if ( bExp == 0 ) {
5996 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5997 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5999 zExp = aExp + bExp - 0x3FFE;
6000 mul64To128( aSig, bSig, &zSig0, &zSig1 );
6001 if ( 0 < (int64_t) zSig0 ) {
6002 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6003 --zExp;
6005 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6006 zSign, zExp, zSig0, zSig1, status);
6009 /*----------------------------------------------------------------------------
6010 | Returns the result of dividing the extended double-precision floating-point
6011 | value `a' by the corresponding value `b'. The operation is performed
6012 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6013 *----------------------------------------------------------------------------*/
6015 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6017 flag aSign, bSign, zSign;
6018 int32_t aExp, bExp, zExp;
6019 uint64_t aSig, bSig, zSig0, zSig1;
6020 uint64_t rem0, rem1, rem2, term0, term1, term2;
6022 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6023 float_raise(float_flag_invalid, status);
6024 return floatx80_default_nan(status);
6026 aSig = extractFloatx80Frac( a );
6027 aExp = extractFloatx80Exp( a );
6028 aSign = extractFloatx80Sign( a );
6029 bSig = extractFloatx80Frac( b );
6030 bExp = extractFloatx80Exp( b );
6031 bSign = extractFloatx80Sign( b );
6032 zSign = aSign ^ bSign;
6033 if ( aExp == 0x7FFF ) {
6034 if ((uint64_t)(aSig << 1)) {
6035 return propagateFloatx80NaN(a, b, status);
6037 if ( bExp == 0x7FFF ) {
6038 if ((uint64_t)(bSig << 1)) {
6039 return propagateFloatx80NaN(a, b, status);
6041 goto invalid;
6043 return packFloatx80(zSign, floatx80_infinity_high,
6044 floatx80_infinity_low);
6046 if ( bExp == 0x7FFF ) {
6047 if ((uint64_t)(bSig << 1)) {
6048 return propagateFloatx80NaN(a, b, status);
6050 return packFloatx80( zSign, 0, 0 );
6052 if ( bExp == 0 ) {
6053 if ( bSig == 0 ) {
6054 if ( ( aExp | aSig ) == 0 ) {
6055 invalid:
6056 float_raise(float_flag_invalid, status);
6057 return floatx80_default_nan(status);
6059 float_raise(float_flag_divbyzero, status);
6060 return packFloatx80(zSign, floatx80_infinity_high,
6061 floatx80_infinity_low);
6063 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6065 if ( aExp == 0 ) {
6066 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6067 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6069 zExp = aExp - bExp + 0x3FFE;
6070 rem1 = 0;
6071 if ( bSig <= aSig ) {
6072 shift128Right( aSig, 0, 1, &aSig, &rem1 );
6073 ++zExp;
6075 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6076 mul64To128( bSig, zSig0, &term0, &term1 );
6077 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6078 while ( (int64_t) rem0 < 0 ) {
6079 --zSig0;
6080 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6082 zSig1 = estimateDiv128To64( rem1, 0, bSig );
6083 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6084 mul64To128( bSig, zSig1, &term1, &term2 );
6085 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6086 while ( (int64_t) rem1 < 0 ) {
6087 --zSig1;
6088 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6090 zSig1 |= ( ( rem1 | rem2 ) != 0 );
6092 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6093 zSign, zExp, zSig0, zSig1, status);
6096 /*----------------------------------------------------------------------------
6097 | Returns the remainder of the extended double-precision floating-point value
6098 | `a' with respect to the corresponding value `b'. The operation is performed
6099 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6100 *----------------------------------------------------------------------------*/
6102 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6104 flag aSign, zSign;
6105 int32_t aExp, bExp, expDiff;
6106 uint64_t aSig0, aSig1, bSig;
6107 uint64_t q, term0, term1, alternateASig0, alternateASig1;
6109 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6110 float_raise(float_flag_invalid, status);
6111 return floatx80_default_nan(status);
6113 aSig0 = extractFloatx80Frac( a );
6114 aExp = extractFloatx80Exp( a );
6115 aSign = extractFloatx80Sign( a );
6116 bSig = extractFloatx80Frac( b );
6117 bExp = extractFloatx80Exp( b );
6118 if ( aExp == 0x7FFF ) {
6119 if ( (uint64_t) ( aSig0<<1 )
6120 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6121 return propagateFloatx80NaN(a, b, status);
6123 goto invalid;
6125 if ( bExp == 0x7FFF ) {
6126 if ((uint64_t)(bSig << 1)) {
6127 return propagateFloatx80NaN(a, b, status);
6129 return a;
6131 if ( bExp == 0 ) {
6132 if ( bSig == 0 ) {
6133 invalid:
6134 float_raise(float_flag_invalid, status);
6135 return floatx80_default_nan(status);
6137 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6139 if ( aExp == 0 ) {
6140 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
6141 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6143 bSig |= LIT64( 0x8000000000000000 );
6144 zSign = aSign;
6145 expDiff = aExp - bExp;
6146 aSig1 = 0;
6147 if ( expDiff < 0 ) {
6148 if ( expDiff < -1 ) return a;
6149 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6150 expDiff = 0;
6152 q = ( bSig <= aSig0 );
6153 if ( q ) aSig0 -= bSig;
6154 expDiff -= 64;
6155 while ( 0 < expDiff ) {
6156 q = estimateDiv128To64( aSig0, aSig1, bSig );
6157 q = ( 2 < q ) ? q - 2 : 0;
6158 mul64To128( bSig, q, &term0, &term1 );
6159 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6160 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6161 expDiff -= 62;
6163 expDiff += 64;
6164 if ( 0 < expDiff ) {
6165 q = estimateDiv128To64( aSig0, aSig1, bSig );
6166 q = ( 2 < q ) ? q - 2 : 0;
6167 q >>= 64 - expDiff;
6168 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6169 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6170 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6171 while ( le128( term0, term1, aSig0, aSig1 ) ) {
6172 ++q;
6173 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6176 else {
6177 term1 = 0;
6178 term0 = bSig;
6180 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6181 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6182 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6183 && ( q & 1 ) )
6185 aSig0 = alternateASig0;
6186 aSig1 = alternateASig1;
6187 zSign = ! zSign;
6189 return
6190 normalizeRoundAndPackFloatx80(
6191 80, zSign, bExp + expDiff, aSig0, aSig1, status);
6195 /*----------------------------------------------------------------------------
6196 | Returns the square root of the extended double-precision floating-point
6197 | value `a'. The operation is performed according to the IEC/IEEE Standard
6198 | for Binary Floating-Point Arithmetic.
6199 *----------------------------------------------------------------------------*/
6201 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6203 flag aSign;
6204 int32_t aExp, zExp;
6205 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6206 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6208 if (floatx80_invalid_encoding(a)) {
6209 float_raise(float_flag_invalid, status);
6210 return floatx80_default_nan(status);
6212 aSig0 = extractFloatx80Frac( a );
6213 aExp = extractFloatx80Exp( a );
6214 aSign = extractFloatx80Sign( a );
6215 if ( aExp == 0x7FFF ) {
6216 if ((uint64_t)(aSig0 << 1)) {
6217 return propagateFloatx80NaN(a, a, status);
6219 if ( ! aSign ) return a;
6220 goto invalid;
6222 if ( aSign ) {
6223 if ( ( aExp | aSig0 ) == 0 ) return a;
6224 invalid:
6225 float_raise(float_flag_invalid, status);
6226 return floatx80_default_nan(status);
6228 if ( aExp == 0 ) {
6229 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6230 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6232 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6233 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6234 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6235 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6236 doubleZSig0 = zSig0<<1;
6237 mul64To128( zSig0, zSig0, &term0, &term1 );
6238 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6239 while ( (int64_t) rem0 < 0 ) {
6240 --zSig0;
6241 doubleZSig0 -= 2;
6242 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6244 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6245 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
6246 if ( zSig1 == 0 ) zSig1 = 1;
6247 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6248 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6249 mul64To128( zSig1, zSig1, &term2, &term3 );
6250 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6251 while ( (int64_t) rem1 < 0 ) {
6252 --zSig1;
6253 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6254 term3 |= 1;
6255 term2 |= doubleZSig0;
6256 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6258 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6260 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6261 zSig0 |= doubleZSig0;
6262 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6263 0, zExp, zSig0, zSig1, status);
6266 /*----------------------------------------------------------------------------
6267 | Returns 1 if the extended double-precision floating-point value `a' is equal
6268 | to the corresponding value `b', and 0 otherwise. The invalid exception is
6269 | raised if either operand is a NaN. Otherwise, the comparison is performed
6270 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6271 *----------------------------------------------------------------------------*/
6273 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
6276 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6277 || (extractFloatx80Exp(a) == 0x7FFF
6278 && (uint64_t) (extractFloatx80Frac(a) << 1))
6279 || (extractFloatx80Exp(b) == 0x7FFF
6280 && (uint64_t) (extractFloatx80Frac(b) << 1))
6282 float_raise(float_flag_invalid, status);
6283 return 0;
6285 return
6286 ( a.low == b.low )
6287 && ( ( a.high == b.high )
6288 || ( ( a.low == 0 )
6289 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6294 /*----------------------------------------------------------------------------
6295 | Returns 1 if the extended double-precision floating-point value `a' is
6296 | less than or equal to the corresponding value `b', and 0 otherwise. The
6297 | invalid exception is raised if either operand is a NaN. The comparison is
6298 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6299 | Arithmetic.
6300 *----------------------------------------------------------------------------*/
6302 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
6304 flag aSign, bSign;
6306 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6307 || (extractFloatx80Exp(a) == 0x7FFF
6308 && (uint64_t) (extractFloatx80Frac(a) << 1))
6309 || (extractFloatx80Exp(b) == 0x7FFF
6310 && (uint64_t) (extractFloatx80Frac(b) << 1))
6312 float_raise(float_flag_invalid, status);
6313 return 0;
6315 aSign = extractFloatx80Sign( a );
6316 bSign = extractFloatx80Sign( b );
6317 if ( aSign != bSign ) {
6318 return
6319 aSign
6320 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6321 == 0 );
6323 return
6324 aSign ? le128( b.high, b.low, a.high, a.low )
6325 : le128( a.high, a.low, b.high, b.low );
6329 /*----------------------------------------------------------------------------
6330 | Returns 1 if the extended double-precision floating-point value `a' is
6331 | less than the corresponding value `b', and 0 otherwise. The invalid
6332 | exception is raised if either operand is a NaN. The comparison is performed
6333 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6334 *----------------------------------------------------------------------------*/
6336 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
6338 flag aSign, bSign;
6340 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6341 || (extractFloatx80Exp(a) == 0x7FFF
6342 && (uint64_t) (extractFloatx80Frac(a) << 1))
6343 || (extractFloatx80Exp(b) == 0x7FFF
6344 && (uint64_t) (extractFloatx80Frac(b) << 1))
6346 float_raise(float_flag_invalid, status);
6347 return 0;
6349 aSign = extractFloatx80Sign( a );
6350 bSign = extractFloatx80Sign( b );
6351 if ( aSign != bSign ) {
6352 return
6353 aSign
6354 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6355 != 0 );
6357 return
6358 aSign ? lt128( b.high, b.low, a.high, a.low )
6359 : lt128( a.high, a.low, b.high, b.low );
6363 /*----------------------------------------------------------------------------
6364 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6365 | cannot be compared, and 0 otherwise. The invalid exception is raised if
6366 | either operand is a NaN. The comparison is performed according to the
6367 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6368 *----------------------------------------------------------------------------*/
6369 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
6371 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6372 || (extractFloatx80Exp(a) == 0x7FFF
6373 && (uint64_t) (extractFloatx80Frac(a) << 1))
6374 || (extractFloatx80Exp(b) == 0x7FFF
6375 && (uint64_t) (extractFloatx80Frac(b) << 1))
6377 float_raise(float_flag_invalid, status);
6378 return 1;
6380 return 0;
6383 /*----------------------------------------------------------------------------
6384 | Returns 1 if the extended double-precision floating-point value `a' is
6385 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6386 | cause an exception. The comparison is performed according to the IEC/IEEE
6387 | Standard for Binary Floating-Point Arithmetic.
6388 *----------------------------------------------------------------------------*/
6390 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
6393 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6394 float_raise(float_flag_invalid, status);
6395 return 0;
6397 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
6398 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6399 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
6400 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6402 if (floatx80_is_signaling_nan(a, status)
6403 || floatx80_is_signaling_nan(b, status)) {
6404 float_raise(float_flag_invalid, status);
6406 return 0;
6408 return
6409 ( a.low == b.low )
6410 && ( ( a.high == b.high )
6411 || ( ( a.low == 0 )
6412 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6417 /*----------------------------------------------------------------------------
6418 | Returns 1 if the extended double-precision floating-point value `a' is less
6419 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
6420 | do not cause an exception. Otherwise, the comparison is performed according
6421 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6422 *----------------------------------------------------------------------------*/
6424 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
6426 flag aSign, bSign;
6428 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6429 float_raise(float_flag_invalid, status);
6430 return 0;
6432 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
6433 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6434 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
6435 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6437 if (floatx80_is_signaling_nan(a, status)
6438 || floatx80_is_signaling_nan(b, status)) {
6439 float_raise(float_flag_invalid, status);
6441 return 0;
6443 aSign = extractFloatx80Sign( a );
6444 bSign = extractFloatx80Sign( b );
6445 if ( aSign != bSign ) {
6446 return
6447 aSign
6448 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6449 == 0 );
6451 return
6452 aSign ? le128( b.high, b.low, a.high, a.low )
6453 : le128( a.high, a.low, b.high, b.low );
6457 /*----------------------------------------------------------------------------
6458 | Returns 1 if the extended double-precision floating-point value `a' is less
6459 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
6460 | an exception. Otherwise, the comparison is performed according to the
6461 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6462 *----------------------------------------------------------------------------*/
6464 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
6466 flag aSign, bSign;
6468 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6469 float_raise(float_flag_invalid, status);
6470 return 0;
6472 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
6473 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6474 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
6475 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6477 if (floatx80_is_signaling_nan(a, status)
6478 || floatx80_is_signaling_nan(b, status)) {
6479 float_raise(float_flag_invalid, status);
6481 return 0;
6483 aSign = extractFloatx80Sign( a );
6484 bSign = extractFloatx80Sign( b );
6485 if ( aSign != bSign ) {
6486 return
6487 aSign
6488 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6489 != 0 );
6491 return
6492 aSign ? lt128( b.high, b.low, a.high, a.low )
6493 : lt128( a.high, a.low, b.high, b.low );
6497 /*----------------------------------------------------------------------------
6498 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6499 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
6500 | The comparison is performed according to the IEC/IEEE Standard for Binary
6501 | Floating-Point Arithmetic.
6502 *----------------------------------------------------------------------------*/
6503 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
6505 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6506 float_raise(float_flag_invalid, status);
6507 return 1;
6509 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
6510 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6511 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
6512 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6514 if (floatx80_is_signaling_nan(a, status)
6515 || floatx80_is_signaling_nan(b, status)) {
6516 float_raise(float_flag_invalid, status);
6518 return 1;
6520 return 0;
6523 /*----------------------------------------------------------------------------
6524 | Returns the result of converting the quadruple-precision floating-point
6525 | value `a' to the 32-bit two's complement integer format. The conversion
6526 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6527 | Arithmetic---which means in particular that the conversion is rounded
6528 | according to the current rounding mode. If `a' is a NaN, the largest
6529 | positive integer is returned. Otherwise, if the conversion overflows, the
6530 | largest integer with the same sign as `a' is returned.
6531 *----------------------------------------------------------------------------*/
6533 int32_t float128_to_int32(float128 a, float_status *status)
6535 flag aSign;
6536 int32_t aExp, shiftCount;
6537 uint64_t aSig0, aSig1;
6539 aSig1 = extractFloat128Frac1( a );
6540 aSig0 = extractFloat128Frac0( a );
6541 aExp = extractFloat128Exp( a );
6542 aSign = extractFloat128Sign( a );
6543 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6544 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6545 aSig0 |= ( aSig1 != 0 );
6546 shiftCount = 0x4028 - aExp;
6547 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6548 return roundAndPackInt32(aSign, aSig0, status);
6552 /*----------------------------------------------------------------------------
6553 | Returns the result of converting the quadruple-precision floating-point
6554 | value `a' to the 32-bit two's complement integer format. The conversion
6555 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6556 | Arithmetic, except that the conversion is always rounded toward zero. If
6557 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
6558 | conversion overflows, the largest integer with the same sign as `a' is
6559 | returned.
6560 *----------------------------------------------------------------------------*/
6562 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6564 flag aSign;
6565 int32_t aExp, shiftCount;
6566 uint64_t aSig0, aSig1, savedASig;
6567 int32_t z;
6569 aSig1 = extractFloat128Frac1( a );
6570 aSig0 = extractFloat128Frac0( a );
6571 aExp = extractFloat128Exp( a );
6572 aSign = extractFloat128Sign( a );
6573 aSig0 |= ( aSig1 != 0 );
6574 if ( 0x401E < aExp ) {
6575 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6576 goto invalid;
6578 else if ( aExp < 0x3FFF ) {
6579 if (aExp || aSig0) {
6580 status->float_exception_flags |= float_flag_inexact;
6582 return 0;
6584 aSig0 |= LIT64( 0x0001000000000000 );
6585 shiftCount = 0x402F - aExp;
6586 savedASig = aSig0;
6587 aSig0 >>= shiftCount;
6588 z = aSig0;
6589 if ( aSign ) z = - z;
6590 if ( ( z < 0 ) ^ aSign ) {
6591 invalid:
6592 float_raise(float_flag_invalid, status);
6593 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
6595 if ( ( aSig0<<shiftCount ) != savedASig ) {
6596 status->float_exception_flags |= float_flag_inexact;
6598 return z;
6602 /*----------------------------------------------------------------------------
6603 | Returns the result of converting the quadruple-precision floating-point
6604 | value `a' to the 64-bit two's complement integer format. The conversion
6605 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6606 | Arithmetic---which means in particular that the conversion is rounded
6607 | according to the current rounding mode. If `a' is a NaN, the largest
6608 | positive integer is returned. Otherwise, if the conversion overflows, the
6609 | largest integer with the same sign as `a' is returned.
6610 *----------------------------------------------------------------------------*/
6612 int64_t float128_to_int64(float128 a, float_status *status)
6614 flag aSign;
6615 int32_t aExp, shiftCount;
6616 uint64_t aSig0, aSig1;
6618 aSig1 = extractFloat128Frac1( a );
6619 aSig0 = extractFloat128Frac0( a );
6620 aExp = extractFloat128Exp( a );
6621 aSign = extractFloat128Sign( a );
6622 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6623 shiftCount = 0x402F - aExp;
6624 if ( shiftCount <= 0 ) {
6625 if ( 0x403E < aExp ) {
6626 float_raise(float_flag_invalid, status);
6627 if ( ! aSign
6628 || ( ( aExp == 0x7FFF )
6629 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
6632 return LIT64( 0x7FFFFFFFFFFFFFFF );
6634 return (int64_t) LIT64( 0x8000000000000000 );
6636 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6638 else {
6639 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6641 return roundAndPackInt64(aSign, aSig0, aSig1, status);
6645 /*----------------------------------------------------------------------------
6646 | Returns the result of converting the quadruple-precision floating-point
6647 | value `a' to the 64-bit two's complement integer format. The conversion
6648 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6649 | Arithmetic, except that the conversion is always rounded toward zero.
6650 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6651 | the conversion overflows, the largest integer with the same sign as `a' is
6652 | returned.
6653 *----------------------------------------------------------------------------*/
6655 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6657 flag aSign;
6658 int32_t aExp, shiftCount;
6659 uint64_t aSig0, aSig1;
6660 int64_t z;
6662 aSig1 = extractFloat128Frac1( a );
6663 aSig0 = extractFloat128Frac0( a );
6664 aExp = extractFloat128Exp( a );
6665 aSign = extractFloat128Sign( a );
6666 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6667 shiftCount = aExp - 0x402F;
6668 if ( 0 < shiftCount ) {
6669 if ( 0x403E <= aExp ) {
6670 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
6671 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
6672 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
6673 if (aSig1) {
6674 status->float_exception_flags |= float_flag_inexact;
6677 else {
6678 float_raise(float_flag_invalid, status);
6679 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6680 return LIT64( 0x7FFFFFFFFFFFFFFF );
6683 return (int64_t) LIT64( 0x8000000000000000 );
6685 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6686 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6687 status->float_exception_flags |= float_flag_inexact;
6690 else {
6691 if ( aExp < 0x3FFF ) {
6692 if ( aExp | aSig0 | aSig1 ) {
6693 status->float_exception_flags |= float_flag_inexact;
6695 return 0;
6697 z = aSig0>>( - shiftCount );
6698 if ( aSig1
6699 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6700 status->float_exception_flags |= float_flag_inexact;
6703 if ( aSign ) z = - z;
6704 return z;
6708 /*----------------------------------------------------------------------------
6709 | Returns the result of converting the quadruple-precision floating-point value
6710 | `a' to the 64-bit unsigned integer format. The conversion is
6711 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6712 | Arithmetic---which means in particular that the conversion is rounded
6713 | according to the current rounding mode. If `a' is a NaN, the largest
6714 | positive integer is returned. If the conversion overflows, the
6715 | largest unsigned integer is returned. If 'a' is negative, the value is
6716 | rounded and zero is returned; negative values that do not round to zero
6717 | will raise the inexact exception.
6718 *----------------------------------------------------------------------------*/
6720 uint64_t float128_to_uint64(float128 a, float_status *status)
6722 flag aSign;
6723 int aExp;
6724 int shiftCount;
6725 uint64_t aSig0, aSig1;
6727 aSig0 = extractFloat128Frac0(a);
6728 aSig1 = extractFloat128Frac1(a);
6729 aExp = extractFloat128Exp(a);
6730 aSign = extractFloat128Sign(a);
6731 if (aSign && (aExp > 0x3FFE)) {
6732 float_raise(float_flag_invalid, status);
6733 if (float128_is_any_nan(a)) {
6734 return LIT64(0xFFFFFFFFFFFFFFFF);
6735 } else {
6736 return 0;
6739 if (aExp) {
6740 aSig0 |= LIT64(0x0001000000000000);
6742 shiftCount = 0x402F - aExp;
6743 if (shiftCount <= 0) {
6744 if (0x403E < aExp) {
6745 float_raise(float_flag_invalid, status);
6746 return LIT64(0xFFFFFFFFFFFFFFFF);
6748 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6749 } else {
6750 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6752 return roundAndPackUint64(aSign, aSig0, aSig1, status);
6755 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6757 uint64_t v;
6758 signed char current_rounding_mode = status->float_rounding_mode;
6760 set_float_rounding_mode(float_round_to_zero, status);
6761 v = float128_to_uint64(a, status);
6762 set_float_rounding_mode(current_rounding_mode, status);
6764 return v;
6767 /*----------------------------------------------------------------------------
6768 | Returns the result of converting the quadruple-precision floating-point
6769 | value `a' to the 32-bit unsigned integer format. The conversion
6770 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6771 | Arithmetic except that the conversion is always rounded toward zero.
6772 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
6773 | if the conversion overflows, the largest unsigned integer is returned.
6774 | If 'a' is negative, the value is rounded and zero is returned; negative
6775 | values that do not round to zero will raise the inexact exception.
6776 *----------------------------------------------------------------------------*/
6778 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6780 uint64_t v;
6781 uint32_t res;
6782 int old_exc_flags = get_float_exception_flags(status);
6784 v = float128_to_uint64_round_to_zero(a, status);
6785 if (v > 0xffffffff) {
6786 res = 0xffffffff;
6787 } else {
6788 return v;
6790 set_float_exception_flags(old_exc_flags, status);
6791 float_raise(float_flag_invalid, status);
6792 return res;
6795 /*----------------------------------------------------------------------------
6796 | Returns the result of converting the quadruple-precision floating-point
6797 | value `a' to the single-precision floating-point format. The conversion
6798 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6799 | Arithmetic.
6800 *----------------------------------------------------------------------------*/
6802 float32 float128_to_float32(float128 a, float_status *status)
6804 flag aSign;
6805 int32_t aExp;
6806 uint64_t aSig0, aSig1;
6807 uint32_t zSig;
6809 aSig1 = extractFloat128Frac1( a );
6810 aSig0 = extractFloat128Frac0( a );
6811 aExp = extractFloat128Exp( a );
6812 aSign = extractFloat128Sign( a );
6813 if ( aExp == 0x7FFF ) {
6814 if ( aSig0 | aSig1 ) {
6815 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6817 return packFloat32( aSign, 0xFF, 0 );
6819 aSig0 |= ( aSig1 != 0 );
6820 shift64RightJamming( aSig0, 18, &aSig0 );
6821 zSig = aSig0;
6822 if ( aExp || zSig ) {
6823 zSig |= 0x40000000;
6824 aExp -= 0x3F81;
6826 return roundAndPackFloat32(aSign, aExp, zSig, status);
6830 /*----------------------------------------------------------------------------
6831 | Returns the result of converting the quadruple-precision floating-point
6832 | value `a' to the double-precision floating-point format. The conversion
6833 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6834 | Arithmetic.
6835 *----------------------------------------------------------------------------*/
6837 float64 float128_to_float64(float128 a, float_status *status)
6839 flag aSign;
6840 int32_t aExp;
6841 uint64_t aSig0, aSig1;
6843 aSig1 = extractFloat128Frac1( a );
6844 aSig0 = extractFloat128Frac0( a );
6845 aExp = extractFloat128Exp( a );
6846 aSign = extractFloat128Sign( a );
6847 if ( aExp == 0x7FFF ) {
6848 if ( aSig0 | aSig1 ) {
6849 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6851 return packFloat64( aSign, 0x7FF, 0 );
6853 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6854 aSig0 |= ( aSig1 != 0 );
6855 if ( aExp || aSig0 ) {
6856 aSig0 |= LIT64( 0x4000000000000000 );
6857 aExp -= 0x3C01;
6859 return roundAndPackFloat64(aSign, aExp, aSig0, status);
6863 /*----------------------------------------------------------------------------
6864 | Returns the result of converting the quadruple-precision floating-point
6865 | value `a' to the extended double-precision floating-point format. The
6866 | conversion is performed according to the IEC/IEEE Standard for Binary
6867 | Floating-Point Arithmetic.
6868 *----------------------------------------------------------------------------*/
6870 floatx80 float128_to_floatx80(float128 a, float_status *status)
6872 flag aSign;
6873 int32_t aExp;
6874 uint64_t aSig0, aSig1;
6876 aSig1 = extractFloat128Frac1( a );
6877 aSig0 = extractFloat128Frac0( a );
6878 aExp = extractFloat128Exp( a );
6879 aSign = extractFloat128Sign( a );
6880 if ( aExp == 0x7FFF ) {
6881 if ( aSig0 | aSig1 ) {
6882 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6884 return packFloatx80(aSign, floatx80_infinity_high,
6885 floatx80_infinity_low);
6887 if ( aExp == 0 ) {
6888 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6889 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6891 else {
6892 aSig0 |= LIT64( 0x0001000000000000 );
6894 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6895 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6899 /*----------------------------------------------------------------------------
6900 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6901 | returns the result as a quadruple-precision floating-point value. The
6902 | operation is performed according to the IEC/IEEE Standard for Binary
6903 | Floating-Point Arithmetic.
6904 *----------------------------------------------------------------------------*/
6906 float128 float128_round_to_int(float128 a, float_status *status)
6908 flag aSign;
6909 int32_t aExp;
6910 uint64_t lastBitMask, roundBitsMask;
6911 float128 z;
6913 aExp = extractFloat128Exp( a );
6914 if ( 0x402F <= aExp ) {
6915 if ( 0x406F <= aExp ) {
6916 if ( ( aExp == 0x7FFF )
6917 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6919 return propagateFloat128NaN(a, a, status);
6921 return a;
6923 lastBitMask = 1;
6924 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6925 roundBitsMask = lastBitMask - 1;
6926 z = a;
6927 switch (status->float_rounding_mode) {
6928 case float_round_nearest_even:
6929 if ( lastBitMask ) {
6930 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6931 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6933 else {
6934 if ( (int64_t) z.low < 0 ) {
6935 ++z.high;
6936 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6939 break;
6940 case float_round_ties_away:
6941 if (lastBitMask) {
6942 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6943 } else {
6944 if ((int64_t) z.low < 0) {
6945 ++z.high;
6948 break;
6949 case float_round_to_zero:
6950 break;
6951 case float_round_up:
6952 if (!extractFloat128Sign(z)) {
6953 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6955 break;
6956 case float_round_down:
6957 if (extractFloat128Sign(z)) {
6958 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6960 break;
6961 default:
6962 abort();
6964 z.low &= ~ roundBitsMask;
6966 else {
6967 if ( aExp < 0x3FFF ) {
6968 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6969 status->float_exception_flags |= float_flag_inexact;
6970 aSign = extractFloat128Sign( a );
6971 switch (status->float_rounding_mode) {
6972 case float_round_nearest_even:
6973 if ( ( aExp == 0x3FFE )
6974 && ( extractFloat128Frac0( a )
6975 | extractFloat128Frac1( a ) )
6977 return packFloat128( aSign, 0x3FFF, 0, 0 );
6979 break;
6980 case float_round_ties_away:
6981 if (aExp == 0x3FFE) {
6982 return packFloat128(aSign, 0x3FFF, 0, 0);
6984 break;
6985 case float_round_down:
6986 return
6987 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6988 : packFloat128( 0, 0, 0, 0 );
6989 case float_round_up:
6990 return
6991 aSign ? packFloat128( 1, 0, 0, 0 )
6992 : packFloat128( 0, 0x3FFF, 0, 0 );
6994 return packFloat128( aSign, 0, 0, 0 );
6996 lastBitMask = 1;
6997 lastBitMask <<= 0x402F - aExp;
6998 roundBitsMask = lastBitMask - 1;
6999 z.low = 0;
7000 z.high = a.high;
7001 switch (status->float_rounding_mode) {
7002 case float_round_nearest_even:
7003 z.high += lastBitMask>>1;
7004 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
7005 z.high &= ~ lastBitMask;
7007 break;
7008 case float_round_ties_away:
7009 z.high += lastBitMask>>1;
7010 break;
7011 case float_round_to_zero:
7012 break;
7013 case float_round_up:
7014 if (!extractFloat128Sign(z)) {
7015 z.high |= ( a.low != 0 );
7016 z.high += roundBitsMask;
7018 break;
7019 case float_round_down:
7020 if (extractFloat128Sign(z)) {
7021 z.high |= (a.low != 0);
7022 z.high += roundBitsMask;
7024 break;
7025 default:
7026 abort();
7028 z.high &= ~ roundBitsMask;
7030 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
7031 status->float_exception_flags |= float_flag_inexact;
7033 return z;
7037 /*----------------------------------------------------------------------------
7038 | Returns the result of adding the absolute values of the quadruple-precision
7039 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
7040 | before being returned. `zSign' is ignored if the result is a NaN.
7041 | The addition is performed according to the IEC/IEEE Standard for Binary
7042 | Floating-Point Arithmetic.
7043 *----------------------------------------------------------------------------*/
7045 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
7046 float_status *status)
7048 int32_t aExp, bExp, zExp;
7049 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7050 int32_t expDiff;
7052 aSig1 = extractFloat128Frac1( a );
7053 aSig0 = extractFloat128Frac0( a );
7054 aExp = extractFloat128Exp( a );
7055 bSig1 = extractFloat128Frac1( b );
7056 bSig0 = extractFloat128Frac0( b );
7057 bExp = extractFloat128Exp( b );
7058 expDiff = aExp - bExp;
7059 if ( 0 < expDiff ) {
7060 if ( aExp == 0x7FFF ) {
7061 if (aSig0 | aSig1) {
7062 return propagateFloat128NaN(a, b, status);
7064 return a;
7066 if ( bExp == 0 ) {
7067 --expDiff;
7069 else {
7070 bSig0 |= LIT64( 0x0001000000000000 );
7072 shift128ExtraRightJamming(
7073 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
7074 zExp = aExp;
7076 else if ( expDiff < 0 ) {
7077 if ( bExp == 0x7FFF ) {
7078 if (bSig0 | bSig1) {
7079 return propagateFloat128NaN(a, b, status);
7081 return packFloat128( zSign, 0x7FFF, 0, 0 );
7083 if ( aExp == 0 ) {
7084 ++expDiff;
7086 else {
7087 aSig0 |= LIT64( 0x0001000000000000 );
7089 shift128ExtraRightJamming(
7090 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
7091 zExp = bExp;
7093 else {
7094 if ( aExp == 0x7FFF ) {
7095 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7096 return propagateFloat128NaN(a, b, status);
7098 return a;
7100 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7101 if ( aExp == 0 ) {
7102 if (status->flush_to_zero) {
7103 if (zSig0 | zSig1) {
7104 float_raise(float_flag_output_denormal, status);
7106 return packFloat128(zSign, 0, 0, 0);
7108 return packFloat128( zSign, 0, zSig0, zSig1 );
7110 zSig2 = 0;
7111 zSig0 |= LIT64( 0x0002000000000000 );
7112 zExp = aExp;
7113 goto shiftRight1;
7115 aSig0 |= LIT64( 0x0001000000000000 );
7116 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7117 --zExp;
7118 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
7119 ++zExp;
7120 shiftRight1:
7121 shift128ExtraRightJamming(
7122 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7123 roundAndPack:
7124 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7128 /*----------------------------------------------------------------------------
7129 | Returns the result of subtracting the absolute values of the quadruple-
7130 | precision floating-point values `a' and `b'. If `zSign' is 1, the
7131 | difference is negated before being returned. `zSign' is ignored if the
7132 | result is a NaN. The subtraction is performed according to the IEC/IEEE
7133 | Standard for Binary Floating-Point Arithmetic.
7134 *----------------------------------------------------------------------------*/
7136 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
7137 float_status *status)
7139 int32_t aExp, bExp, zExp;
7140 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
7141 int32_t expDiff;
7143 aSig1 = extractFloat128Frac1( a );
7144 aSig0 = extractFloat128Frac0( a );
7145 aExp = extractFloat128Exp( a );
7146 bSig1 = extractFloat128Frac1( b );
7147 bSig0 = extractFloat128Frac0( b );
7148 bExp = extractFloat128Exp( b );
7149 expDiff = aExp - bExp;
7150 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7151 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7152 if ( 0 < expDiff ) goto aExpBigger;
7153 if ( expDiff < 0 ) goto bExpBigger;
7154 if ( aExp == 0x7FFF ) {
7155 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7156 return propagateFloat128NaN(a, b, status);
7158 float_raise(float_flag_invalid, status);
7159 return float128_default_nan(status);
7161 if ( aExp == 0 ) {
7162 aExp = 1;
7163 bExp = 1;
7165 if ( bSig0 < aSig0 ) goto aBigger;
7166 if ( aSig0 < bSig0 ) goto bBigger;
7167 if ( bSig1 < aSig1 ) goto aBigger;
7168 if ( aSig1 < bSig1 ) goto bBigger;
7169 return packFloat128(status->float_rounding_mode == float_round_down,
7170 0, 0, 0);
7171 bExpBigger:
7172 if ( bExp == 0x7FFF ) {
7173 if (bSig0 | bSig1) {
7174 return propagateFloat128NaN(a, b, status);
7176 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7178 if ( aExp == 0 ) {
7179 ++expDiff;
7181 else {
7182 aSig0 |= LIT64( 0x4000000000000000 );
7184 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7185 bSig0 |= LIT64( 0x4000000000000000 );
7186 bBigger:
7187 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7188 zExp = bExp;
7189 zSign ^= 1;
7190 goto normalizeRoundAndPack;
7191 aExpBigger:
7192 if ( aExp == 0x7FFF ) {
7193 if (aSig0 | aSig1) {
7194 return propagateFloat128NaN(a, b, status);
7196 return a;
7198 if ( bExp == 0 ) {
7199 --expDiff;
7201 else {
7202 bSig0 |= LIT64( 0x4000000000000000 );
7204 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7205 aSig0 |= LIT64( 0x4000000000000000 );
7206 aBigger:
7207 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7208 zExp = aExp;
7209 normalizeRoundAndPack:
7210 --zExp;
7211 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7212 status);
7216 /*----------------------------------------------------------------------------
7217 | Returns the result of adding the quadruple-precision floating-point values
7218 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
7219 | for Binary Floating-Point Arithmetic.
7220 *----------------------------------------------------------------------------*/
7222 float128 float128_add(float128 a, float128 b, float_status *status)
7224 flag aSign, bSign;
7226 aSign = extractFloat128Sign( a );
7227 bSign = extractFloat128Sign( b );
7228 if ( aSign == bSign ) {
7229 return addFloat128Sigs(a, b, aSign, status);
7231 else {
7232 return subFloat128Sigs(a, b, aSign, status);
7237 /*----------------------------------------------------------------------------
7238 | Returns the result of subtracting the quadruple-precision floating-point
7239 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7240 | Standard for Binary Floating-Point Arithmetic.
7241 *----------------------------------------------------------------------------*/
7243 float128 float128_sub(float128 a, float128 b, float_status *status)
7245 flag aSign, bSign;
7247 aSign = extractFloat128Sign( a );
7248 bSign = extractFloat128Sign( b );
7249 if ( aSign == bSign ) {
7250 return subFloat128Sigs(a, b, aSign, status);
7252 else {
7253 return addFloat128Sigs(a, b, aSign, status);
7258 /*----------------------------------------------------------------------------
7259 | Returns the result of multiplying the quadruple-precision floating-point
7260 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7261 | Standard for Binary Floating-Point Arithmetic.
7262 *----------------------------------------------------------------------------*/
7264 float128 float128_mul(float128 a, float128 b, float_status *status)
7266 flag aSign, bSign, zSign;
7267 int32_t aExp, bExp, zExp;
7268 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7270 aSig1 = extractFloat128Frac1( a );
7271 aSig0 = extractFloat128Frac0( a );
7272 aExp = extractFloat128Exp( a );
7273 aSign = extractFloat128Sign( a );
7274 bSig1 = extractFloat128Frac1( b );
7275 bSig0 = extractFloat128Frac0( b );
7276 bExp = extractFloat128Exp( b );
7277 bSign = extractFloat128Sign( b );
7278 zSign = aSign ^ bSign;
7279 if ( aExp == 0x7FFF ) {
7280 if ( ( aSig0 | aSig1 )
7281 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7282 return propagateFloat128NaN(a, b, status);
7284 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7285 return packFloat128( zSign, 0x7FFF, 0, 0 );
7287 if ( bExp == 0x7FFF ) {
7288 if (bSig0 | bSig1) {
7289 return propagateFloat128NaN(a, b, status);
7291 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7292 invalid:
7293 float_raise(float_flag_invalid, status);
7294 return float128_default_nan(status);
7296 return packFloat128( zSign, 0x7FFF, 0, 0 );
7298 if ( aExp == 0 ) {
7299 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7300 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7302 if ( bExp == 0 ) {
7303 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7304 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7306 zExp = aExp + bExp - 0x4000;
7307 aSig0 |= LIT64( 0x0001000000000000 );
7308 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7309 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7310 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7311 zSig2 |= ( zSig3 != 0 );
7312 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
7313 shift128ExtraRightJamming(
7314 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7315 ++zExp;
7317 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7321 /*----------------------------------------------------------------------------
7322 | Returns the result of dividing the quadruple-precision floating-point value
7323 | `a' by the corresponding value `b'. The operation is performed according to
7324 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7325 *----------------------------------------------------------------------------*/
7327 float128 float128_div(float128 a, float128 b, float_status *status)
7329 flag aSign, bSign, zSign;
7330 int32_t aExp, bExp, zExp;
7331 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7332 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7334 aSig1 = extractFloat128Frac1( a );
7335 aSig0 = extractFloat128Frac0( a );
7336 aExp = extractFloat128Exp( a );
7337 aSign = extractFloat128Sign( a );
7338 bSig1 = extractFloat128Frac1( b );
7339 bSig0 = extractFloat128Frac0( b );
7340 bExp = extractFloat128Exp( b );
7341 bSign = extractFloat128Sign( b );
7342 zSign = aSign ^ bSign;
7343 if ( aExp == 0x7FFF ) {
7344 if (aSig0 | aSig1) {
7345 return propagateFloat128NaN(a, b, status);
7347 if ( bExp == 0x7FFF ) {
7348 if (bSig0 | bSig1) {
7349 return propagateFloat128NaN(a, b, status);
7351 goto invalid;
7353 return packFloat128( zSign, 0x7FFF, 0, 0 );
7355 if ( bExp == 0x7FFF ) {
7356 if (bSig0 | bSig1) {
7357 return propagateFloat128NaN(a, b, status);
7359 return packFloat128( zSign, 0, 0, 0 );
7361 if ( bExp == 0 ) {
7362 if ( ( bSig0 | bSig1 ) == 0 ) {
7363 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7364 invalid:
7365 float_raise(float_flag_invalid, status);
7366 return float128_default_nan(status);
7368 float_raise(float_flag_divbyzero, status);
7369 return packFloat128( zSign, 0x7FFF, 0, 0 );
7371 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7373 if ( aExp == 0 ) {
7374 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7375 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7377 zExp = aExp - bExp + 0x3FFD;
7378 shortShift128Left(
7379 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
7380 shortShift128Left(
7381 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7382 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7383 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7384 ++zExp;
7386 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7387 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7388 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7389 while ( (int64_t) rem0 < 0 ) {
7390 --zSig0;
7391 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7393 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7394 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7395 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7396 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7397 while ( (int64_t) rem1 < 0 ) {
7398 --zSig1;
7399 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7401 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7403 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7404 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7408 /*----------------------------------------------------------------------------
7409 | Returns the remainder of the quadruple-precision floating-point value `a'
7410 | with respect to the corresponding value `b'. The operation is performed
7411 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7412 *----------------------------------------------------------------------------*/
7414 float128 float128_rem(float128 a, float128 b, float_status *status)
7416 flag aSign, zSign;
7417 int32_t aExp, bExp, expDiff;
7418 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7419 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7420 int64_t sigMean0;
7422 aSig1 = extractFloat128Frac1( a );
7423 aSig0 = extractFloat128Frac0( a );
7424 aExp = extractFloat128Exp( a );
7425 aSign = extractFloat128Sign( a );
7426 bSig1 = extractFloat128Frac1( b );
7427 bSig0 = extractFloat128Frac0( b );
7428 bExp = extractFloat128Exp( b );
7429 if ( aExp == 0x7FFF ) {
7430 if ( ( aSig0 | aSig1 )
7431 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7432 return propagateFloat128NaN(a, b, status);
7434 goto invalid;
7436 if ( bExp == 0x7FFF ) {
7437 if (bSig0 | bSig1) {
7438 return propagateFloat128NaN(a, b, status);
7440 return a;
7442 if ( bExp == 0 ) {
7443 if ( ( bSig0 | bSig1 ) == 0 ) {
7444 invalid:
7445 float_raise(float_flag_invalid, status);
7446 return float128_default_nan(status);
7448 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7450 if ( aExp == 0 ) {
7451 if ( ( aSig0 | aSig1 ) == 0 ) return a;
7452 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7454 expDiff = aExp - bExp;
7455 if ( expDiff < -1 ) return a;
7456 shortShift128Left(
7457 aSig0 | LIT64( 0x0001000000000000 ),
7458 aSig1,
7459 15 - ( expDiff < 0 ),
7460 &aSig0,
7461 &aSig1
7463 shortShift128Left(
7464 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7465 q = le128( bSig0, bSig1, aSig0, aSig1 );
7466 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7467 expDiff -= 64;
7468 while ( 0 < expDiff ) {
7469 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7470 q = ( 4 < q ) ? q - 4 : 0;
7471 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7472 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7473 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7474 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7475 expDiff -= 61;
7477 if ( -64 < expDiff ) {
7478 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7479 q = ( 4 < q ) ? q - 4 : 0;
7480 q >>= - expDiff;
7481 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7482 expDiff += 52;
7483 if ( expDiff < 0 ) {
7484 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7486 else {
7487 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7489 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7490 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7492 else {
7493 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7494 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7496 do {
7497 alternateASig0 = aSig0;
7498 alternateASig1 = aSig1;
7499 ++q;
7500 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7501 } while ( 0 <= (int64_t) aSig0 );
7502 add128(
7503 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7504 if ( ( sigMean0 < 0 )
7505 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7506 aSig0 = alternateASig0;
7507 aSig1 = alternateASig1;
7509 zSign = ( (int64_t) aSig0 < 0 );
7510 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7511 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7512 status);
7515 /*----------------------------------------------------------------------------
7516 | Returns the square root of the quadruple-precision floating-point value `a'.
7517 | The operation is performed according to the IEC/IEEE Standard for Binary
7518 | Floating-Point Arithmetic.
7519 *----------------------------------------------------------------------------*/
7521 float128 float128_sqrt(float128 a, float_status *status)
7523 flag aSign;
7524 int32_t aExp, zExp;
7525 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7526 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7528 aSig1 = extractFloat128Frac1( a );
7529 aSig0 = extractFloat128Frac0( a );
7530 aExp = extractFloat128Exp( a );
7531 aSign = extractFloat128Sign( a );
7532 if ( aExp == 0x7FFF ) {
7533 if (aSig0 | aSig1) {
7534 return propagateFloat128NaN(a, a, status);
7536 if ( ! aSign ) return a;
7537 goto invalid;
7539 if ( aSign ) {
7540 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7541 invalid:
7542 float_raise(float_flag_invalid, status);
7543 return float128_default_nan(status);
7545 if ( aExp == 0 ) {
7546 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7547 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7549 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7550 aSig0 |= LIT64( 0x0001000000000000 );
7551 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7552 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7553 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7554 doubleZSig0 = zSig0<<1;
7555 mul64To128( zSig0, zSig0, &term0, &term1 );
7556 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7557 while ( (int64_t) rem0 < 0 ) {
7558 --zSig0;
7559 doubleZSig0 -= 2;
7560 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7562 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7563 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7564 if ( zSig1 == 0 ) zSig1 = 1;
7565 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7566 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7567 mul64To128( zSig1, zSig1, &term2, &term3 );
7568 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7569 while ( (int64_t) rem1 < 0 ) {
7570 --zSig1;
7571 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7572 term3 |= 1;
7573 term2 |= doubleZSig0;
7574 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7576 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7578 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7579 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7583 /*----------------------------------------------------------------------------
7584 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7585 | the corresponding value `b', and 0 otherwise. The invalid exception is
7586 | raised if either operand is a NaN. Otherwise, the comparison is performed
7587 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7588 *----------------------------------------------------------------------------*/
7590 int float128_eq(float128 a, float128 b, float_status *status)
7593 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7594 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7595 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7596 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7598 float_raise(float_flag_invalid, status);
7599 return 0;
7601 return
7602 ( a.low == b.low )
7603 && ( ( a.high == b.high )
7604 || ( ( a.low == 0 )
7605 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7610 /*----------------------------------------------------------------------------
7611 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7612 | or equal to the corresponding value `b', and 0 otherwise. The invalid
7613 | exception is raised if either operand is a NaN. The comparison is performed
7614 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7615 *----------------------------------------------------------------------------*/
7617 int float128_le(float128 a, float128 b, float_status *status)
7619 flag aSign, bSign;
7621 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7622 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7623 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7624 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7626 float_raise(float_flag_invalid, status);
7627 return 0;
7629 aSign = extractFloat128Sign( a );
7630 bSign = extractFloat128Sign( b );
7631 if ( aSign != bSign ) {
7632 return
7633 aSign
7634 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7635 == 0 );
7637 return
7638 aSign ? le128( b.high, b.low, a.high, a.low )
7639 : le128( a.high, a.low, b.high, b.low );
7643 /*----------------------------------------------------------------------------
7644 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7645 | the corresponding value `b', and 0 otherwise. The invalid exception is
7646 | raised if either operand is a NaN. The comparison is performed according
7647 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7648 *----------------------------------------------------------------------------*/
7650 int float128_lt(float128 a, float128 b, float_status *status)
7652 flag aSign, bSign;
7654 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7655 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7656 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7657 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7659 float_raise(float_flag_invalid, status);
7660 return 0;
7662 aSign = extractFloat128Sign( a );
7663 bSign = extractFloat128Sign( b );
7664 if ( aSign != bSign ) {
7665 return
7666 aSign
7667 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7668 != 0 );
7670 return
7671 aSign ? lt128( b.high, b.low, a.high, a.low )
7672 : lt128( a.high, a.low, b.high, b.low );
7676 /*----------------------------------------------------------------------------
7677 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7678 | be compared, and 0 otherwise. The invalid exception is raised if either
7679 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7680 | Standard for Binary Floating-Point Arithmetic.
7681 *----------------------------------------------------------------------------*/
7683 int float128_unordered(float128 a, float128 b, float_status *status)
7685 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7686 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7687 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7688 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7690 float_raise(float_flag_invalid, status);
7691 return 1;
7693 return 0;
7696 /*----------------------------------------------------------------------------
7697 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7698 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7699 | exception. The comparison is performed according to the IEC/IEEE Standard
7700 | for Binary Floating-Point Arithmetic.
7701 *----------------------------------------------------------------------------*/
7703 int float128_eq_quiet(float128 a, float128 b, float_status *status)
7706 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7707 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7708 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7709 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7711 if (float128_is_signaling_nan(a, status)
7712 || float128_is_signaling_nan(b, status)) {
7713 float_raise(float_flag_invalid, status);
7715 return 0;
7717 return
7718 ( a.low == b.low )
7719 && ( ( a.high == b.high )
7720 || ( ( a.low == 0 )
7721 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7726 /*----------------------------------------------------------------------------
7727 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7728 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
7729 | cause an exception. Otherwise, the comparison is performed according to the
7730 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7731 *----------------------------------------------------------------------------*/
7733 int float128_le_quiet(float128 a, float128 b, float_status *status)
7735 flag aSign, bSign;
7737 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7738 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7739 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7740 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7742 if (float128_is_signaling_nan(a, status)
7743 || float128_is_signaling_nan(b, status)) {
7744 float_raise(float_flag_invalid, status);
7746 return 0;
7748 aSign = extractFloat128Sign( a );
7749 bSign = extractFloat128Sign( b );
7750 if ( aSign != bSign ) {
7751 return
7752 aSign
7753 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7754 == 0 );
7756 return
7757 aSign ? le128( b.high, b.low, a.high, a.low )
7758 : le128( a.high, a.low, b.high, b.low );
7762 /*----------------------------------------------------------------------------
7763 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7764 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7765 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
7766 | Standard for Binary Floating-Point Arithmetic.
7767 *----------------------------------------------------------------------------*/
7769 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7771 flag aSign, bSign;
7773 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7774 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7775 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7776 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7778 if (float128_is_signaling_nan(a, status)
7779 || float128_is_signaling_nan(b, status)) {
7780 float_raise(float_flag_invalid, status);
7782 return 0;
7784 aSign = extractFloat128Sign( a );
7785 bSign = extractFloat128Sign( b );
7786 if ( aSign != bSign ) {
7787 return
7788 aSign
7789 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7790 != 0 );
7792 return
7793 aSign ? lt128( b.high, b.low, a.high, a.low )
7794 : lt128( a.high, a.low, b.high, b.low );
7798 /*----------------------------------------------------------------------------
7799 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7800 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
7801 | comparison is performed according to the IEC/IEEE Standard for Binary
7802 | Floating-Point Arithmetic.
7803 *----------------------------------------------------------------------------*/
7805 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7807 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7808 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7809 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7810 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7812 if (float128_is_signaling_nan(a, status)
7813 || float128_is_signaling_nan(b, status)) {
7814 float_raise(float_flag_invalid, status);
7816 return 1;
7818 return 0;
7821 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7822 int is_quiet, float_status *status)
7824 flag aSign, bSign;
7826 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7827 float_raise(float_flag_invalid, status);
7828 return float_relation_unordered;
7830 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7831 ( extractFloatx80Frac( a )<<1 ) ) ||
7832 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7833 ( extractFloatx80Frac( b )<<1 ) )) {
7834 if (!is_quiet ||
7835 floatx80_is_signaling_nan(a, status) ||
7836 floatx80_is_signaling_nan(b, status)) {
7837 float_raise(float_flag_invalid, status);
7839 return float_relation_unordered;
7841 aSign = extractFloatx80Sign( a );
7842 bSign = extractFloatx80Sign( b );
7843 if ( aSign != bSign ) {
7845 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7846 ( ( a.low | b.low ) == 0 ) ) {
7847 /* zero case */
7848 return float_relation_equal;
7849 } else {
7850 return 1 - (2 * aSign);
7852 } else {
7853 if (a.low == b.low && a.high == b.high) {
7854 return float_relation_equal;
7855 } else {
7856 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7861 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7863 return floatx80_compare_internal(a, b, 0, status);
7866 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7868 return floatx80_compare_internal(a, b, 1, status);
7871 static inline int float128_compare_internal(float128 a, float128 b,
7872 int is_quiet, float_status *status)
7874 flag aSign, bSign;
7876 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7877 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7878 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7879 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7880 if (!is_quiet ||
7881 float128_is_signaling_nan(a, status) ||
7882 float128_is_signaling_nan(b, status)) {
7883 float_raise(float_flag_invalid, status);
7885 return float_relation_unordered;
7887 aSign = extractFloat128Sign( a );
7888 bSign = extractFloat128Sign( b );
7889 if ( aSign != bSign ) {
7890 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7891 /* zero case */
7892 return float_relation_equal;
7893 } else {
7894 return 1 - (2 * aSign);
7896 } else {
7897 if (a.low == b.low && a.high == b.high) {
7898 return float_relation_equal;
7899 } else {
7900 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7905 int float128_compare(float128 a, float128 b, float_status *status)
7907 return float128_compare_internal(a, b, 0, status);
7910 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7912 return float128_compare_internal(a, b, 1, status);
7915 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7917 flag aSign;
7918 int32_t aExp;
7919 uint64_t aSig;
7921 if (floatx80_invalid_encoding(a)) {
7922 float_raise(float_flag_invalid, status);
7923 return floatx80_default_nan(status);
7925 aSig = extractFloatx80Frac( a );
7926 aExp = extractFloatx80Exp( a );
7927 aSign = extractFloatx80Sign( a );
7929 if ( aExp == 0x7FFF ) {
7930 if ( aSig<<1 ) {
7931 return propagateFloatx80NaN(a, a, status);
7933 return a;
7936 if (aExp == 0) {
7937 if (aSig == 0) {
7938 return a;
7940 aExp++;
7943 if (n > 0x10000) {
7944 n = 0x10000;
7945 } else if (n < -0x10000) {
7946 n = -0x10000;
7949 aExp += n;
7950 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7951 aSign, aExp, aSig, 0, status);
7954 float128 float128_scalbn(float128 a, int n, float_status *status)
7956 flag aSign;
7957 int32_t aExp;
7958 uint64_t aSig0, aSig1;
7960 aSig1 = extractFloat128Frac1( a );
7961 aSig0 = extractFloat128Frac0( a );
7962 aExp = extractFloat128Exp( a );
7963 aSign = extractFloat128Sign( a );
7964 if ( aExp == 0x7FFF ) {
7965 if ( aSig0 | aSig1 ) {
7966 return propagateFloat128NaN(a, a, status);
7968 return a;
7970 if (aExp != 0) {
7971 aSig0 |= LIT64( 0x0001000000000000 );
7972 } else if (aSig0 == 0 && aSig1 == 0) {
7973 return a;
7974 } else {
7975 aExp++;
7978 if (n > 0x10000) {
7979 n = 0x10000;
7980 } else if (n < -0x10000) {
7981 n = -0x10000;
7984 aExp += n - 1;
7985 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7986 , status);
7990 static void __attribute__((constructor)) softfloat_init(void)
7992 union_float64 ua, ub, uc, ur;
7994 if (QEMU_NO_HARDFLOAT) {
7995 return;
7998 * Test that the host's FMA is not obviously broken. For example,
7999 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
8000 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304
8002 ua.s = 0x0020000000000001ULL;
8003 ub.s = 0x3ca0000000000000ULL;
8004 uc.s = 0x0020000000000000ULL;
8005 ur.h = fma(ua.h, ub.h, uc.h);
8006 if (ur.s != 0x0020000000000001ULL) {
8007 force_soft_fma = true;