target/mips: Style improvements in mips_malta.c
[qemu/ar7.git] / fpu / softfloat.c
blob0638c9f4e0304309c235489acdb2c69fd33cb482
1 /*
2 * QEMU float support
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
90 /* We only need stdlib for abort() */
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
100 * Hardfloat
102 * Fast emulation of guest FP instructions is challenging for two reasons.
103 * First, FP instruction semantics are similar but not identical, particularly
104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
105 * exception flags is not trivial: reading the host's flags register with a
106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107 * and trapping on every FP exception is not fast nor pleasant to work with.
109 * We address these challenges by leveraging the host FPU for a subset of the
110 * operations. To do this we expand on the idea presented in this paper:
112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
115 * The idea is thus to leverage the host FPU to (1) compute FP operations
116 * and (2) identify whether FP exceptions occurred while avoiding
117 * expensive exception flag register accesses.
119 * An important optimization shown in the paper is that given that exception
120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121 * This is particularly useful for the inexact flag, which is very frequently
122 * raised in floating-point workloads.
124 * We optimize the code further by deferring to soft-fp whenever FP exception
125 * detection might get hairy. Two examples: (1) when at least one operand is
126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127 * and the result is < the minimum normal.
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \
130 static inline void name(soft_t *a, float_status *s) \
132 if (unlikely(soft_t ## _is_denormal(*a))) { \
133 *a = soft_t ## _set_sign(soft_t ## _zero, \
134 soft_t ## _is_neg(*a)); \
135 s->float_exception_flags |= float_flag_input_denormal; \
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
143 #define GEN_INPUT_FLUSH1(name, soft_t) \
144 static inline void name(soft_t *a, float_status *s) \
146 if (likely(!s->flush_inputs_to_zero)) { \
147 return; \
149 soft_t ## _input_flush__nocheck(a, s); \
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
156 #define GEN_INPUT_FLUSH2(name, soft_t) \
157 static inline void name(soft_t *a, soft_t *b, float_status *s) \
159 if (likely(!s->flush_inputs_to_zero)) { \
160 return; \
162 soft_t ## _input_flush__nocheck(a, s); \
163 soft_t ## _input_flush__nocheck(b, s); \
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
170 #define GEN_INPUT_FLUSH3(name, soft_t) \
171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
173 if (likely(!s->flush_inputs_to_zero)) { \
174 return; \
176 soft_t ## _input_flush__nocheck(a, s); \
177 soft_t ## _input_flush__nocheck(b, s); \
178 soft_t ## _input_flush__nocheck(c, s); \
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
187 * hardfloat functions. Each combination of number of inputs and float size
188 * gets its own value.
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208 * float{32,64}_is_infinity when !USE_FP.
209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF 1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF 0
216 #endif
219 * Some targets clear the FP flags before most FP operations. This prevents
220 * the use of hardfloat, since hardfloat relies on the inexact flag being
221 * already set.
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226 IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
235 static inline bool can_use_fpu(const float_status *s)
237 if (QEMU_NO_HARDFLOAT) {
238 return false;
240 return likely(s->float_exception_flags & float_flag_inexact &&
241 s->float_rounding_mode == float_round_nearest_even);
245 * Hardfloat generation functions. Each operation can have two flavors:
246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247 * most condition checks, or native ones (e.g. fpclassify).
249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
250 * compiler to propagate constants and inline everything into the callers.
252 * We only generate functions for operations with two inputs, since only
253 * these are common enough to justify consolidating them into common code.
256 typedef union {
257 float32 s;
258 float h;
259 } union_float32;
261 typedef union {
262 float64 s;
263 double h;
264 } union_float64;
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float (*hard_f32_op2_fn)(float a, float b);
272 typedef double (*hard_f64_op2_fn)(double a, double b);
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
277 if (QEMU_HARDFLOAT_2F32_USE_FP) {
279 * Not using a temp variable for consecutive fpclassify calls ends up
280 * generating faster code.
282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
285 return float32_is_zero_or_normal(a.s) &&
286 float32_is_zero_or_normal(b.s);
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
291 if (QEMU_HARDFLOAT_2F64_USE_FP) {
292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
295 return float64_is_zero_or_normal(a.s) &&
296 float64_is_zero_or_normal(b.s);
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
303 if (QEMU_HARDFLOAT_3F32_USE_FP) {
304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
308 return float32_is_zero_or_normal(a.s) &&
309 float32_is_zero_or_normal(b.s) &&
310 float32_is_zero_or_normal(c.s);
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
316 if (QEMU_HARDFLOAT_3F64_USE_FP) {
317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
321 return float64_is_zero_or_normal(a.s) &&
322 float64_is_zero_or_normal(b.s) &&
323 float64_is_zero_or_normal(c.s);
326 static inline bool f32_is_inf(union_float32 a)
328 if (QEMU_HARDFLOAT_USE_ISINF) {
329 return isinf(a.h);
331 return float32_is_infinity(a.s);
334 static inline bool f64_is_inf(union_float64 a)
336 if (QEMU_HARDFLOAT_USE_ISINF) {
337 return isinf(a.h);
339 return float64_is_infinity(a.s);
342 /* Note: @fast_test and @post can be NULL */
343 static inline float32
344 float32_gen2(float32 xa, float32 xb, float_status *s,
345 hard_f32_op2_fn hard, soft_f32_op2_fn soft,
346 f32_check_fn pre, f32_check_fn post,
347 f32_check_fn fast_test, soft_f32_op2_fn fast_op)
349 union_float32 ua, ub, ur;
351 ua.s = xa;
352 ub.s = xb;
354 if (unlikely(!can_use_fpu(s))) {
355 goto soft;
358 float32_input_flush2(&ua.s, &ub.s, s);
359 if (unlikely(!pre(ua, ub))) {
360 goto soft;
362 if (fast_test && fast_test(ua, ub)) {
363 return fast_op(ua.s, ub.s, s);
366 ur.h = hard(ua.h, ub.h);
367 if (unlikely(f32_is_inf(ur))) {
368 s->float_exception_flags |= float_flag_overflow;
369 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
370 if (post == NULL || post(ua, ub)) {
371 goto soft;
374 return ur.s;
376 soft:
377 return soft(ua.s, ub.s, s);
380 static inline float64
381 float64_gen2(float64 xa, float64 xb, float_status *s,
382 hard_f64_op2_fn hard, soft_f64_op2_fn soft,
383 f64_check_fn pre, f64_check_fn post,
384 f64_check_fn fast_test, soft_f64_op2_fn fast_op)
386 union_float64 ua, ub, ur;
388 ua.s = xa;
389 ub.s = xb;
391 if (unlikely(!can_use_fpu(s))) {
392 goto soft;
395 float64_input_flush2(&ua.s, &ub.s, s);
396 if (unlikely(!pre(ua, ub))) {
397 goto soft;
399 if (fast_test && fast_test(ua, ub)) {
400 return fast_op(ua.s, ub.s, s);
403 ur.h = hard(ua.h, ub.h);
404 if (unlikely(f64_is_inf(ur))) {
405 s->float_exception_flags |= float_flag_overflow;
406 } else if (unlikely(fabs(ur.h) <= DBL_MIN)) {
407 if (post == NULL || post(ua, ub)) {
408 goto soft;
411 return ur.s;
413 soft:
414 return soft(ua.s, ub.s, s);
417 /*----------------------------------------------------------------------------
418 | Returns the fraction bits of the single-precision floating-point value `a'.
419 *----------------------------------------------------------------------------*/
421 static inline uint32_t extractFloat32Frac(float32 a)
423 return float32_val(a) & 0x007FFFFF;
426 /*----------------------------------------------------------------------------
427 | Returns the exponent bits of the single-precision floating-point value `a'.
428 *----------------------------------------------------------------------------*/
430 static inline int extractFloat32Exp(float32 a)
432 return (float32_val(a) >> 23) & 0xFF;
435 /*----------------------------------------------------------------------------
436 | Returns the sign bit of the single-precision floating-point value `a'.
437 *----------------------------------------------------------------------------*/
439 static inline flag extractFloat32Sign(float32 a)
441 return float32_val(a) >> 31;
444 /*----------------------------------------------------------------------------
445 | Returns the fraction bits of the double-precision floating-point value `a'.
446 *----------------------------------------------------------------------------*/
448 static inline uint64_t extractFloat64Frac(float64 a)
450 return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
453 /*----------------------------------------------------------------------------
454 | Returns the exponent bits of the double-precision floating-point value `a'.
455 *----------------------------------------------------------------------------*/
457 static inline int extractFloat64Exp(float64 a)
459 return (float64_val(a) >> 52) & 0x7FF;
462 /*----------------------------------------------------------------------------
463 | Returns the sign bit of the double-precision floating-point value `a'.
464 *----------------------------------------------------------------------------*/
466 static inline flag extractFloat64Sign(float64 a)
468 return float64_val(a) >> 63;
472 * Classify a floating point number. Everything above float_class_qnan
473 * is a NaN so cls >= float_class_qnan is any NaN.
476 typedef enum __attribute__ ((__packed__)) {
477 float_class_unclassified,
478 float_class_zero,
479 float_class_normal,
480 float_class_inf,
481 float_class_qnan, /* all NaNs from here */
482 float_class_snan,
483 } FloatClass;
485 /* Simple helpers for checking if, or what kind of, NaN we have */
486 static inline __attribute__((unused)) bool is_nan(FloatClass c)
488 return unlikely(c >= float_class_qnan);
491 static inline __attribute__((unused)) bool is_snan(FloatClass c)
493 return c == float_class_snan;
496 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
498 return c == float_class_qnan;
502 * Structure holding all of the decomposed parts of a float. The
503 * exponent is unbiased and the fraction is normalized. All
504 * calculations are done with a 64 bit fraction and then rounded as
505 * appropriate for the final format.
507 * Thanks to the packed FloatClass a decent compiler should be able to
508 * fit the whole structure into registers and avoid using the stack
509 * for parameter passing.
512 typedef struct {
513 uint64_t frac;
514 int32_t exp;
515 FloatClass cls;
516 bool sign;
517 } FloatParts;
519 #define DECOMPOSED_BINARY_POINT (64 - 2)
520 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
521 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
523 /* Structure holding all of the relevant parameters for a format.
524 * exp_size: the size of the exponent field
525 * exp_bias: the offset applied to the exponent field
526 * exp_max: the maximum normalised exponent
527 * frac_size: the size of the fraction field
528 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
529 * The following are computed based the size of fraction
530 * frac_lsb: least significant bit of fraction
531 * frac_lsbm1: the bit below the least significant bit (for rounding)
532 * round_mask/roundeven_mask: masks used for rounding
533 * The following optional modifiers are available:
534 * arm_althp: handle ARM Alternative Half Precision
536 typedef struct {
537 int exp_size;
538 int exp_bias;
539 int exp_max;
540 int frac_size;
541 int frac_shift;
542 uint64_t frac_lsb;
543 uint64_t frac_lsbm1;
544 uint64_t round_mask;
545 uint64_t roundeven_mask;
546 bool arm_althp;
547 } FloatFmt;
549 /* Expand fields based on the size of exponent and fraction */
550 #define FLOAT_PARAMS(E, F) \
551 .exp_size = E, \
552 .exp_bias = ((1 << E) - 1) >> 1, \
553 .exp_max = (1 << E) - 1, \
554 .frac_size = F, \
555 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
556 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
557 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
558 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
559 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
561 static const FloatFmt float16_params = {
562 FLOAT_PARAMS(5, 10)
565 static const FloatFmt float16_params_ahp = {
566 FLOAT_PARAMS(5, 10),
567 .arm_althp = true
570 static const FloatFmt float32_params = {
571 FLOAT_PARAMS(8, 23)
574 static const FloatFmt float64_params = {
575 FLOAT_PARAMS(11, 52)
578 /* Unpack a float to parts, but do not canonicalize. */
579 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
581 const int sign_pos = fmt.frac_size + fmt.exp_size;
583 return (FloatParts) {
584 .cls = float_class_unclassified,
585 .sign = extract64(raw, sign_pos, 1),
586 .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
587 .frac = extract64(raw, 0, fmt.frac_size),
591 static inline FloatParts float16_unpack_raw(float16 f)
593 return unpack_raw(float16_params, f);
596 static inline FloatParts float32_unpack_raw(float32 f)
598 return unpack_raw(float32_params, f);
601 static inline FloatParts float64_unpack_raw(float64 f)
603 return unpack_raw(float64_params, f);
606 /* Pack a float from parts, but do not canonicalize. */
607 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
609 const int sign_pos = fmt.frac_size + fmt.exp_size;
610 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
611 return deposit64(ret, sign_pos, 1, p.sign);
614 static inline float16 float16_pack_raw(FloatParts p)
616 return make_float16(pack_raw(float16_params, p));
619 static inline float32 float32_pack_raw(FloatParts p)
621 return make_float32(pack_raw(float32_params, p));
624 static inline float64 float64_pack_raw(FloatParts p)
626 return make_float64(pack_raw(float64_params, p));
629 /*----------------------------------------------------------------------------
630 | Functions and definitions to determine: (1) whether tininess for underflow
631 | is detected before or after rounding by default, (2) what (if anything)
632 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
633 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
634 | are propagated from function inputs to output. These details are target-
635 | specific.
636 *----------------------------------------------------------------------------*/
637 #include "softfloat-specialize.inc.c"
639 /* Canonicalize EXP and FRAC, setting CLS. */
640 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm,
641 float_status *status)
643 if (part.exp == parm->exp_max && !parm->arm_althp) {
644 if (part.frac == 0) {
645 part.cls = float_class_inf;
646 } else {
647 part.frac <<= parm->frac_shift;
648 part.cls = (parts_is_snan_frac(part.frac, status)
649 ? float_class_snan : float_class_qnan);
651 } else if (part.exp == 0) {
652 if (likely(part.frac == 0)) {
653 part.cls = float_class_zero;
654 } else if (status->flush_inputs_to_zero) {
655 float_raise(float_flag_input_denormal, status);
656 part.cls = float_class_zero;
657 part.frac = 0;
658 } else {
659 int shift = clz64(part.frac) - 1;
660 part.cls = float_class_normal;
661 part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
662 part.frac <<= shift;
664 } else {
665 part.cls = float_class_normal;
666 part.exp -= parm->exp_bias;
667 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
669 return part;
672 /* Round and uncanonicalize a floating-point number by parts. There
673 * are FRAC_SHIFT bits that may require rounding at the bottom of the
674 * fraction; these bits will be removed. The exponent will be biased
675 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
678 static FloatParts round_canonical(FloatParts p, float_status *s,
679 const FloatFmt *parm)
681 const uint64_t frac_lsb = parm->frac_lsb;
682 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
683 const uint64_t round_mask = parm->round_mask;
684 const uint64_t roundeven_mask = parm->roundeven_mask;
685 const int exp_max = parm->exp_max;
686 const int frac_shift = parm->frac_shift;
687 uint64_t frac, inc;
688 int exp, flags = 0;
689 bool overflow_norm;
691 frac = p.frac;
692 exp = p.exp;
694 switch (p.cls) {
695 case float_class_normal:
696 switch (s->float_rounding_mode) {
697 case float_round_nearest_even:
698 overflow_norm = false;
699 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
700 break;
701 case float_round_ties_away:
702 overflow_norm = false;
703 inc = frac_lsbm1;
704 break;
705 case float_round_to_zero:
706 overflow_norm = true;
707 inc = 0;
708 break;
709 case float_round_up:
710 inc = p.sign ? 0 : round_mask;
711 overflow_norm = p.sign;
712 break;
713 case float_round_down:
714 inc = p.sign ? round_mask : 0;
715 overflow_norm = !p.sign;
716 break;
717 case float_round_to_odd:
718 overflow_norm = true;
719 inc = frac & frac_lsb ? 0 : round_mask;
720 break;
721 default:
722 g_assert_not_reached();
725 exp += parm->exp_bias;
726 if (likely(exp > 0)) {
727 if (frac & round_mask) {
728 flags |= float_flag_inexact;
729 frac += inc;
730 if (frac & DECOMPOSED_OVERFLOW_BIT) {
731 frac >>= 1;
732 exp++;
735 frac >>= frac_shift;
737 if (parm->arm_althp) {
738 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
739 if (unlikely(exp > exp_max)) {
740 /* Overflow. Return the maximum normal. */
741 flags = float_flag_invalid;
742 exp = exp_max;
743 frac = -1;
745 } else if (unlikely(exp >= exp_max)) {
746 flags |= float_flag_overflow | float_flag_inexact;
747 if (overflow_norm) {
748 exp = exp_max - 1;
749 frac = -1;
750 } else {
751 p.cls = float_class_inf;
752 goto do_inf;
755 } else if (s->flush_to_zero) {
756 flags |= float_flag_output_denormal;
757 p.cls = float_class_zero;
758 goto do_zero;
759 } else {
760 bool is_tiny = (s->float_detect_tininess
761 == float_tininess_before_rounding)
762 || (exp < 0)
763 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
765 shift64RightJamming(frac, 1 - exp, &frac);
766 if (frac & round_mask) {
767 /* Need to recompute round-to-even. */
768 switch (s->float_rounding_mode) {
769 case float_round_nearest_even:
770 inc = ((frac & roundeven_mask) != frac_lsbm1
771 ? frac_lsbm1 : 0);
772 break;
773 case float_round_to_odd:
774 inc = frac & frac_lsb ? 0 : round_mask;
775 break;
777 flags |= float_flag_inexact;
778 frac += inc;
781 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
782 frac >>= frac_shift;
784 if (is_tiny && (flags & float_flag_inexact)) {
785 flags |= float_flag_underflow;
787 if (exp == 0 && frac == 0) {
788 p.cls = float_class_zero;
791 break;
793 case float_class_zero:
794 do_zero:
795 exp = 0;
796 frac = 0;
797 break;
799 case float_class_inf:
800 do_inf:
801 assert(!parm->arm_althp);
802 exp = exp_max;
803 frac = 0;
804 break;
806 case float_class_qnan:
807 case float_class_snan:
808 assert(!parm->arm_althp);
809 exp = exp_max;
810 frac >>= parm->frac_shift;
811 break;
813 default:
814 g_assert_not_reached();
817 float_raise(flags, s);
818 p.exp = exp;
819 p.frac = frac;
820 return p;
823 /* Explicit FloatFmt version */
824 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
825 const FloatFmt *params)
827 return sf_canonicalize(float16_unpack_raw(f), params, s);
830 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
832 return float16a_unpack_canonical(f, s, &float16_params);
835 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
836 const FloatFmt *params)
838 return float16_pack_raw(round_canonical(p, s, params));
841 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
843 return float16a_round_pack_canonical(p, s, &float16_params);
846 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
848 return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
851 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
853 return float32_pack_raw(round_canonical(p, s, &float32_params));
856 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
858 return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
861 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
863 return float64_pack_raw(round_canonical(p, s, &float64_params));
866 static FloatParts return_nan(FloatParts a, float_status *s)
868 switch (a.cls) {
869 case float_class_snan:
870 s->float_exception_flags |= float_flag_invalid;
871 a = parts_silence_nan(a, s);
872 /* fall through */
873 case float_class_qnan:
874 if (s->default_nan_mode) {
875 return parts_default_nan(s);
877 break;
879 default:
880 g_assert_not_reached();
882 return a;
885 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
887 if (is_snan(a.cls) || is_snan(b.cls)) {
888 s->float_exception_flags |= float_flag_invalid;
891 if (s->default_nan_mode) {
892 return parts_default_nan(s);
893 } else {
894 if (pickNaN(a.cls, b.cls,
895 a.frac > b.frac ||
896 (a.frac == b.frac && a.sign < b.sign))) {
897 a = b;
899 if (is_snan(a.cls)) {
900 return parts_silence_nan(a, s);
903 return a;
906 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
907 bool inf_zero, float_status *s)
909 int which;
911 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
912 s->float_exception_flags |= float_flag_invalid;
915 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
917 if (s->default_nan_mode) {
918 /* Note that this check is after pickNaNMulAdd so that function
919 * has an opportunity to set the Invalid flag.
921 which = 3;
924 switch (which) {
925 case 0:
926 break;
927 case 1:
928 a = b;
929 break;
930 case 2:
931 a = c;
932 break;
933 case 3:
934 return parts_default_nan(s);
935 default:
936 g_assert_not_reached();
939 if (is_snan(a.cls)) {
940 return parts_silence_nan(a, s);
942 return a;
946 * Returns the result of adding or subtracting the values of the
947 * floating-point values `a' and `b'. The operation is performed
948 * according to the IEC/IEEE Standard for Binary Floating-Point
949 * Arithmetic.
952 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
953 float_status *s)
955 bool a_sign = a.sign;
956 bool b_sign = b.sign ^ subtract;
958 if (a_sign != b_sign) {
959 /* Subtraction */
961 if (a.cls == float_class_normal && b.cls == float_class_normal) {
962 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
963 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
964 a.frac = a.frac - b.frac;
965 } else {
966 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
967 a.frac = b.frac - a.frac;
968 a.exp = b.exp;
969 a_sign ^= 1;
972 if (a.frac == 0) {
973 a.cls = float_class_zero;
974 a.sign = s->float_rounding_mode == float_round_down;
975 } else {
976 int shift = clz64(a.frac) - 1;
977 a.frac = a.frac << shift;
978 a.exp = a.exp - shift;
979 a.sign = a_sign;
981 return a;
983 if (is_nan(a.cls) || is_nan(b.cls)) {
984 return pick_nan(a, b, s);
986 if (a.cls == float_class_inf) {
987 if (b.cls == float_class_inf) {
988 float_raise(float_flag_invalid, s);
989 return parts_default_nan(s);
991 return a;
993 if (a.cls == float_class_zero && b.cls == float_class_zero) {
994 a.sign = s->float_rounding_mode == float_round_down;
995 return a;
997 if (a.cls == float_class_zero || b.cls == float_class_inf) {
998 b.sign = a_sign ^ 1;
999 return b;
1001 if (b.cls == float_class_zero) {
1002 return a;
1004 } else {
1005 /* Addition */
1006 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1007 if (a.exp > b.exp) {
1008 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1009 } else if (a.exp < b.exp) {
1010 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1011 a.exp = b.exp;
1013 a.frac += b.frac;
1014 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1015 shift64RightJamming(a.frac, 1, &a.frac);
1016 a.exp += 1;
1018 return a;
1020 if (is_nan(a.cls) || is_nan(b.cls)) {
1021 return pick_nan(a, b, s);
1023 if (a.cls == float_class_inf || b.cls == float_class_zero) {
1024 return a;
1026 if (b.cls == float_class_inf || a.cls == float_class_zero) {
1027 b.sign = b_sign;
1028 return b;
1031 g_assert_not_reached();
1035 * Returns the result of adding or subtracting the floating-point
1036 * values `a' and `b'. The operation is performed according to the
1037 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1040 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1042 FloatParts pa = float16_unpack_canonical(a, status);
1043 FloatParts pb = float16_unpack_canonical(b, status);
1044 FloatParts pr = addsub_floats(pa, pb, false, status);
1046 return float16_round_pack_canonical(pr, status);
1049 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1051 FloatParts pa = float16_unpack_canonical(a, status);
1052 FloatParts pb = float16_unpack_canonical(b, status);
1053 FloatParts pr = addsub_floats(pa, pb, true, status);
1055 return float16_round_pack_canonical(pr, status);
1058 static float32 QEMU_SOFTFLOAT_ATTR
1059 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1061 FloatParts pa = float32_unpack_canonical(a, status);
1062 FloatParts pb = float32_unpack_canonical(b, status);
1063 FloatParts pr = addsub_floats(pa, pb, subtract, status);
1065 return float32_round_pack_canonical(pr, status);
1068 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1070 return soft_f32_addsub(a, b, false, status);
1073 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1075 return soft_f32_addsub(a, b, true, status);
1078 static float64 QEMU_SOFTFLOAT_ATTR
1079 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1081 FloatParts pa = float64_unpack_canonical(a, status);
1082 FloatParts pb = float64_unpack_canonical(b, status);
1083 FloatParts pr = addsub_floats(pa, pb, subtract, status);
1085 return float64_round_pack_canonical(pr, status);
1088 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1090 return soft_f64_addsub(a, b, false, status);
1093 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1095 return soft_f64_addsub(a, b, true, status);
1098 static float hard_f32_add(float a, float b)
1100 return a + b;
1103 static float hard_f32_sub(float a, float b)
1105 return a - b;
1108 static double hard_f64_add(double a, double b)
1110 return a + b;
1113 static double hard_f64_sub(double a, double b)
1115 return a - b;
1118 static bool f32_addsub_post(union_float32 a, union_float32 b)
1120 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1121 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1123 return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1126 static bool f64_addsub_post(union_float64 a, union_float64 b)
1128 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1129 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1130 } else {
1131 return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1135 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1136 hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1138 return float32_gen2(a, b, s, hard, soft,
1139 f32_is_zon2, f32_addsub_post, NULL, NULL);
1142 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1143 hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1145 return float64_gen2(a, b, s, hard, soft,
1146 f64_is_zon2, f64_addsub_post, NULL, NULL);
1149 float32 QEMU_FLATTEN
1150 float32_add(float32 a, float32 b, float_status *s)
1152 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1155 float32 QEMU_FLATTEN
1156 float32_sub(float32 a, float32 b, float_status *s)
1158 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1161 float64 QEMU_FLATTEN
1162 float64_add(float64 a, float64 b, float_status *s)
1164 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1167 float64 QEMU_FLATTEN
1168 float64_sub(float64 a, float64 b, float_status *s)
1170 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1174 * Returns the result of multiplying the floating-point values `a' and
1175 * `b'. The operation is performed according to the IEC/IEEE Standard
1176 * for Binary Floating-Point Arithmetic.
1179 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
1181 bool sign = a.sign ^ b.sign;
1183 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1184 uint64_t hi, lo;
1185 int exp = a.exp + b.exp;
1187 mul64To128(a.frac, b.frac, &hi, &lo);
1188 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1189 if (lo & DECOMPOSED_OVERFLOW_BIT) {
1190 shift64RightJamming(lo, 1, &lo);
1191 exp += 1;
1194 /* Re-use a */
1195 a.exp = exp;
1196 a.sign = sign;
1197 a.frac = lo;
1198 return a;
1200 /* handle all the NaN cases */
1201 if (is_nan(a.cls) || is_nan(b.cls)) {
1202 return pick_nan(a, b, s);
1204 /* Inf * Zero == NaN */
1205 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1206 (a.cls == float_class_zero && b.cls == float_class_inf)) {
1207 s->float_exception_flags |= float_flag_invalid;
1208 return parts_default_nan(s);
1210 /* Multiply by 0 or Inf */
1211 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1212 a.sign = sign;
1213 return a;
1215 if (b.cls == float_class_inf || b.cls == float_class_zero) {
1216 b.sign = sign;
1217 return b;
1219 g_assert_not_reached();
1222 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1224 FloatParts pa = float16_unpack_canonical(a, status);
1225 FloatParts pb = float16_unpack_canonical(b, status);
1226 FloatParts pr = mul_floats(pa, pb, status);
1228 return float16_round_pack_canonical(pr, status);
1231 static float32 QEMU_SOFTFLOAT_ATTR
1232 soft_f32_mul(float32 a, float32 b, float_status *status)
1234 FloatParts pa = float32_unpack_canonical(a, status);
1235 FloatParts pb = float32_unpack_canonical(b, status);
1236 FloatParts pr = mul_floats(pa, pb, status);
1238 return float32_round_pack_canonical(pr, status);
1241 static float64 QEMU_SOFTFLOAT_ATTR
1242 soft_f64_mul(float64 a, float64 b, float_status *status)
1244 FloatParts pa = float64_unpack_canonical(a, status);
1245 FloatParts pb = float64_unpack_canonical(b, status);
1246 FloatParts pr = mul_floats(pa, pb, status);
1248 return float64_round_pack_canonical(pr, status);
1251 static float hard_f32_mul(float a, float b)
1253 return a * b;
1256 static double hard_f64_mul(double a, double b)
1258 return a * b;
1261 static bool f32_mul_fast_test(union_float32 a, union_float32 b)
1263 return float32_is_zero(a.s) || float32_is_zero(b.s);
1266 static bool f64_mul_fast_test(union_float64 a, union_float64 b)
1268 return float64_is_zero(a.s) || float64_is_zero(b.s);
1271 static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s)
1273 bool signbit = float32_is_neg(a) ^ float32_is_neg(b);
1275 return float32_set_sign(float32_zero, signbit);
1278 static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s)
1280 bool signbit = float64_is_neg(a) ^ float64_is_neg(b);
1282 return float64_set_sign(float64_zero, signbit);
1285 float32 QEMU_FLATTEN
1286 float32_mul(float32 a, float32 b, float_status *s)
1288 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1289 f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op);
1292 float64 QEMU_FLATTEN
1293 float64_mul(float64 a, float64 b, float_status *s)
1295 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1296 f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op);
1300 * Returns the result of multiplying the floating-point values `a' and
1301 * `b' then adding 'c', with no intermediate rounding step after the
1302 * multiplication. The operation is performed according to the
1303 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1304 * The flags argument allows the caller to select negation of the
1305 * addend, the intermediate product, or the final result. (The
1306 * difference between this and having the caller do a separate
1307 * negation is that negating externally will flip the sign bit on
1308 * NaNs.)
1311 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
1312 int flags, float_status *s)
1314 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
1315 ((1 << float_class_inf) | (1 << float_class_zero));
1316 bool p_sign;
1317 bool sign_flip = flags & float_muladd_negate_result;
1318 FloatClass p_class;
1319 uint64_t hi, lo;
1320 int p_exp;
1322 /* It is implementation-defined whether the cases of (0,inf,qnan)
1323 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1324 * they return if they do), so we have to hand this information
1325 * off to the target-specific pick-a-NaN routine.
1327 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
1328 return pick_nan_muladd(a, b, c, inf_zero, s);
1331 if (inf_zero) {
1332 s->float_exception_flags |= float_flag_invalid;
1333 return parts_default_nan(s);
1336 if (flags & float_muladd_negate_c) {
1337 c.sign ^= 1;
1340 p_sign = a.sign ^ b.sign;
1342 if (flags & float_muladd_negate_product) {
1343 p_sign ^= 1;
1346 if (a.cls == float_class_inf || b.cls == float_class_inf) {
1347 p_class = float_class_inf;
1348 } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
1349 p_class = float_class_zero;
1350 } else {
1351 p_class = float_class_normal;
1354 if (c.cls == float_class_inf) {
1355 if (p_class == float_class_inf && p_sign != c.sign) {
1356 s->float_exception_flags |= float_flag_invalid;
1357 return parts_default_nan(s);
1358 } else {
1359 a.cls = float_class_inf;
1360 a.sign = c.sign ^ sign_flip;
1361 return a;
1365 if (p_class == float_class_inf) {
1366 a.cls = float_class_inf;
1367 a.sign = p_sign ^ sign_flip;
1368 return a;
1371 if (p_class == float_class_zero) {
1372 if (c.cls == float_class_zero) {
1373 if (p_sign != c.sign) {
1374 p_sign = s->float_rounding_mode == float_round_down;
1376 c.sign = p_sign;
1377 } else if (flags & float_muladd_halve_result) {
1378 c.exp -= 1;
1380 c.sign ^= sign_flip;
1381 return c;
1384 /* a & b should be normals now... */
1385 assert(a.cls == float_class_normal &&
1386 b.cls == float_class_normal);
1388 p_exp = a.exp + b.exp;
1390 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1391 * result.
1393 mul64To128(a.frac, b.frac, &hi, &lo);
1394 /* binary point now at bit 124 */
1396 /* check for overflow */
1397 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
1398 shift128RightJamming(hi, lo, 1, &hi, &lo);
1399 p_exp += 1;
1402 /* + add/sub */
1403 if (c.cls == float_class_zero) {
1404 /* move binary point back to 62 */
1405 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1406 } else {
1407 int exp_diff = p_exp - c.exp;
1408 if (p_sign == c.sign) {
1409 /* Addition */
1410 if (exp_diff <= 0) {
1411 shift128RightJamming(hi, lo,
1412 DECOMPOSED_BINARY_POINT - exp_diff,
1413 &hi, &lo);
1414 lo += c.frac;
1415 p_exp = c.exp;
1416 } else {
1417 uint64_t c_hi, c_lo;
1418 /* shift c to the same binary point as the product (124) */
1419 c_hi = c.frac >> 2;
1420 c_lo = 0;
1421 shift128RightJamming(c_hi, c_lo,
1422 exp_diff,
1423 &c_hi, &c_lo);
1424 add128(hi, lo, c_hi, c_lo, &hi, &lo);
1425 /* move binary point back to 62 */
1426 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1429 if (lo & DECOMPOSED_OVERFLOW_BIT) {
1430 shift64RightJamming(lo, 1, &lo);
1431 p_exp += 1;
1434 } else {
1435 /* Subtraction */
1436 uint64_t c_hi, c_lo;
1437 /* make C binary point match product at bit 124 */
1438 c_hi = c.frac >> 2;
1439 c_lo = 0;
1441 if (exp_diff <= 0) {
1442 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1443 if (exp_diff == 0
1445 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1446 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1447 } else {
1448 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1449 p_sign ^= 1;
1450 p_exp = c.exp;
1452 } else {
1453 shift128RightJamming(c_hi, c_lo,
1454 exp_diff,
1455 &c_hi, &c_lo);
1456 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1459 if (hi == 0 && lo == 0) {
1460 a.cls = float_class_zero;
1461 a.sign = s->float_rounding_mode == float_round_down;
1462 a.sign ^= sign_flip;
1463 return a;
1464 } else {
1465 int shift;
1466 if (hi != 0) {
1467 shift = clz64(hi);
1468 } else {
1469 shift = clz64(lo) + 64;
1471 /* Normalizing to a binary point of 124 is the
1472 correct adjust for the exponent. However since we're
1473 shifting, we might as well put the binary point back
1474 at 62 where we really want it. Therefore shift as
1475 if we're leaving 1 bit at the top of the word, but
1476 adjust the exponent as if we're leaving 3 bits. */
1477 shift -= 1;
1478 if (shift >= 64) {
1479 lo = lo << (shift - 64);
1480 } else {
1481 hi = (hi << shift) | (lo >> (64 - shift));
1482 lo = hi | ((lo << shift) != 0);
1484 p_exp -= shift - 2;
1489 if (flags & float_muladd_halve_result) {
1490 p_exp -= 1;
1493 /* finally prepare our result */
1494 a.cls = float_class_normal;
1495 a.sign = p_sign ^ sign_flip;
1496 a.exp = p_exp;
1497 a.frac = lo;
1499 return a;
1502 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1503 int flags, float_status *status)
1505 FloatParts pa = float16_unpack_canonical(a, status);
1506 FloatParts pb = float16_unpack_canonical(b, status);
1507 FloatParts pc = float16_unpack_canonical(c, status);
1508 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1510 return float16_round_pack_canonical(pr, status);
1513 static float32 QEMU_SOFTFLOAT_ATTR
1514 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1515 float_status *status)
1517 FloatParts pa = float32_unpack_canonical(a, status);
1518 FloatParts pb = float32_unpack_canonical(b, status);
1519 FloatParts pc = float32_unpack_canonical(c, status);
1520 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1522 return float32_round_pack_canonical(pr, status);
1525 static float64 QEMU_SOFTFLOAT_ATTR
1526 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1527 float_status *status)
1529 FloatParts pa = float64_unpack_canonical(a, status);
1530 FloatParts pb = float64_unpack_canonical(b, status);
1531 FloatParts pc = float64_unpack_canonical(c, status);
1532 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1534 return float64_round_pack_canonical(pr, status);
1537 static bool force_soft_fma;
1539 float32 QEMU_FLATTEN
1540 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1542 union_float32 ua, ub, uc, ur;
1544 ua.s = xa;
1545 ub.s = xb;
1546 uc.s = xc;
1548 if (unlikely(!can_use_fpu(s))) {
1549 goto soft;
1551 if (unlikely(flags & float_muladd_halve_result)) {
1552 goto soft;
1555 float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1556 if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1557 goto soft;
1560 if (unlikely(force_soft_fma)) {
1561 goto soft;
1565 * When (a || b) == 0, there's no need to check for under/over flow,
1566 * since we know the addend is (normal || 0) and the product is 0.
1568 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1569 union_float32 up;
1570 bool prod_sign;
1572 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1573 prod_sign ^= !!(flags & float_muladd_negate_product);
1574 up.s = float32_set_sign(float32_zero, prod_sign);
1576 if (flags & float_muladd_negate_c) {
1577 uc.h = -uc.h;
1579 ur.h = up.h + uc.h;
1580 } else {
1581 union_float32 ua_orig = ua;
1582 union_float32 uc_orig = uc;
1584 if (flags & float_muladd_negate_product) {
1585 ua.h = -ua.h;
1587 if (flags & float_muladd_negate_c) {
1588 uc.h = -uc.h;
1591 ur.h = fmaf(ua.h, ub.h, uc.h);
1593 if (unlikely(f32_is_inf(ur))) {
1594 s->float_exception_flags |= float_flag_overflow;
1595 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1596 ua = ua_orig;
1597 uc = uc_orig;
1598 goto soft;
1601 if (flags & float_muladd_negate_result) {
1602 return float32_chs(ur.s);
1604 return ur.s;
1606 soft:
1607 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1610 float64 QEMU_FLATTEN
1611 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1613 union_float64 ua, ub, uc, ur;
1615 ua.s = xa;
1616 ub.s = xb;
1617 uc.s = xc;
1619 if (unlikely(!can_use_fpu(s))) {
1620 goto soft;
1622 if (unlikely(flags & float_muladd_halve_result)) {
1623 goto soft;
1626 float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1627 if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1628 goto soft;
1631 if (unlikely(force_soft_fma)) {
1632 goto soft;
1636 * When (a || b) == 0, there's no need to check for under/over flow,
1637 * since we know the addend is (normal || 0) and the product is 0.
1639 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1640 union_float64 up;
1641 bool prod_sign;
1643 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1644 prod_sign ^= !!(flags & float_muladd_negate_product);
1645 up.s = float64_set_sign(float64_zero, prod_sign);
1647 if (flags & float_muladd_negate_c) {
1648 uc.h = -uc.h;
1650 ur.h = up.h + uc.h;
1651 } else {
1652 union_float64 ua_orig = ua;
1653 union_float64 uc_orig = uc;
1655 if (flags & float_muladd_negate_product) {
1656 ua.h = -ua.h;
1658 if (flags & float_muladd_negate_c) {
1659 uc.h = -uc.h;
1662 ur.h = fma(ua.h, ub.h, uc.h);
1664 if (unlikely(f64_is_inf(ur))) {
1665 s->float_exception_flags |= float_flag_overflow;
1666 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1667 ua = ua_orig;
1668 uc = uc_orig;
1669 goto soft;
1672 if (flags & float_muladd_negate_result) {
1673 return float64_chs(ur.s);
1675 return ur.s;
1677 soft:
1678 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1682 * Returns the result of dividing the floating-point value `a' by the
1683 * corresponding value `b'. The operation is performed according to
1684 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1687 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1689 bool sign = a.sign ^ b.sign;
1691 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1692 uint64_t n0, n1, q, r;
1693 int exp = a.exp - b.exp;
1696 * We want a 2*N / N-bit division to produce exactly an N-bit
1697 * result, so that we do not lose any precision and so that we
1698 * do not have to renormalize afterward. If A.frac < B.frac,
1699 * then division would produce an (N-1)-bit result; shift A left
1700 * by one to produce the an N-bit result, and decrement the
1701 * exponent to match.
1703 * The udiv_qrnnd algorithm that we're using requires normalization,
1704 * i.e. the msb of the denominator must be set. Since we know that
1705 * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1706 * by one (more), and the remainder must be shifted right by one.
1708 if (a.frac < b.frac) {
1709 exp -= 1;
1710 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1711 } else {
1712 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1714 q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1717 * Set lsb if there is a remainder, to set inexact.
1718 * As mentioned above, to find the actual value of the remainder we
1719 * would need to shift right, but (1) we are only concerned about
1720 * non-zero-ness, and (2) the remainder will always be even because
1721 * both inputs to the division primitive are even.
1723 a.frac = q | (r != 0);
1724 a.sign = sign;
1725 a.exp = exp;
1726 return a;
1728 /* handle all the NaN cases */
1729 if (is_nan(a.cls) || is_nan(b.cls)) {
1730 return pick_nan(a, b, s);
1732 /* 0/0 or Inf/Inf */
1733 if (a.cls == b.cls
1735 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1736 s->float_exception_flags |= float_flag_invalid;
1737 return parts_default_nan(s);
1739 /* Inf / x or 0 / x */
1740 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1741 a.sign = sign;
1742 return a;
1744 /* Div 0 => Inf */
1745 if (b.cls == float_class_zero) {
1746 s->float_exception_flags |= float_flag_divbyzero;
1747 a.cls = float_class_inf;
1748 a.sign = sign;
1749 return a;
1751 /* Div by Inf */
1752 if (b.cls == float_class_inf) {
1753 a.cls = float_class_zero;
1754 a.sign = sign;
1755 return a;
1757 g_assert_not_reached();
1760 float16 float16_div(float16 a, float16 b, float_status *status)
1762 FloatParts pa = float16_unpack_canonical(a, status);
1763 FloatParts pb = float16_unpack_canonical(b, status);
1764 FloatParts pr = div_floats(pa, pb, status);
1766 return float16_round_pack_canonical(pr, status);
1769 static float32 QEMU_SOFTFLOAT_ATTR
1770 soft_f32_div(float32 a, float32 b, float_status *status)
1772 FloatParts pa = float32_unpack_canonical(a, status);
1773 FloatParts pb = float32_unpack_canonical(b, status);
1774 FloatParts pr = div_floats(pa, pb, status);
1776 return float32_round_pack_canonical(pr, status);
1779 static float64 QEMU_SOFTFLOAT_ATTR
1780 soft_f64_div(float64 a, float64 b, float_status *status)
1782 FloatParts pa = float64_unpack_canonical(a, status);
1783 FloatParts pb = float64_unpack_canonical(b, status);
1784 FloatParts pr = div_floats(pa, pb, status);
1786 return float64_round_pack_canonical(pr, status);
1789 static float hard_f32_div(float a, float b)
1791 return a / b;
1794 static double hard_f64_div(double a, double b)
1796 return a / b;
1799 static bool f32_div_pre(union_float32 a, union_float32 b)
1801 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1802 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1803 fpclassify(b.h) == FP_NORMAL;
1805 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1808 static bool f64_div_pre(union_float64 a, union_float64 b)
1810 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1811 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1812 fpclassify(b.h) == FP_NORMAL;
1814 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1817 static bool f32_div_post(union_float32 a, union_float32 b)
1819 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1820 return fpclassify(a.h) != FP_ZERO;
1822 return !float32_is_zero(a.s);
1825 static bool f64_div_post(union_float64 a, union_float64 b)
1827 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1828 return fpclassify(a.h) != FP_ZERO;
1830 return !float64_is_zero(a.s);
1833 float32 QEMU_FLATTEN
1834 float32_div(float32 a, float32 b, float_status *s)
1836 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1837 f32_div_pre, f32_div_post, NULL, NULL);
1840 float64 QEMU_FLATTEN
1841 float64_div(float64 a, float64 b, float_status *s)
1843 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1844 f64_div_pre, f64_div_post, NULL, NULL);
1848 * Float to Float conversions
1850 * Returns the result of converting one float format to another. The
1851 * conversion is performed according to the IEC/IEEE Standard for
1852 * Binary Floating-Point Arithmetic.
1854 * The float_to_float helper only needs to take care of raising
1855 * invalid exceptions and handling the conversion on NaNs.
1858 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1859 float_status *s)
1861 if (dstf->arm_althp) {
1862 switch (a.cls) {
1863 case float_class_qnan:
1864 case float_class_snan:
1865 /* There is no NaN in the destination format. Raise Invalid
1866 * and return a zero with the sign of the input NaN.
1868 s->float_exception_flags |= float_flag_invalid;
1869 a.cls = float_class_zero;
1870 a.frac = 0;
1871 a.exp = 0;
1872 break;
1874 case float_class_inf:
1875 /* There is no Inf in the destination format. Raise Invalid
1876 * and return the maximum normal with the correct sign.
1878 s->float_exception_flags |= float_flag_invalid;
1879 a.cls = float_class_normal;
1880 a.exp = dstf->exp_max;
1881 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1882 break;
1884 default:
1885 break;
1887 } else if (is_nan(a.cls)) {
1888 if (is_snan(a.cls)) {
1889 s->float_exception_flags |= float_flag_invalid;
1890 a = parts_silence_nan(a, s);
1892 if (s->default_nan_mode) {
1893 return parts_default_nan(s);
1896 return a;
1899 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1901 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1902 FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1903 FloatParts pr = float_to_float(p, &float32_params, s);
1904 return float32_round_pack_canonical(pr, s);
1907 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1909 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1910 FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1911 FloatParts pr = float_to_float(p, &float64_params, s);
1912 return float64_round_pack_canonical(pr, s);
1915 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1917 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1918 FloatParts p = float32_unpack_canonical(a, s);
1919 FloatParts pr = float_to_float(p, fmt16, s);
1920 return float16a_round_pack_canonical(pr, s, fmt16);
1923 float64 float32_to_float64(float32 a, float_status *s)
1925 FloatParts p = float32_unpack_canonical(a, s);
1926 FloatParts pr = float_to_float(p, &float64_params, s);
1927 return float64_round_pack_canonical(pr, s);
1930 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1932 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1933 FloatParts p = float64_unpack_canonical(a, s);
1934 FloatParts pr = float_to_float(p, fmt16, s);
1935 return float16a_round_pack_canonical(pr, s, fmt16);
1938 float32 float64_to_float32(float64 a, float_status *s)
1940 FloatParts p = float64_unpack_canonical(a, s);
1941 FloatParts pr = float_to_float(p, &float32_params, s);
1942 return float32_round_pack_canonical(pr, s);
1946 * Rounds the floating-point value `a' to an integer, and returns the
1947 * result as a floating-point value. The operation is performed
1948 * according to the IEC/IEEE Standard for Binary Floating-Point
1949 * Arithmetic.
1952 static FloatParts round_to_int(FloatParts a, int rmode,
1953 int scale, float_status *s)
1955 switch (a.cls) {
1956 case float_class_qnan:
1957 case float_class_snan:
1958 return return_nan(a, s);
1960 case float_class_zero:
1961 case float_class_inf:
1962 /* already "integral" */
1963 break;
1965 case float_class_normal:
1966 scale = MIN(MAX(scale, -0x10000), 0x10000);
1967 a.exp += scale;
1969 if (a.exp >= DECOMPOSED_BINARY_POINT) {
1970 /* already integral */
1971 break;
1973 if (a.exp < 0) {
1974 bool one;
1975 /* all fractional */
1976 s->float_exception_flags |= float_flag_inexact;
1977 switch (rmode) {
1978 case float_round_nearest_even:
1979 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1980 break;
1981 case float_round_ties_away:
1982 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1983 break;
1984 case float_round_to_zero:
1985 one = false;
1986 break;
1987 case float_round_up:
1988 one = !a.sign;
1989 break;
1990 case float_round_down:
1991 one = a.sign;
1992 break;
1993 case float_round_to_odd:
1994 one = true;
1995 break;
1996 default:
1997 g_assert_not_reached();
2000 if (one) {
2001 a.frac = DECOMPOSED_IMPLICIT_BIT;
2002 a.exp = 0;
2003 } else {
2004 a.cls = float_class_zero;
2006 } else {
2007 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2008 uint64_t frac_lsbm1 = frac_lsb >> 1;
2009 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2010 uint64_t rnd_mask = rnd_even_mask >> 1;
2011 uint64_t inc;
2013 switch (rmode) {
2014 case float_round_nearest_even:
2015 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2016 break;
2017 case float_round_ties_away:
2018 inc = frac_lsbm1;
2019 break;
2020 case float_round_to_zero:
2021 inc = 0;
2022 break;
2023 case float_round_up:
2024 inc = a.sign ? 0 : rnd_mask;
2025 break;
2026 case float_round_down:
2027 inc = a.sign ? rnd_mask : 0;
2028 break;
2029 case float_round_to_odd:
2030 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2031 break;
2032 default:
2033 g_assert_not_reached();
2036 if (a.frac & rnd_mask) {
2037 s->float_exception_flags |= float_flag_inexact;
2038 a.frac += inc;
2039 a.frac &= ~rnd_mask;
2040 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
2041 a.frac >>= 1;
2042 a.exp++;
2046 break;
2047 default:
2048 g_assert_not_reached();
2050 return a;
2053 float16 float16_round_to_int(float16 a, float_status *s)
2055 FloatParts pa = float16_unpack_canonical(a, s);
2056 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2057 return float16_round_pack_canonical(pr, s);
2060 float32 float32_round_to_int(float32 a, float_status *s)
2062 FloatParts pa = float32_unpack_canonical(a, s);
2063 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2064 return float32_round_pack_canonical(pr, s);
2067 float64 float64_round_to_int(float64 a, float_status *s)
2069 FloatParts pa = float64_unpack_canonical(a, s);
2070 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2071 return float64_round_pack_canonical(pr, s);
2075 * Returns the result of converting the floating-point value `a' to
2076 * the two's complement integer format. The conversion is performed
2077 * according to the IEC/IEEE Standard for Binary Floating-Point
2078 * Arithmetic---which means in particular that the conversion is
2079 * rounded according to the current rounding mode. If `a' is a NaN,
2080 * the largest positive integer is returned. Otherwise, if the
2081 * conversion overflows, the largest integer with the same sign as `a'
2082 * is returned.
2085 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale,
2086 int64_t min, int64_t max,
2087 float_status *s)
2089 uint64_t r;
2090 int orig_flags = get_float_exception_flags(s);
2091 FloatParts p = round_to_int(in, rmode, scale, s);
2093 switch (p.cls) {
2094 case float_class_snan:
2095 case float_class_qnan:
2096 s->float_exception_flags = orig_flags | float_flag_invalid;
2097 return max;
2098 case float_class_inf:
2099 s->float_exception_flags = orig_flags | float_flag_invalid;
2100 return p.sign ? min : max;
2101 case float_class_zero:
2102 return 0;
2103 case float_class_normal:
2104 if (p.exp < DECOMPOSED_BINARY_POINT) {
2105 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2106 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2107 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2108 } else {
2109 r = UINT64_MAX;
2111 if (p.sign) {
2112 if (r <= -(uint64_t) min) {
2113 return -r;
2114 } else {
2115 s->float_exception_flags = orig_flags | float_flag_invalid;
2116 return min;
2118 } else {
2119 if (r <= max) {
2120 return r;
2121 } else {
2122 s->float_exception_flags = orig_flags | float_flag_invalid;
2123 return max;
2126 default:
2127 g_assert_not_reached();
2131 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale,
2132 float_status *s)
2134 return round_to_int_and_pack(float16_unpack_canonical(a, s),
2135 rmode, scale, INT16_MIN, INT16_MAX, s);
2138 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale,
2139 float_status *s)
2141 return round_to_int_and_pack(float16_unpack_canonical(a, s),
2142 rmode, scale, INT32_MIN, INT32_MAX, s);
2145 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale,
2146 float_status *s)
2148 return round_to_int_and_pack(float16_unpack_canonical(a, s),
2149 rmode, scale, INT64_MIN, INT64_MAX, s);
2152 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale,
2153 float_status *s)
2155 return round_to_int_and_pack(float32_unpack_canonical(a, s),
2156 rmode, scale, INT16_MIN, INT16_MAX, s);
2159 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale,
2160 float_status *s)
2162 return round_to_int_and_pack(float32_unpack_canonical(a, s),
2163 rmode, scale, INT32_MIN, INT32_MAX, s);
2166 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale,
2167 float_status *s)
2169 return round_to_int_and_pack(float32_unpack_canonical(a, s),
2170 rmode, scale, INT64_MIN, INT64_MAX, s);
2173 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale,
2174 float_status *s)
2176 return round_to_int_and_pack(float64_unpack_canonical(a, s),
2177 rmode, scale, INT16_MIN, INT16_MAX, s);
2180 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale,
2181 float_status *s)
2183 return round_to_int_and_pack(float64_unpack_canonical(a, s),
2184 rmode, scale, INT32_MIN, INT32_MAX, s);
2187 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale,
2188 float_status *s)
2190 return round_to_int_and_pack(float64_unpack_canonical(a, s),
2191 rmode, scale, INT64_MIN, INT64_MAX, s);
2194 int16_t float16_to_int16(float16 a, float_status *s)
2196 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2199 int32_t float16_to_int32(float16 a, float_status *s)
2201 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2204 int64_t float16_to_int64(float16 a, float_status *s)
2206 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2209 int16_t float32_to_int16(float32 a, float_status *s)
2211 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2214 int32_t float32_to_int32(float32 a, float_status *s)
2216 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2219 int64_t float32_to_int64(float32 a, float_status *s)
2221 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2224 int16_t float64_to_int16(float64 a, float_status *s)
2226 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2229 int32_t float64_to_int32(float64 a, float_status *s)
2231 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2234 int64_t float64_to_int64(float64 a, float_status *s)
2236 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2239 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2241 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2244 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2246 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2249 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2251 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2254 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2256 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2259 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2261 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2264 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2266 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2269 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2271 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2274 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2276 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2279 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2281 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2285 * Returns the result of converting the floating-point value `a' to
2286 * the unsigned integer format. The conversion is performed according
2287 * to the IEC/IEEE Standard for Binary Floating-Point
2288 * Arithmetic---which means in particular that the conversion is
2289 * rounded according to the current rounding mode. If `a' is a NaN,
2290 * the largest unsigned integer is returned. Otherwise, if the
2291 * conversion overflows, the largest unsigned integer is returned. If
2292 * the 'a' is negative, the result is rounded and zero is returned;
2293 * values that do not round to zero will raise the inexact exception
2294 * flag.
2297 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale,
2298 uint64_t max, float_status *s)
2300 int orig_flags = get_float_exception_flags(s);
2301 FloatParts p = round_to_int(in, rmode, scale, s);
2302 uint64_t r;
2304 switch (p.cls) {
2305 case float_class_snan:
2306 case float_class_qnan:
2307 s->float_exception_flags = orig_flags | float_flag_invalid;
2308 return max;
2309 case float_class_inf:
2310 s->float_exception_flags = orig_flags | float_flag_invalid;
2311 return p.sign ? 0 : max;
2312 case float_class_zero:
2313 return 0;
2314 case float_class_normal:
2315 if (p.sign) {
2316 s->float_exception_flags = orig_flags | float_flag_invalid;
2317 return 0;
2320 if (p.exp < DECOMPOSED_BINARY_POINT) {
2321 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2322 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2323 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2324 } else {
2325 s->float_exception_flags = orig_flags | float_flag_invalid;
2326 return max;
2329 /* For uint64 this will never trip, but if p.exp is too large
2330 * to shift a decomposed fraction we shall have exited via the
2331 * 3rd leg above.
2333 if (r > max) {
2334 s->float_exception_flags = orig_flags | float_flag_invalid;
2335 return max;
2337 return r;
2338 default:
2339 g_assert_not_reached();
2343 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale,
2344 float_status *s)
2346 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2347 rmode, scale, UINT16_MAX, s);
2350 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale,
2351 float_status *s)
2353 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2354 rmode, scale, UINT32_MAX, s);
2357 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale,
2358 float_status *s)
2360 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2361 rmode, scale, UINT64_MAX, s);
2364 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale,
2365 float_status *s)
2367 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2368 rmode, scale, UINT16_MAX, s);
2371 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale,
2372 float_status *s)
2374 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2375 rmode, scale, UINT32_MAX, s);
2378 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale,
2379 float_status *s)
2381 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2382 rmode, scale, UINT64_MAX, s);
2385 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale,
2386 float_status *s)
2388 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2389 rmode, scale, UINT16_MAX, s);
2392 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale,
2393 float_status *s)
2395 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2396 rmode, scale, UINT32_MAX, s);
2399 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale,
2400 float_status *s)
2402 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2403 rmode, scale, UINT64_MAX, s);
2406 uint16_t float16_to_uint16(float16 a, float_status *s)
2408 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2411 uint32_t float16_to_uint32(float16 a, float_status *s)
2413 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2416 uint64_t float16_to_uint64(float16 a, float_status *s)
2418 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2421 uint16_t float32_to_uint16(float32 a, float_status *s)
2423 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2426 uint32_t float32_to_uint32(float32 a, float_status *s)
2428 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2431 uint64_t float32_to_uint64(float32 a, float_status *s)
2433 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2436 uint16_t float64_to_uint16(float64 a, float_status *s)
2438 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2441 uint32_t float64_to_uint32(float64 a, float_status *s)
2443 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2446 uint64_t float64_to_uint64(float64 a, float_status *s)
2448 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2451 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2453 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2456 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2458 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2461 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2463 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2466 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2468 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2471 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2473 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2476 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2478 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2481 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2483 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2486 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2488 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2491 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2493 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2497 * Integer to float conversions
2499 * Returns the result of converting the two's complement integer `a'
2500 * to the floating-point format. The conversion is performed according
2501 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2504 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
2506 FloatParts r = { .sign = false };
2508 if (a == 0) {
2509 r.cls = float_class_zero;
2510 } else {
2511 uint64_t f = a;
2512 int shift;
2514 r.cls = float_class_normal;
2515 if (a < 0) {
2516 f = -f;
2517 r.sign = true;
2519 shift = clz64(f) - 1;
2520 scale = MIN(MAX(scale, -0x10000), 0x10000);
2522 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2523 r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
2526 return r;
2529 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2531 FloatParts pa = int_to_float(a, scale, status);
2532 return float16_round_pack_canonical(pa, status);
2535 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2537 return int64_to_float16_scalbn(a, scale, status);
2540 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2542 return int64_to_float16_scalbn(a, scale, status);
2545 float16 int64_to_float16(int64_t a, float_status *status)
2547 return int64_to_float16_scalbn(a, 0, status);
2550 float16 int32_to_float16(int32_t a, float_status *status)
2552 return int64_to_float16_scalbn(a, 0, status);
2555 float16 int16_to_float16(int16_t a, float_status *status)
2557 return int64_to_float16_scalbn(a, 0, status);
2560 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2562 FloatParts pa = int_to_float(a, scale, status);
2563 return float32_round_pack_canonical(pa, status);
2566 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2568 return int64_to_float32_scalbn(a, scale, status);
2571 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2573 return int64_to_float32_scalbn(a, scale, status);
2576 float32 int64_to_float32(int64_t a, float_status *status)
2578 return int64_to_float32_scalbn(a, 0, status);
2581 float32 int32_to_float32(int32_t a, float_status *status)
2583 return int64_to_float32_scalbn(a, 0, status);
2586 float32 int16_to_float32(int16_t a, float_status *status)
2588 return int64_to_float32_scalbn(a, 0, status);
2591 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2593 FloatParts pa = int_to_float(a, scale, status);
2594 return float64_round_pack_canonical(pa, status);
2597 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2599 return int64_to_float64_scalbn(a, scale, status);
2602 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2604 return int64_to_float64_scalbn(a, scale, status);
2607 float64 int64_to_float64(int64_t a, float_status *status)
2609 return int64_to_float64_scalbn(a, 0, status);
2612 float64 int32_to_float64(int32_t a, float_status *status)
2614 return int64_to_float64_scalbn(a, 0, status);
2617 float64 int16_to_float64(int16_t a, float_status *status)
2619 return int64_to_float64_scalbn(a, 0, status);
2624 * Unsigned Integer to float conversions
2626 * Returns the result of converting the unsigned integer `a' to the
2627 * floating-point format. The conversion is performed according to the
2628 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2631 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
2633 FloatParts r = { .sign = false };
2635 if (a == 0) {
2636 r.cls = float_class_zero;
2637 } else {
2638 scale = MIN(MAX(scale, -0x10000), 0x10000);
2639 r.cls = float_class_normal;
2640 if ((int64_t)a < 0) {
2641 r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
2642 shift64RightJamming(a, 1, &a);
2643 r.frac = a;
2644 } else {
2645 int shift = clz64(a) - 1;
2646 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2647 r.frac = a << shift;
2651 return r;
2654 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2656 FloatParts pa = uint_to_float(a, scale, status);
2657 return float16_round_pack_canonical(pa, status);
2660 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2662 return uint64_to_float16_scalbn(a, scale, status);
2665 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2667 return uint64_to_float16_scalbn(a, scale, status);
2670 float16 uint64_to_float16(uint64_t a, float_status *status)
2672 return uint64_to_float16_scalbn(a, 0, status);
2675 float16 uint32_to_float16(uint32_t a, float_status *status)
2677 return uint64_to_float16_scalbn(a, 0, status);
2680 float16 uint16_to_float16(uint16_t a, float_status *status)
2682 return uint64_to_float16_scalbn(a, 0, status);
2685 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2687 FloatParts pa = uint_to_float(a, scale, status);
2688 return float32_round_pack_canonical(pa, status);
2691 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2693 return uint64_to_float32_scalbn(a, scale, status);
2696 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2698 return uint64_to_float32_scalbn(a, scale, status);
2701 float32 uint64_to_float32(uint64_t a, float_status *status)
2703 return uint64_to_float32_scalbn(a, 0, status);
2706 float32 uint32_to_float32(uint32_t a, float_status *status)
2708 return uint64_to_float32_scalbn(a, 0, status);
2711 float32 uint16_to_float32(uint16_t a, float_status *status)
2713 return uint64_to_float32_scalbn(a, 0, status);
2716 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2718 FloatParts pa = uint_to_float(a, scale, status);
2719 return float64_round_pack_canonical(pa, status);
2722 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2724 return uint64_to_float64_scalbn(a, scale, status);
2727 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2729 return uint64_to_float64_scalbn(a, scale, status);
2732 float64 uint64_to_float64(uint64_t a, float_status *status)
2734 return uint64_to_float64_scalbn(a, 0, status);
2737 float64 uint32_to_float64(uint32_t a, float_status *status)
2739 return uint64_to_float64_scalbn(a, 0, status);
2742 float64 uint16_to_float64(uint16_t a, float_status *status)
2744 return uint64_to_float64_scalbn(a, 0, status);
2747 /* Float Min/Max */
2748 /* min() and max() functions. These can't be implemented as
2749 * 'compare and pick one input' because that would mishandle
2750 * NaNs and +0 vs -0.
2752 * minnum() and maxnum() functions. These are similar to the min()
2753 * and max() functions but if one of the arguments is a QNaN and
2754 * the other is numerical then the numerical argument is returned.
2755 * SNaNs will get quietened before being returned.
2756 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2757 * and maxNum() operations. min() and max() are the typical min/max
2758 * semantics provided by many CPUs which predate that specification.
2760 * minnummag() and maxnummag() functions correspond to minNumMag()
2761 * and minNumMag() from the IEEE-754 2008.
2763 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2764 bool ieee, bool ismag, float_status *s)
2766 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2767 if (ieee) {
2768 /* Takes two floating-point values `a' and `b', one of
2769 * which is a NaN, and returns the appropriate NaN
2770 * result. If either `a' or `b' is a signaling NaN,
2771 * the invalid exception is raised.
2773 if (is_snan(a.cls) || is_snan(b.cls)) {
2774 return pick_nan(a, b, s);
2775 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2776 return b;
2777 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2778 return a;
2781 return pick_nan(a, b, s);
2782 } else {
2783 int a_exp, b_exp;
2785 switch (a.cls) {
2786 case float_class_normal:
2787 a_exp = a.exp;
2788 break;
2789 case float_class_inf:
2790 a_exp = INT_MAX;
2791 break;
2792 case float_class_zero:
2793 a_exp = INT_MIN;
2794 break;
2795 default:
2796 g_assert_not_reached();
2797 break;
2799 switch (b.cls) {
2800 case float_class_normal:
2801 b_exp = b.exp;
2802 break;
2803 case float_class_inf:
2804 b_exp = INT_MAX;
2805 break;
2806 case float_class_zero:
2807 b_exp = INT_MIN;
2808 break;
2809 default:
2810 g_assert_not_reached();
2811 break;
2814 if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2815 bool a_less = a_exp < b_exp;
2816 if (a_exp == b_exp) {
2817 a_less = a.frac < b.frac;
2819 return a_less ^ ismin ? b : a;
2822 if (a.sign == b.sign) {
2823 bool a_less = a_exp < b_exp;
2824 if (a_exp == b_exp) {
2825 a_less = a.frac < b.frac;
2827 return a.sign ^ a_less ^ ismin ? b : a;
2828 } else {
2829 return a.sign ^ ismin ? b : a;
2834 #define MINMAX(sz, name, ismin, isiee, ismag) \
2835 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
2836 float_status *s) \
2838 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2839 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2840 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
2842 return float ## sz ## _round_pack_canonical(pr, s); \
2845 MINMAX(16, min, true, false, false)
2846 MINMAX(16, minnum, true, true, false)
2847 MINMAX(16, minnummag, true, true, true)
2848 MINMAX(16, max, false, false, false)
2849 MINMAX(16, maxnum, false, true, false)
2850 MINMAX(16, maxnummag, false, true, true)
2852 MINMAX(32, min, true, false, false)
2853 MINMAX(32, minnum, true, true, false)
2854 MINMAX(32, minnummag, true, true, true)
2855 MINMAX(32, max, false, false, false)
2856 MINMAX(32, maxnum, false, true, false)
2857 MINMAX(32, maxnummag, false, true, true)
2859 MINMAX(64, min, true, false, false)
2860 MINMAX(64, minnum, true, true, false)
2861 MINMAX(64, minnummag, true, true, true)
2862 MINMAX(64, max, false, false, false)
2863 MINMAX(64, maxnum, false, true, false)
2864 MINMAX(64, maxnummag, false, true, true)
2866 #undef MINMAX
2868 /* Floating point compare */
2869 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2870 float_status *s)
2872 if (is_nan(a.cls) || is_nan(b.cls)) {
2873 if (!is_quiet ||
2874 a.cls == float_class_snan ||
2875 b.cls == float_class_snan) {
2876 s->float_exception_flags |= float_flag_invalid;
2878 return float_relation_unordered;
2881 if (a.cls == float_class_zero) {
2882 if (b.cls == float_class_zero) {
2883 return float_relation_equal;
2885 return b.sign ? float_relation_greater : float_relation_less;
2886 } else if (b.cls == float_class_zero) {
2887 return a.sign ? float_relation_less : float_relation_greater;
2890 /* The only really important thing about infinity is its sign. If
2891 * both are infinities the sign marks the smallest of the two.
2893 if (a.cls == float_class_inf) {
2894 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2895 return float_relation_equal;
2897 return a.sign ? float_relation_less : float_relation_greater;
2898 } else if (b.cls == float_class_inf) {
2899 return b.sign ? float_relation_greater : float_relation_less;
2902 if (a.sign != b.sign) {
2903 return a.sign ? float_relation_less : float_relation_greater;
2906 if (a.exp == b.exp) {
2907 if (a.frac == b.frac) {
2908 return float_relation_equal;
2910 if (a.sign) {
2911 return a.frac > b.frac ?
2912 float_relation_less : float_relation_greater;
2913 } else {
2914 return a.frac > b.frac ?
2915 float_relation_greater : float_relation_less;
2917 } else {
2918 if (a.sign) {
2919 return a.exp > b.exp ? float_relation_less : float_relation_greater;
2920 } else {
2921 return a.exp > b.exp ? float_relation_greater : float_relation_less;
2926 #define COMPARE(name, attr, sz) \
2927 static int attr \
2928 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \
2930 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2931 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2932 return compare_floats(pa, pb, is_quiet, s); \
2935 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
2936 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
2937 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
2939 #undef COMPARE
2941 int float16_compare(float16 a, float16 b, float_status *s)
2943 return soft_f16_compare(a, b, false, s);
2946 int float16_compare_quiet(float16 a, float16 b, float_status *s)
2948 return soft_f16_compare(a, b, true, s);
2951 static int QEMU_FLATTEN
2952 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
2954 union_float32 ua, ub;
2956 ua.s = xa;
2957 ub.s = xb;
2959 if (QEMU_NO_HARDFLOAT) {
2960 goto soft;
2963 float32_input_flush2(&ua.s, &ub.s, s);
2964 if (isgreaterequal(ua.h, ub.h)) {
2965 if (isgreater(ua.h, ub.h)) {
2966 return float_relation_greater;
2968 return float_relation_equal;
2970 if (likely(isless(ua.h, ub.h))) {
2971 return float_relation_less;
2973 /* The only condition remaining is unordered.
2974 * Fall through to set flags.
2976 soft:
2977 return soft_f32_compare(ua.s, ub.s, is_quiet, s);
2980 int float32_compare(float32 a, float32 b, float_status *s)
2982 return f32_compare(a, b, false, s);
2985 int float32_compare_quiet(float32 a, float32 b, float_status *s)
2987 return f32_compare(a, b, true, s);
2990 static int QEMU_FLATTEN
2991 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
2993 union_float64 ua, ub;
2995 ua.s = xa;
2996 ub.s = xb;
2998 if (QEMU_NO_HARDFLOAT) {
2999 goto soft;
3002 float64_input_flush2(&ua.s, &ub.s, s);
3003 if (isgreaterequal(ua.h, ub.h)) {
3004 if (isgreater(ua.h, ub.h)) {
3005 return float_relation_greater;
3007 return float_relation_equal;
3009 if (likely(isless(ua.h, ub.h))) {
3010 return float_relation_less;
3012 /* The only condition remaining is unordered.
3013 * Fall through to set flags.
3015 soft:
3016 return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3019 int float64_compare(float64 a, float64 b, float_status *s)
3021 return f64_compare(a, b, false, s);
3024 int float64_compare_quiet(float64 a, float64 b, float_status *s)
3026 return f64_compare(a, b, true, s);
3029 /* Multiply A by 2 raised to the power N. */
3030 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
3032 if (unlikely(is_nan(a.cls))) {
3033 return return_nan(a, s);
3035 if (a.cls == float_class_normal) {
3036 /* The largest float type (even though not supported by FloatParts)
3037 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
3038 * still allows rounding to infinity, without allowing overflow
3039 * within the int32_t that backs FloatParts.exp.
3041 n = MIN(MAX(n, -0x10000), 0x10000);
3042 a.exp += n;
3044 return a;
3047 float16 float16_scalbn(float16 a, int n, float_status *status)
3049 FloatParts pa = float16_unpack_canonical(a, status);
3050 FloatParts pr = scalbn_decomposed(pa, n, status);
3051 return float16_round_pack_canonical(pr, status);
3054 float32 float32_scalbn(float32 a, int n, float_status *status)
3056 FloatParts pa = float32_unpack_canonical(a, status);
3057 FloatParts pr = scalbn_decomposed(pa, n, status);
3058 return float32_round_pack_canonical(pr, status);
3061 float64 float64_scalbn(float64 a, int n, float_status *status)
3063 FloatParts pa = float64_unpack_canonical(a, status);
3064 FloatParts pr = scalbn_decomposed(pa, n, status);
3065 return float64_round_pack_canonical(pr, status);
3069 * Square Root
3071 * The old softfloat code did an approximation step before zeroing in
3072 * on the final result. However for simpleness we just compute the
3073 * square root by iterating down from the implicit bit to enough extra
3074 * bits to ensure we get a correctly rounded result.
3076 * This does mean however the calculation is slower than before,
3077 * especially for 64 bit floats.
3080 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
3082 uint64_t a_frac, r_frac, s_frac;
3083 int bit, last_bit;
3085 if (is_nan(a.cls)) {
3086 return return_nan(a, s);
3088 if (a.cls == float_class_zero) {
3089 return a; /* sqrt(+-0) = +-0 */
3091 if (a.sign) {
3092 s->float_exception_flags |= float_flag_invalid;
3093 return parts_default_nan(s);
3095 if (a.cls == float_class_inf) {
3096 return a; /* sqrt(+inf) = +inf */
3099 assert(a.cls == float_class_normal);
3101 /* We need two overflow bits at the top. Adding room for that is a
3102 * right shift. If the exponent is odd, we can discard the low bit
3103 * by multiplying the fraction by 2; that's a left shift. Combine
3104 * those and we shift right if the exponent is even.
3106 a_frac = a.frac;
3107 if (!(a.exp & 1)) {
3108 a_frac >>= 1;
3110 a.exp >>= 1;
3112 /* Bit-by-bit computation of sqrt. */
3113 r_frac = 0;
3114 s_frac = 0;
3116 /* Iterate from implicit bit down to the 3 extra bits to compute a
3117 * properly rounded result. Remember we've inserted one more bit
3118 * at the top, so these positions are one less.
3120 bit = DECOMPOSED_BINARY_POINT - 1;
3121 last_bit = MAX(p->frac_shift - 4, 0);
3122 do {
3123 uint64_t q = 1ULL << bit;
3124 uint64_t t_frac = s_frac + q;
3125 if (t_frac <= a_frac) {
3126 s_frac = t_frac + q;
3127 a_frac -= t_frac;
3128 r_frac += q;
3130 a_frac <<= 1;
3131 } while (--bit >= last_bit);
3133 /* Undo the right shift done above. If there is any remaining
3134 * fraction, the result is inexact. Set the sticky bit.
3136 a.frac = (r_frac << 1) + (a_frac != 0);
3138 return a;
3141 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3143 FloatParts pa = float16_unpack_canonical(a, status);
3144 FloatParts pr = sqrt_float(pa, status, &float16_params);
3145 return float16_round_pack_canonical(pr, status);
3148 static float32 QEMU_SOFTFLOAT_ATTR
3149 soft_f32_sqrt(float32 a, float_status *status)
3151 FloatParts pa = float32_unpack_canonical(a, status);
3152 FloatParts pr = sqrt_float(pa, status, &float32_params);
3153 return float32_round_pack_canonical(pr, status);
3156 static float64 QEMU_SOFTFLOAT_ATTR
3157 soft_f64_sqrt(float64 a, float_status *status)
3159 FloatParts pa = float64_unpack_canonical(a, status);
3160 FloatParts pr = sqrt_float(pa, status, &float64_params);
3161 return float64_round_pack_canonical(pr, status);
3164 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3166 union_float32 ua, ur;
3168 ua.s = xa;
3169 if (unlikely(!can_use_fpu(s))) {
3170 goto soft;
3173 float32_input_flush1(&ua.s, s);
3174 if (QEMU_HARDFLOAT_1F32_USE_FP) {
3175 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3176 fpclassify(ua.h) == FP_ZERO) ||
3177 signbit(ua.h))) {
3178 goto soft;
3180 } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3181 float32_is_neg(ua.s))) {
3182 goto soft;
3184 ur.h = sqrtf(ua.h);
3185 return ur.s;
3187 soft:
3188 return soft_f32_sqrt(ua.s, s);
3191 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3193 union_float64 ua, ur;
3195 ua.s = xa;
3196 if (unlikely(!can_use_fpu(s))) {
3197 goto soft;
3200 float64_input_flush1(&ua.s, s);
3201 if (QEMU_HARDFLOAT_1F64_USE_FP) {
3202 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3203 fpclassify(ua.h) == FP_ZERO) ||
3204 signbit(ua.h))) {
3205 goto soft;
3207 } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3208 float64_is_neg(ua.s))) {
3209 goto soft;
3211 ur.h = sqrt(ua.h);
3212 return ur.s;
3214 soft:
3215 return soft_f64_sqrt(ua.s, s);
3218 /*----------------------------------------------------------------------------
3219 | The pattern for a default generated NaN.
3220 *----------------------------------------------------------------------------*/
3222 float16 float16_default_nan(float_status *status)
3224 FloatParts p = parts_default_nan(status);
3225 p.frac >>= float16_params.frac_shift;
3226 return float16_pack_raw(p);
3229 float32 float32_default_nan(float_status *status)
3231 FloatParts p = parts_default_nan(status);
3232 p.frac >>= float32_params.frac_shift;
3233 return float32_pack_raw(p);
3236 float64 float64_default_nan(float_status *status)
3238 FloatParts p = parts_default_nan(status);
3239 p.frac >>= float64_params.frac_shift;
3240 return float64_pack_raw(p);
3243 float128 float128_default_nan(float_status *status)
3245 FloatParts p = parts_default_nan(status);
3246 float128 r;
3248 /* Extrapolate from the choices made by parts_default_nan to fill
3249 * in the quad-floating format. If the low bit is set, assume we
3250 * want to set all non-snan bits.
3252 r.low = -(p.frac & 1);
3253 r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
3254 r.high |= UINT64_C(0x7FFF000000000000);
3255 r.high |= (uint64_t)p.sign << 63;
3257 return r;
3260 /*----------------------------------------------------------------------------
3261 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3262 *----------------------------------------------------------------------------*/
3264 float16 float16_silence_nan(float16 a, float_status *status)
3266 FloatParts p = float16_unpack_raw(a);
3267 p.frac <<= float16_params.frac_shift;
3268 p = parts_silence_nan(p, status);
3269 p.frac >>= float16_params.frac_shift;
3270 return float16_pack_raw(p);
3273 float32 float32_silence_nan(float32 a, float_status *status)
3275 FloatParts p = float32_unpack_raw(a);
3276 p.frac <<= float32_params.frac_shift;
3277 p = parts_silence_nan(p, status);
3278 p.frac >>= float32_params.frac_shift;
3279 return float32_pack_raw(p);
3282 float64 float64_silence_nan(float64 a, float_status *status)
3284 FloatParts p = float64_unpack_raw(a);
3285 p.frac <<= float64_params.frac_shift;
3286 p = parts_silence_nan(p, status);
3287 p.frac >>= float64_params.frac_shift;
3288 return float64_pack_raw(p);
3292 /*----------------------------------------------------------------------------
3293 | If `a' is denormal and we are in flush-to-zero mode then set the
3294 | input-denormal exception and return zero. Otherwise just return the value.
3295 *----------------------------------------------------------------------------*/
3297 static bool parts_squash_denormal(FloatParts p, float_status *status)
3299 if (p.exp == 0 && p.frac != 0) {
3300 float_raise(float_flag_input_denormal, status);
3301 return true;
3304 return false;
3307 float16 float16_squash_input_denormal(float16 a, float_status *status)
3309 if (status->flush_inputs_to_zero) {
3310 FloatParts p = float16_unpack_raw(a);
3311 if (parts_squash_denormal(p, status)) {
3312 return float16_set_sign(float16_zero, p.sign);
3315 return a;
3318 float32 float32_squash_input_denormal(float32 a, float_status *status)
3320 if (status->flush_inputs_to_zero) {
3321 FloatParts p = float32_unpack_raw(a);
3322 if (parts_squash_denormal(p, status)) {
3323 return float32_set_sign(float32_zero, p.sign);
3326 return a;
3329 float64 float64_squash_input_denormal(float64 a, float_status *status)
3331 if (status->flush_inputs_to_zero) {
3332 FloatParts p = float64_unpack_raw(a);
3333 if (parts_squash_denormal(p, status)) {
3334 return float64_set_sign(float64_zero, p.sign);
3337 return a;
3340 /*----------------------------------------------------------------------------
3341 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3342 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3343 | input. If `zSign' is 1, the input is negated before being converted to an
3344 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
3345 | is simply rounded to an integer, with the inexact exception raised if the
3346 | input cannot be represented exactly as an integer. However, if the fixed-
3347 | point input is too large, the invalid exception is raised and the largest
3348 | positive or negative integer is returned.
3349 *----------------------------------------------------------------------------*/
3351 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
3353 int8_t roundingMode;
3354 flag roundNearestEven;
3355 int8_t roundIncrement, roundBits;
3356 int32_t z;
3358 roundingMode = status->float_rounding_mode;
3359 roundNearestEven = ( roundingMode == float_round_nearest_even );
3360 switch (roundingMode) {
3361 case float_round_nearest_even:
3362 case float_round_ties_away:
3363 roundIncrement = 0x40;
3364 break;
3365 case float_round_to_zero:
3366 roundIncrement = 0;
3367 break;
3368 case float_round_up:
3369 roundIncrement = zSign ? 0 : 0x7f;
3370 break;
3371 case float_round_down:
3372 roundIncrement = zSign ? 0x7f : 0;
3373 break;
3374 case float_round_to_odd:
3375 roundIncrement = absZ & 0x80 ? 0 : 0x7f;
3376 break;
3377 default:
3378 abort();
3380 roundBits = absZ & 0x7F;
3381 absZ = ( absZ + roundIncrement )>>7;
3382 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3383 z = absZ;
3384 if ( zSign ) z = - z;
3385 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3386 float_raise(float_flag_invalid, status);
3387 return zSign ? INT32_MIN : INT32_MAX;
3389 if (roundBits) {
3390 status->float_exception_flags |= float_flag_inexact;
3392 return z;
3396 /*----------------------------------------------------------------------------
3397 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3398 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3399 | and returns the properly rounded 64-bit integer corresponding to the input.
3400 | If `zSign' is 1, the input is negated before being converted to an integer.
3401 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3402 | the inexact exception raised if the input cannot be represented exactly as
3403 | an integer. However, if the fixed-point input is too large, the invalid
3404 | exception is raised and the largest positive or negative integer is
3405 | returned.
3406 *----------------------------------------------------------------------------*/
3408 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
3409 float_status *status)
3411 int8_t roundingMode;
3412 flag roundNearestEven, increment;
3413 int64_t z;
3415 roundingMode = status->float_rounding_mode;
3416 roundNearestEven = ( roundingMode == float_round_nearest_even );
3417 switch (roundingMode) {
3418 case float_round_nearest_even:
3419 case float_round_ties_away:
3420 increment = ((int64_t) absZ1 < 0);
3421 break;
3422 case float_round_to_zero:
3423 increment = 0;
3424 break;
3425 case float_round_up:
3426 increment = !zSign && absZ1;
3427 break;
3428 case float_round_down:
3429 increment = zSign && absZ1;
3430 break;
3431 case float_round_to_odd:
3432 increment = !(absZ0 & 1) && absZ1;
3433 break;
3434 default:
3435 abort();
3437 if ( increment ) {
3438 ++absZ0;
3439 if ( absZ0 == 0 ) goto overflow;
3440 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
3442 z = absZ0;
3443 if ( zSign ) z = - z;
3444 if ( z && ( ( z < 0 ) ^ zSign ) ) {
3445 overflow:
3446 float_raise(float_flag_invalid, status);
3447 return zSign ? INT64_MIN : INT64_MAX;
3449 if (absZ1) {
3450 status->float_exception_flags |= float_flag_inexact;
3452 return z;
3456 /*----------------------------------------------------------------------------
3457 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3458 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3459 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3460 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
3461 | with the inexact exception raised if the input cannot be represented exactly
3462 | as an integer. However, if the fixed-point input is too large, the invalid
3463 | exception is raised and the largest unsigned integer is returned.
3464 *----------------------------------------------------------------------------*/
3466 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
3467 uint64_t absZ1, float_status *status)
3469 int8_t roundingMode;
3470 flag roundNearestEven, increment;
3472 roundingMode = status->float_rounding_mode;
3473 roundNearestEven = (roundingMode == float_round_nearest_even);
3474 switch (roundingMode) {
3475 case float_round_nearest_even:
3476 case float_round_ties_away:
3477 increment = ((int64_t)absZ1 < 0);
3478 break;
3479 case float_round_to_zero:
3480 increment = 0;
3481 break;
3482 case float_round_up:
3483 increment = !zSign && absZ1;
3484 break;
3485 case float_round_down:
3486 increment = zSign && absZ1;
3487 break;
3488 case float_round_to_odd:
3489 increment = !(absZ0 & 1) && absZ1;
3490 break;
3491 default:
3492 abort();
3494 if (increment) {
3495 ++absZ0;
3496 if (absZ0 == 0) {
3497 float_raise(float_flag_invalid, status);
3498 return UINT64_MAX;
3500 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
3503 if (zSign && absZ0) {
3504 float_raise(float_flag_invalid, status);
3505 return 0;
3508 if (absZ1) {
3509 status->float_exception_flags |= float_flag_inexact;
3511 return absZ0;
3514 /*----------------------------------------------------------------------------
3515 | Normalizes the subnormal single-precision floating-point value represented
3516 | by the denormalized significand `aSig'. The normalized exponent and
3517 | significand are stored at the locations pointed to by `zExpPtr' and
3518 | `zSigPtr', respectively.
3519 *----------------------------------------------------------------------------*/
3521 static void
3522 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3524 int8_t shiftCount;
3526 shiftCount = clz32(aSig) - 8;
3527 *zSigPtr = aSig<<shiftCount;
3528 *zExpPtr = 1 - shiftCount;
3532 /*----------------------------------------------------------------------------
3533 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3534 | and significand `zSig', and returns the proper single-precision floating-
3535 | point value corresponding to the abstract input. Ordinarily, the abstract
3536 | value is simply rounded and packed into the single-precision format, with
3537 | the inexact exception raised if the abstract input cannot be represented
3538 | exactly. However, if the abstract value is too large, the overflow and
3539 | inexact exceptions are raised and an infinity or maximal finite value is
3540 | returned. If the abstract value is too small, the input value is rounded to
3541 | a subnormal number, and the underflow and inexact exceptions are raised if
3542 | the abstract input cannot be represented exactly as a subnormal single-
3543 | precision floating-point number.
3544 | The input significand `zSig' has its binary point between bits 30
3545 | and 29, which is 7 bits to the left of the usual location. This shifted
3546 | significand must be normalized or smaller. If `zSig' is not normalized,
3547 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3548 | and it must not require rounding. In the usual case that `zSig' is
3549 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3550 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3551 | Binary Floating-Point Arithmetic.
3552 *----------------------------------------------------------------------------*/
3554 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3555 float_status *status)
3557 int8_t roundingMode;
3558 flag roundNearestEven;
3559 int8_t roundIncrement, roundBits;
3560 flag isTiny;
3562 roundingMode = status->float_rounding_mode;
3563 roundNearestEven = ( roundingMode == float_round_nearest_even );
3564 switch (roundingMode) {
3565 case float_round_nearest_even:
3566 case float_round_ties_away:
3567 roundIncrement = 0x40;
3568 break;
3569 case float_round_to_zero:
3570 roundIncrement = 0;
3571 break;
3572 case float_round_up:
3573 roundIncrement = zSign ? 0 : 0x7f;
3574 break;
3575 case float_round_down:
3576 roundIncrement = zSign ? 0x7f : 0;
3577 break;
3578 case float_round_to_odd:
3579 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3580 break;
3581 default:
3582 abort();
3583 break;
3585 roundBits = zSig & 0x7F;
3586 if ( 0xFD <= (uint16_t) zExp ) {
3587 if ( ( 0xFD < zExp )
3588 || ( ( zExp == 0xFD )
3589 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3591 bool overflow_to_inf = roundingMode != float_round_to_odd &&
3592 roundIncrement != 0;
3593 float_raise(float_flag_overflow | float_flag_inexact, status);
3594 return packFloat32(zSign, 0xFF, -!overflow_to_inf);
3596 if ( zExp < 0 ) {
3597 if (status->flush_to_zero) {
3598 float_raise(float_flag_output_denormal, status);
3599 return packFloat32(zSign, 0, 0);
3601 isTiny =
3602 (status->float_detect_tininess
3603 == float_tininess_before_rounding)
3604 || ( zExp < -1 )
3605 || ( zSig + roundIncrement < 0x80000000 );
3606 shift32RightJamming( zSig, - zExp, &zSig );
3607 zExp = 0;
3608 roundBits = zSig & 0x7F;
3609 if (isTiny && roundBits) {
3610 float_raise(float_flag_underflow, status);
3612 if (roundingMode == float_round_to_odd) {
3614 * For round-to-odd case, the roundIncrement depends on
3615 * zSig which just changed.
3617 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3621 if (roundBits) {
3622 status->float_exception_flags |= float_flag_inexact;
3624 zSig = ( zSig + roundIncrement )>>7;
3625 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3626 if ( zSig == 0 ) zExp = 0;
3627 return packFloat32( zSign, zExp, zSig );
3631 /*----------------------------------------------------------------------------
3632 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3633 | and significand `zSig', and returns the proper single-precision floating-
3634 | point value corresponding to the abstract input. This routine is just like
3635 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
3636 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3637 | floating-point exponent.
3638 *----------------------------------------------------------------------------*/
3640 static float32
3641 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3642 float_status *status)
3644 int8_t shiftCount;
3646 shiftCount = clz32(zSig) - 1;
3647 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
3648 status);
3652 /*----------------------------------------------------------------------------
3653 | Normalizes the subnormal double-precision floating-point value represented
3654 | by the denormalized significand `aSig'. The normalized exponent and
3655 | significand are stored at the locations pointed to by `zExpPtr' and
3656 | `zSigPtr', respectively.
3657 *----------------------------------------------------------------------------*/
3659 static void
3660 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
3662 int8_t shiftCount;
3664 shiftCount = clz64(aSig) - 11;
3665 *zSigPtr = aSig<<shiftCount;
3666 *zExpPtr = 1 - shiftCount;
3670 /*----------------------------------------------------------------------------
3671 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3672 | double-precision floating-point value, returning the result. After being
3673 | shifted into the proper positions, the three fields are simply added
3674 | together to form the result. This means that any integer portion of `zSig'
3675 | will be added into the exponent. Since a properly normalized significand
3676 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3677 | than the desired result exponent whenever `zSig' is a complete, normalized
3678 | significand.
3679 *----------------------------------------------------------------------------*/
3681 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
3684 return make_float64(
3685 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
3689 /*----------------------------------------------------------------------------
3690 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3691 | and significand `zSig', and returns the proper double-precision floating-
3692 | point value corresponding to the abstract input. Ordinarily, the abstract
3693 | value is simply rounded and packed into the double-precision format, with
3694 | the inexact exception raised if the abstract input cannot be represented
3695 | exactly. However, if the abstract value is too large, the overflow and
3696 | inexact exceptions are raised and an infinity or maximal finite value is
3697 | returned. If the abstract value is too small, the input value is rounded to
3698 | a subnormal number, and the underflow and inexact exceptions are raised if
3699 | the abstract input cannot be represented exactly as a subnormal double-
3700 | precision floating-point number.
3701 | The input significand `zSig' has its binary point between bits 62
3702 | and 61, which is 10 bits to the left of the usual location. This shifted
3703 | significand must be normalized or smaller. If `zSig' is not normalized,
3704 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3705 | and it must not require rounding. In the usual case that `zSig' is
3706 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3707 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3708 | Binary Floating-Point Arithmetic.
3709 *----------------------------------------------------------------------------*/
3711 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3712 float_status *status)
3714 int8_t roundingMode;
3715 flag roundNearestEven;
3716 int roundIncrement, roundBits;
3717 flag isTiny;
3719 roundingMode = status->float_rounding_mode;
3720 roundNearestEven = ( roundingMode == float_round_nearest_even );
3721 switch (roundingMode) {
3722 case float_round_nearest_even:
3723 case float_round_ties_away:
3724 roundIncrement = 0x200;
3725 break;
3726 case float_round_to_zero:
3727 roundIncrement = 0;
3728 break;
3729 case float_round_up:
3730 roundIncrement = zSign ? 0 : 0x3ff;
3731 break;
3732 case float_round_down:
3733 roundIncrement = zSign ? 0x3ff : 0;
3734 break;
3735 case float_round_to_odd:
3736 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3737 break;
3738 default:
3739 abort();
3741 roundBits = zSig & 0x3FF;
3742 if ( 0x7FD <= (uint16_t) zExp ) {
3743 if ( ( 0x7FD < zExp )
3744 || ( ( zExp == 0x7FD )
3745 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
3747 bool overflow_to_inf = roundingMode != float_round_to_odd &&
3748 roundIncrement != 0;
3749 float_raise(float_flag_overflow | float_flag_inexact, status);
3750 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
3752 if ( zExp < 0 ) {
3753 if (status->flush_to_zero) {
3754 float_raise(float_flag_output_denormal, status);
3755 return packFloat64(zSign, 0, 0);
3757 isTiny =
3758 (status->float_detect_tininess
3759 == float_tininess_before_rounding)
3760 || ( zExp < -1 )
3761 || ( zSig + roundIncrement < UINT64_C(0x8000000000000000) );
3762 shift64RightJamming( zSig, - zExp, &zSig );
3763 zExp = 0;
3764 roundBits = zSig & 0x3FF;
3765 if (isTiny && roundBits) {
3766 float_raise(float_flag_underflow, status);
3768 if (roundingMode == float_round_to_odd) {
3770 * For round-to-odd case, the roundIncrement depends on
3771 * zSig which just changed.
3773 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3777 if (roundBits) {
3778 status->float_exception_flags |= float_flag_inexact;
3780 zSig = ( zSig + roundIncrement )>>10;
3781 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
3782 if ( zSig == 0 ) zExp = 0;
3783 return packFloat64( zSign, zExp, zSig );
3787 /*----------------------------------------------------------------------------
3788 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3789 | and significand `zSig', and returns the proper double-precision floating-
3790 | point value corresponding to the abstract input. This routine is just like
3791 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
3792 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3793 | floating-point exponent.
3794 *----------------------------------------------------------------------------*/
3796 static float64
3797 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3798 float_status *status)
3800 int8_t shiftCount;
3802 shiftCount = clz64(zSig) - 1;
3803 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
3804 status);
3808 /*----------------------------------------------------------------------------
3809 | Normalizes the subnormal extended double-precision floating-point value
3810 | represented by the denormalized significand `aSig'. The normalized exponent
3811 | and significand are stored at the locations pointed to by `zExpPtr' and
3812 | `zSigPtr', respectively.
3813 *----------------------------------------------------------------------------*/
3815 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
3816 uint64_t *zSigPtr)
3818 int8_t shiftCount;
3820 shiftCount = clz64(aSig);
3821 *zSigPtr = aSig<<shiftCount;
3822 *zExpPtr = 1 - shiftCount;
3825 /*----------------------------------------------------------------------------
3826 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3827 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3828 | and returns the proper extended double-precision floating-point value
3829 | corresponding to the abstract input. Ordinarily, the abstract value is
3830 | rounded and packed into the extended double-precision format, with the
3831 | inexact exception raised if the abstract input cannot be represented
3832 | exactly. However, if the abstract value is too large, the overflow and
3833 | inexact exceptions are raised and an infinity or maximal finite value is
3834 | returned. If the abstract value is too small, the input value is rounded to
3835 | a subnormal number, and the underflow and inexact exceptions are raised if
3836 | the abstract input cannot be represented exactly as a subnormal extended
3837 | double-precision floating-point number.
3838 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
3839 | number of bits as single or double precision, respectively. Otherwise, the
3840 | result is rounded to the full precision of the extended double-precision
3841 | format.
3842 | The input significand must be normalized or smaller. If the input
3843 | significand is not normalized, `zExp' must be 0; in that case, the result
3844 | returned is a subnormal number, and it must not require rounding. The
3845 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3846 | Floating-Point Arithmetic.
3847 *----------------------------------------------------------------------------*/
3849 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
3850 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3851 float_status *status)
3853 int8_t roundingMode;
3854 flag roundNearestEven, increment, isTiny;
3855 int64_t roundIncrement, roundMask, roundBits;
3857 roundingMode = status->float_rounding_mode;
3858 roundNearestEven = ( roundingMode == float_round_nearest_even );
3859 if ( roundingPrecision == 80 ) goto precision80;
3860 if ( roundingPrecision == 64 ) {
3861 roundIncrement = UINT64_C(0x0000000000000400);
3862 roundMask = UINT64_C(0x00000000000007FF);
3864 else if ( roundingPrecision == 32 ) {
3865 roundIncrement = UINT64_C(0x0000008000000000);
3866 roundMask = UINT64_C(0x000000FFFFFFFFFF);
3868 else {
3869 goto precision80;
3871 zSig0 |= ( zSig1 != 0 );
3872 switch (roundingMode) {
3873 case float_round_nearest_even:
3874 case float_round_ties_away:
3875 break;
3876 case float_round_to_zero:
3877 roundIncrement = 0;
3878 break;
3879 case float_round_up:
3880 roundIncrement = zSign ? 0 : roundMask;
3881 break;
3882 case float_round_down:
3883 roundIncrement = zSign ? roundMask : 0;
3884 break;
3885 default:
3886 abort();
3888 roundBits = zSig0 & roundMask;
3889 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3890 if ( ( 0x7FFE < zExp )
3891 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3893 goto overflow;
3895 if ( zExp <= 0 ) {
3896 if (status->flush_to_zero) {
3897 float_raise(float_flag_output_denormal, status);
3898 return packFloatx80(zSign, 0, 0);
3900 isTiny =
3901 (status->float_detect_tininess
3902 == float_tininess_before_rounding)
3903 || ( zExp < 0 )
3904 || ( zSig0 <= zSig0 + roundIncrement );
3905 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3906 zExp = 0;
3907 roundBits = zSig0 & roundMask;
3908 if (isTiny && roundBits) {
3909 float_raise(float_flag_underflow, status);
3911 if (roundBits) {
3912 status->float_exception_flags |= float_flag_inexact;
3914 zSig0 += roundIncrement;
3915 if ( (int64_t) zSig0 < 0 ) zExp = 1;
3916 roundIncrement = roundMask + 1;
3917 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3918 roundMask |= roundIncrement;
3920 zSig0 &= ~ roundMask;
3921 return packFloatx80( zSign, zExp, zSig0 );
3924 if (roundBits) {
3925 status->float_exception_flags |= float_flag_inexact;
3927 zSig0 += roundIncrement;
3928 if ( zSig0 < roundIncrement ) {
3929 ++zExp;
3930 zSig0 = UINT64_C(0x8000000000000000);
3932 roundIncrement = roundMask + 1;
3933 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3934 roundMask |= roundIncrement;
3936 zSig0 &= ~ roundMask;
3937 if ( zSig0 == 0 ) zExp = 0;
3938 return packFloatx80( zSign, zExp, zSig0 );
3939 precision80:
3940 switch (roundingMode) {
3941 case float_round_nearest_even:
3942 case float_round_ties_away:
3943 increment = ((int64_t)zSig1 < 0);
3944 break;
3945 case float_round_to_zero:
3946 increment = 0;
3947 break;
3948 case float_round_up:
3949 increment = !zSign && zSig1;
3950 break;
3951 case float_round_down:
3952 increment = zSign && zSig1;
3953 break;
3954 default:
3955 abort();
3957 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3958 if ( ( 0x7FFE < zExp )
3959 || ( ( zExp == 0x7FFE )
3960 && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
3961 && increment
3964 roundMask = 0;
3965 overflow:
3966 float_raise(float_flag_overflow | float_flag_inexact, status);
3967 if ( ( roundingMode == float_round_to_zero )
3968 || ( zSign && ( roundingMode == float_round_up ) )
3969 || ( ! zSign && ( roundingMode == float_round_down ) )
3971 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3973 return packFloatx80(zSign,
3974 floatx80_infinity_high,
3975 floatx80_infinity_low);
3977 if ( zExp <= 0 ) {
3978 isTiny =
3979 (status->float_detect_tininess
3980 == float_tininess_before_rounding)
3981 || ( zExp < 0 )
3982 || ! increment
3983 || ( zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF) );
3984 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
3985 zExp = 0;
3986 if (isTiny && zSig1) {
3987 float_raise(float_flag_underflow, status);
3989 if (zSig1) {
3990 status->float_exception_flags |= float_flag_inexact;
3992 switch (roundingMode) {
3993 case float_round_nearest_even:
3994 case float_round_ties_away:
3995 increment = ((int64_t)zSig1 < 0);
3996 break;
3997 case float_round_to_zero:
3998 increment = 0;
3999 break;
4000 case float_round_up:
4001 increment = !zSign && zSig1;
4002 break;
4003 case float_round_down:
4004 increment = zSign && zSig1;
4005 break;
4006 default:
4007 abort();
4009 if ( increment ) {
4010 ++zSig0;
4011 zSig0 &=
4012 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
4013 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4015 return packFloatx80( zSign, zExp, zSig0 );
4018 if (zSig1) {
4019 status->float_exception_flags |= float_flag_inexact;
4021 if ( increment ) {
4022 ++zSig0;
4023 if ( zSig0 == 0 ) {
4024 ++zExp;
4025 zSig0 = UINT64_C(0x8000000000000000);
4027 else {
4028 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
4031 else {
4032 if ( zSig0 == 0 ) zExp = 0;
4034 return packFloatx80( zSign, zExp, zSig0 );
4038 /*----------------------------------------------------------------------------
4039 | Takes an abstract floating-point value having sign `zSign', exponent
4040 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4041 | and returns the proper extended double-precision floating-point value
4042 | corresponding to the abstract input. This routine is just like
4043 | `roundAndPackFloatx80' except that the input significand does not have to be
4044 | normalized.
4045 *----------------------------------------------------------------------------*/
4047 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4048 flag zSign, int32_t zExp,
4049 uint64_t zSig0, uint64_t zSig1,
4050 float_status *status)
4052 int8_t shiftCount;
4054 if ( zSig0 == 0 ) {
4055 zSig0 = zSig1;
4056 zSig1 = 0;
4057 zExp -= 64;
4059 shiftCount = clz64(zSig0);
4060 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4061 zExp -= shiftCount;
4062 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4063 zSig0, zSig1, status);
4067 /*----------------------------------------------------------------------------
4068 | Returns the least-significant 64 fraction bits of the quadruple-precision
4069 | floating-point value `a'.
4070 *----------------------------------------------------------------------------*/
4072 static inline uint64_t extractFloat128Frac1( float128 a )
4075 return a.low;
4079 /*----------------------------------------------------------------------------
4080 | Returns the most-significant 48 fraction bits of the quadruple-precision
4081 | floating-point value `a'.
4082 *----------------------------------------------------------------------------*/
4084 static inline uint64_t extractFloat128Frac0( float128 a )
4087 return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4091 /*----------------------------------------------------------------------------
4092 | Returns the exponent bits of the quadruple-precision floating-point value
4093 | `a'.
4094 *----------------------------------------------------------------------------*/
4096 static inline int32_t extractFloat128Exp( float128 a )
4099 return ( a.high>>48 ) & 0x7FFF;
4103 /*----------------------------------------------------------------------------
4104 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4105 *----------------------------------------------------------------------------*/
4107 static inline flag extractFloat128Sign( float128 a )
4110 return a.high>>63;
4114 /*----------------------------------------------------------------------------
4115 | Normalizes the subnormal quadruple-precision floating-point value
4116 | represented by the denormalized significand formed by the concatenation of
4117 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
4118 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
4119 | significand are stored at the location pointed to by `zSig0Ptr', and the
4120 | least significant 64 bits of the normalized significand are stored at the
4121 | location pointed to by `zSig1Ptr'.
4122 *----------------------------------------------------------------------------*/
4124 static void
4125 normalizeFloat128Subnormal(
4126 uint64_t aSig0,
4127 uint64_t aSig1,
4128 int32_t *zExpPtr,
4129 uint64_t *zSig0Ptr,
4130 uint64_t *zSig1Ptr
4133 int8_t shiftCount;
4135 if ( aSig0 == 0 ) {
4136 shiftCount = clz64(aSig1) - 15;
4137 if ( shiftCount < 0 ) {
4138 *zSig0Ptr = aSig1>>( - shiftCount );
4139 *zSig1Ptr = aSig1<<( shiftCount & 63 );
4141 else {
4142 *zSig0Ptr = aSig1<<shiftCount;
4143 *zSig1Ptr = 0;
4145 *zExpPtr = - shiftCount - 63;
4147 else {
4148 shiftCount = clz64(aSig0) - 15;
4149 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4150 *zExpPtr = 1 - shiftCount;
4155 /*----------------------------------------------------------------------------
4156 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4157 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4158 | floating-point value, returning the result. After being shifted into the
4159 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4160 | added together to form the most significant 32 bits of the result. This
4161 | means that any integer portion of `zSig0' will be added into the exponent.
4162 | Since a properly normalized significand will have an integer portion equal
4163 | to 1, the `zExp' input should be 1 less than the desired result exponent
4164 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4165 | significand.
4166 *----------------------------------------------------------------------------*/
4168 static inline float128
4169 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
4171 float128 z;
4173 z.low = zSig1;
4174 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
4175 return z;
4179 /*----------------------------------------------------------------------------
4180 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4181 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4182 | and `zSig2', and returns the proper quadruple-precision floating-point value
4183 | corresponding to the abstract input. Ordinarily, the abstract value is
4184 | simply rounded and packed into the quadruple-precision format, with the
4185 | inexact exception raised if the abstract input cannot be represented
4186 | exactly. However, if the abstract value is too large, the overflow and
4187 | inexact exceptions are raised and an infinity or maximal finite value is
4188 | returned. If the abstract value is too small, the input value is rounded to
4189 | a subnormal number, and the underflow and inexact exceptions are raised if
4190 | the abstract input cannot be represented exactly as a subnormal quadruple-
4191 | precision floating-point number.
4192 | The input significand must be normalized or smaller. If the input
4193 | significand is not normalized, `zExp' must be 0; in that case, the result
4194 | returned is a subnormal number, and it must not require rounding. In the
4195 | usual case that the input significand is normalized, `zExp' must be 1 less
4196 | than the ``true'' floating-point exponent. The handling of underflow and
4197 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4198 *----------------------------------------------------------------------------*/
4200 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
4201 uint64_t zSig0, uint64_t zSig1,
4202 uint64_t zSig2, float_status *status)
4204 int8_t roundingMode;
4205 flag roundNearestEven, increment, isTiny;
4207 roundingMode = status->float_rounding_mode;
4208 roundNearestEven = ( roundingMode == float_round_nearest_even );
4209 switch (roundingMode) {
4210 case float_round_nearest_even:
4211 case float_round_ties_away:
4212 increment = ((int64_t)zSig2 < 0);
4213 break;
4214 case float_round_to_zero:
4215 increment = 0;
4216 break;
4217 case float_round_up:
4218 increment = !zSign && zSig2;
4219 break;
4220 case float_round_down:
4221 increment = zSign && zSig2;
4222 break;
4223 case float_round_to_odd:
4224 increment = !(zSig1 & 0x1) && zSig2;
4225 break;
4226 default:
4227 abort();
4229 if ( 0x7FFD <= (uint32_t) zExp ) {
4230 if ( ( 0x7FFD < zExp )
4231 || ( ( zExp == 0x7FFD )
4232 && eq128(
4233 UINT64_C(0x0001FFFFFFFFFFFF),
4234 UINT64_C(0xFFFFFFFFFFFFFFFF),
4235 zSig0,
4236 zSig1
4238 && increment
4241 float_raise(float_flag_overflow | float_flag_inexact, status);
4242 if ( ( roundingMode == float_round_to_zero )
4243 || ( zSign && ( roundingMode == float_round_up ) )
4244 || ( ! zSign && ( roundingMode == float_round_down ) )
4245 || (roundingMode == float_round_to_odd)
4247 return
4248 packFloat128(
4249 zSign,
4250 0x7FFE,
4251 UINT64_C(0x0000FFFFFFFFFFFF),
4252 UINT64_C(0xFFFFFFFFFFFFFFFF)
4255 return packFloat128( zSign, 0x7FFF, 0, 0 );
4257 if ( zExp < 0 ) {
4258 if (status->flush_to_zero) {
4259 float_raise(float_flag_output_denormal, status);
4260 return packFloat128(zSign, 0, 0, 0);
4262 isTiny =
4263 (status->float_detect_tininess
4264 == float_tininess_before_rounding)
4265 || ( zExp < -1 )
4266 || ! increment
4267 || lt128(
4268 zSig0,
4269 zSig1,
4270 UINT64_C(0x0001FFFFFFFFFFFF),
4271 UINT64_C(0xFFFFFFFFFFFFFFFF)
4273 shift128ExtraRightJamming(
4274 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4275 zExp = 0;
4276 if (isTiny && zSig2) {
4277 float_raise(float_flag_underflow, status);
4279 switch (roundingMode) {
4280 case float_round_nearest_even:
4281 case float_round_ties_away:
4282 increment = ((int64_t)zSig2 < 0);
4283 break;
4284 case float_round_to_zero:
4285 increment = 0;
4286 break;
4287 case float_round_up:
4288 increment = !zSign && zSig2;
4289 break;
4290 case float_round_down:
4291 increment = zSign && zSig2;
4292 break;
4293 case float_round_to_odd:
4294 increment = !(zSig1 & 0x1) && zSig2;
4295 break;
4296 default:
4297 abort();
4301 if (zSig2) {
4302 status->float_exception_flags |= float_flag_inexact;
4304 if ( increment ) {
4305 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4306 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
4308 else {
4309 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4311 return packFloat128( zSign, zExp, zSig0, zSig1 );
4315 /*----------------------------------------------------------------------------
4316 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4317 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4318 | returns the proper quadruple-precision floating-point value corresponding
4319 | to the abstract input. This routine is just like `roundAndPackFloat128'
4320 | except that the input significand has fewer bits and does not have to be
4321 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
4322 | point exponent.
4323 *----------------------------------------------------------------------------*/
4325 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
4326 uint64_t zSig0, uint64_t zSig1,
4327 float_status *status)
4329 int8_t shiftCount;
4330 uint64_t zSig2;
4332 if ( zSig0 == 0 ) {
4333 zSig0 = zSig1;
4334 zSig1 = 0;
4335 zExp -= 64;
4337 shiftCount = clz64(zSig0) - 15;
4338 if ( 0 <= shiftCount ) {
4339 zSig2 = 0;
4340 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4342 else {
4343 shift128ExtraRightJamming(
4344 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4346 zExp -= shiftCount;
4347 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4352 /*----------------------------------------------------------------------------
4353 | Returns the result of converting the 32-bit two's complement integer `a'
4354 | to the extended double-precision floating-point format. The conversion
4355 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4356 | Arithmetic.
4357 *----------------------------------------------------------------------------*/
4359 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4361 flag zSign;
4362 uint32_t absA;
4363 int8_t shiftCount;
4364 uint64_t zSig;
4366 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4367 zSign = ( a < 0 );
4368 absA = zSign ? - a : a;
4369 shiftCount = clz32(absA) + 32;
4370 zSig = absA;
4371 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4375 /*----------------------------------------------------------------------------
4376 | Returns the result of converting the 32-bit two's complement integer `a' to
4377 | the quadruple-precision floating-point format. The conversion is performed
4378 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4379 *----------------------------------------------------------------------------*/
4381 float128 int32_to_float128(int32_t a, float_status *status)
4383 flag zSign;
4384 uint32_t absA;
4385 int8_t shiftCount;
4386 uint64_t zSig0;
4388 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4389 zSign = ( a < 0 );
4390 absA = zSign ? - a : a;
4391 shiftCount = clz32(absA) + 17;
4392 zSig0 = absA;
4393 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4397 /*----------------------------------------------------------------------------
4398 | Returns the result of converting the 64-bit two's complement integer `a'
4399 | to the extended double-precision floating-point format. The conversion
4400 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4401 | Arithmetic.
4402 *----------------------------------------------------------------------------*/
4404 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4406 flag zSign;
4407 uint64_t absA;
4408 int8_t shiftCount;
4410 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4411 zSign = ( a < 0 );
4412 absA = zSign ? - a : a;
4413 shiftCount = clz64(absA);
4414 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4418 /*----------------------------------------------------------------------------
4419 | Returns the result of converting the 64-bit two's complement integer `a' to
4420 | the quadruple-precision floating-point format. The conversion is performed
4421 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4422 *----------------------------------------------------------------------------*/
4424 float128 int64_to_float128(int64_t a, float_status *status)
4426 flag zSign;
4427 uint64_t absA;
4428 int8_t shiftCount;
4429 int32_t zExp;
4430 uint64_t zSig0, zSig1;
4432 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4433 zSign = ( a < 0 );
4434 absA = zSign ? - a : a;
4435 shiftCount = clz64(absA) + 49;
4436 zExp = 0x406E - shiftCount;
4437 if ( 64 <= shiftCount ) {
4438 zSig1 = 0;
4439 zSig0 = absA;
4440 shiftCount -= 64;
4442 else {
4443 zSig1 = absA;
4444 zSig0 = 0;
4446 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4447 return packFloat128( zSign, zExp, zSig0, zSig1 );
4451 /*----------------------------------------------------------------------------
4452 | Returns the result of converting the 64-bit unsigned integer `a'
4453 | to the quadruple-precision floating-point format. The conversion is performed
4454 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4455 *----------------------------------------------------------------------------*/
4457 float128 uint64_to_float128(uint64_t a, float_status *status)
4459 if (a == 0) {
4460 return float128_zero;
4462 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4465 /*----------------------------------------------------------------------------
4466 | Returns the result of converting the single-precision floating-point value
4467 | `a' to the extended double-precision floating-point format. The conversion
4468 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4469 | Arithmetic.
4470 *----------------------------------------------------------------------------*/
4472 floatx80 float32_to_floatx80(float32 a, float_status *status)
4474 flag aSign;
4475 int aExp;
4476 uint32_t aSig;
4478 a = float32_squash_input_denormal(a, status);
4479 aSig = extractFloat32Frac( a );
4480 aExp = extractFloat32Exp( a );
4481 aSign = extractFloat32Sign( a );
4482 if ( aExp == 0xFF ) {
4483 if (aSig) {
4484 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
4486 return packFloatx80(aSign,
4487 floatx80_infinity_high,
4488 floatx80_infinity_low);
4490 if ( aExp == 0 ) {
4491 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4492 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4494 aSig |= 0x00800000;
4495 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4499 /*----------------------------------------------------------------------------
4500 | Returns the result of converting the single-precision floating-point value
4501 | `a' to the double-precision floating-point format. The conversion is
4502 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4503 | Arithmetic.
4504 *----------------------------------------------------------------------------*/
4506 float128 float32_to_float128(float32 a, float_status *status)
4508 flag aSign;
4509 int aExp;
4510 uint32_t aSig;
4512 a = float32_squash_input_denormal(a, status);
4513 aSig = extractFloat32Frac( a );
4514 aExp = extractFloat32Exp( a );
4515 aSign = extractFloat32Sign( a );
4516 if ( aExp == 0xFF ) {
4517 if (aSig) {
4518 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4520 return packFloat128( aSign, 0x7FFF, 0, 0 );
4522 if ( aExp == 0 ) {
4523 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4524 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4525 --aExp;
4527 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4531 /*----------------------------------------------------------------------------
4532 | Returns the remainder of the single-precision floating-point value `a'
4533 | with respect to the corresponding value `b'. The operation is performed
4534 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4535 *----------------------------------------------------------------------------*/
4537 float32 float32_rem(float32 a, float32 b, float_status *status)
4539 flag aSign, zSign;
4540 int aExp, bExp, expDiff;
4541 uint32_t aSig, bSig;
4542 uint32_t q;
4543 uint64_t aSig64, bSig64, q64;
4544 uint32_t alternateASig;
4545 int32_t sigMean;
4546 a = float32_squash_input_denormal(a, status);
4547 b = float32_squash_input_denormal(b, status);
4549 aSig = extractFloat32Frac( a );
4550 aExp = extractFloat32Exp( a );
4551 aSign = extractFloat32Sign( a );
4552 bSig = extractFloat32Frac( b );
4553 bExp = extractFloat32Exp( b );
4554 if ( aExp == 0xFF ) {
4555 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4556 return propagateFloat32NaN(a, b, status);
4558 float_raise(float_flag_invalid, status);
4559 return float32_default_nan(status);
4561 if ( bExp == 0xFF ) {
4562 if (bSig) {
4563 return propagateFloat32NaN(a, b, status);
4565 return a;
4567 if ( bExp == 0 ) {
4568 if ( bSig == 0 ) {
4569 float_raise(float_flag_invalid, status);
4570 return float32_default_nan(status);
4572 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4574 if ( aExp == 0 ) {
4575 if ( aSig == 0 ) return a;
4576 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4578 expDiff = aExp - bExp;
4579 aSig |= 0x00800000;
4580 bSig |= 0x00800000;
4581 if ( expDiff < 32 ) {
4582 aSig <<= 8;
4583 bSig <<= 8;
4584 if ( expDiff < 0 ) {
4585 if ( expDiff < -1 ) return a;
4586 aSig >>= 1;
4588 q = ( bSig <= aSig );
4589 if ( q ) aSig -= bSig;
4590 if ( 0 < expDiff ) {
4591 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4592 q >>= 32 - expDiff;
4593 bSig >>= 2;
4594 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4596 else {
4597 aSig >>= 2;
4598 bSig >>= 2;
4601 else {
4602 if ( bSig <= aSig ) aSig -= bSig;
4603 aSig64 = ( (uint64_t) aSig )<<40;
4604 bSig64 = ( (uint64_t) bSig )<<40;
4605 expDiff -= 64;
4606 while ( 0 < expDiff ) {
4607 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4608 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4609 aSig64 = - ( ( bSig * q64 )<<38 );
4610 expDiff -= 62;
4612 expDiff += 64;
4613 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4614 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4615 q = q64>>( 64 - expDiff );
4616 bSig <<= 6;
4617 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
4619 do {
4620 alternateASig = aSig;
4621 ++q;
4622 aSig -= bSig;
4623 } while ( 0 <= (int32_t) aSig );
4624 sigMean = aSig + alternateASig;
4625 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4626 aSig = alternateASig;
4628 zSign = ( (int32_t) aSig < 0 );
4629 if ( zSign ) aSig = - aSig;
4630 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
4635 /*----------------------------------------------------------------------------
4636 | Returns the binary exponential of the single-precision floating-point value
4637 | `a'. The operation is performed according to the IEC/IEEE Standard for
4638 | Binary Floating-Point Arithmetic.
4640 | Uses the following identities:
4642 | 1. -------------------------------------------------------------------------
4643 | x x*ln(2)
4644 | 2 = e
4646 | 2. -------------------------------------------------------------------------
4647 | 2 3 4 5 n
4648 | x x x x x x x
4649 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
4650 | 1! 2! 3! 4! 5! n!
4651 *----------------------------------------------------------------------------*/
4653 static const float64 float32_exp2_coefficients[15] =
4655 const_float64( 0x3ff0000000000000ll ), /* 1 */
4656 const_float64( 0x3fe0000000000000ll ), /* 2 */
4657 const_float64( 0x3fc5555555555555ll ), /* 3 */
4658 const_float64( 0x3fa5555555555555ll ), /* 4 */
4659 const_float64( 0x3f81111111111111ll ), /* 5 */
4660 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
4661 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
4662 const_float64( 0x3efa01a01a01a01all ), /* 8 */
4663 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
4664 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
4665 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
4666 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
4667 const_float64( 0x3de6124613a86d09ll ), /* 13 */
4668 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
4669 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
4672 float32 float32_exp2(float32 a, float_status *status)
4674 flag aSign;
4675 int aExp;
4676 uint32_t aSig;
4677 float64 r, x, xn;
4678 int i;
4679 a = float32_squash_input_denormal(a, status);
4681 aSig = extractFloat32Frac( a );
4682 aExp = extractFloat32Exp( a );
4683 aSign = extractFloat32Sign( a );
4685 if ( aExp == 0xFF) {
4686 if (aSig) {
4687 return propagateFloat32NaN(a, float32_zero, status);
4689 return (aSign) ? float32_zero : a;
4691 if (aExp == 0) {
4692 if (aSig == 0) return float32_one;
4695 float_raise(float_flag_inexact, status);
4697 /* ******************************* */
4698 /* using float64 for approximation */
4699 /* ******************************* */
4700 x = float32_to_float64(a, status);
4701 x = float64_mul(x, float64_ln2, status);
4703 xn = x;
4704 r = float64_one;
4705 for (i = 0 ; i < 15 ; i++) {
4706 float64 f;
4708 f = float64_mul(xn, float32_exp2_coefficients[i], status);
4709 r = float64_add(r, f, status);
4711 xn = float64_mul(xn, x, status);
4714 return float64_to_float32(r, status);
4717 /*----------------------------------------------------------------------------
4718 | Returns the binary log of the single-precision floating-point value `a'.
4719 | The operation is performed according to the IEC/IEEE Standard for Binary
4720 | Floating-Point Arithmetic.
4721 *----------------------------------------------------------------------------*/
4722 float32 float32_log2(float32 a, float_status *status)
4724 flag aSign, zSign;
4725 int aExp;
4726 uint32_t aSig, zSig, i;
4728 a = float32_squash_input_denormal(a, status);
4729 aSig = extractFloat32Frac( a );
4730 aExp = extractFloat32Exp( a );
4731 aSign = extractFloat32Sign( a );
4733 if ( aExp == 0 ) {
4734 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
4735 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4737 if ( aSign ) {
4738 float_raise(float_flag_invalid, status);
4739 return float32_default_nan(status);
4741 if ( aExp == 0xFF ) {
4742 if (aSig) {
4743 return propagateFloat32NaN(a, float32_zero, status);
4745 return a;
4748 aExp -= 0x7F;
4749 aSig |= 0x00800000;
4750 zSign = aExp < 0;
4751 zSig = aExp << 23;
4753 for (i = 1 << 22; i > 0; i >>= 1) {
4754 aSig = ( (uint64_t)aSig * aSig ) >> 23;
4755 if ( aSig & 0x01000000 ) {
4756 aSig >>= 1;
4757 zSig |= i;
4761 if ( zSign )
4762 zSig = -zSig;
4764 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
4767 /*----------------------------------------------------------------------------
4768 | Returns 1 if the single-precision floating-point value `a' is equal to
4769 | the corresponding value `b', and 0 otherwise. The invalid exception is
4770 | raised if either operand is a NaN. Otherwise, the comparison is performed
4771 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4772 *----------------------------------------------------------------------------*/
4774 int float32_eq(float32 a, float32 b, float_status *status)
4776 uint32_t av, bv;
4777 a = float32_squash_input_denormal(a, status);
4778 b = float32_squash_input_denormal(b, status);
4780 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4781 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4783 float_raise(float_flag_invalid, status);
4784 return 0;
4786 av = float32_val(a);
4787 bv = float32_val(b);
4788 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4791 /*----------------------------------------------------------------------------
4792 | Returns 1 if the single-precision floating-point value `a' is less than
4793 | or equal to the corresponding value `b', and 0 otherwise. The invalid
4794 | exception is raised if either operand is a NaN. The comparison is performed
4795 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4796 *----------------------------------------------------------------------------*/
4798 int float32_le(float32 a, float32 b, float_status *status)
4800 flag aSign, bSign;
4801 uint32_t av, bv;
4802 a = float32_squash_input_denormal(a, status);
4803 b = float32_squash_input_denormal(b, status);
4805 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4806 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4808 float_raise(float_flag_invalid, status);
4809 return 0;
4811 aSign = extractFloat32Sign( a );
4812 bSign = extractFloat32Sign( b );
4813 av = float32_val(a);
4814 bv = float32_val(b);
4815 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4816 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4820 /*----------------------------------------------------------------------------
4821 | Returns 1 if the single-precision floating-point value `a' is less than
4822 | the corresponding value `b', and 0 otherwise. The invalid exception is
4823 | raised if either operand is a NaN. The comparison is performed according
4824 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4825 *----------------------------------------------------------------------------*/
4827 int float32_lt(float32 a, float32 b, float_status *status)
4829 flag aSign, bSign;
4830 uint32_t av, bv;
4831 a = float32_squash_input_denormal(a, status);
4832 b = float32_squash_input_denormal(b, status);
4834 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4835 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4837 float_raise(float_flag_invalid, status);
4838 return 0;
4840 aSign = extractFloat32Sign( a );
4841 bSign = extractFloat32Sign( b );
4842 av = float32_val(a);
4843 bv = float32_val(b);
4844 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4845 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4849 /*----------------------------------------------------------------------------
4850 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4851 | be compared, and 0 otherwise. The invalid exception is raised if either
4852 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4853 | Standard for Binary Floating-Point Arithmetic.
4854 *----------------------------------------------------------------------------*/
4856 int float32_unordered(float32 a, float32 b, float_status *status)
4858 a = float32_squash_input_denormal(a, status);
4859 b = float32_squash_input_denormal(b, status);
4861 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4862 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4864 float_raise(float_flag_invalid, status);
4865 return 1;
4867 return 0;
4870 /*----------------------------------------------------------------------------
4871 | Returns 1 if the single-precision floating-point value `a' is equal to
4872 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4873 | exception. The comparison is performed according to the IEC/IEEE Standard
4874 | for Binary Floating-Point Arithmetic.
4875 *----------------------------------------------------------------------------*/
4877 int float32_eq_quiet(float32 a, float32 b, float_status *status)
4879 a = float32_squash_input_denormal(a, status);
4880 b = float32_squash_input_denormal(b, status);
4882 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4883 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4885 if (float32_is_signaling_nan(a, status)
4886 || float32_is_signaling_nan(b, status)) {
4887 float_raise(float_flag_invalid, status);
4889 return 0;
4891 return ( float32_val(a) == float32_val(b) ) ||
4892 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
4895 /*----------------------------------------------------------------------------
4896 | Returns 1 if the single-precision floating-point value `a' is less than or
4897 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4898 | cause an exception. Otherwise, the comparison is performed according to the
4899 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4900 *----------------------------------------------------------------------------*/
4902 int float32_le_quiet(float32 a, float32 b, float_status *status)
4904 flag aSign, bSign;
4905 uint32_t av, bv;
4906 a = float32_squash_input_denormal(a, status);
4907 b = float32_squash_input_denormal(b, status);
4909 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4910 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4912 if (float32_is_signaling_nan(a, status)
4913 || float32_is_signaling_nan(b, status)) {
4914 float_raise(float_flag_invalid, status);
4916 return 0;
4918 aSign = extractFloat32Sign( a );
4919 bSign = extractFloat32Sign( b );
4920 av = float32_val(a);
4921 bv = float32_val(b);
4922 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4923 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4927 /*----------------------------------------------------------------------------
4928 | Returns 1 if the single-precision floating-point value `a' is less than
4929 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4930 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4931 | Standard for Binary Floating-Point Arithmetic.
4932 *----------------------------------------------------------------------------*/
4934 int float32_lt_quiet(float32 a, float32 b, float_status *status)
4936 flag aSign, bSign;
4937 uint32_t av, bv;
4938 a = float32_squash_input_denormal(a, status);
4939 b = float32_squash_input_denormal(b, status);
4941 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4942 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4944 if (float32_is_signaling_nan(a, status)
4945 || float32_is_signaling_nan(b, status)) {
4946 float_raise(float_flag_invalid, status);
4948 return 0;
4950 aSign = extractFloat32Sign( a );
4951 bSign = extractFloat32Sign( b );
4952 av = float32_val(a);
4953 bv = float32_val(b);
4954 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4955 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4959 /*----------------------------------------------------------------------------
4960 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4961 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4962 | comparison is performed according to the IEC/IEEE Standard for Binary
4963 | Floating-Point Arithmetic.
4964 *----------------------------------------------------------------------------*/
4966 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
4968 a = float32_squash_input_denormal(a, status);
4969 b = float32_squash_input_denormal(b, status);
4971 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4972 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4974 if (float32_is_signaling_nan(a, status)
4975 || float32_is_signaling_nan(b, status)) {
4976 float_raise(float_flag_invalid, status);
4978 return 1;
4980 return 0;
4983 /*----------------------------------------------------------------------------
4984 | Returns the result of converting the double-precision floating-point value
4985 | `a' to the extended double-precision floating-point format. The conversion
4986 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4987 | Arithmetic.
4988 *----------------------------------------------------------------------------*/
4990 floatx80 float64_to_floatx80(float64 a, float_status *status)
4992 flag aSign;
4993 int aExp;
4994 uint64_t aSig;
4996 a = float64_squash_input_denormal(a, status);
4997 aSig = extractFloat64Frac( a );
4998 aExp = extractFloat64Exp( a );
4999 aSign = extractFloat64Sign( a );
5000 if ( aExp == 0x7FF ) {
5001 if (aSig) {
5002 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
5004 return packFloatx80(aSign,
5005 floatx80_infinity_high,
5006 floatx80_infinity_low);
5008 if ( aExp == 0 ) {
5009 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5010 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5012 return
5013 packFloatx80(
5014 aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5018 /*----------------------------------------------------------------------------
5019 | Returns the result of converting the double-precision floating-point value
5020 | `a' to the quadruple-precision floating-point format. The conversion is
5021 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5022 | Arithmetic.
5023 *----------------------------------------------------------------------------*/
5025 float128 float64_to_float128(float64 a, float_status *status)
5027 flag aSign;
5028 int aExp;
5029 uint64_t aSig, zSig0, zSig1;
5031 a = float64_squash_input_denormal(a, status);
5032 aSig = extractFloat64Frac( a );
5033 aExp = extractFloat64Exp( a );
5034 aSign = extractFloat64Sign( a );
5035 if ( aExp == 0x7FF ) {
5036 if (aSig) {
5037 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5039 return packFloat128( aSign, 0x7FFF, 0, 0 );
5041 if ( aExp == 0 ) {
5042 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5043 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5044 --aExp;
5046 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5047 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5052 /*----------------------------------------------------------------------------
5053 | Returns the remainder of the double-precision floating-point value `a'
5054 | with respect to the corresponding value `b'. The operation is performed
5055 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5056 *----------------------------------------------------------------------------*/
5058 float64 float64_rem(float64 a, float64 b, float_status *status)
5060 flag aSign, zSign;
5061 int aExp, bExp, expDiff;
5062 uint64_t aSig, bSig;
5063 uint64_t q, alternateASig;
5064 int64_t sigMean;
5066 a = float64_squash_input_denormal(a, status);
5067 b = float64_squash_input_denormal(b, status);
5068 aSig = extractFloat64Frac( a );
5069 aExp = extractFloat64Exp( a );
5070 aSign = extractFloat64Sign( a );
5071 bSig = extractFloat64Frac( b );
5072 bExp = extractFloat64Exp( b );
5073 if ( aExp == 0x7FF ) {
5074 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5075 return propagateFloat64NaN(a, b, status);
5077 float_raise(float_flag_invalid, status);
5078 return float64_default_nan(status);
5080 if ( bExp == 0x7FF ) {
5081 if (bSig) {
5082 return propagateFloat64NaN(a, b, status);
5084 return a;
5086 if ( bExp == 0 ) {
5087 if ( bSig == 0 ) {
5088 float_raise(float_flag_invalid, status);
5089 return float64_default_nan(status);
5091 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5093 if ( aExp == 0 ) {
5094 if ( aSig == 0 ) return a;
5095 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5097 expDiff = aExp - bExp;
5098 aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5099 bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5100 if ( expDiff < 0 ) {
5101 if ( expDiff < -1 ) return a;
5102 aSig >>= 1;
5104 q = ( bSig <= aSig );
5105 if ( q ) aSig -= bSig;
5106 expDiff -= 64;
5107 while ( 0 < expDiff ) {
5108 q = estimateDiv128To64( aSig, 0, bSig );
5109 q = ( 2 < q ) ? q - 2 : 0;
5110 aSig = - ( ( bSig>>2 ) * q );
5111 expDiff -= 62;
5113 expDiff += 64;
5114 if ( 0 < expDiff ) {
5115 q = estimateDiv128To64( aSig, 0, bSig );
5116 q = ( 2 < q ) ? q - 2 : 0;
5117 q >>= 64 - expDiff;
5118 bSig >>= 2;
5119 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5121 else {
5122 aSig >>= 2;
5123 bSig >>= 2;
5125 do {
5126 alternateASig = aSig;
5127 ++q;
5128 aSig -= bSig;
5129 } while ( 0 <= (int64_t) aSig );
5130 sigMean = aSig + alternateASig;
5131 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5132 aSig = alternateASig;
5134 zSign = ( (int64_t) aSig < 0 );
5135 if ( zSign ) aSig = - aSig;
5136 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5140 /*----------------------------------------------------------------------------
5141 | Returns the binary log of the double-precision floating-point value `a'.
5142 | The operation is performed according to the IEC/IEEE Standard for Binary
5143 | Floating-Point Arithmetic.
5144 *----------------------------------------------------------------------------*/
5145 float64 float64_log2(float64 a, float_status *status)
5147 flag aSign, zSign;
5148 int aExp;
5149 uint64_t aSig, aSig0, aSig1, zSig, i;
5150 a = float64_squash_input_denormal(a, status);
5152 aSig = extractFloat64Frac( a );
5153 aExp = extractFloat64Exp( a );
5154 aSign = extractFloat64Sign( a );
5156 if ( aExp == 0 ) {
5157 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5158 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5160 if ( aSign ) {
5161 float_raise(float_flag_invalid, status);
5162 return float64_default_nan(status);
5164 if ( aExp == 0x7FF ) {
5165 if (aSig) {
5166 return propagateFloat64NaN(a, float64_zero, status);
5168 return a;
5171 aExp -= 0x3FF;
5172 aSig |= UINT64_C(0x0010000000000000);
5173 zSign = aExp < 0;
5174 zSig = (uint64_t)aExp << 52;
5175 for (i = 1LL << 51; i > 0; i >>= 1) {
5176 mul64To128( aSig, aSig, &aSig0, &aSig1 );
5177 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5178 if ( aSig & UINT64_C(0x0020000000000000) ) {
5179 aSig >>= 1;
5180 zSig |= i;
5184 if ( zSign )
5185 zSig = -zSig;
5186 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5189 /*----------------------------------------------------------------------------
5190 | Returns 1 if the double-precision floating-point value `a' is equal to the
5191 | corresponding value `b', and 0 otherwise. The invalid exception is raised
5192 | if either operand is a NaN. Otherwise, the comparison is performed
5193 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5194 *----------------------------------------------------------------------------*/
5196 int float64_eq(float64 a, float64 b, float_status *status)
5198 uint64_t av, bv;
5199 a = float64_squash_input_denormal(a, status);
5200 b = float64_squash_input_denormal(b, status);
5202 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5203 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5205 float_raise(float_flag_invalid, status);
5206 return 0;
5208 av = float64_val(a);
5209 bv = float64_val(b);
5210 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5214 /*----------------------------------------------------------------------------
5215 | Returns 1 if the double-precision floating-point value `a' is less than or
5216 | equal to the corresponding value `b', and 0 otherwise. The invalid
5217 | exception is raised if either operand is a NaN. The comparison is performed
5218 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5219 *----------------------------------------------------------------------------*/
5221 int float64_le(float64 a, float64 b, float_status *status)
5223 flag aSign, bSign;
5224 uint64_t av, bv;
5225 a = float64_squash_input_denormal(a, status);
5226 b = float64_squash_input_denormal(b, status);
5228 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5229 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5231 float_raise(float_flag_invalid, status);
5232 return 0;
5234 aSign = extractFloat64Sign( a );
5235 bSign = extractFloat64Sign( b );
5236 av = float64_val(a);
5237 bv = float64_val(b);
5238 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5239 return ( av == bv ) || ( aSign ^ ( av < bv ) );
5243 /*----------------------------------------------------------------------------
5244 | Returns 1 if the double-precision floating-point value `a' is less than
5245 | the corresponding value `b', and 0 otherwise. The invalid exception is
5246 | raised if either operand is a NaN. The comparison is performed according
5247 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5248 *----------------------------------------------------------------------------*/
5250 int float64_lt(float64 a, float64 b, float_status *status)
5252 flag aSign, bSign;
5253 uint64_t av, bv;
5255 a = float64_squash_input_denormal(a, status);
5256 b = float64_squash_input_denormal(b, status);
5257 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5258 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5260 float_raise(float_flag_invalid, status);
5261 return 0;
5263 aSign = extractFloat64Sign( a );
5264 bSign = extractFloat64Sign( b );
5265 av = float64_val(a);
5266 bv = float64_val(b);
5267 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5268 return ( av != bv ) && ( aSign ^ ( av < bv ) );
5272 /*----------------------------------------------------------------------------
5273 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5274 | be compared, and 0 otherwise. The invalid exception is raised if either
5275 | operand is a NaN. The comparison is performed according to the IEC/IEEE
5276 | Standard for Binary Floating-Point Arithmetic.
5277 *----------------------------------------------------------------------------*/
5279 int float64_unordered(float64 a, float64 b, float_status *status)
5281 a = float64_squash_input_denormal(a, status);
5282 b = float64_squash_input_denormal(b, status);
5284 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5285 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5287 float_raise(float_flag_invalid, status);
5288 return 1;
5290 return 0;
5293 /*----------------------------------------------------------------------------
5294 | Returns 1 if the double-precision floating-point value `a' is equal to the
5295 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5296 | exception.The comparison is performed according to the IEC/IEEE Standard
5297 | for Binary Floating-Point Arithmetic.
5298 *----------------------------------------------------------------------------*/
5300 int float64_eq_quiet(float64 a, float64 b, float_status *status)
5302 uint64_t av, bv;
5303 a = float64_squash_input_denormal(a, status);
5304 b = float64_squash_input_denormal(b, status);
5306 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5307 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5309 if (float64_is_signaling_nan(a, status)
5310 || float64_is_signaling_nan(b, status)) {
5311 float_raise(float_flag_invalid, status);
5313 return 0;
5315 av = float64_val(a);
5316 bv = float64_val(b);
5317 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5321 /*----------------------------------------------------------------------------
5322 | Returns 1 if the double-precision floating-point value `a' is less than or
5323 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5324 | cause an exception. Otherwise, the comparison is performed according to the
5325 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5326 *----------------------------------------------------------------------------*/
5328 int float64_le_quiet(float64 a, float64 b, float_status *status)
5330 flag aSign, bSign;
5331 uint64_t av, bv;
5332 a = float64_squash_input_denormal(a, status);
5333 b = float64_squash_input_denormal(b, status);
5335 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5336 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5338 if (float64_is_signaling_nan(a, status)
5339 || float64_is_signaling_nan(b, status)) {
5340 float_raise(float_flag_invalid, status);
5342 return 0;
5344 aSign = extractFloat64Sign( a );
5345 bSign = extractFloat64Sign( b );
5346 av = float64_val(a);
5347 bv = float64_val(b);
5348 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5349 return ( av == bv ) || ( aSign ^ ( av < bv ) );
5353 /*----------------------------------------------------------------------------
5354 | Returns 1 if the double-precision floating-point value `a' is less than
5355 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5356 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5357 | Standard for Binary Floating-Point Arithmetic.
5358 *----------------------------------------------------------------------------*/
5360 int float64_lt_quiet(float64 a, float64 b, float_status *status)
5362 flag aSign, bSign;
5363 uint64_t av, bv;
5364 a = float64_squash_input_denormal(a, status);
5365 b = float64_squash_input_denormal(b, status);
5367 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5368 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5370 if (float64_is_signaling_nan(a, status)
5371 || float64_is_signaling_nan(b, status)) {
5372 float_raise(float_flag_invalid, status);
5374 return 0;
5376 aSign = extractFloat64Sign( a );
5377 bSign = extractFloat64Sign( b );
5378 av = float64_val(a);
5379 bv = float64_val(b);
5380 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5381 return ( av != bv ) && ( aSign ^ ( av < bv ) );
5385 /*----------------------------------------------------------------------------
5386 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5387 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
5388 | comparison is performed according to the IEC/IEEE Standard for Binary
5389 | Floating-Point Arithmetic.
5390 *----------------------------------------------------------------------------*/
5392 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
5394 a = float64_squash_input_denormal(a, status);
5395 b = float64_squash_input_denormal(b, status);
5397 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5398 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5400 if (float64_is_signaling_nan(a, status)
5401 || float64_is_signaling_nan(b, status)) {
5402 float_raise(float_flag_invalid, status);
5404 return 1;
5406 return 0;
5409 /*----------------------------------------------------------------------------
5410 | Returns the result of converting the extended double-precision floating-
5411 | point value `a' to the 32-bit two's complement integer format. The
5412 | conversion is performed according to the IEC/IEEE Standard for Binary
5413 | Floating-Point Arithmetic---which means in particular that the conversion
5414 | is rounded according to the current rounding mode. If `a' is a NaN, the
5415 | largest positive integer is returned. Otherwise, if the conversion
5416 | overflows, the largest integer with the same sign as `a' is returned.
5417 *----------------------------------------------------------------------------*/
5419 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5421 flag aSign;
5422 int32_t aExp, shiftCount;
5423 uint64_t aSig;
5425 if (floatx80_invalid_encoding(a)) {
5426 float_raise(float_flag_invalid, status);
5427 return 1 << 31;
5429 aSig = extractFloatx80Frac( a );
5430 aExp = extractFloatx80Exp( a );
5431 aSign = extractFloatx80Sign( a );
5432 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5433 shiftCount = 0x4037 - aExp;
5434 if ( shiftCount <= 0 ) shiftCount = 1;
5435 shift64RightJamming( aSig, shiftCount, &aSig );
5436 return roundAndPackInt32(aSign, aSig, status);
5440 /*----------------------------------------------------------------------------
5441 | Returns the result of converting the extended double-precision floating-
5442 | point value `a' to the 32-bit two's complement integer format. The
5443 | conversion is performed according to the IEC/IEEE Standard for Binary
5444 | Floating-Point Arithmetic, except that the conversion is always rounded
5445 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5446 | Otherwise, if the conversion overflows, the largest integer with the same
5447 | sign as `a' is returned.
5448 *----------------------------------------------------------------------------*/
5450 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5452 flag aSign;
5453 int32_t aExp, shiftCount;
5454 uint64_t aSig, savedASig;
5455 int32_t z;
5457 if (floatx80_invalid_encoding(a)) {
5458 float_raise(float_flag_invalid, status);
5459 return 1 << 31;
5461 aSig = extractFloatx80Frac( a );
5462 aExp = extractFloatx80Exp( a );
5463 aSign = extractFloatx80Sign( a );
5464 if ( 0x401E < aExp ) {
5465 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5466 goto invalid;
5468 else if ( aExp < 0x3FFF ) {
5469 if (aExp || aSig) {
5470 status->float_exception_flags |= float_flag_inexact;
5472 return 0;
5474 shiftCount = 0x403E - aExp;
5475 savedASig = aSig;
5476 aSig >>= shiftCount;
5477 z = aSig;
5478 if ( aSign ) z = - z;
5479 if ( ( z < 0 ) ^ aSign ) {
5480 invalid:
5481 float_raise(float_flag_invalid, status);
5482 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5484 if ( ( aSig<<shiftCount ) != savedASig ) {
5485 status->float_exception_flags |= float_flag_inexact;
5487 return z;
5491 /*----------------------------------------------------------------------------
5492 | Returns the result of converting the extended double-precision floating-
5493 | point value `a' to the 64-bit two's complement integer format. The
5494 | conversion is performed according to the IEC/IEEE Standard for Binary
5495 | Floating-Point Arithmetic---which means in particular that the conversion
5496 | is rounded according to the current rounding mode. If `a' is a NaN,
5497 | the largest positive integer is returned. Otherwise, if the conversion
5498 | overflows, the largest integer with the same sign as `a' is returned.
5499 *----------------------------------------------------------------------------*/
5501 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5503 flag aSign;
5504 int32_t aExp, shiftCount;
5505 uint64_t aSig, aSigExtra;
5507 if (floatx80_invalid_encoding(a)) {
5508 float_raise(float_flag_invalid, status);
5509 return 1ULL << 63;
5511 aSig = extractFloatx80Frac( a );
5512 aExp = extractFloatx80Exp( a );
5513 aSign = extractFloatx80Sign( a );
5514 shiftCount = 0x403E - aExp;
5515 if ( shiftCount <= 0 ) {
5516 if ( shiftCount ) {
5517 float_raise(float_flag_invalid, status);
5518 if (!aSign || floatx80_is_any_nan(a)) {
5519 return INT64_MAX;
5521 return INT64_MIN;
5523 aSigExtra = 0;
5525 else {
5526 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5528 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5532 /*----------------------------------------------------------------------------
5533 | Returns the result of converting the extended double-precision floating-
5534 | point value `a' to the 64-bit two's complement integer format. The
5535 | conversion is performed according to the IEC/IEEE Standard for Binary
5536 | Floating-Point Arithmetic, except that the conversion is always rounded
5537 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5538 | Otherwise, if the conversion overflows, the largest integer with the same
5539 | sign as `a' is returned.
5540 *----------------------------------------------------------------------------*/
5542 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5544 flag aSign;
5545 int32_t aExp, shiftCount;
5546 uint64_t aSig;
5547 int64_t z;
5549 if (floatx80_invalid_encoding(a)) {
5550 float_raise(float_flag_invalid, status);
5551 return 1ULL << 63;
5553 aSig = extractFloatx80Frac( a );
5554 aExp = extractFloatx80Exp( a );
5555 aSign = extractFloatx80Sign( a );
5556 shiftCount = aExp - 0x403E;
5557 if ( 0 <= shiftCount ) {
5558 aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5559 if ( ( a.high != 0xC03E ) || aSig ) {
5560 float_raise(float_flag_invalid, status);
5561 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5562 return INT64_MAX;
5565 return INT64_MIN;
5567 else if ( aExp < 0x3FFF ) {
5568 if (aExp | aSig) {
5569 status->float_exception_flags |= float_flag_inexact;
5571 return 0;
5573 z = aSig>>( - shiftCount );
5574 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5575 status->float_exception_flags |= float_flag_inexact;
5577 if ( aSign ) z = - z;
5578 return z;
5582 /*----------------------------------------------------------------------------
5583 | Returns the result of converting the extended double-precision floating-
5584 | point value `a' to the single-precision floating-point format. The
5585 | conversion is performed according to the IEC/IEEE Standard for Binary
5586 | Floating-Point Arithmetic.
5587 *----------------------------------------------------------------------------*/
5589 float32 floatx80_to_float32(floatx80 a, float_status *status)
5591 flag aSign;
5592 int32_t aExp;
5593 uint64_t aSig;
5595 if (floatx80_invalid_encoding(a)) {
5596 float_raise(float_flag_invalid, status);
5597 return float32_default_nan(status);
5599 aSig = extractFloatx80Frac( a );
5600 aExp = extractFloatx80Exp( a );
5601 aSign = extractFloatx80Sign( a );
5602 if ( aExp == 0x7FFF ) {
5603 if ( (uint64_t) ( aSig<<1 ) ) {
5604 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5606 return packFloat32( aSign, 0xFF, 0 );
5608 shift64RightJamming( aSig, 33, &aSig );
5609 if ( aExp || aSig ) aExp -= 0x3F81;
5610 return roundAndPackFloat32(aSign, aExp, aSig, status);
5614 /*----------------------------------------------------------------------------
5615 | Returns the result of converting the extended double-precision floating-
5616 | point value `a' to the double-precision floating-point format. The
5617 | conversion is performed according to the IEC/IEEE Standard for Binary
5618 | Floating-Point Arithmetic.
5619 *----------------------------------------------------------------------------*/
5621 float64 floatx80_to_float64(floatx80 a, float_status *status)
5623 flag aSign;
5624 int32_t aExp;
5625 uint64_t aSig, zSig;
5627 if (floatx80_invalid_encoding(a)) {
5628 float_raise(float_flag_invalid, status);
5629 return float64_default_nan(status);
5631 aSig = extractFloatx80Frac( a );
5632 aExp = extractFloatx80Exp( a );
5633 aSign = extractFloatx80Sign( a );
5634 if ( aExp == 0x7FFF ) {
5635 if ( (uint64_t) ( aSig<<1 ) ) {
5636 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5638 return packFloat64( aSign, 0x7FF, 0 );
5640 shift64RightJamming( aSig, 1, &zSig );
5641 if ( aExp || aSig ) aExp -= 0x3C01;
5642 return roundAndPackFloat64(aSign, aExp, zSig, status);
5646 /*----------------------------------------------------------------------------
5647 | Returns the result of converting the extended double-precision floating-
5648 | point value `a' to the quadruple-precision floating-point format. The
5649 | conversion is performed according to the IEC/IEEE Standard for Binary
5650 | Floating-Point Arithmetic.
5651 *----------------------------------------------------------------------------*/
5653 float128 floatx80_to_float128(floatx80 a, float_status *status)
5655 flag aSign;
5656 int aExp;
5657 uint64_t aSig, zSig0, zSig1;
5659 if (floatx80_invalid_encoding(a)) {
5660 float_raise(float_flag_invalid, status);
5661 return float128_default_nan(status);
5663 aSig = extractFloatx80Frac( a );
5664 aExp = extractFloatx80Exp( a );
5665 aSign = extractFloatx80Sign( a );
5666 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5667 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5669 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5670 return packFloat128( aSign, aExp, zSig0, zSig1 );
5674 /*----------------------------------------------------------------------------
5675 | Rounds the extended double-precision floating-point value `a'
5676 | to the precision provided by floatx80_rounding_precision and returns the
5677 | result as an extended double-precision floating-point value.
5678 | The operation is performed according to the IEC/IEEE Standard for Binary
5679 | Floating-Point Arithmetic.
5680 *----------------------------------------------------------------------------*/
5682 floatx80 floatx80_round(floatx80 a, float_status *status)
5684 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5685 extractFloatx80Sign(a),
5686 extractFloatx80Exp(a),
5687 extractFloatx80Frac(a), 0, status);
5690 /*----------------------------------------------------------------------------
5691 | Rounds the extended double-precision floating-point value `a' to an integer,
5692 | and returns the result as an extended quadruple-precision floating-point
5693 | value. The operation is performed according to the IEC/IEEE Standard for
5694 | Binary Floating-Point Arithmetic.
5695 *----------------------------------------------------------------------------*/
5697 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5699 flag aSign;
5700 int32_t aExp;
5701 uint64_t lastBitMask, roundBitsMask;
5702 floatx80 z;
5704 if (floatx80_invalid_encoding(a)) {
5705 float_raise(float_flag_invalid, status);
5706 return floatx80_default_nan(status);
5708 aExp = extractFloatx80Exp( a );
5709 if ( 0x403E <= aExp ) {
5710 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5711 return propagateFloatx80NaN(a, a, status);
5713 return a;
5715 if ( aExp < 0x3FFF ) {
5716 if ( ( aExp == 0 )
5717 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5718 return a;
5720 status->float_exception_flags |= float_flag_inexact;
5721 aSign = extractFloatx80Sign( a );
5722 switch (status->float_rounding_mode) {
5723 case float_round_nearest_even:
5724 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5726 return
5727 packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5729 break;
5730 case float_round_ties_away:
5731 if (aExp == 0x3FFE) {
5732 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5734 break;
5735 case float_round_down:
5736 return
5737 aSign ?
5738 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5739 : packFloatx80( 0, 0, 0 );
5740 case float_round_up:
5741 return
5742 aSign ? packFloatx80( 1, 0, 0 )
5743 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5745 return packFloatx80( aSign, 0, 0 );
5747 lastBitMask = 1;
5748 lastBitMask <<= 0x403E - aExp;
5749 roundBitsMask = lastBitMask - 1;
5750 z = a;
5751 switch (status->float_rounding_mode) {
5752 case float_round_nearest_even:
5753 z.low += lastBitMask>>1;
5754 if ((z.low & roundBitsMask) == 0) {
5755 z.low &= ~lastBitMask;
5757 break;
5758 case float_round_ties_away:
5759 z.low += lastBitMask >> 1;
5760 break;
5761 case float_round_to_zero:
5762 break;
5763 case float_round_up:
5764 if (!extractFloatx80Sign(z)) {
5765 z.low += roundBitsMask;
5767 break;
5768 case float_round_down:
5769 if (extractFloatx80Sign(z)) {
5770 z.low += roundBitsMask;
5772 break;
5773 default:
5774 abort();
5776 z.low &= ~ roundBitsMask;
5777 if ( z.low == 0 ) {
5778 ++z.high;
5779 z.low = UINT64_C(0x8000000000000000);
5781 if (z.low != a.low) {
5782 status->float_exception_flags |= float_flag_inexact;
5784 return z;
5788 /*----------------------------------------------------------------------------
5789 | Returns the result of adding the absolute values of the extended double-
5790 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5791 | negated before being returned. `zSign' is ignored if the result is a NaN.
5792 | The addition is performed according to the IEC/IEEE Standard for Binary
5793 | Floating-Point Arithmetic.
5794 *----------------------------------------------------------------------------*/
5796 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5797 float_status *status)
5799 int32_t aExp, bExp, zExp;
5800 uint64_t aSig, bSig, zSig0, zSig1;
5801 int32_t expDiff;
5803 aSig = extractFloatx80Frac( a );
5804 aExp = extractFloatx80Exp( a );
5805 bSig = extractFloatx80Frac( b );
5806 bExp = extractFloatx80Exp( b );
5807 expDiff = aExp - bExp;
5808 if ( 0 < expDiff ) {
5809 if ( aExp == 0x7FFF ) {
5810 if ((uint64_t)(aSig << 1)) {
5811 return propagateFloatx80NaN(a, b, status);
5813 return a;
5815 if ( bExp == 0 ) --expDiff;
5816 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5817 zExp = aExp;
5819 else if ( expDiff < 0 ) {
5820 if ( bExp == 0x7FFF ) {
5821 if ((uint64_t)(bSig << 1)) {
5822 return propagateFloatx80NaN(a, b, status);
5824 return packFloatx80(zSign,
5825 floatx80_infinity_high,
5826 floatx80_infinity_low);
5828 if ( aExp == 0 ) ++expDiff;
5829 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5830 zExp = bExp;
5832 else {
5833 if ( aExp == 0x7FFF ) {
5834 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5835 return propagateFloatx80NaN(a, b, status);
5837 return a;
5839 zSig1 = 0;
5840 zSig0 = aSig + bSig;
5841 if ( aExp == 0 ) {
5842 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5843 goto roundAndPack;
5845 zExp = aExp;
5846 goto shiftRight1;
5848 zSig0 = aSig + bSig;
5849 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5850 shiftRight1:
5851 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5852 zSig0 |= UINT64_C(0x8000000000000000);
5853 ++zExp;
5854 roundAndPack:
5855 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5856 zSign, zExp, zSig0, zSig1, status);
5859 /*----------------------------------------------------------------------------
5860 | Returns the result of subtracting the absolute values of the extended
5861 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5862 | difference is negated before being returned. `zSign' is ignored if the
5863 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5864 | Standard for Binary Floating-Point Arithmetic.
5865 *----------------------------------------------------------------------------*/
5867 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5868 float_status *status)
5870 int32_t aExp, bExp, zExp;
5871 uint64_t aSig, bSig, zSig0, zSig1;
5872 int32_t expDiff;
5874 aSig = extractFloatx80Frac( a );
5875 aExp = extractFloatx80Exp( a );
5876 bSig = extractFloatx80Frac( b );
5877 bExp = extractFloatx80Exp( b );
5878 expDiff = aExp - bExp;
5879 if ( 0 < expDiff ) goto aExpBigger;
5880 if ( expDiff < 0 ) goto bExpBigger;
5881 if ( aExp == 0x7FFF ) {
5882 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5883 return propagateFloatx80NaN(a, b, status);
5885 float_raise(float_flag_invalid, status);
5886 return floatx80_default_nan(status);
5888 if ( aExp == 0 ) {
5889 aExp = 1;
5890 bExp = 1;
5892 zSig1 = 0;
5893 if ( bSig < aSig ) goto aBigger;
5894 if ( aSig < bSig ) goto bBigger;
5895 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5896 bExpBigger:
5897 if ( bExp == 0x7FFF ) {
5898 if ((uint64_t)(bSig << 1)) {
5899 return propagateFloatx80NaN(a, b, status);
5901 return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5902 floatx80_infinity_low);
5904 if ( aExp == 0 ) ++expDiff;
5905 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5906 bBigger:
5907 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5908 zExp = bExp;
5909 zSign ^= 1;
5910 goto normalizeRoundAndPack;
5911 aExpBigger:
5912 if ( aExp == 0x7FFF ) {
5913 if ((uint64_t)(aSig << 1)) {
5914 return propagateFloatx80NaN(a, b, status);
5916 return a;
5918 if ( bExp == 0 ) --expDiff;
5919 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5920 aBigger:
5921 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5922 zExp = aExp;
5923 normalizeRoundAndPack:
5924 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5925 zSign, zExp, zSig0, zSig1, status);
5928 /*----------------------------------------------------------------------------
5929 | Returns the result of adding the extended double-precision floating-point
5930 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5931 | Standard for Binary Floating-Point Arithmetic.
5932 *----------------------------------------------------------------------------*/
5934 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5936 flag aSign, bSign;
5938 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5939 float_raise(float_flag_invalid, status);
5940 return floatx80_default_nan(status);
5942 aSign = extractFloatx80Sign( a );
5943 bSign = extractFloatx80Sign( b );
5944 if ( aSign == bSign ) {
5945 return addFloatx80Sigs(a, b, aSign, status);
5947 else {
5948 return subFloatx80Sigs(a, b, aSign, status);
5953 /*----------------------------------------------------------------------------
5954 | Returns the result of subtracting the extended double-precision floating-
5955 | point values `a' and `b'. The operation is performed according to the
5956 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5957 *----------------------------------------------------------------------------*/
5959 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5961 flag aSign, bSign;
5963 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5964 float_raise(float_flag_invalid, status);
5965 return floatx80_default_nan(status);
5967 aSign = extractFloatx80Sign( a );
5968 bSign = extractFloatx80Sign( b );
5969 if ( aSign == bSign ) {
5970 return subFloatx80Sigs(a, b, aSign, status);
5972 else {
5973 return addFloatx80Sigs(a, b, aSign, status);
5978 /*----------------------------------------------------------------------------
5979 | Returns the result of multiplying the extended double-precision floating-
5980 | point values `a' and `b'. The operation is performed according to the
5981 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5982 *----------------------------------------------------------------------------*/
5984 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5986 flag aSign, bSign, zSign;
5987 int32_t aExp, bExp, zExp;
5988 uint64_t aSig, bSig, zSig0, zSig1;
5990 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5991 float_raise(float_flag_invalid, status);
5992 return floatx80_default_nan(status);
5994 aSig = extractFloatx80Frac( a );
5995 aExp = extractFloatx80Exp( a );
5996 aSign = extractFloatx80Sign( a );
5997 bSig = extractFloatx80Frac( b );
5998 bExp = extractFloatx80Exp( b );
5999 bSign = extractFloatx80Sign( b );
6000 zSign = aSign ^ bSign;
6001 if ( aExp == 0x7FFF ) {
6002 if ( (uint64_t) ( aSig<<1 )
6003 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6004 return propagateFloatx80NaN(a, b, status);
6006 if ( ( bExp | bSig ) == 0 ) goto invalid;
6007 return packFloatx80(zSign, floatx80_infinity_high,
6008 floatx80_infinity_low);
6010 if ( bExp == 0x7FFF ) {
6011 if ((uint64_t)(bSig << 1)) {
6012 return propagateFloatx80NaN(a, b, status);
6014 if ( ( aExp | aSig ) == 0 ) {
6015 invalid:
6016 float_raise(float_flag_invalid, status);
6017 return floatx80_default_nan(status);
6019 return packFloatx80(zSign, floatx80_infinity_high,
6020 floatx80_infinity_low);
6022 if ( aExp == 0 ) {
6023 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6024 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6026 if ( bExp == 0 ) {
6027 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
6028 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6030 zExp = aExp + bExp - 0x3FFE;
6031 mul64To128( aSig, bSig, &zSig0, &zSig1 );
6032 if ( 0 < (int64_t) zSig0 ) {
6033 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6034 --zExp;
6036 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6037 zSign, zExp, zSig0, zSig1, status);
6040 /*----------------------------------------------------------------------------
6041 | Returns the result of dividing the extended double-precision floating-point
6042 | value `a' by the corresponding value `b'. The operation is performed
6043 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6044 *----------------------------------------------------------------------------*/
6046 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6048 flag aSign, bSign, zSign;
6049 int32_t aExp, bExp, zExp;
6050 uint64_t aSig, bSig, zSig0, zSig1;
6051 uint64_t rem0, rem1, rem2, term0, term1, term2;
6053 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6054 float_raise(float_flag_invalid, status);
6055 return floatx80_default_nan(status);
6057 aSig = extractFloatx80Frac( a );
6058 aExp = extractFloatx80Exp( a );
6059 aSign = extractFloatx80Sign( a );
6060 bSig = extractFloatx80Frac( b );
6061 bExp = extractFloatx80Exp( b );
6062 bSign = extractFloatx80Sign( b );
6063 zSign = aSign ^ bSign;
6064 if ( aExp == 0x7FFF ) {
6065 if ((uint64_t)(aSig << 1)) {
6066 return propagateFloatx80NaN(a, b, status);
6068 if ( bExp == 0x7FFF ) {
6069 if ((uint64_t)(bSig << 1)) {
6070 return propagateFloatx80NaN(a, b, status);
6072 goto invalid;
6074 return packFloatx80(zSign, floatx80_infinity_high,
6075 floatx80_infinity_low);
6077 if ( bExp == 0x7FFF ) {
6078 if ((uint64_t)(bSig << 1)) {
6079 return propagateFloatx80NaN(a, b, status);
6081 return packFloatx80( zSign, 0, 0 );
6083 if ( bExp == 0 ) {
6084 if ( bSig == 0 ) {
6085 if ( ( aExp | aSig ) == 0 ) {
6086 invalid:
6087 float_raise(float_flag_invalid, status);
6088 return floatx80_default_nan(status);
6090 float_raise(float_flag_divbyzero, status);
6091 return packFloatx80(zSign, floatx80_infinity_high,
6092 floatx80_infinity_low);
6094 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6096 if ( aExp == 0 ) {
6097 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6098 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6100 zExp = aExp - bExp + 0x3FFE;
6101 rem1 = 0;
6102 if ( bSig <= aSig ) {
6103 shift128Right( aSig, 0, 1, &aSig, &rem1 );
6104 ++zExp;
6106 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6107 mul64To128( bSig, zSig0, &term0, &term1 );
6108 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6109 while ( (int64_t) rem0 < 0 ) {
6110 --zSig0;
6111 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6113 zSig1 = estimateDiv128To64( rem1, 0, bSig );
6114 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6115 mul64To128( bSig, zSig1, &term1, &term2 );
6116 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6117 while ( (int64_t) rem1 < 0 ) {
6118 --zSig1;
6119 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6121 zSig1 |= ( ( rem1 | rem2 ) != 0 );
6123 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6124 zSign, zExp, zSig0, zSig1, status);
6127 /*----------------------------------------------------------------------------
6128 | Returns the remainder of the extended double-precision floating-point value
6129 | `a' with respect to the corresponding value `b'. The operation is performed
6130 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6131 *----------------------------------------------------------------------------*/
6133 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6135 flag aSign, zSign;
6136 int32_t aExp, bExp, expDiff;
6137 uint64_t aSig0, aSig1, bSig;
6138 uint64_t q, term0, term1, alternateASig0, alternateASig1;
6140 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6141 float_raise(float_flag_invalid, status);
6142 return floatx80_default_nan(status);
6144 aSig0 = extractFloatx80Frac( a );
6145 aExp = extractFloatx80Exp( a );
6146 aSign = extractFloatx80Sign( a );
6147 bSig = extractFloatx80Frac( b );
6148 bExp = extractFloatx80Exp( b );
6149 if ( aExp == 0x7FFF ) {
6150 if ( (uint64_t) ( aSig0<<1 )
6151 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6152 return propagateFloatx80NaN(a, b, status);
6154 goto invalid;
6156 if ( bExp == 0x7FFF ) {
6157 if ((uint64_t)(bSig << 1)) {
6158 return propagateFloatx80NaN(a, b, status);
6160 return a;
6162 if ( bExp == 0 ) {
6163 if ( bSig == 0 ) {
6164 invalid:
6165 float_raise(float_flag_invalid, status);
6166 return floatx80_default_nan(status);
6168 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6170 if ( aExp == 0 ) {
6171 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
6172 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6174 bSig |= UINT64_C(0x8000000000000000);
6175 zSign = aSign;
6176 expDiff = aExp - bExp;
6177 aSig1 = 0;
6178 if ( expDiff < 0 ) {
6179 if ( expDiff < -1 ) return a;
6180 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6181 expDiff = 0;
6183 q = ( bSig <= aSig0 );
6184 if ( q ) aSig0 -= bSig;
6185 expDiff -= 64;
6186 while ( 0 < expDiff ) {
6187 q = estimateDiv128To64( aSig0, aSig1, bSig );
6188 q = ( 2 < q ) ? q - 2 : 0;
6189 mul64To128( bSig, q, &term0, &term1 );
6190 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6191 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6192 expDiff -= 62;
6194 expDiff += 64;
6195 if ( 0 < expDiff ) {
6196 q = estimateDiv128To64( aSig0, aSig1, bSig );
6197 q = ( 2 < q ) ? q - 2 : 0;
6198 q >>= 64 - expDiff;
6199 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6200 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6201 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6202 while ( le128( term0, term1, aSig0, aSig1 ) ) {
6203 ++q;
6204 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6207 else {
6208 term1 = 0;
6209 term0 = bSig;
6211 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6212 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6213 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6214 && ( q & 1 ) )
6216 aSig0 = alternateASig0;
6217 aSig1 = alternateASig1;
6218 zSign = ! zSign;
6220 return
6221 normalizeRoundAndPackFloatx80(
6222 80, zSign, bExp + expDiff, aSig0, aSig1, status);
6226 /*----------------------------------------------------------------------------
6227 | Returns the square root of the extended double-precision floating-point
6228 | value `a'. The operation is performed according to the IEC/IEEE Standard
6229 | for Binary Floating-Point Arithmetic.
6230 *----------------------------------------------------------------------------*/
6232 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6234 flag aSign;
6235 int32_t aExp, zExp;
6236 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6237 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6239 if (floatx80_invalid_encoding(a)) {
6240 float_raise(float_flag_invalid, status);
6241 return floatx80_default_nan(status);
6243 aSig0 = extractFloatx80Frac( a );
6244 aExp = extractFloatx80Exp( a );
6245 aSign = extractFloatx80Sign( a );
6246 if ( aExp == 0x7FFF ) {
6247 if ((uint64_t)(aSig0 << 1)) {
6248 return propagateFloatx80NaN(a, a, status);
6250 if ( ! aSign ) return a;
6251 goto invalid;
6253 if ( aSign ) {
6254 if ( ( aExp | aSig0 ) == 0 ) return a;
6255 invalid:
6256 float_raise(float_flag_invalid, status);
6257 return floatx80_default_nan(status);
6259 if ( aExp == 0 ) {
6260 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6261 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6263 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6264 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6265 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6266 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6267 doubleZSig0 = zSig0<<1;
6268 mul64To128( zSig0, zSig0, &term0, &term1 );
6269 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6270 while ( (int64_t) rem0 < 0 ) {
6271 --zSig0;
6272 doubleZSig0 -= 2;
6273 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6275 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6276 if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6277 if ( zSig1 == 0 ) zSig1 = 1;
6278 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6279 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6280 mul64To128( zSig1, zSig1, &term2, &term3 );
6281 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6282 while ( (int64_t) rem1 < 0 ) {
6283 --zSig1;
6284 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6285 term3 |= 1;
6286 term2 |= doubleZSig0;
6287 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6289 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6291 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6292 zSig0 |= doubleZSig0;
6293 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6294 0, zExp, zSig0, zSig1, status);
6297 /*----------------------------------------------------------------------------
6298 | Returns 1 if the extended double-precision floating-point value `a' is equal
6299 | to the corresponding value `b', and 0 otherwise. The invalid exception is
6300 | raised if either operand is a NaN. Otherwise, the comparison is performed
6301 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6302 *----------------------------------------------------------------------------*/
6304 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
6307 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6308 || (extractFloatx80Exp(a) == 0x7FFF
6309 && (uint64_t) (extractFloatx80Frac(a) << 1))
6310 || (extractFloatx80Exp(b) == 0x7FFF
6311 && (uint64_t) (extractFloatx80Frac(b) << 1))
6313 float_raise(float_flag_invalid, status);
6314 return 0;
6316 return
6317 ( a.low == b.low )
6318 && ( ( a.high == b.high )
6319 || ( ( a.low == 0 )
6320 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6325 /*----------------------------------------------------------------------------
6326 | Returns 1 if the extended double-precision floating-point value `a' is
6327 | less than or equal to the corresponding value `b', and 0 otherwise. The
6328 | invalid exception is raised if either operand is a NaN. The comparison is
6329 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6330 | Arithmetic.
6331 *----------------------------------------------------------------------------*/
6333 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
6335 flag aSign, bSign;
6337 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6338 || (extractFloatx80Exp(a) == 0x7FFF
6339 && (uint64_t) (extractFloatx80Frac(a) << 1))
6340 || (extractFloatx80Exp(b) == 0x7FFF
6341 && (uint64_t) (extractFloatx80Frac(b) << 1))
6343 float_raise(float_flag_invalid, status);
6344 return 0;
6346 aSign = extractFloatx80Sign( a );
6347 bSign = extractFloatx80Sign( b );
6348 if ( aSign != bSign ) {
6349 return
6350 aSign
6351 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6352 == 0 );
6354 return
6355 aSign ? le128( b.high, b.low, a.high, a.low )
6356 : le128( a.high, a.low, b.high, b.low );
6360 /*----------------------------------------------------------------------------
6361 | Returns 1 if the extended double-precision floating-point value `a' is
6362 | less than the corresponding value `b', and 0 otherwise. The invalid
6363 | exception is raised if either operand is a NaN. The comparison is performed
6364 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6365 *----------------------------------------------------------------------------*/
6367 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
6369 flag aSign, bSign;
6371 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6372 || (extractFloatx80Exp(a) == 0x7FFF
6373 && (uint64_t) (extractFloatx80Frac(a) << 1))
6374 || (extractFloatx80Exp(b) == 0x7FFF
6375 && (uint64_t) (extractFloatx80Frac(b) << 1))
6377 float_raise(float_flag_invalid, status);
6378 return 0;
6380 aSign = extractFloatx80Sign( a );
6381 bSign = extractFloatx80Sign( b );
6382 if ( aSign != bSign ) {
6383 return
6384 aSign
6385 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6386 != 0 );
6388 return
6389 aSign ? lt128( b.high, b.low, a.high, a.low )
6390 : lt128( a.high, a.low, b.high, b.low );
6394 /*----------------------------------------------------------------------------
6395 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6396 | cannot be compared, and 0 otherwise. The invalid exception is raised if
6397 | either operand is a NaN. The comparison is performed according to the
6398 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6399 *----------------------------------------------------------------------------*/
6400 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
6402 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6403 || (extractFloatx80Exp(a) == 0x7FFF
6404 && (uint64_t) (extractFloatx80Frac(a) << 1))
6405 || (extractFloatx80Exp(b) == 0x7FFF
6406 && (uint64_t) (extractFloatx80Frac(b) << 1))
6408 float_raise(float_flag_invalid, status);
6409 return 1;
6411 return 0;
6414 /*----------------------------------------------------------------------------
6415 | Returns 1 if the extended double-precision floating-point value `a' is
6416 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6417 | cause an exception. The comparison is performed according to the IEC/IEEE
6418 | Standard for Binary Floating-Point Arithmetic.
6419 *----------------------------------------------------------------------------*/
6421 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
6424 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6425 float_raise(float_flag_invalid, status);
6426 return 0;
6428 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
6429 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6430 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
6431 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6433 if (floatx80_is_signaling_nan(a, status)
6434 || floatx80_is_signaling_nan(b, status)) {
6435 float_raise(float_flag_invalid, status);
6437 return 0;
6439 return
6440 ( a.low == b.low )
6441 && ( ( a.high == b.high )
6442 || ( ( a.low == 0 )
6443 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6448 /*----------------------------------------------------------------------------
6449 | Returns 1 if the extended double-precision floating-point value `a' is less
6450 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
6451 | do not cause an exception. Otherwise, the comparison is performed according
6452 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6453 *----------------------------------------------------------------------------*/
6455 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
6457 flag aSign, bSign;
6459 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6460 float_raise(float_flag_invalid, status);
6461 return 0;
6463 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
6464 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6465 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
6466 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6468 if (floatx80_is_signaling_nan(a, status)
6469 || floatx80_is_signaling_nan(b, status)) {
6470 float_raise(float_flag_invalid, status);
6472 return 0;
6474 aSign = extractFloatx80Sign( a );
6475 bSign = extractFloatx80Sign( b );
6476 if ( aSign != bSign ) {
6477 return
6478 aSign
6479 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6480 == 0 );
6482 return
6483 aSign ? le128( b.high, b.low, a.high, a.low )
6484 : le128( a.high, a.low, b.high, b.low );
6488 /*----------------------------------------------------------------------------
6489 | Returns 1 if the extended double-precision floating-point value `a' is less
6490 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
6491 | an exception. Otherwise, the comparison is performed according to the
6492 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6493 *----------------------------------------------------------------------------*/
6495 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
6497 flag aSign, bSign;
6499 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6500 float_raise(float_flag_invalid, status);
6501 return 0;
6503 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
6504 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6505 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
6506 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6508 if (floatx80_is_signaling_nan(a, status)
6509 || floatx80_is_signaling_nan(b, status)) {
6510 float_raise(float_flag_invalid, status);
6512 return 0;
6514 aSign = extractFloatx80Sign( a );
6515 bSign = extractFloatx80Sign( b );
6516 if ( aSign != bSign ) {
6517 return
6518 aSign
6519 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6520 != 0 );
6522 return
6523 aSign ? lt128( b.high, b.low, a.high, a.low )
6524 : lt128( a.high, a.low, b.high, b.low );
6528 /*----------------------------------------------------------------------------
6529 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6530 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
6531 | The comparison is performed according to the IEC/IEEE Standard for Binary
6532 | Floating-Point Arithmetic.
6533 *----------------------------------------------------------------------------*/
6534 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
6536 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6537 float_raise(float_flag_invalid, status);
6538 return 1;
6540 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
6541 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6542 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
6543 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6545 if (floatx80_is_signaling_nan(a, status)
6546 || floatx80_is_signaling_nan(b, status)) {
6547 float_raise(float_flag_invalid, status);
6549 return 1;
6551 return 0;
6554 /*----------------------------------------------------------------------------
6555 | Returns the result of converting the quadruple-precision floating-point
6556 | value `a' to the 32-bit two's complement integer format. The conversion
6557 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6558 | Arithmetic---which means in particular that the conversion is rounded
6559 | according to the current rounding mode. If `a' is a NaN, the largest
6560 | positive integer is returned. Otherwise, if the conversion overflows, the
6561 | largest integer with the same sign as `a' is returned.
6562 *----------------------------------------------------------------------------*/
6564 int32_t float128_to_int32(float128 a, float_status *status)
6566 flag aSign;
6567 int32_t aExp, shiftCount;
6568 uint64_t aSig0, aSig1;
6570 aSig1 = extractFloat128Frac1( a );
6571 aSig0 = extractFloat128Frac0( a );
6572 aExp = extractFloat128Exp( a );
6573 aSign = extractFloat128Sign( a );
6574 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6575 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6576 aSig0 |= ( aSig1 != 0 );
6577 shiftCount = 0x4028 - aExp;
6578 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6579 return roundAndPackInt32(aSign, aSig0, status);
6583 /*----------------------------------------------------------------------------
6584 | Returns the result of converting the quadruple-precision floating-point
6585 | value `a' to the 32-bit two's complement integer format. The conversion
6586 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6587 | Arithmetic, except that the conversion is always rounded toward zero. If
6588 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
6589 | conversion overflows, the largest integer with the same sign as `a' is
6590 | returned.
6591 *----------------------------------------------------------------------------*/
6593 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6595 flag aSign;
6596 int32_t aExp, shiftCount;
6597 uint64_t aSig0, aSig1, savedASig;
6598 int32_t z;
6600 aSig1 = extractFloat128Frac1( a );
6601 aSig0 = extractFloat128Frac0( a );
6602 aExp = extractFloat128Exp( a );
6603 aSign = extractFloat128Sign( a );
6604 aSig0 |= ( aSig1 != 0 );
6605 if ( 0x401E < aExp ) {
6606 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6607 goto invalid;
6609 else if ( aExp < 0x3FFF ) {
6610 if (aExp || aSig0) {
6611 status->float_exception_flags |= float_flag_inexact;
6613 return 0;
6615 aSig0 |= UINT64_C(0x0001000000000000);
6616 shiftCount = 0x402F - aExp;
6617 savedASig = aSig0;
6618 aSig0 >>= shiftCount;
6619 z = aSig0;
6620 if ( aSign ) z = - z;
6621 if ( ( z < 0 ) ^ aSign ) {
6622 invalid:
6623 float_raise(float_flag_invalid, status);
6624 return aSign ? INT32_MIN : INT32_MAX;
6626 if ( ( aSig0<<shiftCount ) != savedASig ) {
6627 status->float_exception_flags |= float_flag_inexact;
6629 return z;
6633 /*----------------------------------------------------------------------------
6634 | Returns the result of converting the quadruple-precision floating-point
6635 | value `a' to the 64-bit two's complement integer format. The conversion
6636 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6637 | Arithmetic---which means in particular that the conversion is rounded
6638 | according to the current rounding mode. If `a' is a NaN, the largest
6639 | positive integer is returned. Otherwise, if the conversion overflows, the
6640 | largest integer with the same sign as `a' is returned.
6641 *----------------------------------------------------------------------------*/
6643 int64_t float128_to_int64(float128 a, float_status *status)
6645 flag aSign;
6646 int32_t aExp, shiftCount;
6647 uint64_t aSig0, aSig1;
6649 aSig1 = extractFloat128Frac1( a );
6650 aSig0 = extractFloat128Frac0( a );
6651 aExp = extractFloat128Exp( a );
6652 aSign = extractFloat128Sign( a );
6653 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6654 shiftCount = 0x402F - aExp;
6655 if ( shiftCount <= 0 ) {
6656 if ( 0x403E < aExp ) {
6657 float_raise(float_flag_invalid, status);
6658 if ( ! aSign
6659 || ( ( aExp == 0x7FFF )
6660 && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
6663 return INT64_MAX;
6665 return INT64_MIN;
6667 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6669 else {
6670 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6672 return roundAndPackInt64(aSign, aSig0, aSig1, status);
6676 /*----------------------------------------------------------------------------
6677 | Returns the result of converting the quadruple-precision floating-point
6678 | value `a' to the 64-bit two's complement integer format. The conversion
6679 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6680 | Arithmetic, except that the conversion is always rounded toward zero.
6681 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6682 | the conversion overflows, the largest integer with the same sign as `a' is
6683 | returned.
6684 *----------------------------------------------------------------------------*/
6686 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6688 flag aSign;
6689 int32_t aExp, shiftCount;
6690 uint64_t aSig0, aSig1;
6691 int64_t z;
6693 aSig1 = extractFloat128Frac1( a );
6694 aSig0 = extractFloat128Frac0( a );
6695 aExp = extractFloat128Exp( a );
6696 aSign = extractFloat128Sign( a );
6697 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6698 shiftCount = aExp - 0x402F;
6699 if ( 0 < shiftCount ) {
6700 if ( 0x403E <= aExp ) {
6701 aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6702 if ( ( a.high == UINT64_C(0xC03E000000000000) )
6703 && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6704 if (aSig1) {
6705 status->float_exception_flags |= float_flag_inexact;
6708 else {
6709 float_raise(float_flag_invalid, status);
6710 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6711 return INT64_MAX;
6714 return INT64_MIN;
6716 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6717 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6718 status->float_exception_flags |= float_flag_inexact;
6721 else {
6722 if ( aExp < 0x3FFF ) {
6723 if ( aExp | aSig0 | aSig1 ) {
6724 status->float_exception_flags |= float_flag_inexact;
6726 return 0;
6728 z = aSig0>>( - shiftCount );
6729 if ( aSig1
6730 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6731 status->float_exception_flags |= float_flag_inexact;
6734 if ( aSign ) z = - z;
6735 return z;
6739 /*----------------------------------------------------------------------------
6740 | Returns the result of converting the quadruple-precision floating-point value
6741 | `a' to the 64-bit unsigned integer format. The conversion is
6742 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6743 | Arithmetic---which means in particular that the conversion is rounded
6744 | according to the current rounding mode. If `a' is a NaN, the largest
6745 | positive integer is returned. If the conversion overflows, the
6746 | largest unsigned integer is returned. If 'a' is negative, the value is
6747 | rounded and zero is returned; negative values that do not round to zero
6748 | will raise the inexact exception.
6749 *----------------------------------------------------------------------------*/
6751 uint64_t float128_to_uint64(float128 a, float_status *status)
6753 flag aSign;
6754 int aExp;
6755 int shiftCount;
6756 uint64_t aSig0, aSig1;
6758 aSig0 = extractFloat128Frac0(a);
6759 aSig1 = extractFloat128Frac1(a);
6760 aExp = extractFloat128Exp(a);
6761 aSign = extractFloat128Sign(a);
6762 if (aSign && (aExp > 0x3FFE)) {
6763 float_raise(float_flag_invalid, status);
6764 if (float128_is_any_nan(a)) {
6765 return UINT64_MAX;
6766 } else {
6767 return 0;
6770 if (aExp) {
6771 aSig0 |= UINT64_C(0x0001000000000000);
6773 shiftCount = 0x402F - aExp;
6774 if (shiftCount <= 0) {
6775 if (0x403E < aExp) {
6776 float_raise(float_flag_invalid, status);
6777 return UINT64_MAX;
6779 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6780 } else {
6781 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6783 return roundAndPackUint64(aSign, aSig0, aSig1, status);
6786 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6788 uint64_t v;
6789 signed char current_rounding_mode = status->float_rounding_mode;
6791 set_float_rounding_mode(float_round_to_zero, status);
6792 v = float128_to_uint64(a, status);
6793 set_float_rounding_mode(current_rounding_mode, status);
6795 return v;
6798 /*----------------------------------------------------------------------------
6799 | Returns the result of converting the quadruple-precision floating-point
6800 | value `a' to the 32-bit unsigned integer format. The conversion
6801 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6802 | Arithmetic except that the conversion is always rounded toward zero.
6803 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
6804 | if the conversion overflows, the largest unsigned integer is returned.
6805 | If 'a' is negative, the value is rounded and zero is returned; negative
6806 | values that do not round to zero will raise the inexact exception.
6807 *----------------------------------------------------------------------------*/
6809 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6811 uint64_t v;
6812 uint32_t res;
6813 int old_exc_flags = get_float_exception_flags(status);
6815 v = float128_to_uint64_round_to_zero(a, status);
6816 if (v > 0xffffffff) {
6817 res = 0xffffffff;
6818 } else {
6819 return v;
6821 set_float_exception_flags(old_exc_flags, status);
6822 float_raise(float_flag_invalid, status);
6823 return res;
6826 /*----------------------------------------------------------------------------
6827 | Returns the result of converting the quadruple-precision floating-point value
6828 | `a' to the 32-bit unsigned integer format. The conversion is
6829 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6830 | Arithmetic---which means in particular that the conversion is rounded
6831 | according to the current rounding mode. If `a' is a NaN, the largest
6832 | positive integer is returned. If the conversion overflows, the
6833 | largest unsigned integer is returned. If 'a' is negative, the value is
6834 | rounded and zero is returned; negative values that do not round to zero
6835 | will raise the inexact exception.
6836 *----------------------------------------------------------------------------*/
6838 uint32_t float128_to_uint32(float128 a, float_status *status)
6840 uint64_t v;
6841 uint32_t res;
6842 int old_exc_flags = get_float_exception_flags(status);
6844 v = float128_to_uint64(a, status);
6845 if (v > 0xffffffff) {
6846 res = 0xffffffff;
6847 } else {
6848 return v;
6850 set_float_exception_flags(old_exc_flags, status);
6851 float_raise(float_flag_invalid, status);
6852 return res;
6855 /*----------------------------------------------------------------------------
6856 | Returns the result of converting the quadruple-precision floating-point
6857 | value `a' to the single-precision floating-point format. The conversion
6858 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6859 | Arithmetic.
6860 *----------------------------------------------------------------------------*/
6862 float32 float128_to_float32(float128 a, float_status *status)
6864 flag aSign;
6865 int32_t aExp;
6866 uint64_t aSig0, aSig1;
6867 uint32_t zSig;
6869 aSig1 = extractFloat128Frac1( a );
6870 aSig0 = extractFloat128Frac0( a );
6871 aExp = extractFloat128Exp( a );
6872 aSign = extractFloat128Sign( a );
6873 if ( aExp == 0x7FFF ) {
6874 if ( aSig0 | aSig1 ) {
6875 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6877 return packFloat32( aSign, 0xFF, 0 );
6879 aSig0 |= ( aSig1 != 0 );
6880 shift64RightJamming( aSig0, 18, &aSig0 );
6881 zSig = aSig0;
6882 if ( aExp || zSig ) {
6883 zSig |= 0x40000000;
6884 aExp -= 0x3F81;
6886 return roundAndPackFloat32(aSign, aExp, zSig, status);
6890 /*----------------------------------------------------------------------------
6891 | Returns the result of converting the quadruple-precision floating-point
6892 | value `a' to the double-precision floating-point format. The conversion
6893 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6894 | Arithmetic.
6895 *----------------------------------------------------------------------------*/
6897 float64 float128_to_float64(float128 a, float_status *status)
6899 flag aSign;
6900 int32_t aExp;
6901 uint64_t aSig0, aSig1;
6903 aSig1 = extractFloat128Frac1( a );
6904 aSig0 = extractFloat128Frac0( a );
6905 aExp = extractFloat128Exp( a );
6906 aSign = extractFloat128Sign( a );
6907 if ( aExp == 0x7FFF ) {
6908 if ( aSig0 | aSig1 ) {
6909 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6911 return packFloat64( aSign, 0x7FF, 0 );
6913 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6914 aSig0 |= ( aSig1 != 0 );
6915 if ( aExp || aSig0 ) {
6916 aSig0 |= UINT64_C(0x4000000000000000);
6917 aExp -= 0x3C01;
6919 return roundAndPackFloat64(aSign, aExp, aSig0, status);
6923 /*----------------------------------------------------------------------------
6924 | Returns the result of converting the quadruple-precision floating-point
6925 | value `a' to the extended double-precision floating-point format. The
6926 | conversion is performed according to the IEC/IEEE Standard for Binary
6927 | Floating-Point Arithmetic.
6928 *----------------------------------------------------------------------------*/
6930 floatx80 float128_to_floatx80(float128 a, float_status *status)
6932 flag aSign;
6933 int32_t aExp;
6934 uint64_t aSig0, aSig1;
6936 aSig1 = extractFloat128Frac1( a );
6937 aSig0 = extractFloat128Frac0( a );
6938 aExp = extractFloat128Exp( a );
6939 aSign = extractFloat128Sign( a );
6940 if ( aExp == 0x7FFF ) {
6941 if ( aSig0 | aSig1 ) {
6942 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6944 return packFloatx80(aSign, floatx80_infinity_high,
6945 floatx80_infinity_low);
6947 if ( aExp == 0 ) {
6948 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6949 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6951 else {
6952 aSig0 |= UINT64_C(0x0001000000000000);
6954 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6955 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6959 /*----------------------------------------------------------------------------
6960 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6961 | returns the result as a quadruple-precision floating-point value. The
6962 | operation is performed according to the IEC/IEEE Standard for Binary
6963 | Floating-Point Arithmetic.
6964 *----------------------------------------------------------------------------*/
6966 float128 float128_round_to_int(float128 a, float_status *status)
6968 flag aSign;
6969 int32_t aExp;
6970 uint64_t lastBitMask, roundBitsMask;
6971 float128 z;
6973 aExp = extractFloat128Exp( a );
6974 if ( 0x402F <= aExp ) {
6975 if ( 0x406F <= aExp ) {
6976 if ( ( aExp == 0x7FFF )
6977 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6979 return propagateFloat128NaN(a, a, status);
6981 return a;
6983 lastBitMask = 1;
6984 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6985 roundBitsMask = lastBitMask - 1;
6986 z = a;
6987 switch (status->float_rounding_mode) {
6988 case float_round_nearest_even:
6989 if ( lastBitMask ) {
6990 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6991 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6993 else {
6994 if ( (int64_t) z.low < 0 ) {
6995 ++z.high;
6996 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6999 break;
7000 case float_round_ties_away:
7001 if (lastBitMask) {
7002 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
7003 } else {
7004 if ((int64_t) z.low < 0) {
7005 ++z.high;
7008 break;
7009 case float_round_to_zero:
7010 break;
7011 case float_round_up:
7012 if (!extractFloat128Sign(z)) {
7013 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7015 break;
7016 case float_round_down:
7017 if (extractFloat128Sign(z)) {
7018 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7020 break;
7021 case float_round_to_odd:
7023 * Note that if lastBitMask == 0, the last bit is the lsb
7024 * of high, and roundBitsMask == -1.
7026 if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
7027 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7029 break;
7030 default:
7031 abort();
7033 z.low &= ~ roundBitsMask;
7035 else {
7036 if ( aExp < 0x3FFF ) {
7037 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
7038 status->float_exception_flags |= float_flag_inexact;
7039 aSign = extractFloat128Sign( a );
7040 switch (status->float_rounding_mode) {
7041 case float_round_nearest_even:
7042 if ( ( aExp == 0x3FFE )
7043 && ( extractFloat128Frac0( a )
7044 | extractFloat128Frac1( a ) )
7046 return packFloat128( aSign, 0x3FFF, 0, 0 );
7048 break;
7049 case float_round_ties_away:
7050 if (aExp == 0x3FFE) {
7051 return packFloat128(aSign, 0x3FFF, 0, 0);
7053 break;
7054 case float_round_down:
7055 return
7056 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
7057 : packFloat128( 0, 0, 0, 0 );
7058 case float_round_up:
7059 return
7060 aSign ? packFloat128( 1, 0, 0, 0 )
7061 : packFloat128( 0, 0x3FFF, 0, 0 );
7063 case float_round_to_odd:
7064 return packFloat128(aSign, 0x3FFF, 0, 0);
7066 return packFloat128( aSign, 0, 0, 0 );
7068 lastBitMask = 1;
7069 lastBitMask <<= 0x402F - aExp;
7070 roundBitsMask = lastBitMask - 1;
7071 z.low = 0;
7072 z.high = a.high;
7073 switch (status->float_rounding_mode) {
7074 case float_round_nearest_even:
7075 z.high += lastBitMask>>1;
7076 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
7077 z.high &= ~ lastBitMask;
7079 break;
7080 case float_round_ties_away:
7081 z.high += lastBitMask>>1;
7082 break;
7083 case float_round_to_zero:
7084 break;
7085 case float_round_up:
7086 if (!extractFloat128Sign(z)) {
7087 z.high |= ( a.low != 0 );
7088 z.high += roundBitsMask;
7090 break;
7091 case float_round_down:
7092 if (extractFloat128Sign(z)) {
7093 z.high |= (a.low != 0);
7094 z.high += roundBitsMask;
7096 break;
7097 case float_round_to_odd:
7098 if ((z.high & lastBitMask) == 0) {
7099 z.high |= (a.low != 0);
7100 z.high += roundBitsMask;
7102 break;
7103 default:
7104 abort();
7106 z.high &= ~ roundBitsMask;
7108 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
7109 status->float_exception_flags |= float_flag_inexact;
7111 return z;
7115 /*----------------------------------------------------------------------------
7116 | Returns the result of adding the absolute values of the quadruple-precision
7117 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
7118 | before being returned. `zSign' is ignored if the result is a NaN.
7119 | The addition is performed according to the IEC/IEEE Standard for Binary
7120 | Floating-Point Arithmetic.
7121 *----------------------------------------------------------------------------*/
7123 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
7124 float_status *status)
7126 int32_t aExp, bExp, zExp;
7127 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7128 int32_t expDiff;
7130 aSig1 = extractFloat128Frac1( a );
7131 aSig0 = extractFloat128Frac0( a );
7132 aExp = extractFloat128Exp( a );
7133 bSig1 = extractFloat128Frac1( b );
7134 bSig0 = extractFloat128Frac0( b );
7135 bExp = extractFloat128Exp( b );
7136 expDiff = aExp - bExp;
7137 if ( 0 < expDiff ) {
7138 if ( aExp == 0x7FFF ) {
7139 if (aSig0 | aSig1) {
7140 return propagateFloat128NaN(a, b, status);
7142 return a;
7144 if ( bExp == 0 ) {
7145 --expDiff;
7147 else {
7148 bSig0 |= UINT64_C(0x0001000000000000);
7150 shift128ExtraRightJamming(
7151 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
7152 zExp = aExp;
7154 else if ( expDiff < 0 ) {
7155 if ( bExp == 0x7FFF ) {
7156 if (bSig0 | bSig1) {
7157 return propagateFloat128NaN(a, b, status);
7159 return packFloat128( zSign, 0x7FFF, 0, 0 );
7161 if ( aExp == 0 ) {
7162 ++expDiff;
7164 else {
7165 aSig0 |= UINT64_C(0x0001000000000000);
7167 shift128ExtraRightJamming(
7168 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
7169 zExp = bExp;
7171 else {
7172 if ( aExp == 0x7FFF ) {
7173 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7174 return propagateFloat128NaN(a, b, status);
7176 return a;
7178 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7179 if ( aExp == 0 ) {
7180 if (status->flush_to_zero) {
7181 if (zSig0 | zSig1) {
7182 float_raise(float_flag_output_denormal, status);
7184 return packFloat128(zSign, 0, 0, 0);
7186 return packFloat128( zSign, 0, zSig0, zSig1 );
7188 zSig2 = 0;
7189 zSig0 |= UINT64_C(0x0002000000000000);
7190 zExp = aExp;
7191 goto shiftRight1;
7193 aSig0 |= UINT64_C(0x0001000000000000);
7194 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7195 --zExp;
7196 if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack;
7197 ++zExp;
7198 shiftRight1:
7199 shift128ExtraRightJamming(
7200 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7201 roundAndPack:
7202 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7206 /*----------------------------------------------------------------------------
7207 | Returns the result of subtracting the absolute values of the quadruple-
7208 | precision floating-point values `a' and `b'. If `zSign' is 1, the
7209 | difference is negated before being returned. `zSign' is ignored if the
7210 | result is a NaN. The subtraction is performed according to the IEC/IEEE
7211 | Standard for Binary Floating-Point Arithmetic.
7212 *----------------------------------------------------------------------------*/
7214 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
7215 float_status *status)
7217 int32_t aExp, bExp, zExp;
7218 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
7219 int32_t expDiff;
7221 aSig1 = extractFloat128Frac1( a );
7222 aSig0 = extractFloat128Frac0( a );
7223 aExp = extractFloat128Exp( a );
7224 bSig1 = extractFloat128Frac1( b );
7225 bSig0 = extractFloat128Frac0( b );
7226 bExp = extractFloat128Exp( b );
7227 expDiff = aExp - bExp;
7228 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7229 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7230 if ( 0 < expDiff ) goto aExpBigger;
7231 if ( expDiff < 0 ) goto bExpBigger;
7232 if ( aExp == 0x7FFF ) {
7233 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7234 return propagateFloat128NaN(a, b, status);
7236 float_raise(float_flag_invalid, status);
7237 return float128_default_nan(status);
7239 if ( aExp == 0 ) {
7240 aExp = 1;
7241 bExp = 1;
7243 if ( bSig0 < aSig0 ) goto aBigger;
7244 if ( aSig0 < bSig0 ) goto bBigger;
7245 if ( bSig1 < aSig1 ) goto aBigger;
7246 if ( aSig1 < bSig1 ) goto bBigger;
7247 return packFloat128(status->float_rounding_mode == float_round_down,
7248 0, 0, 0);
7249 bExpBigger:
7250 if ( bExp == 0x7FFF ) {
7251 if (bSig0 | bSig1) {
7252 return propagateFloat128NaN(a, b, status);
7254 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7256 if ( aExp == 0 ) {
7257 ++expDiff;
7259 else {
7260 aSig0 |= UINT64_C(0x4000000000000000);
7262 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7263 bSig0 |= UINT64_C(0x4000000000000000);
7264 bBigger:
7265 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7266 zExp = bExp;
7267 zSign ^= 1;
7268 goto normalizeRoundAndPack;
7269 aExpBigger:
7270 if ( aExp == 0x7FFF ) {
7271 if (aSig0 | aSig1) {
7272 return propagateFloat128NaN(a, b, status);
7274 return a;
7276 if ( bExp == 0 ) {
7277 --expDiff;
7279 else {
7280 bSig0 |= UINT64_C(0x4000000000000000);
7282 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7283 aSig0 |= UINT64_C(0x4000000000000000);
7284 aBigger:
7285 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7286 zExp = aExp;
7287 normalizeRoundAndPack:
7288 --zExp;
7289 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7290 status);
7294 /*----------------------------------------------------------------------------
7295 | Returns the result of adding the quadruple-precision floating-point values
7296 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
7297 | for Binary Floating-Point Arithmetic.
7298 *----------------------------------------------------------------------------*/
7300 float128 float128_add(float128 a, float128 b, float_status *status)
7302 flag aSign, bSign;
7304 aSign = extractFloat128Sign( a );
7305 bSign = extractFloat128Sign( b );
7306 if ( aSign == bSign ) {
7307 return addFloat128Sigs(a, b, aSign, status);
7309 else {
7310 return subFloat128Sigs(a, b, aSign, status);
7315 /*----------------------------------------------------------------------------
7316 | Returns the result of subtracting the quadruple-precision floating-point
7317 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7318 | Standard for Binary Floating-Point Arithmetic.
7319 *----------------------------------------------------------------------------*/
7321 float128 float128_sub(float128 a, float128 b, float_status *status)
7323 flag aSign, bSign;
7325 aSign = extractFloat128Sign( a );
7326 bSign = extractFloat128Sign( b );
7327 if ( aSign == bSign ) {
7328 return subFloat128Sigs(a, b, aSign, status);
7330 else {
7331 return addFloat128Sigs(a, b, aSign, status);
7336 /*----------------------------------------------------------------------------
7337 | Returns the result of multiplying the quadruple-precision floating-point
7338 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7339 | Standard for Binary Floating-Point Arithmetic.
7340 *----------------------------------------------------------------------------*/
7342 float128 float128_mul(float128 a, float128 b, float_status *status)
7344 flag aSign, bSign, zSign;
7345 int32_t aExp, bExp, zExp;
7346 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7348 aSig1 = extractFloat128Frac1( a );
7349 aSig0 = extractFloat128Frac0( a );
7350 aExp = extractFloat128Exp( a );
7351 aSign = extractFloat128Sign( a );
7352 bSig1 = extractFloat128Frac1( b );
7353 bSig0 = extractFloat128Frac0( b );
7354 bExp = extractFloat128Exp( b );
7355 bSign = extractFloat128Sign( b );
7356 zSign = aSign ^ bSign;
7357 if ( aExp == 0x7FFF ) {
7358 if ( ( aSig0 | aSig1 )
7359 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7360 return propagateFloat128NaN(a, b, status);
7362 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7363 return packFloat128( zSign, 0x7FFF, 0, 0 );
7365 if ( bExp == 0x7FFF ) {
7366 if (bSig0 | bSig1) {
7367 return propagateFloat128NaN(a, b, status);
7369 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7370 invalid:
7371 float_raise(float_flag_invalid, status);
7372 return float128_default_nan(status);
7374 return packFloat128( zSign, 0x7FFF, 0, 0 );
7376 if ( aExp == 0 ) {
7377 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7378 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7380 if ( bExp == 0 ) {
7381 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7382 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7384 zExp = aExp + bExp - 0x4000;
7385 aSig0 |= UINT64_C(0x0001000000000000);
7386 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7387 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7388 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7389 zSig2 |= ( zSig3 != 0 );
7390 if (UINT64_C( 0x0002000000000000) <= zSig0 ) {
7391 shift128ExtraRightJamming(
7392 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7393 ++zExp;
7395 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7399 /*----------------------------------------------------------------------------
7400 | Returns the result of dividing the quadruple-precision floating-point value
7401 | `a' by the corresponding value `b'. The operation is performed according to
7402 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7403 *----------------------------------------------------------------------------*/
7405 float128 float128_div(float128 a, float128 b, float_status *status)
7407 flag aSign, bSign, zSign;
7408 int32_t aExp, bExp, zExp;
7409 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7410 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7412 aSig1 = extractFloat128Frac1( a );
7413 aSig0 = extractFloat128Frac0( a );
7414 aExp = extractFloat128Exp( a );
7415 aSign = extractFloat128Sign( a );
7416 bSig1 = extractFloat128Frac1( b );
7417 bSig0 = extractFloat128Frac0( b );
7418 bExp = extractFloat128Exp( b );
7419 bSign = extractFloat128Sign( b );
7420 zSign = aSign ^ bSign;
7421 if ( aExp == 0x7FFF ) {
7422 if (aSig0 | aSig1) {
7423 return propagateFloat128NaN(a, b, status);
7425 if ( bExp == 0x7FFF ) {
7426 if (bSig0 | bSig1) {
7427 return propagateFloat128NaN(a, b, status);
7429 goto invalid;
7431 return packFloat128( zSign, 0x7FFF, 0, 0 );
7433 if ( bExp == 0x7FFF ) {
7434 if (bSig0 | bSig1) {
7435 return propagateFloat128NaN(a, b, status);
7437 return packFloat128( zSign, 0, 0, 0 );
7439 if ( bExp == 0 ) {
7440 if ( ( bSig0 | bSig1 ) == 0 ) {
7441 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7442 invalid:
7443 float_raise(float_flag_invalid, status);
7444 return float128_default_nan(status);
7446 float_raise(float_flag_divbyzero, status);
7447 return packFloat128( zSign, 0x7FFF, 0, 0 );
7449 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7451 if ( aExp == 0 ) {
7452 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7453 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7455 zExp = aExp - bExp + 0x3FFD;
7456 shortShift128Left(
7457 aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
7458 shortShift128Left(
7459 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7460 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7461 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7462 ++zExp;
7464 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7465 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7466 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7467 while ( (int64_t) rem0 < 0 ) {
7468 --zSig0;
7469 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7471 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7472 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7473 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7474 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7475 while ( (int64_t) rem1 < 0 ) {
7476 --zSig1;
7477 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7479 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7481 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7482 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7486 /*----------------------------------------------------------------------------
7487 | Returns the remainder of the quadruple-precision floating-point value `a'
7488 | with respect to the corresponding value `b'. The operation is performed
7489 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7490 *----------------------------------------------------------------------------*/
7492 float128 float128_rem(float128 a, float128 b, float_status *status)
7494 flag aSign, zSign;
7495 int32_t aExp, bExp, expDiff;
7496 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7497 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7498 int64_t sigMean0;
7500 aSig1 = extractFloat128Frac1( a );
7501 aSig0 = extractFloat128Frac0( a );
7502 aExp = extractFloat128Exp( a );
7503 aSign = extractFloat128Sign( a );
7504 bSig1 = extractFloat128Frac1( b );
7505 bSig0 = extractFloat128Frac0( b );
7506 bExp = extractFloat128Exp( b );
7507 if ( aExp == 0x7FFF ) {
7508 if ( ( aSig0 | aSig1 )
7509 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7510 return propagateFloat128NaN(a, b, status);
7512 goto invalid;
7514 if ( bExp == 0x7FFF ) {
7515 if (bSig0 | bSig1) {
7516 return propagateFloat128NaN(a, b, status);
7518 return a;
7520 if ( bExp == 0 ) {
7521 if ( ( bSig0 | bSig1 ) == 0 ) {
7522 invalid:
7523 float_raise(float_flag_invalid, status);
7524 return float128_default_nan(status);
7526 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7528 if ( aExp == 0 ) {
7529 if ( ( aSig0 | aSig1 ) == 0 ) return a;
7530 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7532 expDiff = aExp - bExp;
7533 if ( expDiff < -1 ) return a;
7534 shortShift128Left(
7535 aSig0 | UINT64_C(0x0001000000000000),
7536 aSig1,
7537 15 - ( expDiff < 0 ),
7538 &aSig0,
7539 &aSig1
7541 shortShift128Left(
7542 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7543 q = le128( bSig0, bSig1, aSig0, aSig1 );
7544 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7545 expDiff -= 64;
7546 while ( 0 < expDiff ) {
7547 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7548 q = ( 4 < q ) ? q - 4 : 0;
7549 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7550 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7551 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7552 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7553 expDiff -= 61;
7555 if ( -64 < expDiff ) {
7556 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7557 q = ( 4 < q ) ? q - 4 : 0;
7558 q >>= - expDiff;
7559 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7560 expDiff += 52;
7561 if ( expDiff < 0 ) {
7562 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7564 else {
7565 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7567 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7568 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7570 else {
7571 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7572 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7574 do {
7575 alternateASig0 = aSig0;
7576 alternateASig1 = aSig1;
7577 ++q;
7578 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7579 } while ( 0 <= (int64_t) aSig0 );
7580 add128(
7581 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7582 if ( ( sigMean0 < 0 )
7583 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7584 aSig0 = alternateASig0;
7585 aSig1 = alternateASig1;
7587 zSign = ( (int64_t) aSig0 < 0 );
7588 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7589 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7590 status);
7593 /*----------------------------------------------------------------------------
7594 | Returns the square root of the quadruple-precision floating-point value `a'.
7595 | The operation is performed according to the IEC/IEEE Standard for Binary
7596 | Floating-Point Arithmetic.
7597 *----------------------------------------------------------------------------*/
7599 float128 float128_sqrt(float128 a, float_status *status)
7601 flag aSign;
7602 int32_t aExp, zExp;
7603 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7604 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7606 aSig1 = extractFloat128Frac1( a );
7607 aSig0 = extractFloat128Frac0( a );
7608 aExp = extractFloat128Exp( a );
7609 aSign = extractFloat128Sign( a );
7610 if ( aExp == 0x7FFF ) {
7611 if (aSig0 | aSig1) {
7612 return propagateFloat128NaN(a, a, status);
7614 if ( ! aSign ) return a;
7615 goto invalid;
7617 if ( aSign ) {
7618 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7619 invalid:
7620 float_raise(float_flag_invalid, status);
7621 return float128_default_nan(status);
7623 if ( aExp == 0 ) {
7624 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7625 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7627 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7628 aSig0 |= UINT64_C(0x0001000000000000);
7629 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7630 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7631 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7632 doubleZSig0 = zSig0<<1;
7633 mul64To128( zSig0, zSig0, &term0, &term1 );
7634 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7635 while ( (int64_t) rem0 < 0 ) {
7636 --zSig0;
7637 doubleZSig0 -= 2;
7638 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7640 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7641 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7642 if ( zSig1 == 0 ) zSig1 = 1;
7643 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7644 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7645 mul64To128( zSig1, zSig1, &term2, &term3 );
7646 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7647 while ( (int64_t) rem1 < 0 ) {
7648 --zSig1;
7649 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7650 term3 |= 1;
7651 term2 |= doubleZSig0;
7652 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7654 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7656 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7657 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7661 /*----------------------------------------------------------------------------
7662 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7663 | the corresponding value `b', and 0 otherwise. The invalid exception is
7664 | raised if either operand is a NaN. Otherwise, the comparison is performed
7665 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7666 *----------------------------------------------------------------------------*/
7668 int float128_eq(float128 a, float128 b, float_status *status)
7671 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7672 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7673 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7674 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7676 float_raise(float_flag_invalid, status);
7677 return 0;
7679 return
7680 ( a.low == b.low )
7681 && ( ( a.high == b.high )
7682 || ( ( a.low == 0 )
7683 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7688 /*----------------------------------------------------------------------------
7689 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7690 | or equal to the corresponding value `b', and 0 otherwise. The invalid
7691 | exception is raised if either operand is a NaN. The comparison is performed
7692 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7693 *----------------------------------------------------------------------------*/
7695 int float128_le(float128 a, float128 b, float_status *status)
7697 flag aSign, bSign;
7699 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7700 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7701 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7702 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7704 float_raise(float_flag_invalid, status);
7705 return 0;
7707 aSign = extractFloat128Sign( a );
7708 bSign = extractFloat128Sign( b );
7709 if ( aSign != bSign ) {
7710 return
7711 aSign
7712 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7713 == 0 );
7715 return
7716 aSign ? le128( b.high, b.low, a.high, a.low )
7717 : le128( a.high, a.low, b.high, b.low );
7721 /*----------------------------------------------------------------------------
7722 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7723 | the corresponding value `b', and 0 otherwise. The invalid exception is
7724 | raised if either operand is a NaN. The comparison is performed according
7725 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7726 *----------------------------------------------------------------------------*/
7728 int float128_lt(float128 a, float128 b, float_status *status)
7730 flag aSign, bSign;
7732 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7733 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7734 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7735 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7737 float_raise(float_flag_invalid, status);
7738 return 0;
7740 aSign = extractFloat128Sign( a );
7741 bSign = extractFloat128Sign( b );
7742 if ( aSign != bSign ) {
7743 return
7744 aSign
7745 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7746 != 0 );
7748 return
7749 aSign ? lt128( b.high, b.low, a.high, a.low )
7750 : lt128( a.high, a.low, b.high, b.low );
7754 /*----------------------------------------------------------------------------
7755 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7756 | be compared, and 0 otherwise. The invalid exception is raised if either
7757 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7758 | Standard for Binary Floating-Point Arithmetic.
7759 *----------------------------------------------------------------------------*/
7761 int float128_unordered(float128 a, float128 b, float_status *status)
7763 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7764 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7765 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7766 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7768 float_raise(float_flag_invalid, status);
7769 return 1;
7771 return 0;
7774 /*----------------------------------------------------------------------------
7775 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7776 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7777 | exception. The comparison is performed according to the IEC/IEEE Standard
7778 | for Binary Floating-Point Arithmetic.
7779 *----------------------------------------------------------------------------*/
7781 int float128_eq_quiet(float128 a, float128 b, float_status *status)
7784 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7785 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7786 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7787 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7789 if (float128_is_signaling_nan(a, status)
7790 || float128_is_signaling_nan(b, status)) {
7791 float_raise(float_flag_invalid, status);
7793 return 0;
7795 return
7796 ( a.low == b.low )
7797 && ( ( a.high == b.high )
7798 || ( ( a.low == 0 )
7799 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7804 /*----------------------------------------------------------------------------
7805 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7806 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
7807 | cause an exception. Otherwise, the comparison is performed according to the
7808 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7809 *----------------------------------------------------------------------------*/
7811 int float128_le_quiet(float128 a, float128 b, float_status *status)
7813 flag aSign, bSign;
7815 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7816 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7817 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7818 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7820 if (float128_is_signaling_nan(a, status)
7821 || float128_is_signaling_nan(b, status)) {
7822 float_raise(float_flag_invalid, status);
7824 return 0;
7826 aSign = extractFloat128Sign( a );
7827 bSign = extractFloat128Sign( b );
7828 if ( aSign != bSign ) {
7829 return
7830 aSign
7831 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7832 == 0 );
7834 return
7835 aSign ? le128( b.high, b.low, a.high, a.low )
7836 : le128( a.high, a.low, b.high, b.low );
7840 /*----------------------------------------------------------------------------
7841 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7842 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7843 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
7844 | Standard for Binary Floating-Point Arithmetic.
7845 *----------------------------------------------------------------------------*/
7847 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7849 flag aSign, bSign;
7851 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7852 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7853 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7854 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7856 if (float128_is_signaling_nan(a, status)
7857 || float128_is_signaling_nan(b, status)) {
7858 float_raise(float_flag_invalid, status);
7860 return 0;
7862 aSign = extractFloat128Sign( a );
7863 bSign = extractFloat128Sign( b );
7864 if ( aSign != bSign ) {
7865 return
7866 aSign
7867 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7868 != 0 );
7870 return
7871 aSign ? lt128( b.high, b.low, a.high, a.low )
7872 : lt128( a.high, a.low, b.high, b.low );
7876 /*----------------------------------------------------------------------------
7877 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7878 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
7879 | comparison is performed according to the IEC/IEEE Standard for Binary
7880 | Floating-Point Arithmetic.
7881 *----------------------------------------------------------------------------*/
7883 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7885 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7886 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7887 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7888 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7890 if (float128_is_signaling_nan(a, status)
7891 || float128_is_signaling_nan(b, status)) {
7892 float_raise(float_flag_invalid, status);
7894 return 1;
7896 return 0;
7899 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7900 int is_quiet, float_status *status)
7902 flag aSign, bSign;
7904 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7905 float_raise(float_flag_invalid, status);
7906 return float_relation_unordered;
7908 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7909 ( extractFloatx80Frac( a )<<1 ) ) ||
7910 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7911 ( extractFloatx80Frac( b )<<1 ) )) {
7912 if (!is_quiet ||
7913 floatx80_is_signaling_nan(a, status) ||
7914 floatx80_is_signaling_nan(b, status)) {
7915 float_raise(float_flag_invalid, status);
7917 return float_relation_unordered;
7919 aSign = extractFloatx80Sign( a );
7920 bSign = extractFloatx80Sign( b );
7921 if ( aSign != bSign ) {
7923 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7924 ( ( a.low | b.low ) == 0 ) ) {
7925 /* zero case */
7926 return float_relation_equal;
7927 } else {
7928 return 1 - (2 * aSign);
7930 } else {
7931 if (a.low == b.low && a.high == b.high) {
7932 return float_relation_equal;
7933 } else {
7934 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7939 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7941 return floatx80_compare_internal(a, b, 0, status);
7944 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7946 return floatx80_compare_internal(a, b, 1, status);
7949 static inline int float128_compare_internal(float128 a, float128 b,
7950 int is_quiet, float_status *status)
7952 flag aSign, bSign;
7954 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7955 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7956 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7957 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7958 if (!is_quiet ||
7959 float128_is_signaling_nan(a, status) ||
7960 float128_is_signaling_nan(b, status)) {
7961 float_raise(float_flag_invalid, status);
7963 return float_relation_unordered;
7965 aSign = extractFloat128Sign( a );
7966 bSign = extractFloat128Sign( b );
7967 if ( aSign != bSign ) {
7968 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7969 /* zero case */
7970 return float_relation_equal;
7971 } else {
7972 return 1 - (2 * aSign);
7974 } else {
7975 if (a.low == b.low && a.high == b.high) {
7976 return float_relation_equal;
7977 } else {
7978 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7983 int float128_compare(float128 a, float128 b, float_status *status)
7985 return float128_compare_internal(a, b, 0, status);
7988 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7990 return float128_compare_internal(a, b, 1, status);
7993 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7995 flag aSign;
7996 int32_t aExp;
7997 uint64_t aSig;
7999 if (floatx80_invalid_encoding(a)) {
8000 float_raise(float_flag_invalid, status);
8001 return floatx80_default_nan(status);
8003 aSig = extractFloatx80Frac( a );
8004 aExp = extractFloatx80Exp( a );
8005 aSign = extractFloatx80Sign( a );
8007 if ( aExp == 0x7FFF ) {
8008 if ( aSig<<1 ) {
8009 return propagateFloatx80NaN(a, a, status);
8011 return a;
8014 if (aExp == 0) {
8015 if (aSig == 0) {
8016 return a;
8018 aExp++;
8021 if (n > 0x10000) {
8022 n = 0x10000;
8023 } else if (n < -0x10000) {
8024 n = -0x10000;
8027 aExp += n;
8028 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
8029 aSign, aExp, aSig, 0, status);
8032 float128 float128_scalbn(float128 a, int n, float_status *status)
8034 flag aSign;
8035 int32_t aExp;
8036 uint64_t aSig0, aSig1;
8038 aSig1 = extractFloat128Frac1( a );
8039 aSig0 = extractFloat128Frac0( a );
8040 aExp = extractFloat128Exp( a );
8041 aSign = extractFloat128Sign( a );
8042 if ( aExp == 0x7FFF ) {
8043 if ( aSig0 | aSig1 ) {
8044 return propagateFloat128NaN(a, a, status);
8046 return a;
8048 if (aExp != 0) {
8049 aSig0 |= UINT64_C(0x0001000000000000);
8050 } else if (aSig0 == 0 && aSig1 == 0) {
8051 return a;
8052 } else {
8053 aExp++;
8056 if (n > 0x10000) {
8057 n = 0x10000;
8058 } else if (n < -0x10000) {
8059 n = -0x10000;
8062 aExp += n - 1;
8063 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
8064 , status);
8068 static void __attribute__((constructor)) softfloat_init(void)
8070 union_float64 ua, ub, uc, ur;
8072 if (QEMU_NO_HARDFLOAT) {
8073 return;
8076 * Test that the host's FMA is not obviously broken. For example,
8077 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
8078 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304
8080 ua.s = 0x0020000000000001ULL;
8081 ub.s = 0x3ca0000000000000ULL;
8082 uc.s = 0x0020000000000000ULL;
8083 ur.h = fma(ua.h, ub.h, uc.h);
8084 if (ur.s != 0x0020000000000001ULL) {
8085 force_soft_fma = true;