display: ensure qxl log_buf is a nul terminated string
[qemu/ar7.git] / fpu / softfloat.c
blob59eac97d1010385b06b39ebb6f888f6b5ba37486
1 /*
2 * QEMU float support
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
90 /* We only need stdlib for abort() */
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
100 * Hardfloat
102 * Fast emulation of guest FP instructions is challenging for two reasons.
103 * First, FP instruction semantics are similar but not identical, particularly
104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
105 * exception flags is not trivial: reading the host's flags register with a
106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107 * and trapping on every FP exception is not fast nor pleasant to work with.
109 * We address these challenges by leveraging the host FPU for a subset of the
110 * operations. To do this we expand on the idea presented in this paper:
112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
115 * The idea is thus to leverage the host FPU to (1) compute FP operations
116 * and (2) identify whether FP exceptions occurred while avoiding
117 * expensive exception flag register accesses.
119 * An important optimization shown in the paper is that given that exception
120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121 * This is particularly useful for the inexact flag, which is very frequently
122 * raised in floating-point workloads.
124 * We optimize the code further by deferring to soft-fp whenever FP exception
125 * detection might get hairy. Two examples: (1) when at least one operand is
126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127 * and the result is < the minimum normal.
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \
130 static inline void name(soft_t *a, float_status *s) \
132 if (unlikely(soft_t ## _is_denormal(*a))) { \
133 *a = soft_t ## _set_sign(soft_t ## _zero, \
134 soft_t ## _is_neg(*a)); \
135 s->float_exception_flags |= float_flag_input_denormal; \
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
143 #define GEN_INPUT_FLUSH1(name, soft_t) \
144 static inline void name(soft_t *a, float_status *s) \
146 if (likely(!s->flush_inputs_to_zero)) { \
147 return; \
149 soft_t ## _input_flush__nocheck(a, s); \
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
156 #define GEN_INPUT_FLUSH2(name, soft_t) \
157 static inline void name(soft_t *a, soft_t *b, float_status *s) \
159 if (likely(!s->flush_inputs_to_zero)) { \
160 return; \
162 soft_t ## _input_flush__nocheck(a, s); \
163 soft_t ## _input_flush__nocheck(b, s); \
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
170 #define GEN_INPUT_FLUSH3(name, soft_t) \
171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
173 if (likely(!s->flush_inputs_to_zero)) { \
174 return; \
176 soft_t ## _input_flush__nocheck(a, s); \
177 soft_t ## _input_flush__nocheck(b, s); \
178 soft_t ## _input_flush__nocheck(c, s); \
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
187 * hardfloat functions. Each combination of number of inputs and float size
188 * gets its own value.
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208 * float{32,64}_is_infinity when !USE_FP.
209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF 1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF 0
216 #endif
219 * Some targets clear the FP flags before most FP operations. This prevents
220 * the use of hardfloat, since hardfloat relies on the inexact flag being
221 * already set.
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226 IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
235 static inline bool can_use_fpu(const float_status *s)
237 if (QEMU_NO_HARDFLOAT) {
238 return false;
240 return likely(s->float_exception_flags & float_flag_inexact &&
241 s->float_rounding_mode == float_round_nearest_even);
245 * Hardfloat generation functions. Each operation can have two flavors:
246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247 * most condition checks, or native ones (e.g. fpclassify).
249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
250 * compiler to propagate constants and inline everything into the callers.
252 * We only generate functions for operations with two inputs, since only
253 * these are common enough to justify consolidating them into common code.
256 typedef union {
257 float32 s;
258 float h;
259 } union_float32;
261 typedef union {
262 float64 s;
263 double h;
264 } union_float64;
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float (*hard_f32_op2_fn)(float a, float b);
272 typedef double (*hard_f64_op2_fn)(double a, double b);
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
277 if (QEMU_HARDFLOAT_2F32_USE_FP) {
279 * Not using a temp variable for consecutive fpclassify calls ends up
280 * generating faster code.
282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
285 return float32_is_zero_or_normal(a.s) &&
286 float32_is_zero_or_normal(b.s);
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
291 if (QEMU_HARDFLOAT_2F64_USE_FP) {
292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
295 return float64_is_zero_or_normal(a.s) &&
296 float64_is_zero_or_normal(b.s);
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
303 if (QEMU_HARDFLOAT_3F32_USE_FP) {
304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
308 return float32_is_zero_or_normal(a.s) &&
309 float32_is_zero_or_normal(b.s) &&
310 float32_is_zero_or_normal(c.s);
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
316 if (QEMU_HARDFLOAT_3F64_USE_FP) {
317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
321 return float64_is_zero_or_normal(a.s) &&
322 float64_is_zero_or_normal(b.s) &&
323 float64_is_zero_or_normal(c.s);
326 static inline bool f32_is_inf(union_float32 a)
328 if (QEMU_HARDFLOAT_USE_ISINF) {
329 return isinf(a.h);
331 return float32_is_infinity(a.s);
334 static inline bool f64_is_inf(union_float64 a)
336 if (QEMU_HARDFLOAT_USE_ISINF) {
337 return isinf(a.h);
339 return float64_is_infinity(a.s);
342 /* Note: @fast_test and @post can be NULL */
343 static inline float32
344 float32_gen2(float32 xa, float32 xb, float_status *s,
345 hard_f32_op2_fn hard, soft_f32_op2_fn soft,
346 f32_check_fn pre, f32_check_fn post,
347 f32_check_fn fast_test, soft_f32_op2_fn fast_op)
349 union_float32 ua, ub, ur;
351 ua.s = xa;
352 ub.s = xb;
354 if (unlikely(!can_use_fpu(s))) {
355 goto soft;
358 float32_input_flush2(&ua.s, &ub.s, s);
359 if (unlikely(!pre(ua, ub))) {
360 goto soft;
362 if (fast_test && fast_test(ua, ub)) {
363 return fast_op(ua.s, ub.s, s);
366 ur.h = hard(ua.h, ub.h);
367 if (unlikely(f32_is_inf(ur))) {
368 s->float_exception_flags |= float_flag_overflow;
369 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
370 if (post == NULL || post(ua, ub)) {
371 goto soft;
374 return ur.s;
376 soft:
377 return soft(ua.s, ub.s, s);
380 static inline float64
381 float64_gen2(float64 xa, float64 xb, float_status *s,
382 hard_f64_op2_fn hard, soft_f64_op2_fn soft,
383 f64_check_fn pre, f64_check_fn post,
384 f64_check_fn fast_test, soft_f64_op2_fn fast_op)
386 union_float64 ua, ub, ur;
388 ua.s = xa;
389 ub.s = xb;
391 if (unlikely(!can_use_fpu(s))) {
392 goto soft;
395 float64_input_flush2(&ua.s, &ub.s, s);
396 if (unlikely(!pre(ua, ub))) {
397 goto soft;
399 if (fast_test && fast_test(ua, ub)) {
400 return fast_op(ua.s, ub.s, s);
403 ur.h = hard(ua.h, ub.h);
404 if (unlikely(f64_is_inf(ur))) {
405 s->float_exception_flags |= float_flag_overflow;
406 } else if (unlikely(fabs(ur.h) <= DBL_MIN)) {
407 if (post == NULL || post(ua, ub)) {
408 goto soft;
411 return ur.s;
413 soft:
414 return soft(ua.s, ub.s, s);
417 /*----------------------------------------------------------------------------
418 | Returns the fraction bits of the half-precision floating-point value `a'.
419 *----------------------------------------------------------------------------*/
421 static inline uint32_t extractFloat16Frac(float16 a)
423 return float16_val(a) & 0x3ff;
426 /*----------------------------------------------------------------------------
427 | Returns the exponent bits of the half-precision floating-point value `a'.
428 *----------------------------------------------------------------------------*/
430 static inline int extractFloat16Exp(float16 a)
432 return (float16_val(a) >> 10) & 0x1f;
435 /*----------------------------------------------------------------------------
436 | Returns the fraction bits of the single-precision floating-point value `a'.
437 *----------------------------------------------------------------------------*/
439 static inline uint32_t extractFloat32Frac(float32 a)
441 return float32_val(a) & 0x007FFFFF;
444 /*----------------------------------------------------------------------------
445 | Returns the exponent bits of the single-precision floating-point value `a'.
446 *----------------------------------------------------------------------------*/
448 static inline int extractFloat32Exp(float32 a)
450 return (float32_val(a) >> 23) & 0xFF;
453 /*----------------------------------------------------------------------------
454 | Returns the sign bit of the single-precision floating-point value `a'.
455 *----------------------------------------------------------------------------*/
457 static inline flag extractFloat32Sign(float32 a)
459 return float32_val(a) >> 31;
462 /*----------------------------------------------------------------------------
463 | Returns the fraction bits of the double-precision floating-point value `a'.
464 *----------------------------------------------------------------------------*/
466 static inline uint64_t extractFloat64Frac(float64 a)
468 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
471 /*----------------------------------------------------------------------------
472 | Returns the exponent bits of the double-precision floating-point value `a'.
473 *----------------------------------------------------------------------------*/
475 static inline int extractFloat64Exp(float64 a)
477 return (float64_val(a) >> 52) & 0x7FF;
480 /*----------------------------------------------------------------------------
481 | Returns the sign bit of the double-precision floating-point value `a'.
482 *----------------------------------------------------------------------------*/
484 static inline flag extractFloat64Sign(float64 a)
486 return float64_val(a) >> 63;
490 * Classify a floating point number. Everything above float_class_qnan
491 * is a NaN so cls >= float_class_qnan is any NaN.
494 typedef enum __attribute__ ((__packed__)) {
495 float_class_unclassified,
496 float_class_zero,
497 float_class_normal,
498 float_class_inf,
499 float_class_qnan, /* all NaNs from here */
500 float_class_snan,
501 } FloatClass;
503 /* Simple helpers for checking if, or what kind of, NaN we have */
504 static inline __attribute__((unused)) bool is_nan(FloatClass c)
506 return unlikely(c >= float_class_qnan);
509 static inline __attribute__((unused)) bool is_snan(FloatClass c)
511 return c == float_class_snan;
514 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
516 return c == float_class_qnan;
520 * Structure holding all of the decomposed parts of a float. The
521 * exponent is unbiased and the fraction is normalized. All
522 * calculations are done with a 64 bit fraction and then rounded as
523 * appropriate for the final format.
525 * Thanks to the packed FloatClass a decent compiler should be able to
526 * fit the whole structure into registers and avoid using the stack
527 * for parameter passing.
530 typedef struct {
531 uint64_t frac;
532 int32_t exp;
533 FloatClass cls;
534 bool sign;
535 } FloatParts;
537 #define DECOMPOSED_BINARY_POINT (64 - 2)
538 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
539 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
541 /* Structure holding all of the relevant parameters for a format.
542 * exp_size: the size of the exponent field
543 * exp_bias: the offset applied to the exponent field
544 * exp_max: the maximum normalised exponent
545 * frac_size: the size of the fraction field
546 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
547 * The following are computed based the size of fraction
548 * frac_lsb: least significant bit of fraction
549 * frac_lsbm1: the bit below the least significant bit (for rounding)
550 * round_mask/roundeven_mask: masks used for rounding
551 * The following optional modifiers are available:
552 * arm_althp: handle ARM Alternative Half Precision
554 typedef struct {
555 int exp_size;
556 int exp_bias;
557 int exp_max;
558 int frac_size;
559 int frac_shift;
560 uint64_t frac_lsb;
561 uint64_t frac_lsbm1;
562 uint64_t round_mask;
563 uint64_t roundeven_mask;
564 bool arm_althp;
565 } FloatFmt;
567 /* Expand fields based on the size of exponent and fraction */
568 #define FLOAT_PARAMS(E, F) \
569 .exp_size = E, \
570 .exp_bias = ((1 << E) - 1) >> 1, \
571 .exp_max = (1 << E) - 1, \
572 .frac_size = F, \
573 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
574 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
575 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
576 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
577 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
579 static const FloatFmt float16_params = {
580 FLOAT_PARAMS(5, 10)
583 static const FloatFmt float16_params_ahp = {
584 FLOAT_PARAMS(5, 10),
585 .arm_althp = true
588 static const FloatFmt float32_params = {
589 FLOAT_PARAMS(8, 23)
592 static const FloatFmt float64_params = {
593 FLOAT_PARAMS(11, 52)
596 /* Unpack a float to parts, but do not canonicalize. */
597 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
599 const int sign_pos = fmt.frac_size + fmt.exp_size;
601 return (FloatParts) {
602 .cls = float_class_unclassified,
603 .sign = extract64(raw, sign_pos, 1),
604 .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
605 .frac = extract64(raw, 0, fmt.frac_size),
609 static inline FloatParts float16_unpack_raw(float16 f)
611 return unpack_raw(float16_params, f);
614 static inline FloatParts float32_unpack_raw(float32 f)
616 return unpack_raw(float32_params, f);
619 static inline FloatParts float64_unpack_raw(float64 f)
621 return unpack_raw(float64_params, f);
624 /* Pack a float from parts, but do not canonicalize. */
625 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
627 const int sign_pos = fmt.frac_size + fmt.exp_size;
628 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
629 return deposit64(ret, sign_pos, 1, p.sign);
632 static inline float16 float16_pack_raw(FloatParts p)
634 return make_float16(pack_raw(float16_params, p));
637 static inline float32 float32_pack_raw(FloatParts p)
639 return make_float32(pack_raw(float32_params, p));
642 static inline float64 float64_pack_raw(FloatParts p)
644 return make_float64(pack_raw(float64_params, p));
647 /*----------------------------------------------------------------------------
648 | Functions and definitions to determine: (1) whether tininess for underflow
649 | is detected before or after rounding by default, (2) what (if anything)
650 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
651 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
652 | are propagated from function inputs to output. These details are target-
653 | specific.
654 *----------------------------------------------------------------------------*/
655 #include "softfloat-specialize.h"
657 /* Canonicalize EXP and FRAC, setting CLS. */
658 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm,
659 float_status *status)
661 if (part.exp == parm->exp_max && !parm->arm_althp) {
662 if (part.frac == 0) {
663 part.cls = float_class_inf;
664 } else {
665 part.frac <<= parm->frac_shift;
666 part.cls = (parts_is_snan_frac(part.frac, status)
667 ? float_class_snan : float_class_qnan);
669 } else if (part.exp == 0) {
670 if (likely(part.frac == 0)) {
671 part.cls = float_class_zero;
672 } else if (status->flush_inputs_to_zero) {
673 float_raise(float_flag_input_denormal, status);
674 part.cls = float_class_zero;
675 part.frac = 0;
676 } else {
677 int shift = clz64(part.frac) - 1;
678 part.cls = float_class_normal;
679 part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
680 part.frac <<= shift;
682 } else {
683 part.cls = float_class_normal;
684 part.exp -= parm->exp_bias;
685 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
687 return part;
690 /* Round and uncanonicalize a floating-point number by parts. There
691 * are FRAC_SHIFT bits that may require rounding at the bottom of the
692 * fraction; these bits will be removed. The exponent will be biased
693 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
696 static FloatParts round_canonical(FloatParts p, float_status *s,
697 const FloatFmt *parm)
699 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
700 const uint64_t round_mask = parm->round_mask;
701 const uint64_t roundeven_mask = parm->roundeven_mask;
702 const int exp_max = parm->exp_max;
703 const int frac_shift = parm->frac_shift;
704 uint64_t frac, inc;
705 int exp, flags = 0;
706 bool overflow_norm;
708 frac = p.frac;
709 exp = p.exp;
711 switch (p.cls) {
712 case float_class_normal:
713 switch (s->float_rounding_mode) {
714 case float_round_nearest_even:
715 overflow_norm = false;
716 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
717 break;
718 case float_round_ties_away:
719 overflow_norm = false;
720 inc = frac_lsbm1;
721 break;
722 case float_round_to_zero:
723 overflow_norm = true;
724 inc = 0;
725 break;
726 case float_round_up:
727 inc = p.sign ? 0 : round_mask;
728 overflow_norm = p.sign;
729 break;
730 case float_round_down:
731 inc = p.sign ? round_mask : 0;
732 overflow_norm = !p.sign;
733 break;
734 default:
735 g_assert_not_reached();
738 exp += parm->exp_bias;
739 if (likely(exp > 0)) {
740 if (frac & round_mask) {
741 flags |= float_flag_inexact;
742 frac += inc;
743 if (frac & DECOMPOSED_OVERFLOW_BIT) {
744 frac >>= 1;
745 exp++;
748 frac >>= frac_shift;
750 if (parm->arm_althp) {
751 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
752 if (unlikely(exp > exp_max)) {
753 /* Overflow. Return the maximum normal. */
754 flags = float_flag_invalid;
755 exp = exp_max;
756 frac = -1;
758 } else if (unlikely(exp >= exp_max)) {
759 flags |= float_flag_overflow | float_flag_inexact;
760 if (overflow_norm) {
761 exp = exp_max - 1;
762 frac = -1;
763 } else {
764 p.cls = float_class_inf;
765 goto do_inf;
768 } else if (s->flush_to_zero) {
769 flags |= float_flag_output_denormal;
770 p.cls = float_class_zero;
771 goto do_zero;
772 } else {
773 bool is_tiny = (s->float_detect_tininess
774 == float_tininess_before_rounding)
775 || (exp < 0)
776 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
778 shift64RightJamming(frac, 1 - exp, &frac);
779 if (frac & round_mask) {
780 /* Need to recompute round-to-even. */
781 if (s->float_rounding_mode == float_round_nearest_even) {
782 inc = ((frac & roundeven_mask) != frac_lsbm1
783 ? frac_lsbm1 : 0);
785 flags |= float_flag_inexact;
786 frac += inc;
789 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
790 frac >>= frac_shift;
792 if (is_tiny && (flags & float_flag_inexact)) {
793 flags |= float_flag_underflow;
795 if (exp == 0 && frac == 0) {
796 p.cls = float_class_zero;
799 break;
801 case float_class_zero:
802 do_zero:
803 exp = 0;
804 frac = 0;
805 break;
807 case float_class_inf:
808 do_inf:
809 assert(!parm->arm_althp);
810 exp = exp_max;
811 frac = 0;
812 break;
814 case float_class_qnan:
815 case float_class_snan:
816 assert(!parm->arm_althp);
817 exp = exp_max;
818 frac >>= parm->frac_shift;
819 break;
821 default:
822 g_assert_not_reached();
825 float_raise(flags, s);
826 p.exp = exp;
827 p.frac = frac;
828 return p;
831 /* Explicit FloatFmt version */
832 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
833 const FloatFmt *params)
835 return sf_canonicalize(float16_unpack_raw(f), params, s);
838 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
840 return float16a_unpack_canonical(f, s, &float16_params);
843 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
844 const FloatFmt *params)
846 return float16_pack_raw(round_canonical(p, s, params));
849 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
851 return float16a_round_pack_canonical(p, s, &float16_params);
854 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
856 return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
859 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
861 return float32_pack_raw(round_canonical(p, s, &float32_params));
864 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
866 return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
869 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
871 return float64_pack_raw(round_canonical(p, s, &float64_params));
874 static FloatParts return_nan(FloatParts a, float_status *s)
876 switch (a.cls) {
877 case float_class_snan:
878 s->float_exception_flags |= float_flag_invalid;
879 a = parts_silence_nan(a, s);
880 /* fall through */
881 case float_class_qnan:
882 if (s->default_nan_mode) {
883 return parts_default_nan(s);
885 break;
887 default:
888 g_assert_not_reached();
890 return a;
893 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
895 if (is_snan(a.cls) || is_snan(b.cls)) {
896 s->float_exception_flags |= float_flag_invalid;
899 if (s->default_nan_mode) {
900 return parts_default_nan(s);
901 } else {
902 if (pickNaN(a.cls, b.cls,
903 a.frac > b.frac ||
904 (a.frac == b.frac && a.sign < b.sign))) {
905 a = b;
907 if (is_snan(a.cls)) {
908 return parts_silence_nan(a, s);
911 return a;
914 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
915 bool inf_zero, float_status *s)
917 int which;
919 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
920 s->float_exception_flags |= float_flag_invalid;
923 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
925 if (s->default_nan_mode) {
926 /* Note that this check is after pickNaNMulAdd so that function
927 * has an opportunity to set the Invalid flag.
929 which = 3;
932 switch (which) {
933 case 0:
934 break;
935 case 1:
936 a = b;
937 break;
938 case 2:
939 a = c;
940 break;
941 case 3:
942 return parts_default_nan(s);
943 default:
944 g_assert_not_reached();
947 if (is_snan(a.cls)) {
948 return parts_silence_nan(a, s);
950 return a;
954 * Returns the result of adding or subtracting the values of the
955 * floating-point values `a' and `b'. The operation is performed
956 * according to the IEC/IEEE Standard for Binary Floating-Point
957 * Arithmetic.
960 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
961 float_status *s)
963 bool a_sign = a.sign;
964 bool b_sign = b.sign ^ subtract;
966 if (a_sign != b_sign) {
967 /* Subtraction */
969 if (a.cls == float_class_normal && b.cls == float_class_normal) {
970 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
971 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
972 a.frac = a.frac - b.frac;
973 } else {
974 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
975 a.frac = b.frac - a.frac;
976 a.exp = b.exp;
977 a_sign ^= 1;
980 if (a.frac == 0) {
981 a.cls = float_class_zero;
982 a.sign = s->float_rounding_mode == float_round_down;
983 } else {
984 int shift = clz64(a.frac) - 1;
985 a.frac = a.frac << shift;
986 a.exp = a.exp - shift;
987 a.sign = a_sign;
989 return a;
991 if (is_nan(a.cls) || is_nan(b.cls)) {
992 return pick_nan(a, b, s);
994 if (a.cls == float_class_inf) {
995 if (b.cls == float_class_inf) {
996 float_raise(float_flag_invalid, s);
997 return parts_default_nan(s);
999 return a;
1001 if (a.cls == float_class_zero && b.cls == float_class_zero) {
1002 a.sign = s->float_rounding_mode == float_round_down;
1003 return a;
1005 if (a.cls == float_class_zero || b.cls == float_class_inf) {
1006 b.sign = a_sign ^ 1;
1007 return b;
1009 if (b.cls == float_class_zero) {
1010 return a;
1012 } else {
1013 /* Addition */
1014 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1015 if (a.exp > b.exp) {
1016 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1017 } else if (a.exp < b.exp) {
1018 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1019 a.exp = b.exp;
1021 a.frac += b.frac;
1022 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1023 shift64RightJamming(a.frac, 1, &a.frac);
1024 a.exp += 1;
1026 return a;
1028 if (is_nan(a.cls) || is_nan(b.cls)) {
1029 return pick_nan(a, b, s);
1031 if (a.cls == float_class_inf || b.cls == float_class_zero) {
1032 return a;
1034 if (b.cls == float_class_inf || a.cls == float_class_zero) {
1035 b.sign = b_sign;
1036 return b;
1039 g_assert_not_reached();
1043 * Returns the result of adding or subtracting the floating-point
1044 * values `a' and `b'. The operation is performed according to the
1045 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1048 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1050 FloatParts pa = float16_unpack_canonical(a, status);
1051 FloatParts pb = float16_unpack_canonical(b, status);
1052 FloatParts pr = addsub_floats(pa, pb, false, status);
1054 return float16_round_pack_canonical(pr, status);
1057 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1059 FloatParts pa = float16_unpack_canonical(a, status);
1060 FloatParts pb = float16_unpack_canonical(b, status);
1061 FloatParts pr = addsub_floats(pa, pb, true, status);
1063 return float16_round_pack_canonical(pr, status);
1066 static float32 QEMU_SOFTFLOAT_ATTR
1067 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1069 FloatParts pa = float32_unpack_canonical(a, status);
1070 FloatParts pb = float32_unpack_canonical(b, status);
1071 FloatParts pr = addsub_floats(pa, pb, subtract, status);
1073 return float32_round_pack_canonical(pr, status);
1076 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1078 return soft_f32_addsub(a, b, false, status);
1081 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1083 return soft_f32_addsub(a, b, true, status);
1086 static float64 QEMU_SOFTFLOAT_ATTR
1087 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1089 FloatParts pa = float64_unpack_canonical(a, status);
1090 FloatParts pb = float64_unpack_canonical(b, status);
1091 FloatParts pr = addsub_floats(pa, pb, subtract, status);
1093 return float64_round_pack_canonical(pr, status);
1096 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1098 return soft_f64_addsub(a, b, false, status);
1101 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1103 return soft_f64_addsub(a, b, true, status);
1106 static float hard_f32_add(float a, float b)
1108 return a + b;
1111 static float hard_f32_sub(float a, float b)
1113 return a - b;
1116 static double hard_f64_add(double a, double b)
1118 return a + b;
1121 static double hard_f64_sub(double a, double b)
1123 return a - b;
1126 static bool f32_addsub_post(union_float32 a, union_float32 b)
1128 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1129 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1131 return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1134 static bool f64_addsub_post(union_float64 a, union_float64 b)
1136 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1137 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1138 } else {
1139 return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1143 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1144 hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1146 return float32_gen2(a, b, s, hard, soft,
1147 f32_is_zon2, f32_addsub_post, NULL, NULL);
1150 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1151 hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1153 return float64_gen2(a, b, s, hard, soft,
1154 f64_is_zon2, f64_addsub_post, NULL, NULL);
1157 float32 QEMU_FLATTEN
1158 float32_add(float32 a, float32 b, float_status *s)
1160 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1163 float32 QEMU_FLATTEN
1164 float32_sub(float32 a, float32 b, float_status *s)
1166 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1169 float64 QEMU_FLATTEN
1170 float64_add(float64 a, float64 b, float_status *s)
1172 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1175 float64 QEMU_FLATTEN
1176 float64_sub(float64 a, float64 b, float_status *s)
1178 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1182 * Returns the result of multiplying the floating-point values `a' and
1183 * `b'. The operation is performed according to the IEC/IEEE Standard
1184 * for Binary Floating-Point Arithmetic.
1187 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
1189 bool sign = a.sign ^ b.sign;
1191 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1192 uint64_t hi, lo;
1193 int exp = a.exp + b.exp;
1195 mul64To128(a.frac, b.frac, &hi, &lo);
1196 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1197 if (lo & DECOMPOSED_OVERFLOW_BIT) {
1198 shift64RightJamming(lo, 1, &lo);
1199 exp += 1;
1202 /* Re-use a */
1203 a.exp = exp;
1204 a.sign = sign;
1205 a.frac = lo;
1206 return a;
1208 /* handle all the NaN cases */
1209 if (is_nan(a.cls) || is_nan(b.cls)) {
1210 return pick_nan(a, b, s);
1212 /* Inf * Zero == NaN */
1213 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1214 (a.cls == float_class_zero && b.cls == float_class_inf)) {
1215 s->float_exception_flags |= float_flag_invalid;
1216 return parts_default_nan(s);
1218 /* Multiply by 0 or Inf */
1219 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1220 a.sign = sign;
1221 return a;
1223 if (b.cls == float_class_inf || b.cls == float_class_zero) {
1224 b.sign = sign;
1225 return b;
1227 g_assert_not_reached();
1230 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1232 FloatParts pa = float16_unpack_canonical(a, status);
1233 FloatParts pb = float16_unpack_canonical(b, status);
1234 FloatParts pr = mul_floats(pa, pb, status);
1236 return float16_round_pack_canonical(pr, status);
1239 static float32 QEMU_SOFTFLOAT_ATTR
1240 soft_f32_mul(float32 a, float32 b, float_status *status)
1242 FloatParts pa = float32_unpack_canonical(a, status);
1243 FloatParts pb = float32_unpack_canonical(b, status);
1244 FloatParts pr = mul_floats(pa, pb, status);
1246 return float32_round_pack_canonical(pr, status);
1249 static float64 QEMU_SOFTFLOAT_ATTR
1250 soft_f64_mul(float64 a, float64 b, float_status *status)
1252 FloatParts pa = float64_unpack_canonical(a, status);
1253 FloatParts pb = float64_unpack_canonical(b, status);
1254 FloatParts pr = mul_floats(pa, pb, status);
1256 return float64_round_pack_canonical(pr, status);
1259 static float hard_f32_mul(float a, float b)
1261 return a * b;
1264 static double hard_f64_mul(double a, double b)
1266 return a * b;
1269 static bool f32_mul_fast_test(union_float32 a, union_float32 b)
1271 return float32_is_zero(a.s) || float32_is_zero(b.s);
1274 static bool f64_mul_fast_test(union_float64 a, union_float64 b)
1276 return float64_is_zero(a.s) || float64_is_zero(b.s);
1279 static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s)
1281 bool signbit = float32_is_neg(a) ^ float32_is_neg(b);
1283 return float32_set_sign(float32_zero, signbit);
1286 static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s)
1288 bool signbit = float64_is_neg(a) ^ float64_is_neg(b);
1290 return float64_set_sign(float64_zero, signbit);
1293 float32 QEMU_FLATTEN
1294 float32_mul(float32 a, float32 b, float_status *s)
1296 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1297 f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op);
1300 float64 QEMU_FLATTEN
1301 float64_mul(float64 a, float64 b, float_status *s)
1303 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1304 f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op);
1308 * Returns the result of multiplying the floating-point values `a' and
1309 * `b' then adding 'c', with no intermediate rounding step after the
1310 * multiplication. The operation is performed according to the
1311 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1312 * The flags argument allows the caller to select negation of the
1313 * addend, the intermediate product, or the final result. (The
1314 * difference between this and having the caller do a separate
1315 * negation is that negating externally will flip the sign bit on
1316 * NaNs.)
1319 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
1320 int flags, float_status *s)
1322 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
1323 ((1 << float_class_inf) | (1 << float_class_zero));
1324 bool p_sign;
1325 bool sign_flip = flags & float_muladd_negate_result;
1326 FloatClass p_class;
1327 uint64_t hi, lo;
1328 int p_exp;
1330 /* It is implementation-defined whether the cases of (0,inf,qnan)
1331 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1332 * they return if they do), so we have to hand this information
1333 * off to the target-specific pick-a-NaN routine.
1335 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
1336 return pick_nan_muladd(a, b, c, inf_zero, s);
1339 if (inf_zero) {
1340 s->float_exception_flags |= float_flag_invalid;
1341 return parts_default_nan(s);
1344 if (flags & float_muladd_negate_c) {
1345 c.sign ^= 1;
1348 p_sign = a.sign ^ b.sign;
1350 if (flags & float_muladd_negate_product) {
1351 p_sign ^= 1;
1354 if (a.cls == float_class_inf || b.cls == float_class_inf) {
1355 p_class = float_class_inf;
1356 } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
1357 p_class = float_class_zero;
1358 } else {
1359 p_class = float_class_normal;
1362 if (c.cls == float_class_inf) {
1363 if (p_class == float_class_inf && p_sign != c.sign) {
1364 s->float_exception_flags |= float_flag_invalid;
1365 return parts_default_nan(s);
1366 } else {
1367 a.cls = float_class_inf;
1368 a.sign = c.sign ^ sign_flip;
1369 return a;
1373 if (p_class == float_class_inf) {
1374 a.cls = float_class_inf;
1375 a.sign = p_sign ^ sign_flip;
1376 return a;
1379 if (p_class == float_class_zero) {
1380 if (c.cls == float_class_zero) {
1381 if (p_sign != c.sign) {
1382 p_sign = s->float_rounding_mode == float_round_down;
1384 c.sign = p_sign;
1385 } else if (flags & float_muladd_halve_result) {
1386 c.exp -= 1;
1388 c.sign ^= sign_flip;
1389 return c;
1392 /* a & b should be normals now... */
1393 assert(a.cls == float_class_normal &&
1394 b.cls == float_class_normal);
1396 p_exp = a.exp + b.exp;
1398 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1399 * result.
1401 mul64To128(a.frac, b.frac, &hi, &lo);
1402 /* binary point now at bit 124 */
1404 /* check for overflow */
1405 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
1406 shift128RightJamming(hi, lo, 1, &hi, &lo);
1407 p_exp += 1;
1410 /* + add/sub */
1411 if (c.cls == float_class_zero) {
1412 /* move binary point back to 62 */
1413 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1414 } else {
1415 int exp_diff = p_exp - c.exp;
1416 if (p_sign == c.sign) {
1417 /* Addition */
1418 if (exp_diff <= 0) {
1419 shift128RightJamming(hi, lo,
1420 DECOMPOSED_BINARY_POINT - exp_diff,
1421 &hi, &lo);
1422 lo += c.frac;
1423 p_exp = c.exp;
1424 } else {
1425 uint64_t c_hi, c_lo;
1426 /* shift c to the same binary point as the product (124) */
1427 c_hi = c.frac >> 2;
1428 c_lo = 0;
1429 shift128RightJamming(c_hi, c_lo,
1430 exp_diff,
1431 &c_hi, &c_lo);
1432 add128(hi, lo, c_hi, c_lo, &hi, &lo);
1433 /* move binary point back to 62 */
1434 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1437 if (lo & DECOMPOSED_OVERFLOW_BIT) {
1438 shift64RightJamming(lo, 1, &lo);
1439 p_exp += 1;
1442 } else {
1443 /* Subtraction */
1444 uint64_t c_hi, c_lo;
1445 /* make C binary point match product at bit 124 */
1446 c_hi = c.frac >> 2;
1447 c_lo = 0;
1449 if (exp_diff <= 0) {
1450 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1451 if (exp_diff == 0
1453 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1454 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1455 } else {
1456 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1457 p_sign ^= 1;
1458 p_exp = c.exp;
1460 } else {
1461 shift128RightJamming(c_hi, c_lo,
1462 exp_diff,
1463 &c_hi, &c_lo);
1464 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1467 if (hi == 0 && lo == 0) {
1468 a.cls = float_class_zero;
1469 a.sign = s->float_rounding_mode == float_round_down;
1470 a.sign ^= sign_flip;
1471 return a;
1472 } else {
1473 int shift;
1474 if (hi != 0) {
1475 shift = clz64(hi);
1476 } else {
1477 shift = clz64(lo) + 64;
1479 /* Normalizing to a binary point of 124 is the
1480 correct adjust for the exponent. However since we're
1481 shifting, we might as well put the binary point back
1482 at 62 where we really want it. Therefore shift as
1483 if we're leaving 1 bit at the top of the word, but
1484 adjust the exponent as if we're leaving 3 bits. */
1485 shift -= 1;
1486 if (shift >= 64) {
1487 lo = lo << (shift - 64);
1488 } else {
1489 hi = (hi << shift) | (lo >> (64 - shift));
1490 lo = hi | ((lo << shift) != 0);
1492 p_exp -= shift - 2;
1497 if (flags & float_muladd_halve_result) {
1498 p_exp -= 1;
1501 /* finally prepare our result */
1502 a.cls = float_class_normal;
1503 a.sign = p_sign ^ sign_flip;
1504 a.exp = p_exp;
1505 a.frac = lo;
1507 return a;
1510 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1511 int flags, float_status *status)
1513 FloatParts pa = float16_unpack_canonical(a, status);
1514 FloatParts pb = float16_unpack_canonical(b, status);
1515 FloatParts pc = float16_unpack_canonical(c, status);
1516 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1518 return float16_round_pack_canonical(pr, status);
1521 static float32 QEMU_SOFTFLOAT_ATTR
1522 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1523 float_status *status)
1525 FloatParts pa = float32_unpack_canonical(a, status);
1526 FloatParts pb = float32_unpack_canonical(b, status);
1527 FloatParts pc = float32_unpack_canonical(c, status);
1528 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1530 return float32_round_pack_canonical(pr, status);
1533 static float64 QEMU_SOFTFLOAT_ATTR
1534 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1535 float_status *status)
1537 FloatParts pa = float64_unpack_canonical(a, status);
1538 FloatParts pb = float64_unpack_canonical(b, status);
1539 FloatParts pc = float64_unpack_canonical(c, status);
1540 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1542 return float64_round_pack_canonical(pr, status);
1545 float32 QEMU_FLATTEN
1546 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1548 union_float32 ua, ub, uc, ur;
1550 ua.s = xa;
1551 ub.s = xb;
1552 uc.s = xc;
1554 if (unlikely(!can_use_fpu(s))) {
1555 goto soft;
1557 if (unlikely(flags & float_muladd_halve_result)) {
1558 goto soft;
1561 float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1562 if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1563 goto soft;
1566 * When (a || b) == 0, there's no need to check for under/over flow,
1567 * since we know the addend is (normal || 0) and the product is 0.
1569 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1570 union_float32 up;
1571 bool prod_sign;
1573 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1574 prod_sign ^= !!(flags & float_muladd_negate_product);
1575 up.s = float32_set_sign(float32_zero, prod_sign);
1577 if (flags & float_muladd_negate_c) {
1578 uc.h = -uc.h;
1580 ur.h = up.h + uc.h;
1581 } else {
1582 if (flags & float_muladd_negate_product) {
1583 ua.h = -ua.h;
1585 if (flags & float_muladd_negate_c) {
1586 uc.h = -uc.h;
1589 ur.h = fmaf(ua.h, ub.h, uc.h);
1591 if (unlikely(f32_is_inf(ur))) {
1592 s->float_exception_flags |= float_flag_overflow;
1593 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1594 goto soft;
1597 if (flags & float_muladd_negate_result) {
1598 return float32_chs(ur.s);
1600 return ur.s;
1602 soft:
1603 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1606 float64 QEMU_FLATTEN
1607 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1609 union_float64 ua, ub, uc, ur;
1611 ua.s = xa;
1612 ub.s = xb;
1613 uc.s = xc;
1615 if (unlikely(!can_use_fpu(s))) {
1616 goto soft;
1618 if (unlikely(flags & float_muladd_halve_result)) {
1619 goto soft;
1622 float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1623 if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1624 goto soft;
1627 * When (a || b) == 0, there's no need to check for under/over flow,
1628 * since we know the addend is (normal || 0) and the product is 0.
1630 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1631 union_float64 up;
1632 bool prod_sign;
1634 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1635 prod_sign ^= !!(flags & float_muladd_negate_product);
1636 up.s = float64_set_sign(float64_zero, prod_sign);
1638 if (flags & float_muladd_negate_c) {
1639 uc.h = -uc.h;
1641 ur.h = up.h + uc.h;
1642 } else {
1643 if (flags & float_muladd_negate_product) {
1644 ua.h = -ua.h;
1646 if (flags & float_muladd_negate_c) {
1647 uc.h = -uc.h;
1650 ur.h = fma(ua.h, ub.h, uc.h);
1652 if (unlikely(f64_is_inf(ur))) {
1653 s->float_exception_flags |= float_flag_overflow;
1654 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1655 goto soft;
1658 if (flags & float_muladd_negate_result) {
1659 return float64_chs(ur.s);
1661 return ur.s;
1663 soft:
1664 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1668 * Returns the result of dividing the floating-point value `a' by the
1669 * corresponding value `b'. The operation is performed according to
1670 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1673 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1675 bool sign = a.sign ^ b.sign;
1677 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1678 uint64_t n0, n1, q, r;
1679 int exp = a.exp - b.exp;
1682 * We want a 2*N / N-bit division to produce exactly an N-bit
1683 * result, so that we do not lose any precision and so that we
1684 * do not have to renormalize afterward. If A.frac < B.frac,
1685 * then division would produce an (N-1)-bit result; shift A left
1686 * by one to produce the an N-bit result, and decrement the
1687 * exponent to match.
1689 * The udiv_qrnnd algorithm that we're using requires normalization,
1690 * i.e. the msb of the denominator must be set. Since we know that
1691 * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1692 * by one (more), and the remainder must be shifted right by one.
1694 if (a.frac < b.frac) {
1695 exp -= 1;
1696 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1697 } else {
1698 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1700 q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1703 * Set lsb if there is a remainder, to set inexact.
1704 * As mentioned above, to find the actual value of the remainder we
1705 * would need to shift right, but (1) we are only concerned about
1706 * non-zero-ness, and (2) the remainder will always be even because
1707 * both inputs to the division primitive are even.
1709 a.frac = q | (r != 0);
1710 a.sign = sign;
1711 a.exp = exp;
1712 return a;
1714 /* handle all the NaN cases */
1715 if (is_nan(a.cls) || is_nan(b.cls)) {
1716 return pick_nan(a, b, s);
1718 /* 0/0 or Inf/Inf */
1719 if (a.cls == b.cls
1721 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1722 s->float_exception_flags |= float_flag_invalid;
1723 return parts_default_nan(s);
1725 /* Inf / x or 0 / x */
1726 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1727 a.sign = sign;
1728 return a;
1730 /* Div 0 => Inf */
1731 if (b.cls == float_class_zero) {
1732 s->float_exception_flags |= float_flag_divbyzero;
1733 a.cls = float_class_inf;
1734 a.sign = sign;
1735 return a;
1737 /* Div by Inf */
1738 if (b.cls == float_class_inf) {
1739 a.cls = float_class_zero;
1740 a.sign = sign;
1741 return a;
1743 g_assert_not_reached();
1746 float16 float16_div(float16 a, float16 b, float_status *status)
1748 FloatParts pa = float16_unpack_canonical(a, status);
1749 FloatParts pb = float16_unpack_canonical(b, status);
1750 FloatParts pr = div_floats(pa, pb, status);
1752 return float16_round_pack_canonical(pr, status);
1755 static float32 QEMU_SOFTFLOAT_ATTR
1756 soft_f32_div(float32 a, float32 b, float_status *status)
1758 FloatParts pa = float32_unpack_canonical(a, status);
1759 FloatParts pb = float32_unpack_canonical(b, status);
1760 FloatParts pr = div_floats(pa, pb, status);
1762 return float32_round_pack_canonical(pr, status);
1765 static float64 QEMU_SOFTFLOAT_ATTR
1766 soft_f64_div(float64 a, float64 b, float_status *status)
1768 FloatParts pa = float64_unpack_canonical(a, status);
1769 FloatParts pb = float64_unpack_canonical(b, status);
1770 FloatParts pr = div_floats(pa, pb, status);
1772 return float64_round_pack_canonical(pr, status);
1775 static float hard_f32_div(float a, float b)
1777 return a / b;
1780 static double hard_f64_div(double a, double b)
1782 return a / b;
1785 static bool f32_div_pre(union_float32 a, union_float32 b)
1787 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1788 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1789 fpclassify(b.h) == FP_NORMAL;
1791 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1794 static bool f64_div_pre(union_float64 a, union_float64 b)
1796 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1797 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1798 fpclassify(b.h) == FP_NORMAL;
1800 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1803 static bool f32_div_post(union_float32 a, union_float32 b)
1805 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1806 return fpclassify(a.h) != FP_ZERO;
1808 return !float32_is_zero(a.s);
1811 static bool f64_div_post(union_float64 a, union_float64 b)
1813 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1814 return fpclassify(a.h) != FP_ZERO;
1816 return !float64_is_zero(a.s);
1819 float32 QEMU_FLATTEN
1820 float32_div(float32 a, float32 b, float_status *s)
1822 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1823 f32_div_pre, f32_div_post, NULL, NULL);
1826 float64 QEMU_FLATTEN
1827 float64_div(float64 a, float64 b, float_status *s)
1829 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1830 f64_div_pre, f64_div_post, NULL, NULL);
1834 * Float to Float conversions
1836 * Returns the result of converting one float format to another. The
1837 * conversion is performed according to the IEC/IEEE Standard for
1838 * Binary Floating-Point Arithmetic.
1840 * The float_to_float helper only needs to take care of raising
1841 * invalid exceptions and handling the conversion on NaNs.
1844 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1845 float_status *s)
1847 if (dstf->arm_althp) {
1848 switch (a.cls) {
1849 case float_class_qnan:
1850 case float_class_snan:
1851 /* There is no NaN in the destination format. Raise Invalid
1852 * and return a zero with the sign of the input NaN.
1854 s->float_exception_flags |= float_flag_invalid;
1855 a.cls = float_class_zero;
1856 a.frac = 0;
1857 a.exp = 0;
1858 break;
1860 case float_class_inf:
1861 /* There is no Inf in the destination format. Raise Invalid
1862 * and return the maximum normal with the correct sign.
1864 s->float_exception_flags |= float_flag_invalid;
1865 a.cls = float_class_normal;
1866 a.exp = dstf->exp_max;
1867 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1868 break;
1870 default:
1871 break;
1873 } else if (is_nan(a.cls)) {
1874 if (is_snan(a.cls)) {
1875 s->float_exception_flags |= float_flag_invalid;
1876 a = parts_silence_nan(a, s);
1878 if (s->default_nan_mode) {
1879 return parts_default_nan(s);
1882 return a;
1885 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1887 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1888 FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1889 FloatParts pr = float_to_float(p, &float32_params, s);
1890 return float32_round_pack_canonical(pr, s);
1893 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1895 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1896 FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1897 FloatParts pr = float_to_float(p, &float64_params, s);
1898 return float64_round_pack_canonical(pr, s);
1901 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1903 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1904 FloatParts p = float32_unpack_canonical(a, s);
1905 FloatParts pr = float_to_float(p, fmt16, s);
1906 return float16a_round_pack_canonical(pr, s, fmt16);
1909 float64 float32_to_float64(float32 a, float_status *s)
1911 FloatParts p = float32_unpack_canonical(a, s);
1912 FloatParts pr = float_to_float(p, &float64_params, s);
1913 return float64_round_pack_canonical(pr, s);
1916 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1918 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1919 FloatParts p = float64_unpack_canonical(a, s);
1920 FloatParts pr = float_to_float(p, fmt16, s);
1921 return float16a_round_pack_canonical(pr, s, fmt16);
1924 float32 float64_to_float32(float64 a, float_status *s)
1926 FloatParts p = float64_unpack_canonical(a, s);
1927 FloatParts pr = float_to_float(p, &float32_params, s);
1928 return float32_round_pack_canonical(pr, s);
1932 * Rounds the floating-point value `a' to an integer, and returns the
1933 * result as a floating-point value. The operation is performed
1934 * according to the IEC/IEEE Standard for Binary Floating-Point
1935 * Arithmetic.
1938 static FloatParts round_to_int(FloatParts a, int rmode,
1939 int scale, float_status *s)
1941 switch (a.cls) {
1942 case float_class_qnan:
1943 case float_class_snan:
1944 return return_nan(a, s);
1946 case float_class_zero:
1947 case float_class_inf:
1948 /* already "integral" */
1949 break;
1951 case float_class_normal:
1952 scale = MIN(MAX(scale, -0x10000), 0x10000);
1953 a.exp += scale;
1955 if (a.exp >= DECOMPOSED_BINARY_POINT) {
1956 /* already integral */
1957 break;
1959 if (a.exp < 0) {
1960 bool one;
1961 /* all fractional */
1962 s->float_exception_flags |= float_flag_inexact;
1963 switch (rmode) {
1964 case float_round_nearest_even:
1965 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1966 break;
1967 case float_round_ties_away:
1968 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1969 break;
1970 case float_round_to_zero:
1971 one = false;
1972 break;
1973 case float_round_up:
1974 one = !a.sign;
1975 break;
1976 case float_round_down:
1977 one = a.sign;
1978 break;
1979 default:
1980 g_assert_not_reached();
1983 if (one) {
1984 a.frac = DECOMPOSED_IMPLICIT_BIT;
1985 a.exp = 0;
1986 } else {
1987 a.cls = float_class_zero;
1989 } else {
1990 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1991 uint64_t frac_lsbm1 = frac_lsb >> 1;
1992 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1993 uint64_t rnd_mask = rnd_even_mask >> 1;
1994 uint64_t inc;
1996 switch (rmode) {
1997 case float_round_nearest_even:
1998 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1999 break;
2000 case float_round_ties_away:
2001 inc = frac_lsbm1;
2002 break;
2003 case float_round_to_zero:
2004 inc = 0;
2005 break;
2006 case float_round_up:
2007 inc = a.sign ? 0 : rnd_mask;
2008 break;
2009 case float_round_down:
2010 inc = a.sign ? rnd_mask : 0;
2011 break;
2012 default:
2013 g_assert_not_reached();
2016 if (a.frac & rnd_mask) {
2017 s->float_exception_flags |= float_flag_inexact;
2018 a.frac += inc;
2019 a.frac &= ~rnd_mask;
2020 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
2021 a.frac >>= 1;
2022 a.exp++;
2026 break;
2027 default:
2028 g_assert_not_reached();
2030 return a;
2033 float16 float16_round_to_int(float16 a, float_status *s)
2035 FloatParts pa = float16_unpack_canonical(a, s);
2036 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2037 return float16_round_pack_canonical(pr, s);
2040 float32 float32_round_to_int(float32 a, float_status *s)
2042 FloatParts pa = float32_unpack_canonical(a, s);
2043 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2044 return float32_round_pack_canonical(pr, s);
2047 float64 float64_round_to_int(float64 a, float_status *s)
2049 FloatParts pa = float64_unpack_canonical(a, s);
2050 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2051 return float64_round_pack_canonical(pr, s);
2055 * Returns the result of converting the floating-point value `a' to
2056 * the two's complement integer format. The conversion is performed
2057 * according to the IEC/IEEE Standard for Binary Floating-Point
2058 * Arithmetic---which means in particular that the conversion is
2059 * rounded according to the current rounding mode. If `a' is a NaN,
2060 * the largest positive integer is returned. Otherwise, if the
2061 * conversion overflows, the largest integer with the same sign as `a'
2062 * is returned.
2065 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale,
2066 int64_t min, int64_t max,
2067 float_status *s)
2069 uint64_t r;
2070 int orig_flags = get_float_exception_flags(s);
2071 FloatParts p = round_to_int(in, rmode, scale, s);
2073 switch (p.cls) {
2074 case float_class_snan:
2075 case float_class_qnan:
2076 s->float_exception_flags = orig_flags | float_flag_invalid;
2077 return max;
2078 case float_class_inf:
2079 s->float_exception_flags = orig_flags | float_flag_invalid;
2080 return p.sign ? min : max;
2081 case float_class_zero:
2082 return 0;
2083 case float_class_normal:
2084 if (p.exp < DECOMPOSED_BINARY_POINT) {
2085 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2086 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2087 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2088 } else {
2089 r = UINT64_MAX;
2091 if (p.sign) {
2092 if (r <= -(uint64_t) min) {
2093 return -r;
2094 } else {
2095 s->float_exception_flags = orig_flags | float_flag_invalid;
2096 return min;
2098 } else {
2099 if (r <= max) {
2100 return r;
2101 } else {
2102 s->float_exception_flags = orig_flags | float_flag_invalid;
2103 return max;
2106 default:
2107 g_assert_not_reached();
2111 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale,
2112 float_status *s)
2114 return round_to_int_and_pack(float16_unpack_canonical(a, s),
2115 rmode, scale, INT16_MIN, INT16_MAX, s);
2118 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale,
2119 float_status *s)
2121 return round_to_int_and_pack(float16_unpack_canonical(a, s),
2122 rmode, scale, INT32_MIN, INT32_MAX, s);
2125 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale,
2126 float_status *s)
2128 return round_to_int_and_pack(float16_unpack_canonical(a, s),
2129 rmode, scale, INT64_MIN, INT64_MAX, s);
2132 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale,
2133 float_status *s)
2135 return round_to_int_and_pack(float32_unpack_canonical(a, s),
2136 rmode, scale, INT16_MIN, INT16_MAX, s);
2139 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale,
2140 float_status *s)
2142 return round_to_int_and_pack(float32_unpack_canonical(a, s),
2143 rmode, scale, INT32_MIN, INT32_MAX, s);
2146 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale,
2147 float_status *s)
2149 return round_to_int_and_pack(float32_unpack_canonical(a, s),
2150 rmode, scale, INT64_MIN, INT64_MAX, s);
2153 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale,
2154 float_status *s)
2156 return round_to_int_and_pack(float64_unpack_canonical(a, s),
2157 rmode, scale, INT16_MIN, INT16_MAX, s);
2160 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale,
2161 float_status *s)
2163 return round_to_int_and_pack(float64_unpack_canonical(a, s),
2164 rmode, scale, INT32_MIN, INT32_MAX, s);
2167 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale,
2168 float_status *s)
2170 return round_to_int_and_pack(float64_unpack_canonical(a, s),
2171 rmode, scale, INT64_MIN, INT64_MAX, s);
2174 int16_t float16_to_int16(float16 a, float_status *s)
2176 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2179 int32_t float16_to_int32(float16 a, float_status *s)
2181 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2184 int64_t float16_to_int64(float16 a, float_status *s)
2186 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2189 int16_t float32_to_int16(float32 a, float_status *s)
2191 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2194 int32_t float32_to_int32(float32 a, float_status *s)
2196 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2199 int64_t float32_to_int64(float32 a, float_status *s)
2201 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2204 int16_t float64_to_int16(float64 a, float_status *s)
2206 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2209 int32_t float64_to_int32(float64 a, float_status *s)
2211 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2214 int64_t float64_to_int64(float64 a, float_status *s)
2216 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2219 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2221 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2224 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2226 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2229 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2231 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2234 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2236 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2239 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2241 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2244 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2246 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2249 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2251 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2254 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2256 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2259 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2261 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2265 * Returns the result of converting the floating-point value `a' to
2266 * the unsigned integer format. The conversion is performed according
2267 * to the IEC/IEEE Standard for Binary Floating-Point
2268 * Arithmetic---which means in particular that the conversion is
2269 * rounded according to the current rounding mode. If `a' is a NaN,
2270 * the largest unsigned integer is returned. Otherwise, if the
2271 * conversion overflows, the largest unsigned integer is returned. If
2272 * the 'a' is negative, the result is rounded and zero is returned;
2273 * values that do not round to zero will raise the inexact exception
2274 * flag.
2277 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale,
2278 uint64_t max, float_status *s)
2280 int orig_flags = get_float_exception_flags(s);
2281 FloatParts p = round_to_int(in, rmode, scale, s);
2282 uint64_t r;
2284 switch (p.cls) {
2285 case float_class_snan:
2286 case float_class_qnan:
2287 s->float_exception_flags = orig_flags | float_flag_invalid;
2288 return max;
2289 case float_class_inf:
2290 s->float_exception_flags = orig_flags | float_flag_invalid;
2291 return p.sign ? 0 : max;
2292 case float_class_zero:
2293 return 0;
2294 case float_class_normal:
2295 if (p.sign) {
2296 s->float_exception_flags = orig_flags | float_flag_invalid;
2297 return 0;
2300 if (p.exp < DECOMPOSED_BINARY_POINT) {
2301 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2302 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2303 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2304 } else {
2305 s->float_exception_flags = orig_flags | float_flag_invalid;
2306 return max;
2309 /* For uint64 this will never trip, but if p.exp is too large
2310 * to shift a decomposed fraction we shall have exited via the
2311 * 3rd leg above.
2313 if (r > max) {
2314 s->float_exception_flags = orig_flags | float_flag_invalid;
2315 return max;
2317 return r;
2318 default:
2319 g_assert_not_reached();
2323 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale,
2324 float_status *s)
2326 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2327 rmode, scale, UINT16_MAX, s);
2330 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale,
2331 float_status *s)
2333 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2334 rmode, scale, UINT32_MAX, s);
2337 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale,
2338 float_status *s)
2340 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2341 rmode, scale, UINT64_MAX, s);
2344 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale,
2345 float_status *s)
2347 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2348 rmode, scale, UINT16_MAX, s);
2351 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale,
2352 float_status *s)
2354 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2355 rmode, scale, UINT32_MAX, s);
2358 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale,
2359 float_status *s)
2361 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2362 rmode, scale, UINT64_MAX, s);
2365 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale,
2366 float_status *s)
2368 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2369 rmode, scale, UINT16_MAX, s);
2372 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale,
2373 float_status *s)
2375 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2376 rmode, scale, UINT32_MAX, s);
2379 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale,
2380 float_status *s)
2382 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2383 rmode, scale, UINT64_MAX, s);
2386 uint16_t float16_to_uint16(float16 a, float_status *s)
2388 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2391 uint32_t float16_to_uint32(float16 a, float_status *s)
2393 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2396 uint64_t float16_to_uint64(float16 a, float_status *s)
2398 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2401 uint16_t float32_to_uint16(float32 a, float_status *s)
2403 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2406 uint32_t float32_to_uint32(float32 a, float_status *s)
2408 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2411 uint64_t float32_to_uint64(float32 a, float_status *s)
2413 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2416 uint16_t float64_to_uint16(float64 a, float_status *s)
2418 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2421 uint32_t float64_to_uint32(float64 a, float_status *s)
2423 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2426 uint64_t float64_to_uint64(float64 a, float_status *s)
2428 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2431 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2433 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2436 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2438 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2441 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2443 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2446 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2448 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2451 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2453 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2456 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2458 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2461 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2463 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2466 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2468 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2471 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2473 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2477 * Integer to float conversions
2479 * Returns the result of converting the two's complement integer `a'
2480 * to the floating-point format. The conversion is performed according
2481 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2484 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
2486 FloatParts r = { .sign = false };
2488 if (a == 0) {
2489 r.cls = float_class_zero;
2490 } else {
2491 uint64_t f = a;
2492 int shift;
2494 r.cls = float_class_normal;
2495 if (a < 0) {
2496 f = -f;
2497 r.sign = true;
2499 shift = clz64(f) - 1;
2500 scale = MIN(MAX(scale, -0x10000), 0x10000);
2502 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2503 r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
2506 return r;
2509 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2511 FloatParts pa = int_to_float(a, scale, status);
2512 return float16_round_pack_canonical(pa, status);
2515 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2517 return int64_to_float16_scalbn(a, scale, status);
2520 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2522 return int64_to_float16_scalbn(a, scale, status);
2525 float16 int64_to_float16(int64_t a, float_status *status)
2527 return int64_to_float16_scalbn(a, 0, status);
2530 float16 int32_to_float16(int32_t a, float_status *status)
2532 return int64_to_float16_scalbn(a, 0, status);
2535 float16 int16_to_float16(int16_t a, float_status *status)
2537 return int64_to_float16_scalbn(a, 0, status);
2540 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2542 FloatParts pa = int_to_float(a, scale, status);
2543 return float32_round_pack_canonical(pa, status);
2546 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2548 return int64_to_float32_scalbn(a, scale, status);
2551 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2553 return int64_to_float32_scalbn(a, scale, status);
2556 float32 int64_to_float32(int64_t a, float_status *status)
2558 return int64_to_float32_scalbn(a, 0, status);
2561 float32 int32_to_float32(int32_t a, float_status *status)
2563 return int64_to_float32_scalbn(a, 0, status);
2566 float32 int16_to_float32(int16_t a, float_status *status)
2568 return int64_to_float32_scalbn(a, 0, status);
2571 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2573 FloatParts pa = int_to_float(a, scale, status);
2574 return float64_round_pack_canonical(pa, status);
2577 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2579 return int64_to_float64_scalbn(a, scale, status);
2582 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2584 return int64_to_float64_scalbn(a, scale, status);
2587 float64 int64_to_float64(int64_t a, float_status *status)
2589 return int64_to_float64_scalbn(a, 0, status);
2592 float64 int32_to_float64(int32_t a, float_status *status)
2594 return int64_to_float64_scalbn(a, 0, status);
2597 float64 int16_to_float64(int16_t a, float_status *status)
2599 return int64_to_float64_scalbn(a, 0, status);
2604 * Unsigned Integer to float conversions
2606 * Returns the result of converting the unsigned integer `a' to the
2607 * floating-point format. The conversion is performed according to the
2608 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2611 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
2613 FloatParts r = { .sign = false };
2615 if (a == 0) {
2616 r.cls = float_class_zero;
2617 } else {
2618 scale = MIN(MAX(scale, -0x10000), 0x10000);
2619 r.cls = float_class_normal;
2620 if ((int64_t)a < 0) {
2621 r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
2622 shift64RightJamming(a, 1, &a);
2623 r.frac = a;
2624 } else {
2625 int shift = clz64(a) - 1;
2626 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2627 r.frac = a << shift;
2631 return r;
2634 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2636 FloatParts pa = uint_to_float(a, scale, status);
2637 return float16_round_pack_canonical(pa, status);
2640 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2642 return uint64_to_float16_scalbn(a, scale, status);
2645 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2647 return uint64_to_float16_scalbn(a, scale, status);
2650 float16 uint64_to_float16(uint64_t a, float_status *status)
2652 return uint64_to_float16_scalbn(a, 0, status);
2655 float16 uint32_to_float16(uint32_t a, float_status *status)
2657 return uint64_to_float16_scalbn(a, 0, status);
2660 float16 uint16_to_float16(uint16_t a, float_status *status)
2662 return uint64_to_float16_scalbn(a, 0, status);
2665 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2667 FloatParts pa = uint_to_float(a, scale, status);
2668 return float32_round_pack_canonical(pa, status);
2671 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2673 return uint64_to_float32_scalbn(a, scale, status);
2676 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2678 return uint64_to_float32_scalbn(a, scale, status);
2681 float32 uint64_to_float32(uint64_t a, float_status *status)
2683 return uint64_to_float32_scalbn(a, 0, status);
2686 float32 uint32_to_float32(uint32_t a, float_status *status)
2688 return uint64_to_float32_scalbn(a, 0, status);
2691 float32 uint16_to_float32(uint16_t a, float_status *status)
2693 return uint64_to_float32_scalbn(a, 0, status);
2696 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2698 FloatParts pa = uint_to_float(a, scale, status);
2699 return float64_round_pack_canonical(pa, status);
2702 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2704 return uint64_to_float64_scalbn(a, scale, status);
2707 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2709 return uint64_to_float64_scalbn(a, scale, status);
2712 float64 uint64_to_float64(uint64_t a, float_status *status)
2714 return uint64_to_float64_scalbn(a, 0, status);
2717 float64 uint32_to_float64(uint32_t a, float_status *status)
2719 return uint64_to_float64_scalbn(a, 0, status);
2722 float64 uint16_to_float64(uint16_t a, float_status *status)
2724 return uint64_to_float64_scalbn(a, 0, status);
2727 /* Float Min/Max */
2728 /* min() and max() functions. These can't be implemented as
2729 * 'compare and pick one input' because that would mishandle
2730 * NaNs and +0 vs -0.
2732 * minnum() and maxnum() functions. These are similar to the min()
2733 * and max() functions but if one of the arguments is a QNaN and
2734 * the other is numerical then the numerical argument is returned.
2735 * SNaNs will get quietened before being returned.
2736 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2737 * and maxNum() operations. min() and max() are the typical min/max
2738 * semantics provided by many CPUs which predate that specification.
2740 * minnummag() and maxnummag() functions correspond to minNumMag()
2741 * and minNumMag() from the IEEE-754 2008.
2743 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2744 bool ieee, bool ismag, float_status *s)
2746 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2747 if (ieee) {
2748 /* Takes two floating-point values `a' and `b', one of
2749 * which is a NaN, and returns the appropriate NaN
2750 * result. If either `a' or `b' is a signaling NaN,
2751 * the invalid exception is raised.
2753 if (is_snan(a.cls) || is_snan(b.cls)) {
2754 return pick_nan(a, b, s);
2755 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2756 return b;
2757 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2758 return a;
2761 return pick_nan(a, b, s);
2762 } else {
2763 int a_exp, b_exp;
2765 switch (a.cls) {
2766 case float_class_normal:
2767 a_exp = a.exp;
2768 break;
2769 case float_class_inf:
2770 a_exp = INT_MAX;
2771 break;
2772 case float_class_zero:
2773 a_exp = INT_MIN;
2774 break;
2775 default:
2776 g_assert_not_reached();
2777 break;
2779 switch (b.cls) {
2780 case float_class_normal:
2781 b_exp = b.exp;
2782 break;
2783 case float_class_inf:
2784 b_exp = INT_MAX;
2785 break;
2786 case float_class_zero:
2787 b_exp = INT_MIN;
2788 break;
2789 default:
2790 g_assert_not_reached();
2791 break;
2794 if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2795 bool a_less = a_exp < b_exp;
2796 if (a_exp == b_exp) {
2797 a_less = a.frac < b.frac;
2799 return a_less ^ ismin ? b : a;
2802 if (a.sign == b.sign) {
2803 bool a_less = a_exp < b_exp;
2804 if (a_exp == b_exp) {
2805 a_less = a.frac < b.frac;
2807 return a.sign ^ a_less ^ ismin ? b : a;
2808 } else {
2809 return a.sign ^ ismin ? b : a;
2814 #define MINMAX(sz, name, ismin, isiee, ismag) \
2815 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
2816 float_status *s) \
2818 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2819 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2820 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
2822 return float ## sz ## _round_pack_canonical(pr, s); \
2825 MINMAX(16, min, true, false, false)
2826 MINMAX(16, minnum, true, true, false)
2827 MINMAX(16, minnummag, true, true, true)
2828 MINMAX(16, max, false, false, false)
2829 MINMAX(16, maxnum, false, true, false)
2830 MINMAX(16, maxnummag, false, true, true)
2832 MINMAX(32, min, true, false, false)
2833 MINMAX(32, minnum, true, true, false)
2834 MINMAX(32, minnummag, true, true, true)
2835 MINMAX(32, max, false, false, false)
2836 MINMAX(32, maxnum, false, true, false)
2837 MINMAX(32, maxnummag, false, true, true)
2839 MINMAX(64, min, true, false, false)
2840 MINMAX(64, minnum, true, true, false)
2841 MINMAX(64, minnummag, true, true, true)
2842 MINMAX(64, max, false, false, false)
2843 MINMAX(64, maxnum, false, true, false)
2844 MINMAX(64, maxnummag, false, true, true)
2846 #undef MINMAX
2848 /* Floating point compare */
2849 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2850 float_status *s)
2852 if (is_nan(a.cls) || is_nan(b.cls)) {
2853 if (!is_quiet ||
2854 a.cls == float_class_snan ||
2855 b.cls == float_class_snan) {
2856 s->float_exception_flags |= float_flag_invalid;
2858 return float_relation_unordered;
2861 if (a.cls == float_class_zero) {
2862 if (b.cls == float_class_zero) {
2863 return float_relation_equal;
2865 return b.sign ? float_relation_greater : float_relation_less;
2866 } else if (b.cls == float_class_zero) {
2867 return a.sign ? float_relation_less : float_relation_greater;
2870 /* The only really important thing about infinity is its sign. If
2871 * both are infinities the sign marks the smallest of the two.
2873 if (a.cls == float_class_inf) {
2874 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2875 return float_relation_equal;
2877 return a.sign ? float_relation_less : float_relation_greater;
2878 } else if (b.cls == float_class_inf) {
2879 return b.sign ? float_relation_greater : float_relation_less;
2882 if (a.sign != b.sign) {
2883 return a.sign ? float_relation_less : float_relation_greater;
2886 if (a.exp == b.exp) {
2887 if (a.frac == b.frac) {
2888 return float_relation_equal;
2890 if (a.sign) {
2891 return a.frac > b.frac ?
2892 float_relation_less : float_relation_greater;
2893 } else {
2894 return a.frac > b.frac ?
2895 float_relation_greater : float_relation_less;
2897 } else {
2898 if (a.sign) {
2899 return a.exp > b.exp ? float_relation_less : float_relation_greater;
2900 } else {
2901 return a.exp > b.exp ? float_relation_greater : float_relation_less;
2906 #define COMPARE(name, attr, sz) \
2907 static int attr \
2908 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \
2910 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2911 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2912 return compare_floats(pa, pb, is_quiet, s); \
2915 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
2916 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
2917 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
2919 #undef COMPARE
2921 int float16_compare(float16 a, float16 b, float_status *s)
2923 return soft_f16_compare(a, b, false, s);
2926 int float16_compare_quiet(float16 a, float16 b, float_status *s)
2928 return soft_f16_compare(a, b, true, s);
2931 static int QEMU_FLATTEN
2932 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
2934 union_float32 ua, ub;
2936 ua.s = xa;
2937 ub.s = xb;
2939 if (QEMU_NO_HARDFLOAT) {
2940 goto soft;
2943 float32_input_flush2(&ua.s, &ub.s, s);
2944 if (isgreaterequal(ua.h, ub.h)) {
2945 if (isgreater(ua.h, ub.h)) {
2946 return float_relation_greater;
2948 return float_relation_equal;
2950 if (likely(isless(ua.h, ub.h))) {
2951 return float_relation_less;
2953 /* The only condition remaining is unordered.
2954 * Fall through to set flags.
2956 soft:
2957 return soft_f32_compare(ua.s, ub.s, is_quiet, s);
2960 int float32_compare(float32 a, float32 b, float_status *s)
2962 return f32_compare(a, b, false, s);
2965 int float32_compare_quiet(float32 a, float32 b, float_status *s)
2967 return f32_compare(a, b, true, s);
2970 static int QEMU_FLATTEN
2971 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
2973 union_float64 ua, ub;
2975 ua.s = xa;
2976 ub.s = xb;
2978 if (QEMU_NO_HARDFLOAT) {
2979 goto soft;
2982 float64_input_flush2(&ua.s, &ub.s, s);
2983 if (isgreaterequal(ua.h, ub.h)) {
2984 if (isgreater(ua.h, ub.h)) {
2985 return float_relation_greater;
2987 return float_relation_equal;
2989 if (likely(isless(ua.h, ub.h))) {
2990 return float_relation_less;
2992 /* The only condition remaining is unordered.
2993 * Fall through to set flags.
2995 soft:
2996 return soft_f64_compare(ua.s, ub.s, is_quiet, s);
2999 int float64_compare(float64 a, float64 b, float_status *s)
3001 return f64_compare(a, b, false, s);
3004 int float64_compare_quiet(float64 a, float64 b, float_status *s)
3006 return f64_compare(a, b, true, s);
3009 /* Multiply A by 2 raised to the power N. */
3010 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
3012 if (unlikely(is_nan(a.cls))) {
3013 return return_nan(a, s);
3015 if (a.cls == float_class_normal) {
3016 /* The largest float type (even though not supported by FloatParts)
3017 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
3018 * still allows rounding to infinity, without allowing overflow
3019 * within the int32_t that backs FloatParts.exp.
3021 n = MIN(MAX(n, -0x10000), 0x10000);
3022 a.exp += n;
3024 return a;
3027 float16 float16_scalbn(float16 a, int n, float_status *status)
3029 FloatParts pa = float16_unpack_canonical(a, status);
3030 FloatParts pr = scalbn_decomposed(pa, n, status);
3031 return float16_round_pack_canonical(pr, status);
3034 float32 float32_scalbn(float32 a, int n, float_status *status)
3036 FloatParts pa = float32_unpack_canonical(a, status);
3037 FloatParts pr = scalbn_decomposed(pa, n, status);
3038 return float32_round_pack_canonical(pr, status);
3041 float64 float64_scalbn(float64 a, int n, float_status *status)
3043 FloatParts pa = float64_unpack_canonical(a, status);
3044 FloatParts pr = scalbn_decomposed(pa, n, status);
3045 return float64_round_pack_canonical(pr, status);
3049 * Square Root
3051 * The old softfloat code did an approximation step before zeroing in
3052 * on the final result. However for simpleness we just compute the
3053 * square root by iterating down from the implicit bit to enough extra
3054 * bits to ensure we get a correctly rounded result.
3056 * This does mean however the calculation is slower than before,
3057 * especially for 64 bit floats.
3060 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
3062 uint64_t a_frac, r_frac, s_frac;
3063 int bit, last_bit;
3065 if (is_nan(a.cls)) {
3066 return return_nan(a, s);
3068 if (a.cls == float_class_zero) {
3069 return a; /* sqrt(+-0) = +-0 */
3071 if (a.sign) {
3072 s->float_exception_flags |= float_flag_invalid;
3073 return parts_default_nan(s);
3075 if (a.cls == float_class_inf) {
3076 return a; /* sqrt(+inf) = +inf */
3079 assert(a.cls == float_class_normal);
3081 /* We need two overflow bits at the top. Adding room for that is a
3082 * right shift. If the exponent is odd, we can discard the low bit
3083 * by multiplying the fraction by 2; that's a left shift. Combine
3084 * those and we shift right if the exponent is even.
3086 a_frac = a.frac;
3087 if (!(a.exp & 1)) {
3088 a_frac >>= 1;
3090 a.exp >>= 1;
3092 /* Bit-by-bit computation of sqrt. */
3093 r_frac = 0;
3094 s_frac = 0;
3096 /* Iterate from implicit bit down to the 3 extra bits to compute a
3097 * properly rounded result. Remember we've inserted one more bit
3098 * at the top, so these positions are one less.
3100 bit = DECOMPOSED_BINARY_POINT - 1;
3101 last_bit = MAX(p->frac_shift - 4, 0);
3102 do {
3103 uint64_t q = 1ULL << bit;
3104 uint64_t t_frac = s_frac + q;
3105 if (t_frac <= a_frac) {
3106 s_frac = t_frac + q;
3107 a_frac -= t_frac;
3108 r_frac += q;
3110 a_frac <<= 1;
3111 } while (--bit >= last_bit);
3113 /* Undo the right shift done above. If there is any remaining
3114 * fraction, the result is inexact. Set the sticky bit.
3116 a.frac = (r_frac << 1) + (a_frac != 0);
3118 return a;
3121 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3123 FloatParts pa = float16_unpack_canonical(a, status);
3124 FloatParts pr = sqrt_float(pa, status, &float16_params);
3125 return float16_round_pack_canonical(pr, status);
3128 static float32 QEMU_SOFTFLOAT_ATTR
3129 soft_f32_sqrt(float32 a, float_status *status)
3131 FloatParts pa = float32_unpack_canonical(a, status);
3132 FloatParts pr = sqrt_float(pa, status, &float32_params);
3133 return float32_round_pack_canonical(pr, status);
3136 static float64 QEMU_SOFTFLOAT_ATTR
3137 soft_f64_sqrt(float64 a, float_status *status)
3139 FloatParts pa = float64_unpack_canonical(a, status);
3140 FloatParts pr = sqrt_float(pa, status, &float64_params);
3141 return float64_round_pack_canonical(pr, status);
3144 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3146 union_float32 ua, ur;
3148 ua.s = xa;
3149 if (unlikely(!can_use_fpu(s))) {
3150 goto soft;
3153 float32_input_flush1(&ua.s, s);
3154 if (QEMU_HARDFLOAT_1F32_USE_FP) {
3155 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3156 fpclassify(ua.h) == FP_ZERO) ||
3157 signbit(ua.h))) {
3158 goto soft;
3160 } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3161 float32_is_neg(ua.s))) {
3162 goto soft;
3164 ur.h = sqrtf(ua.h);
3165 return ur.s;
3167 soft:
3168 return soft_f32_sqrt(ua.s, s);
3171 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3173 union_float64 ua, ur;
3175 ua.s = xa;
3176 if (unlikely(!can_use_fpu(s))) {
3177 goto soft;
3180 float64_input_flush1(&ua.s, s);
3181 if (QEMU_HARDFLOAT_1F64_USE_FP) {
3182 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3183 fpclassify(ua.h) == FP_ZERO) ||
3184 signbit(ua.h))) {
3185 goto soft;
3187 } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3188 float64_is_neg(ua.s))) {
3189 goto soft;
3191 ur.h = sqrt(ua.h);
3192 return ur.s;
3194 soft:
3195 return soft_f64_sqrt(ua.s, s);
3198 /*----------------------------------------------------------------------------
3199 | The pattern for a default generated NaN.
3200 *----------------------------------------------------------------------------*/
3202 float16 float16_default_nan(float_status *status)
3204 FloatParts p = parts_default_nan(status);
3205 p.frac >>= float16_params.frac_shift;
3206 return float16_pack_raw(p);
3209 float32 float32_default_nan(float_status *status)
3211 FloatParts p = parts_default_nan(status);
3212 p.frac >>= float32_params.frac_shift;
3213 return float32_pack_raw(p);
3216 float64 float64_default_nan(float_status *status)
3218 FloatParts p = parts_default_nan(status);
3219 p.frac >>= float64_params.frac_shift;
3220 return float64_pack_raw(p);
3223 float128 float128_default_nan(float_status *status)
3225 FloatParts p = parts_default_nan(status);
3226 float128 r;
3228 /* Extrapolate from the choices made by parts_default_nan to fill
3229 * in the quad-floating format. If the low bit is set, assume we
3230 * want to set all non-snan bits.
3232 r.low = -(p.frac & 1);
3233 r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
3234 r.high |= LIT64(0x7FFF000000000000);
3235 r.high |= (uint64_t)p.sign << 63;
3237 return r;
3240 /*----------------------------------------------------------------------------
3241 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3242 *----------------------------------------------------------------------------*/
3244 float16 float16_silence_nan(float16 a, float_status *status)
3246 FloatParts p = float16_unpack_raw(a);
3247 p.frac <<= float16_params.frac_shift;
3248 p = parts_silence_nan(p, status);
3249 p.frac >>= float16_params.frac_shift;
3250 return float16_pack_raw(p);
3253 float32 float32_silence_nan(float32 a, float_status *status)
3255 FloatParts p = float32_unpack_raw(a);
3256 p.frac <<= float32_params.frac_shift;
3257 p = parts_silence_nan(p, status);
3258 p.frac >>= float32_params.frac_shift;
3259 return float32_pack_raw(p);
3262 float64 float64_silence_nan(float64 a, float_status *status)
3264 FloatParts p = float64_unpack_raw(a);
3265 p.frac <<= float64_params.frac_shift;
3266 p = parts_silence_nan(p, status);
3267 p.frac >>= float64_params.frac_shift;
3268 return float64_pack_raw(p);
3271 /*----------------------------------------------------------------------------
3272 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3273 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3274 | input. If `zSign' is 1, the input is negated before being converted to an
3275 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
3276 | is simply rounded to an integer, with the inexact exception raised if the
3277 | input cannot be represented exactly as an integer. However, if the fixed-
3278 | point input is too large, the invalid exception is raised and the largest
3279 | positive or negative integer is returned.
3280 *----------------------------------------------------------------------------*/
3282 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
3284 int8_t roundingMode;
3285 flag roundNearestEven;
3286 int8_t roundIncrement, roundBits;
3287 int32_t z;
3289 roundingMode = status->float_rounding_mode;
3290 roundNearestEven = ( roundingMode == float_round_nearest_even );
3291 switch (roundingMode) {
3292 case float_round_nearest_even:
3293 case float_round_ties_away:
3294 roundIncrement = 0x40;
3295 break;
3296 case float_round_to_zero:
3297 roundIncrement = 0;
3298 break;
3299 case float_round_up:
3300 roundIncrement = zSign ? 0 : 0x7f;
3301 break;
3302 case float_round_down:
3303 roundIncrement = zSign ? 0x7f : 0;
3304 break;
3305 default:
3306 abort();
3308 roundBits = absZ & 0x7F;
3309 absZ = ( absZ + roundIncrement )>>7;
3310 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3311 z = absZ;
3312 if ( zSign ) z = - z;
3313 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3314 float_raise(float_flag_invalid, status);
3315 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3317 if (roundBits) {
3318 status->float_exception_flags |= float_flag_inexact;
3320 return z;
3324 /*----------------------------------------------------------------------------
3325 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3326 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3327 | and returns the properly rounded 64-bit integer corresponding to the input.
3328 | If `zSign' is 1, the input is negated before being converted to an integer.
3329 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3330 | the inexact exception raised if the input cannot be represented exactly as
3331 | an integer. However, if the fixed-point input is too large, the invalid
3332 | exception is raised and the largest positive or negative integer is
3333 | returned.
3334 *----------------------------------------------------------------------------*/
3336 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
3337 float_status *status)
3339 int8_t roundingMode;
3340 flag roundNearestEven, increment;
3341 int64_t z;
3343 roundingMode = status->float_rounding_mode;
3344 roundNearestEven = ( roundingMode == float_round_nearest_even );
3345 switch (roundingMode) {
3346 case float_round_nearest_even:
3347 case float_round_ties_away:
3348 increment = ((int64_t) absZ1 < 0);
3349 break;
3350 case float_round_to_zero:
3351 increment = 0;
3352 break;
3353 case float_round_up:
3354 increment = !zSign && absZ1;
3355 break;
3356 case float_round_down:
3357 increment = zSign && absZ1;
3358 break;
3359 default:
3360 abort();
3362 if ( increment ) {
3363 ++absZ0;
3364 if ( absZ0 == 0 ) goto overflow;
3365 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
3367 z = absZ0;
3368 if ( zSign ) z = - z;
3369 if ( z && ( ( z < 0 ) ^ zSign ) ) {
3370 overflow:
3371 float_raise(float_flag_invalid, status);
3372 return
3373 zSign ? (int64_t) LIT64( 0x8000000000000000 )
3374 : LIT64( 0x7FFFFFFFFFFFFFFF );
3376 if (absZ1) {
3377 status->float_exception_flags |= float_flag_inexact;
3379 return z;
3383 /*----------------------------------------------------------------------------
3384 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3385 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3386 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3387 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
3388 | with the inexact exception raised if the input cannot be represented exactly
3389 | as an integer. However, if the fixed-point input is too large, the invalid
3390 | exception is raised and the largest unsigned integer is returned.
3391 *----------------------------------------------------------------------------*/
3393 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
3394 uint64_t absZ1, float_status *status)
3396 int8_t roundingMode;
3397 flag roundNearestEven, increment;
3399 roundingMode = status->float_rounding_mode;
3400 roundNearestEven = (roundingMode == float_round_nearest_even);
3401 switch (roundingMode) {
3402 case float_round_nearest_even:
3403 case float_round_ties_away:
3404 increment = ((int64_t)absZ1 < 0);
3405 break;
3406 case float_round_to_zero:
3407 increment = 0;
3408 break;
3409 case float_round_up:
3410 increment = !zSign && absZ1;
3411 break;
3412 case float_round_down:
3413 increment = zSign && absZ1;
3414 break;
3415 default:
3416 abort();
3418 if (increment) {
3419 ++absZ0;
3420 if (absZ0 == 0) {
3421 float_raise(float_flag_invalid, status);
3422 return LIT64(0xFFFFFFFFFFFFFFFF);
3424 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
3427 if (zSign && absZ0) {
3428 float_raise(float_flag_invalid, status);
3429 return 0;
3432 if (absZ1) {
3433 status->float_exception_flags |= float_flag_inexact;
3435 return absZ0;
3438 /*----------------------------------------------------------------------------
3439 | If `a' is denormal and we are in flush-to-zero mode then set the
3440 | input-denormal exception and return zero. Otherwise just return the value.
3441 *----------------------------------------------------------------------------*/
3442 float32 float32_squash_input_denormal(float32 a, float_status *status)
3444 if (status->flush_inputs_to_zero) {
3445 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
3446 float_raise(float_flag_input_denormal, status);
3447 return make_float32(float32_val(a) & 0x80000000);
3450 return a;
3453 /*----------------------------------------------------------------------------
3454 | Normalizes the subnormal single-precision floating-point value represented
3455 | by the denormalized significand `aSig'. The normalized exponent and
3456 | significand are stored at the locations pointed to by `zExpPtr' and
3457 | `zSigPtr', respectively.
3458 *----------------------------------------------------------------------------*/
3460 static void
3461 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3463 int8_t shiftCount;
3465 shiftCount = clz32(aSig) - 8;
3466 *zSigPtr = aSig<<shiftCount;
3467 *zExpPtr = 1 - shiftCount;
3471 /*----------------------------------------------------------------------------
3472 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3473 | and significand `zSig', and returns the proper single-precision floating-
3474 | point value corresponding to the abstract input. Ordinarily, the abstract
3475 | value is simply rounded and packed into the single-precision format, with
3476 | the inexact exception raised if the abstract input cannot be represented
3477 | exactly. However, if the abstract value is too large, the overflow and
3478 | inexact exceptions are raised and an infinity or maximal finite value is
3479 | returned. If the abstract value is too small, the input value is rounded to
3480 | a subnormal number, and the underflow and inexact exceptions are raised if
3481 | the abstract input cannot be represented exactly as a subnormal single-
3482 | precision floating-point number.
3483 | The input significand `zSig' has its binary point between bits 30
3484 | and 29, which is 7 bits to the left of the usual location. This shifted
3485 | significand must be normalized or smaller. If `zSig' is not normalized,
3486 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3487 | and it must not require rounding. In the usual case that `zSig' is
3488 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3489 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3490 | Binary Floating-Point Arithmetic.
3491 *----------------------------------------------------------------------------*/
3493 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3494 float_status *status)
3496 int8_t roundingMode;
3497 flag roundNearestEven;
3498 int8_t roundIncrement, roundBits;
3499 flag isTiny;
3501 roundingMode = status->float_rounding_mode;
3502 roundNearestEven = ( roundingMode == float_round_nearest_even );
3503 switch (roundingMode) {
3504 case float_round_nearest_even:
3505 case float_round_ties_away:
3506 roundIncrement = 0x40;
3507 break;
3508 case float_round_to_zero:
3509 roundIncrement = 0;
3510 break;
3511 case float_round_up:
3512 roundIncrement = zSign ? 0 : 0x7f;
3513 break;
3514 case float_round_down:
3515 roundIncrement = zSign ? 0x7f : 0;
3516 break;
3517 default:
3518 abort();
3519 break;
3521 roundBits = zSig & 0x7F;
3522 if ( 0xFD <= (uint16_t) zExp ) {
3523 if ( ( 0xFD < zExp )
3524 || ( ( zExp == 0xFD )
3525 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3527 float_raise(float_flag_overflow | float_flag_inexact, status);
3528 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
3530 if ( zExp < 0 ) {
3531 if (status->flush_to_zero) {
3532 float_raise(float_flag_output_denormal, status);
3533 return packFloat32(zSign, 0, 0);
3535 isTiny =
3536 (status->float_detect_tininess
3537 == float_tininess_before_rounding)
3538 || ( zExp < -1 )
3539 || ( zSig + roundIncrement < 0x80000000 );
3540 shift32RightJamming( zSig, - zExp, &zSig );
3541 zExp = 0;
3542 roundBits = zSig & 0x7F;
3543 if (isTiny && roundBits) {
3544 float_raise(float_flag_underflow, status);
3548 if (roundBits) {
3549 status->float_exception_flags |= float_flag_inexact;
3551 zSig = ( zSig + roundIncrement )>>7;
3552 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3553 if ( zSig == 0 ) zExp = 0;
3554 return packFloat32( zSign, zExp, zSig );
3558 /*----------------------------------------------------------------------------
3559 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3560 | and significand `zSig', and returns the proper single-precision floating-
3561 | point value corresponding to the abstract input. This routine is just like
3562 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
3563 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3564 | floating-point exponent.
3565 *----------------------------------------------------------------------------*/
3567 static float32
3568 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3569 float_status *status)
3571 int8_t shiftCount;
3573 shiftCount = clz32(zSig) - 1;
3574 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
3575 status);
3579 /*----------------------------------------------------------------------------
3580 | If `a' is denormal and we are in flush-to-zero mode then set the
3581 | input-denormal exception and return zero. Otherwise just return the value.
3582 *----------------------------------------------------------------------------*/
3583 float64 float64_squash_input_denormal(float64 a, float_status *status)
3585 if (status->flush_inputs_to_zero) {
3586 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
3587 float_raise(float_flag_input_denormal, status);
3588 return make_float64(float64_val(a) & (1ULL << 63));
3591 return a;
3594 /*----------------------------------------------------------------------------
3595 | Normalizes the subnormal double-precision floating-point value represented
3596 | by the denormalized significand `aSig'. The normalized exponent and
3597 | significand are stored at the locations pointed to by `zExpPtr' and
3598 | `zSigPtr', respectively.
3599 *----------------------------------------------------------------------------*/
3601 static void
3602 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
3604 int8_t shiftCount;
3606 shiftCount = clz64(aSig) - 11;
3607 *zSigPtr = aSig<<shiftCount;
3608 *zExpPtr = 1 - shiftCount;
3612 /*----------------------------------------------------------------------------
3613 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3614 | double-precision floating-point value, returning the result. After being
3615 | shifted into the proper positions, the three fields are simply added
3616 | together to form the result. This means that any integer portion of `zSig'
3617 | will be added into the exponent. Since a properly normalized significand
3618 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3619 | than the desired result exponent whenever `zSig' is a complete, normalized
3620 | significand.
3621 *----------------------------------------------------------------------------*/
3623 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
3626 return make_float64(
3627 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
3631 /*----------------------------------------------------------------------------
3632 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3633 | and significand `zSig', and returns the proper double-precision floating-
3634 | point value corresponding to the abstract input. Ordinarily, the abstract
3635 | value is simply rounded and packed into the double-precision format, with
3636 | the inexact exception raised if the abstract input cannot be represented
3637 | exactly. However, if the abstract value is too large, the overflow and
3638 | inexact exceptions are raised and an infinity or maximal finite value is
3639 | returned. If the abstract value is too small, the input value is rounded to
3640 | a subnormal number, and the underflow and inexact exceptions are raised if
3641 | the abstract input cannot be represented exactly as a subnormal double-
3642 | precision floating-point number.
3643 | The input significand `zSig' has its binary point between bits 62
3644 | and 61, which is 10 bits to the left of the usual location. This shifted
3645 | significand must be normalized or smaller. If `zSig' is not normalized,
3646 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3647 | and it must not require rounding. In the usual case that `zSig' is
3648 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3649 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3650 | Binary Floating-Point Arithmetic.
3651 *----------------------------------------------------------------------------*/
3653 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3654 float_status *status)
3656 int8_t roundingMode;
3657 flag roundNearestEven;
3658 int roundIncrement, roundBits;
3659 flag isTiny;
3661 roundingMode = status->float_rounding_mode;
3662 roundNearestEven = ( roundingMode == float_round_nearest_even );
3663 switch (roundingMode) {
3664 case float_round_nearest_even:
3665 case float_round_ties_away:
3666 roundIncrement = 0x200;
3667 break;
3668 case float_round_to_zero:
3669 roundIncrement = 0;
3670 break;
3671 case float_round_up:
3672 roundIncrement = zSign ? 0 : 0x3ff;
3673 break;
3674 case float_round_down:
3675 roundIncrement = zSign ? 0x3ff : 0;
3676 break;
3677 case float_round_to_odd:
3678 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3679 break;
3680 default:
3681 abort();
3683 roundBits = zSig & 0x3FF;
3684 if ( 0x7FD <= (uint16_t) zExp ) {
3685 if ( ( 0x7FD < zExp )
3686 || ( ( zExp == 0x7FD )
3687 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
3689 bool overflow_to_inf = roundingMode != float_round_to_odd &&
3690 roundIncrement != 0;
3691 float_raise(float_flag_overflow | float_flag_inexact, status);
3692 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
3694 if ( zExp < 0 ) {
3695 if (status->flush_to_zero) {
3696 float_raise(float_flag_output_denormal, status);
3697 return packFloat64(zSign, 0, 0);
3699 isTiny =
3700 (status->float_detect_tininess
3701 == float_tininess_before_rounding)
3702 || ( zExp < -1 )
3703 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
3704 shift64RightJamming( zSig, - zExp, &zSig );
3705 zExp = 0;
3706 roundBits = zSig & 0x3FF;
3707 if (isTiny && roundBits) {
3708 float_raise(float_flag_underflow, status);
3710 if (roundingMode == float_round_to_odd) {
3712 * For round-to-odd case, the roundIncrement depends on
3713 * zSig which just changed.
3715 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3719 if (roundBits) {
3720 status->float_exception_flags |= float_flag_inexact;
3722 zSig = ( zSig + roundIncrement )>>10;
3723 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
3724 if ( zSig == 0 ) zExp = 0;
3725 return packFloat64( zSign, zExp, zSig );
3729 /*----------------------------------------------------------------------------
3730 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3731 | and significand `zSig', and returns the proper double-precision floating-
3732 | point value corresponding to the abstract input. This routine is just like
3733 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
3734 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3735 | floating-point exponent.
3736 *----------------------------------------------------------------------------*/
3738 static float64
3739 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3740 float_status *status)
3742 int8_t shiftCount;
3744 shiftCount = clz64(zSig) - 1;
3745 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
3746 status);
3750 /*----------------------------------------------------------------------------
3751 | Normalizes the subnormal extended double-precision floating-point value
3752 | represented by the denormalized significand `aSig'. The normalized exponent
3753 | and significand are stored at the locations pointed to by `zExpPtr' and
3754 | `zSigPtr', respectively.
3755 *----------------------------------------------------------------------------*/
3757 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
3758 uint64_t *zSigPtr)
3760 int8_t shiftCount;
3762 shiftCount = clz64(aSig);
3763 *zSigPtr = aSig<<shiftCount;
3764 *zExpPtr = 1 - shiftCount;
3767 /*----------------------------------------------------------------------------
3768 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3769 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3770 | and returns the proper extended double-precision floating-point value
3771 | corresponding to the abstract input. Ordinarily, the abstract value is
3772 | rounded and packed into the extended double-precision format, with the
3773 | inexact exception raised if the abstract input cannot be represented
3774 | exactly. However, if the abstract value is too large, the overflow and
3775 | inexact exceptions are raised and an infinity or maximal finite value is
3776 | returned. If the abstract value is too small, the input value is rounded to
3777 | a subnormal number, and the underflow and inexact exceptions are raised if
3778 | the abstract input cannot be represented exactly as a subnormal extended
3779 | double-precision floating-point number.
3780 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
3781 | number of bits as single or double precision, respectively. Otherwise, the
3782 | result is rounded to the full precision of the extended double-precision
3783 | format.
3784 | The input significand must be normalized or smaller. If the input
3785 | significand is not normalized, `zExp' must be 0; in that case, the result
3786 | returned is a subnormal number, and it must not require rounding. The
3787 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3788 | Floating-Point Arithmetic.
3789 *----------------------------------------------------------------------------*/
3791 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
3792 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3793 float_status *status)
3795 int8_t roundingMode;
3796 flag roundNearestEven, increment, isTiny;
3797 int64_t roundIncrement, roundMask, roundBits;
3799 roundingMode = status->float_rounding_mode;
3800 roundNearestEven = ( roundingMode == float_round_nearest_even );
3801 if ( roundingPrecision == 80 ) goto precision80;
3802 if ( roundingPrecision == 64 ) {
3803 roundIncrement = LIT64( 0x0000000000000400 );
3804 roundMask = LIT64( 0x00000000000007FF );
3806 else if ( roundingPrecision == 32 ) {
3807 roundIncrement = LIT64( 0x0000008000000000 );
3808 roundMask = LIT64( 0x000000FFFFFFFFFF );
3810 else {
3811 goto precision80;
3813 zSig0 |= ( zSig1 != 0 );
3814 switch (roundingMode) {
3815 case float_round_nearest_even:
3816 case float_round_ties_away:
3817 break;
3818 case float_round_to_zero:
3819 roundIncrement = 0;
3820 break;
3821 case float_round_up:
3822 roundIncrement = zSign ? 0 : roundMask;
3823 break;
3824 case float_round_down:
3825 roundIncrement = zSign ? roundMask : 0;
3826 break;
3827 default:
3828 abort();
3830 roundBits = zSig0 & roundMask;
3831 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3832 if ( ( 0x7FFE < zExp )
3833 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3835 goto overflow;
3837 if ( zExp <= 0 ) {
3838 if (status->flush_to_zero) {
3839 float_raise(float_flag_output_denormal, status);
3840 return packFloatx80(zSign, 0, 0);
3842 isTiny =
3843 (status->float_detect_tininess
3844 == float_tininess_before_rounding)
3845 || ( zExp < 0 )
3846 || ( zSig0 <= zSig0 + roundIncrement );
3847 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3848 zExp = 0;
3849 roundBits = zSig0 & roundMask;
3850 if (isTiny && roundBits) {
3851 float_raise(float_flag_underflow, status);
3853 if (roundBits) {
3854 status->float_exception_flags |= float_flag_inexact;
3856 zSig0 += roundIncrement;
3857 if ( (int64_t) zSig0 < 0 ) zExp = 1;
3858 roundIncrement = roundMask + 1;
3859 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3860 roundMask |= roundIncrement;
3862 zSig0 &= ~ roundMask;
3863 return packFloatx80( zSign, zExp, zSig0 );
3866 if (roundBits) {
3867 status->float_exception_flags |= float_flag_inexact;
3869 zSig0 += roundIncrement;
3870 if ( zSig0 < roundIncrement ) {
3871 ++zExp;
3872 zSig0 = LIT64( 0x8000000000000000 );
3874 roundIncrement = roundMask + 1;
3875 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3876 roundMask |= roundIncrement;
3878 zSig0 &= ~ roundMask;
3879 if ( zSig0 == 0 ) zExp = 0;
3880 return packFloatx80( zSign, zExp, zSig0 );
3881 precision80:
3882 switch (roundingMode) {
3883 case float_round_nearest_even:
3884 case float_round_ties_away:
3885 increment = ((int64_t)zSig1 < 0);
3886 break;
3887 case float_round_to_zero:
3888 increment = 0;
3889 break;
3890 case float_round_up:
3891 increment = !zSign && zSig1;
3892 break;
3893 case float_round_down:
3894 increment = zSign && zSig1;
3895 break;
3896 default:
3897 abort();
3899 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3900 if ( ( 0x7FFE < zExp )
3901 || ( ( zExp == 0x7FFE )
3902 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
3903 && increment
3906 roundMask = 0;
3907 overflow:
3908 float_raise(float_flag_overflow | float_flag_inexact, status);
3909 if ( ( roundingMode == float_round_to_zero )
3910 || ( zSign && ( roundingMode == float_round_up ) )
3911 || ( ! zSign && ( roundingMode == float_round_down ) )
3913 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3915 return packFloatx80(zSign,
3916 floatx80_infinity_high,
3917 floatx80_infinity_low);
3919 if ( zExp <= 0 ) {
3920 isTiny =
3921 (status->float_detect_tininess
3922 == float_tininess_before_rounding)
3923 || ( zExp < 0 )
3924 || ! increment
3925 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
3926 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
3927 zExp = 0;
3928 if (isTiny && zSig1) {
3929 float_raise(float_flag_underflow, status);
3931 if (zSig1) {
3932 status->float_exception_flags |= float_flag_inexact;
3934 switch (roundingMode) {
3935 case float_round_nearest_even:
3936 case float_round_ties_away:
3937 increment = ((int64_t)zSig1 < 0);
3938 break;
3939 case float_round_to_zero:
3940 increment = 0;
3941 break;
3942 case float_round_up:
3943 increment = !zSign && zSig1;
3944 break;
3945 case float_round_down:
3946 increment = zSign && zSig1;
3947 break;
3948 default:
3949 abort();
3951 if ( increment ) {
3952 ++zSig0;
3953 zSig0 &=
3954 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3955 if ( (int64_t) zSig0 < 0 ) zExp = 1;
3957 return packFloatx80( zSign, zExp, zSig0 );
3960 if (zSig1) {
3961 status->float_exception_flags |= float_flag_inexact;
3963 if ( increment ) {
3964 ++zSig0;
3965 if ( zSig0 == 0 ) {
3966 ++zExp;
3967 zSig0 = LIT64( 0x8000000000000000 );
3969 else {
3970 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3973 else {
3974 if ( zSig0 == 0 ) zExp = 0;
3976 return packFloatx80( zSign, zExp, zSig0 );
3980 /*----------------------------------------------------------------------------
3981 | Takes an abstract floating-point value having sign `zSign', exponent
3982 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
3983 | and returns the proper extended double-precision floating-point value
3984 | corresponding to the abstract input. This routine is just like
3985 | `roundAndPackFloatx80' except that the input significand does not have to be
3986 | normalized.
3987 *----------------------------------------------------------------------------*/
3989 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
3990 flag zSign, int32_t zExp,
3991 uint64_t zSig0, uint64_t zSig1,
3992 float_status *status)
3994 int8_t shiftCount;
3996 if ( zSig0 == 0 ) {
3997 zSig0 = zSig1;
3998 zSig1 = 0;
3999 zExp -= 64;
4001 shiftCount = clz64(zSig0);
4002 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4003 zExp -= shiftCount;
4004 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4005 zSig0, zSig1, status);
4009 /*----------------------------------------------------------------------------
4010 | Returns the least-significant 64 fraction bits of the quadruple-precision
4011 | floating-point value `a'.
4012 *----------------------------------------------------------------------------*/
4014 static inline uint64_t extractFloat128Frac1( float128 a )
4017 return a.low;
4021 /*----------------------------------------------------------------------------
4022 | Returns the most-significant 48 fraction bits of the quadruple-precision
4023 | floating-point value `a'.
4024 *----------------------------------------------------------------------------*/
4026 static inline uint64_t extractFloat128Frac0( float128 a )
4029 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
4033 /*----------------------------------------------------------------------------
4034 | Returns the exponent bits of the quadruple-precision floating-point value
4035 | `a'.
4036 *----------------------------------------------------------------------------*/
4038 static inline int32_t extractFloat128Exp( float128 a )
4041 return ( a.high>>48 ) & 0x7FFF;
4045 /*----------------------------------------------------------------------------
4046 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4047 *----------------------------------------------------------------------------*/
4049 static inline flag extractFloat128Sign( float128 a )
4052 return a.high>>63;
4056 /*----------------------------------------------------------------------------
4057 | Normalizes the subnormal quadruple-precision floating-point value
4058 | represented by the denormalized significand formed by the concatenation of
4059 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
4060 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
4061 | significand are stored at the location pointed to by `zSig0Ptr', and the
4062 | least significant 64 bits of the normalized significand are stored at the
4063 | location pointed to by `zSig1Ptr'.
4064 *----------------------------------------------------------------------------*/
4066 static void
4067 normalizeFloat128Subnormal(
4068 uint64_t aSig0,
4069 uint64_t aSig1,
4070 int32_t *zExpPtr,
4071 uint64_t *zSig0Ptr,
4072 uint64_t *zSig1Ptr
4075 int8_t shiftCount;
4077 if ( aSig0 == 0 ) {
4078 shiftCount = clz64(aSig1) - 15;
4079 if ( shiftCount < 0 ) {
4080 *zSig0Ptr = aSig1>>( - shiftCount );
4081 *zSig1Ptr = aSig1<<( shiftCount & 63 );
4083 else {
4084 *zSig0Ptr = aSig1<<shiftCount;
4085 *zSig1Ptr = 0;
4087 *zExpPtr = - shiftCount - 63;
4089 else {
4090 shiftCount = clz64(aSig0) - 15;
4091 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4092 *zExpPtr = 1 - shiftCount;
4097 /*----------------------------------------------------------------------------
4098 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4099 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4100 | floating-point value, returning the result. After being shifted into the
4101 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4102 | added together to form the most significant 32 bits of the result. This
4103 | means that any integer portion of `zSig0' will be added into the exponent.
4104 | Since a properly normalized significand will have an integer portion equal
4105 | to 1, the `zExp' input should be 1 less than the desired result exponent
4106 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4107 | significand.
4108 *----------------------------------------------------------------------------*/
4110 static inline float128
4111 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
4113 float128 z;
4115 z.low = zSig1;
4116 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
4117 return z;
4121 /*----------------------------------------------------------------------------
4122 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4123 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4124 | and `zSig2', and returns the proper quadruple-precision floating-point value
4125 | corresponding to the abstract input. Ordinarily, the abstract value is
4126 | simply rounded and packed into the quadruple-precision format, with the
4127 | inexact exception raised if the abstract input cannot be represented
4128 | exactly. However, if the abstract value is too large, the overflow and
4129 | inexact exceptions are raised and an infinity or maximal finite value is
4130 | returned. If the abstract value is too small, the input value is rounded to
4131 | a subnormal number, and the underflow and inexact exceptions are raised if
4132 | the abstract input cannot be represented exactly as a subnormal quadruple-
4133 | precision floating-point number.
4134 | The input significand must be normalized or smaller. If the input
4135 | significand is not normalized, `zExp' must be 0; in that case, the result
4136 | returned is a subnormal number, and it must not require rounding. In the
4137 | usual case that the input significand is normalized, `zExp' must be 1 less
4138 | than the ``true'' floating-point exponent. The handling of underflow and
4139 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4140 *----------------------------------------------------------------------------*/
4142 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
4143 uint64_t zSig0, uint64_t zSig1,
4144 uint64_t zSig2, float_status *status)
4146 int8_t roundingMode;
4147 flag roundNearestEven, increment, isTiny;
4149 roundingMode = status->float_rounding_mode;
4150 roundNearestEven = ( roundingMode == float_round_nearest_even );
4151 switch (roundingMode) {
4152 case float_round_nearest_even:
4153 case float_round_ties_away:
4154 increment = ((int64_t)zSig2 < 0);
4155 break;
4156 case float_round_to_zero:
4157 increment = 0;
4158 break;
4159 case float_round_up:
4160 increment = !zSign && zSig2;
4161 break;
4162 case float_round_down:
4163 increment = zSign && zSig2;
4164 break;
4165 case float_round_to_odd:
4166 increment = !(zSig1 & 0x1) && zSig2;
4167 break;
4168 default:
4169 abort();
4171 if ( 0x7FFD <= (uint32_t) zExp ) {
4172 if ( ( 0x7FFD < zExp )
4173 || ( ( zExp == 0x7FFD )
4174 && eq128(
4175 LIT64( 0x0001FFFFFFFFFFFF ),
4176 LIT64( 0xFFFFFFFFFFFFFFFF ),
4177 zSig0,
4178 zSig1
4180 && increment
4183 float_raise(float_flag_overflow | float_flag_inexact, status);
4184 if ( ( roundingMode == float_round_to_zero )
4185 || ( zSign && ( roundingMode == float_round_up ) )
4186 || ( ! zSign && ( roundingMode == float_round_down ) )
4187 || (roundingMode == float_round_to_odd)
4189 return
4190 packFloat128(
4191 zSign,
4192 0x7FFE,
4193 LIT64( 0x0000FFFFFFFFFFFF ),
4194 LIT64( 0xFFFFFFFFFFFFFFFF )
4197 return packFloat128( zSign, 0x7FFF, 0, 0 );
4199 if ( zExp < 0 ) {
4200 if (status->flush_to_zero) {
4201 float_raise(float_flag_output_denormal, status);
4202 return packFloat128(zSign, 0, 0, 0);
4204 isTiny =
4205 (status->float_detect_tininess
4206 == float_tininess_before_rounding)
4207 || ( zExp < -1 )
4208 || ! increment
4209 || lt128(
4210 zSig0,
4211 zSig1,
4212 LIT64( 0x0001FFFFFFFFFFFF ),
4213 LIT64( 0xFFFFFFFFFFFFFFFF )
4215 shift128ExtraRightJamming(
4216 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4217 zExp = 0;
4218 if (isTiny && zSig2) {
4219 float_raise(float_flag_underflow, status);
4221 switch (roundingMode) {
4222 case float_round_nearest_even:
4223 case float_round_ties_away:
4224 increment = ((int64_t)zSig2 < 0);
4225 break;
4226 case float_round_to_zero:
4227 increment = 0;
4228 break;
4229 case float_round_up:
4230 increment = !zSign && zSig2;
4231 break;
4232 case float_round_down:
4233 increment = zSign && zSig2;
4234 break;
4235 case float_round_to_odd:
4236 increment = !(zSig1 & 0x1) && zSig2;
4237 break;
4238 default:
4239 abort();
4243 if (zSig2) {
4244 status->float_exception_flags |= float_flag_inexact;
4246 if ( increment ) {
4247 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4248 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
4250 else {
4251 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4253 return packFloat128( zSign, zExp, zSig0, zSig1 );
4257 /*----------------------------------------------------------------------------
4258 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4259 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4260 | returns the proper quadruple-precision floating-point value corresponding
4261 | to the abstract input. This routine is just like `roundAndPackFloat128'
4262 | except that the input significand has fewer bits and does not have to be
4263 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
4264 | point exponent.
4265 *----------------------------------------------------------------------------*/
4267 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
4268 uint64_t zSig0, uint64_t zSig1,
4269 float_status *status)
4271 int8_t shiftCount;
4272 uint64_t zSig2;
4274 if ( zSig0 == 0 ) {
4275 zSig0 = zSig1;
4276 zSig1 = 0;
4277 zExp -= 64;
4279 shiftCount = clz64(zSig0) - 15;
4280 if ( 0 <= shiftCount ) {
4281 zSig2 = 0;
4282 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4284 else {
4285 shift128ExtraRightJamming(
4286 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4288 zExp -= shiftCount;
4289 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4294 /*----------------------------------------------------------------------------
4295 | Returns the result of converting the 32-bit two's complement integer `a'
4296 | to the extended double-precision floating-point format. The conversion
4297 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4298 | Arithmetic.
4299 *----------------------------------------------------------------------------*/
4301 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4303 flag zSign;
4304 uint32_t absA;
4305 int8_t shiftCount;
4306 uint64_t zSig;
4308 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4309 zSign = ( a < 0 );
4310 absA = zSign ? - a : a;
4311 shiftCount = clz32(absA) + 32;
4312 zSig = absA;
4313 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4317 /*----------------------------------------------------------------------------
4318 | Returns the result of converting the 32-bit two's complement integer `a' to
4319 | the quadruple-precision floating-point format. The conversion is performed
4320 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4321 *----------------------------------------------------------------------------*/
4323 float128 int32_to_float128(int32_t a, float_status *status)
4325 flag zSign;
4326 uint32_t absA;
4327 int8_t shiftCount;
4328 uint64_t zSig0;
4330 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4331 zSign = ( a < 0 );
4332 absA = zSign ? - a : a;
4333 shiftCount = clz32(absA) + 17;
4334 zSig0 = absA;
4335 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4339 /*----------------------------------------------------------------------------
4340 | Returns the result of converting the 64-bit two's complement integer `a'
4341 | to the extended double-precision floating-point format. The conversion
4342 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4343 | Arithmetic.
4344 *----------------------------------------------------------------------------*/
4346 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4348 flag zSign;
4349 uint64_t absA;
4350 int8_t shiftCount;
4352 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4353 zSign = ( a < 0 );
4354 absA = zSign ? - a : a;
4355 shiftCount = clz64(absA);
4356 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4360 /*----------------------------------------------------------------------------
4361 | Returns the result of converting the 64-bit two's complement integer `a' to
4362 | the quadruple-precision floating-point format. The conversion is performed
4363 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4364 *----------------------------------------------------------------------------*/
4366 float128 int64_to_float128(int64_t a, float_status *status)
4368 flag zSign;
4369 uint64_t absA;
4370 int8_t shiftCount;
4371 int32_t zExp;
4372 uint64_t zSig0, zSig1;
4374 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4375 zSign = ( a < 0 );
4376 absA = zSign ? - a : a;
4377 shiftCount = clz64(absA) + 49;
4378 zExp = 0x406E - shiftCount;
4379 if ( 64 <= shiftCount ) {
4380 zSig1 = 0;
4381 zSig0 = absA;
4382 shiftCount -= 64;
4384 else {
4385 zSig1 = absA;
4386 zSig0 = 0;
4388 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4389 return packFloat128( zSign, zExp, zSig0, zSig1 );
4393 /*----------------------------------------------------------------------------
4394 | Returns the result of converting the 64-bit unsigned integer `a'
4395 | to the quadruple-precision floating-point format. The conversion is performed
4396 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4397 *----------------------------------------------------------------------------*/
4399 float128 uint64_to_float128(uint64_t a, float_status *status)
4401 if (a == 0) {
4402 return float128_zero;
4404 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4407 /*----------------------------------------------------------------------------
4408 | Returns the result of converting the single-precision floating-point value
4409 | `a' to the extended double-precision floating-point format. The conversion
4410 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4411 | Arithmetic.
4412 *----------------------------------------------------------------------------*/
4414 floatx80 float32_to_floatx80(float32 a, float_status *status)
4416 flag aSign;
4417 int aExp;
4418 uint32_t aSig;
4420 a = float32_squash_input_denormal(a, status);
4421 aSig = extractFloat32Frac( a );
4422 aExp = extractFloat32Exp( a );
4423 aSign = extractFloat32Sign( a );
4424 if ( aExp == 0xFF ) {
4425 if (aSig) {
4426 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
4428 return packFloatx80(aSign,
4429 floatx80_infinity_high,
4430 floatx80_infinity_low);
4432 if ( aExp == 0 ) {
4433 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4434 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4436 aSig |= 0x00800000;
4437 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4441 /*----------------------------------------------------------------------------
4442 | Returns the result of converting the single-precision floating-point value
4443 | `a' to the double-precision floating-point format. The conversion is
4444 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4445 | Arithmetic.
4446 *----------------------------------------------------------------------------*/
4448 float128 float32_to_float128(float32 a, float_status *status)
4450 flag aSign;
4451 int aExp;
4452 uint32_t aSig;
4454 a = float32_squash_input_denormal(a, status);
4455 aSig = extractFloat32Frac( a );
4456 aExp = extractFloat32Exp( a );
4457 aSign = extractFloat32Sign( a );
4458 if ( aExp == 0xFF ) {
4459 if (aSig) {
4460 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4462 return packFloat128( aSign, 0x7FFF, 0, 0 );
4464 if ( aExp == 0 ) {
4465 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4466 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4467 --aExp;
4469 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4473 /*----------------------------------------------------------------------------
4474 | Returns the remainder of the single-precision floating-point value `a'
4475 | with respect to the corresponding value `b'. The operation is performed
4476 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4477 *----------------------------------------------------------------------------*/
4479 float32 float32_rem(float32 a, float32 b, float_status *status)
4481 flag aSign, zSign;
4482 int aExp, bExp, expDiff;
4483 uint32_t aSig, bSig;
4484 uint32_t q;
4485 uint64_t aSig64, bSig64, q64;
4486 uint32_t alternateASig;
4487 int32_t sigMean;
4488 a = float32_squash_input_denormal(a, status);
4489 b = float32_squash_input_denormal(b, status);
4491 aSig = extractFloat32Frac( a );
4492 aExp = extractFloat32Exp( a );
4493 aSign = extractFloat32Sign( a );
4494 bSig = extractFloat32Frac( b );
4495 bExp = extractFloat32Exp( b );
4496 if ( aExp == 0xFF ) {
4497 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4498 return propagateFloat32NaN(a, b, status);
4500 float_raise(float_flag_invalid, status);
4501 return float32_default_nan(status);
4503 if ( bExp == 0xFF ) {
4504 if (bSig) {
4505 return propagateFloat32NaN(a, b, status);
4507 return a;
4509 if ( bExp == 0 ) {
4510 if ( bSig == 0 ) {
4511 float_raise(float_flag_invalid, status);
4512 return float32_default_nan(status);
4514 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4516 if ( aExp == 0 ) {
4517 if ( aSig == 0 ) return a;
4518 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4520 expDiff = aExp - bExp;
4521 aSig |= 0x00800000;
4522 bSig |= 0x00800000;
4523 if ( expDiff < 32 ) {
4524 aSig <<= 8;
4525 bSig <<= 8;
4526 if ( expDiff < 0 ) {
4527 if ( expDiff < -1 ) return a;
4528 aSig >>= 1;
4530 q = ( bSig <= aSig );
4531 if ( q ) aSig -= bSig;
4532 if ( 0 < expDiff ) {
4533 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4534 q >>= 32 - expDiff;
4535 bSig >>= 2;
4536 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4538 else {
4539 aSig >>= 2;
4540 bSig >>= 2;
4543 else {
4544 if ( bSig <= aSig ) aSig -= bSig;
4545 aSig64 = ( (uint64_t) aSig )<<40;
4546 bSig64 = ( (uint64_t) bSig )<<40;
4547 expDiff -= 64;
4548 while ( 0 < expDiff ) {
4549 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4550 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4551 aSig64 = - ( ( bSig * q64 )<<38 );
4552 expDiff -= 62;
4554 expDiff += 64;
4555 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4556 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4557 q = q64>>( 64 - expDiff );
4558 bSig <<= 6;
4559 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
4561 do {
4562 alternateASig = aSig;
4563 ++q;
4564 aSig -= bSig;
4565 } while ( 0 <= (int32_t) aSig );
4566 sigMean = aSig + alternateASig;
4567 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4568 aSig = alternateASig;
4570 zSign = ( (int32_t) aSig < 0 );
4571 if ( zSign ) aSig = - aSig;
4572 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
4577 /*----------------------------------------------------------------------------
4578 | Returns the binary exponential of the single-precision floating-point value
4579 | `a'. The operation is performed according to the IEC/IEEE Standard for
4580 | Binary Floating-Point Arithmetic.
4582 | Uses the following identities:
4584 | 1. -------------------------------------------------------------------------
4585 | x x*ln(2)
4586 | 2 = e
4588 | 2. -------------------------------------------------------------------------
4589 | 2 3 4 5 n
4590 | x x x x x x x
4591 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
4592 | 1! 2! 3! 4! 5! n!
4593 *----------------------------------------------------------------------------*/
4595 static const float64 float32_exp2_coefficients[15] =
4597 const_float64( 0x3ff0000000000000ll ), /* 1 */
4598 const_float64( 0x3fe0000000000000ll ), /* 2 */
4599 const_float64( 0x3fc5555555555555ll ), /* 3 */
4600 const_float64( 0x3fa5555555555555ll ), /* 4 */
4601 const_float64( 0x3f81111111111111ll ), /* 5 */
4602 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
4603 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
4604 const_float64( 0x3efa01a01a01a01all ), /* 8 */
4605 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
4606 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
4607 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
4608 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
4609 const_float64( 0x3de6124613a86d09ll ), /* 13 */
4610 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
4611 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
4614 float32 float32_exp2(float32 a, float_status *status)
4616 flag aSign;
4617 int aExp;
4618 uint32_t aSig;
4619 float64 r, x, xn;
4620 int i;
4621 a = float32_squash_input_denormal(a, status);
4623 aSig = extractFloat32Frac( a );
4624 aExp = extractFloat32Exp( a );
4625 aSign = extractFloat32Sign( a );
4627 if ( aExp == 0xFF) {
4628 if (aSig) {
4629 return propagateFloat32NaN(a, float32_zero, status);
4631 return (aSign) ? float32_zero : a;
4633 if (aExp == 0) {
4634 if (aSig == 0) return float32_one;
4637 float_raise(float_flag_inexact, status);
4639 /* ******************************* */
4640 /* using float64 for approximation */
4641 /* ******************************* */
4642 x = float32_to_float64(a, status);
4643 x = float64_mul(x, float64_ln2, status);
4645 xn = x;
4646 r = float64_one;
4647 for (i = 0 ; i < 15 ; i++) {
4648 float64 f;
4650 f = float64_mul(xn, float32_exp2_coefficients[i], status);
4651 r = float64_add(r, f, status);
4653 xn = float64_mul(xn, x, status);
4656 return float64_to_float32(r, status);
4659 /*----------------------------------------------------------------------------
4660 | Returns the binary log of the single-precision floating-point value `a'.
4661 | The operation is performed according to the IEC/IEEE Standard for Binary
4662 | Floating-Point Arithmetic.
4663 *----------------------------------------------------------------------------*/
4664 float32 float32_log2(float32 a, float_status *status)
4666 flag aSign, zSign;
4667 int aExp;
4668 uint32_t aSig, zSig, i;
4670 a = float32_squash_input_denormal(a, status);
4671 aSig = extractFloat32Frac( a );
4672 aExp = extractFloat32Exp( a );
4673 aSign = extractFloat32Sign( a );
4675 if ( aExp == 0 ) {
4676 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
4677 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4679 if ( aSign ) {
4680 float_raise(float_flag_invalid, status);
4681 return float32_default_nan(status);
4683 if ( aExp == 0xFF ) {
4684 if (aSig) {
4685 return propagateFloat32NaN(a, float32_zero, status);
4687 return a;
4690 aExp -= 0x7F;
4691 aSig |= 0x00800000;
4692 zSign = aExp < 0;
4693 zSig = aExp << 23;
4695 for (i = 1 << 22; i > 0; i >>= 1) {
4696 aSig = ( (uint64_t)aSig * aSig ) >> 23;
4697 if ( aSig & 0x01000000 ) {
4698 aSig >>= 1;
4699 zSig |= i;
4703 if ( zSign )
4704 zSig = -zSig;
4706 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
4709 /*----------------------------------------------------------------------------
4710 | Returns 1 if the single-precision floating-point value `a' is equal to
4711 | the corresponding value `b', and 0 otherwise. The invalid exception is
4712 | raised if either operand is a NaN. Otherwise, the comparison is performed
4713 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4714 *----------------------------------------------------------------------------*/
4716 int float32_eq(float32 a, float32 b, float_status *status)
4718 uint32_t av, bv;
4719 a = float32_squash_input_denormal(a, status);
4720 b = float32_squash_input_denormal(b, status);
4722 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4723 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4725 float_raise(float_flag_invalid, status);
4726 return 0;
4728 av = float32_val(a);
4729 bv = float32_val(b);
4730 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4733 /*----------------------------------------------------------------------------
4734 | Returns 1 if the single-precision floating-point value `a' is less than
4735 | or equal to the corresponding value `b', and 0 otherwise. The invalid
4736 | exception is raised if either operand is a NaN. The comparison is performed
4737 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4738 *----------------------------------------------------------------------------*/
4740 int float32_le(float32 a, float32 b, float_status *status)
4742 flag aSign, bSign;
4743 uint32_t av, bv;
4744 a = float32_squash_input_denormal(a, status);
4745 b = float32_squash_input_denormal(b, status);
4747 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4748 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4750 float_raise(float_flag_invalid, status);
4751 return 0;
4753 aSign = extractFloat32Sign( a );
4754 bSign = extractFloat32Sign( b );
4755 av = float32_val(a);
4756 bv = float32_val(b);
4757 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4758 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4762 /*----------------------------------------------------------------------------
4763 | Returns 1 if the single-precision floating-point value `a' is less than
4764 | the corresponding value `b', and 0 otherwise. The invalid exception is
4765 | raised if either operand is a NaN. The comparison is performed according
4766 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4767 *----------------------------------------------------------------------------*/
4769 int float32_lt(float32 a, float32 b, float_status *status)
4771 flag aSign, bSign;
4772 uint32_t av, bv;
4773 a = float32_squash_input_denormal(a, status);
4774 b = float32_squash_input_denormal(b, status);
4776 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4777 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4779 float_raise(float_flag_invalid, status);
4780 return 0;
4782 aSign = extractFloat32Sign( a );
4783 bSign = extractFloat32Sign( b );
4784 av = float32_val(a);
4785 bv = float32_val(b);
4786 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4787 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4791 /*----------------------------------------------------------------------------
4792 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4793 | be compared, and 0 otherwise. The invalid exception is raised if either
4794 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4795 | Standard for Binary Floating-Point Arithmetic.
4796 *----------------------------------------------------------------------------*/
4798 int float32_unordered(float32 a, float32 b, float_status *status)
4800 a = float32_squash_input_denormal(a, status);
4801 b = float32_squash_input_denormal(b, status);
4803 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4804 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4806 float_raise(float_flag_invalid, status);
4807 return 1;
4809 return 0;
4812 /*----------------------------------------------------------------------------
4813 | Returns 1 if the single-precision floating-point value `a' is equal to
4814 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4815 | exception. The comparison is performed according to the IEC/IEEE Standard
4816 | for Binary Floating-Point Arithmetic.
4817 *----------------------------------------------------------------------------*/
4819 int float32_eq_quiet(float32 a, float32 b, float_status *status)
4821 a = float32_squash_input_denormal(a, status);
4822 b = float32_squash_input_denormal(b, status);
4824 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4825 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4827 if (float32_is_signaling_nan(a, status)
4828 || float32_is_signaling_nan(b, status)) {
4829 float_raise(float_flag_invalid, status);
4831 return 0;
4833 return ( float32_val(a) == float32_val(b) ) ||
4834 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
4837 /*----------------------------------------------------------------------------
4838 | Returns 1 if the single-precision floating-point value `a' is less than or
4839 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4840 | cause an exception. Otherwise, the comparison is performed according to the
4841 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4842 *----------------------------------------------------------------------------*/
4844 int float32_le_quiet(float32 a, float32 b, float_status *status)
4846 flag aSign, bSign;
4847 uint32_t av, bv;
4848 a = float32_squash_input_denormal(a, status);
4849 b = float32_squash_input_denormal(b, status);
4851 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4852 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4854 if (float32_is_signaling_nan(a, status)
4855 || float32_is_signaling_nan(b, status)) {
4856 float_raise(float_flag_invalid, status);
4858 return 0;
4860 aSign = extractFloat32Sign( a );
4861 bSign = extractFloat32Sign( b );
4862 av = float32_val(a);
4863 bv = float32_val(b);
4864 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4865 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4869 /*----------------------------------------------------------------------------
4870 | Returns 1 if the single-precision floating-point value `a' is less than
4871 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4872 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4873 | Standard for Binary Floating-Point Arithmetic.
4874 *----------------------------------------------------------------------------*/
4876 int float32_lt_quiet(float32 a, float32 b, float_status *status)
4878 flag aSign, bSign;
4879 uint32_t av, bv;
4880 a = float32_squash_input_denormal(a, status);
4881 b = float32_squash_input_denormal(b, status);
4883 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4884 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4886 if (float32_is_signaling_nan(a, status)
4887 || float32_is_signaling_nan(b, status)) {
4888 float_raise(float_flag_invalid, status);
4890 return 0;
4892 aSign = extractFloat32Sign( a );
4893 bSign = extractFloat32Sign( b );
4894 av = float32_val(a);
4895 bv = float32_val(b);
4896 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4897 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4901 /*----------------------------------------------------------------------------
4902 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4903 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4904 | comparison is performed according to the IEC/IEEE Standard for Binary
4905 | Floating-Point Arithmetic.
4906 *----------------------------------------------------------------------------*/
4908 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
4910 a = float32_squash_input_denormal(a, status);
4911 b = float32_squash_input_denormal(b, status);
4913 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4914 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4916 if (float32_is_signaling_nan(a, status)
4917 || float32_is_signaling_nan(b, status)) {
4918 float_raise(float_flag_invalid, status);
4920 return 1;
4922 return 0;
4925 /*----------------------------------------------------------------------------
4926 | If `a' is denormal and we are in flush-to-zero mode then set the
4927 | input-denormal exception and return zero. Otherwise just return the value.
4928 *----------------------------------------------------------------------------*/
4929 float16 float16_squash_input_denormal(float16 a, float_status *status)
4931 if (status->flush_inputs_to_zero) {
4932 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
4933 float_raise(float_flag_input_denormal, status);
4934 return make_float16(float16_val(a) & 0x8000);
4937 return a;
4940 /*----------------------------------------------------------------------------
4941 | Returns the result of converting the double-precision floating-point value
4942 | `a' to the extended double-precision floating-point format. The conversion
4943 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4944 | Arithmetic.
4945 *----------------------------------------------------------------------------*/
4947 floatx80 float64_to_floatx80(float64 a, float_status *status)
4949 flag aSign;
4950 int aExp;
4951 uint64_t aSig;
4953 a = float64_squash_input_denormal(a, status);
4954 aSig = extractFloat64Frac( a );
4955 aExp = extractFloat64Exp( a );
4956 aSign = extractFloat64Sign( a );
4957 if ( aExp == 0x7FF ) {
4958 if (aSig) {
4959 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4961 return packFloatx80(aSign,
4962 floatx80_infinity_high,
4963 floatx80_infinity_low);
4965 if ( aExp == 0 ) {
4966 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4967 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4969 return
4970 packFloatx80(
4971 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4975 /*----------------------------------------------------------------------------
4976 | Returns the result of converting the double-precision floating-point value
4977 | `a' to the quadruple-precision floating-point format. The conversion is
4978 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4979 | Arithmetic.
4980 *----------------------------------------------------------------------------*/
4982 float128 float64_to_float128(float64 a, float_status *status)
4984 flag aSign;
4985 int aExp;
4986 uint64_t aSig, zSig0, zSig1;
4988 a = float64_squash_input_denormal(a, status);
4989 aSig = extractFloat64Frac( a );
4990 aExp = extractFloat64Exp( a );
4991 aSign = extractFloat64Sign( a );
4992 if ( aExp == 0x7FF ) {
4993 if (aSig) {
4994 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4996 return packFloat128( aSign, 0x7FFF, 0, 0 );
4998 if ( aExp == 0 ) {
4999 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5000 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5001 --aExp;
5003 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5004 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5009 /*----------------------------------------------------------------------------
5010 | Returns the remainder of the double-precision floating-point value `a'
5011 | with respect to the corresponding value `b'. The operation is performed
5012 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5013 *----------------------------------------------------------------------------*/
5015 float64 float64_rem(float64 a, float64 b, float_status *status)
5017 flag aSign, zSign;
5018 int aExp, bExp, expDiff;
5019 uint64_t aSig, bSig;
5020 uint64_t q, alternateASig;
5021 int64_t sigMean;
5023 a = float64_squash_input_denormal(a, status);
5024 b = float64_squash_input_denormal(b, status);
5025 aSig = extractFloat64Frac( a );
5026 aExp = extractFloat64Exp( a );
5027 aSign = extractFloat64Sign( a );
5028 bSig = extractFloat64Frac( b );
5029 bExp = extractFloat64Exp( b );
5030 if ( aExp == 0x7FF ) {
5031 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5032 return propagateFloat64NaN(a, b, status);
5034 float_raise(float_flag_invalid, status);
5035 return float64_default_nan(status);
5037 if ( bExp == 0x7FF ) {
5038 if (bSig) {
5039 return propagateFloat64NaN(a, b, status);
5041 return a;
5043 if ( bExp == 0 ) {
5044 if ( bSig == 0 ) {
5045 float_raise(float_flag_invalid, status);
5046 return float64_default_nan(status);
5048 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5050 if ( aExp == 0 ) {
5051 if ( aSig == 0 ) return a;
5052 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5054 expDiff = aExp - bExp;
5055 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
5056 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
5057 if ( expDiff < 0 ) {
5058 if ( expDiff < -1 ) return a;
5059 aSig >>= 1;
5061 q = ( bSig <= aSig );
5062 if ( q ) aSig -= bSig;
5063 expDiff -= 64;
5064 while ( 0 < expDiff ) {
5065 q = estimateDiv128To64( aSig, 0, bSig );
5066 q = ( 2 < q ) ? q - 2 : 0;
5067 aSig = - ( ( bSig>>2 ) * q );
5068 expDiff -= 62;
5070 expDiff += 64;
5071 if ( 0 < expDiff ) {
5072 q = estimateDiv128To64( aSig, 0, bSig );
5073 q = ( 2 < q ) ? q - 2 : 0;
5074 q >>= 64 - expDiff;
5075 bSig >>= 2;
5076 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5078 else {
5079 aSig >>= 2;
5080 bSig >>= 2;
5082 do {
5083 alternateASig = aSig;
5084 ++q;
5085 aSig -= bSig;
5086 } while ( 0 <= (int64_t) aSig );
5087 sigMean = aSig + alternateASig;
5088 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5089 aSig = alternateASig;
5091 zSign = ( (int64_t) aSig < 0 );
5092 if ( zSign ) aSig = - aSig;
5093 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5097 /*----------------------------------------------------------------------------
5098 | Returns the binary log of the double-precision floating-point value `a'.
5099 | The operation is performed according to the IEC/IEEE Standard for Binary
5100 | Floating-Point Arithmetic.
5101 *----------------------------------------------------------------------------*/
5102 float64 float64_log2(float64 a, float_status *status)
5104 flag aSign, zSign;
5105 int aExp;
5106 uint64_t aSig, aSig0, aSig1, zSig, i;
5107 a = float64_squash_input_denormal(a, status);
5109 aSig = extractFloat64Frac( a );
5110 aExp = extractFloat64Exp( a );
5111 aSign = extractFloat64Sign( a );
5113 if ( aExp == 0 ) {
5114 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5115 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5117 if ( aSign ) {
5118 float_raise(float_flag_invalid, status);
5119 return float64_default_nan(status);
5121 if ( aExp == 0x7FF ) {
5122 if (aSig) {
5123 return propagateFloat64NaN(a, float64_zero, status);
5125 return a;
5128 aExp -= 0x3FF;
5129 aSig |= LIT64( 0x0010000000000000 );
5130 zSign = aExp < 0;
5131 zSig = (uint64_t)aExp << 52;
5132 for (i = 1LL << 51; i > 0; i >>= 1) {
5133 mul64To128( aSig, aSig, &aSig0, &aSig1 );
5134 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5135 if ( aSig & LIT64( 0x0020000000000000 ) ) {
5136 aSig >>= 1;
5137 zSig |= i;
5141 if ( zSign )
5142 zSig = -zSig;
5143 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5146 /*----------------------------------------------------------------------------
5147 | Returns 1 if the double-precision floating-point value `a' is equal to the
5148 | corresponding value `b', and 0 otherwise. The invalid exception is raised
5149 | if either operand is a NaN. Otherwise, the comparison is performed
5150 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5151 *----------------------------------------------------------------------------*/
5153 int float64_eq(float64 a, float64 b, float_status *status)
5155 uint64_t av, bv;
5156 a = float64_squash_input_denormal(a, status);
5157 b = float64_squash_input_denormal(b, status);
5159 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5160 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5162 float_raise(float_flag_invalid, status);
5163 return 0;
5165 av = float64_val(a);
5166 bv = float64_val(b);
5167 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5171 /*----------------------------------------------------------------------------
5172 | Returns 1 if the double-precision floating-point value `a' is less than or
5173 | equal to the corresponding value `b', and 0 otherwise. The invalid
5174 | exception is raised if either operand is a NaN. The comparison is performed
5175 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5176 *----------------------------------------------------------------------------*/
5178 int float64_le(float64 a, float64 b, float_status *status)
5180 flag aSign, bSign;
5181 uint64_t av, bv;
5182 a = float64_squash_input_denormal(a, status);
5183 b = float64_squash_input_denormal(b, status);
5185 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5186 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5188 float_raise(float_flag_invalid, status);
5189 return 0;
5191 aSign = extractFloat64Sign( a );
5192 bSign = extractFloat64Sign( b );
5193 av = float64_val(a);
5194 bv = float64_val(b);
5195 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5196 return ( av == bv ) || ( aSign ^ ( av < bv ) );
5200 /*----------------------------------------------------------------------------
5201 | Returns 1 if the double-precision floating-point value `a' is less than
5202 | the corresponding value `b', and 0 otherwise. The invalid exception is
5203 | raised if either operand is a NaN. The comparison is performed according
5204 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5205 *----------------------------------------------------------------------------*/
5207 int float64_lt(float64 a, float64 b, float_status *status)
5209 flag aSign, bSign;
5210 uint64_t av, bv;
5212 a = float64_squash_input_denormal(a, status);
5213 b = float64_squash_input_denormal(b, status);
5214 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5215 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5217 float_raise(float_flag_invalid, status);
5218 return 0;
5220 aSign = extractFloat64Sign( a );
5221 bSign = extractFloat64Sign( b );
5222 av = float64_val(a);
5223 bv = float64_val(b);
5224 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5225 return ( av != bv ) && ( aSign ^ ( av < bv ) );
5229 /*----------------------------------------------------------------------------
5230 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5231 | be compared, and 0 otherwise. The invalid exception is raised if either
5232 | operand is a NaN. The comparison is performed according to the IEC/IEEE
5233 | Standard for Binary Floating-Point Arithmetic.
5234 *----------------------------------------------------------------------------*/
5236 int float64_unordered(float64 a, float64 b, float_status *status)
5238 a = float64_squash_input_denormal(a, status);
5239 b = float64_squash_input_denormal(b, status);
5241 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5242 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5244 float_raise(float_flag_invalid, status);
5245 return 1;
5247 return 0;
5250 /*----------------------------------------------------------------------------
5251 | Returns 1 if the double-precision floating-point value `a' is equal to the
5252 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5253 | exception.The comparison is performed according to the IEC/IEEE Standard
5254 | for Binary Floating-Point Arithmetic.
5255 *----------------------------------------------------------------------------*/
5257 int float64_eq_quiet(float64 a, float64 b, float_status *status)
5259 uint64_t av, bv;
5260 a = float64_squash_input_denormal(a, status);
5261 b = float64_squash_input_denormal(b, status);
5263 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5264 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5266 if (float64_is_signaling_nan(a, status)
5267 || float64_is_signaling_nan(b, status)) {
5268 float_raise(float_flag_invalid, status);
5270 return 0;
5272 av = float64_val(a);
5273 bv = float64_val(b);
5274 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5278 /*----------------------------------------------------------------------------
5279 | Returns 1 if the double-precision floating-point value `a' is less than or
5280 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5281 | cause an exception. Otherwise, the comparison is performed according to the
5282 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5283 *----------------------------------------------------------------------------*/
5285 int float64_le_quiet(float64 a, float64 b, float_status *status)
5287 flag aSign, bSign;
5288 uint64_t av, bv;
5289 a = float64_squash_input_denormal(a, status);
5290 b = float64_squash_input_denormal(b, status);
5292 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5293 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5295 if (float64_is_signaling_nan(a, status)
5296 || float64_is_signaling_nan(b, status)) {
5297 float_raise(float_flag_invalid, status);
5299 return 0;
5301 aSign = extractFloat64Sign( a );
5302 bSign = extractFloat64Sign( b );
5303 av = float64_val(a);
5304 bv = float64_val(b);
5305 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5306 return ( av == bv ) || ( aSign ^ ( av < bv ) );
5310 /*----------------------------------------------------------------------------
5311 | Returns 1 if the double-precision floating-point value `a' is less than
5312 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5313 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5314 | Standard for Binary Floating-Point Arithmetic.
5315 *----------------------------------------------------------------------------*/
5317 int float64_lt_quiet(float64 a, float64 b, float_status *status)
5319 flag aSign, bSign;
5320 uint64_t av, bv;
5321 a = float64_squash_input_denormal(a, status);
5322 b = float64_squash_input_denormal(b, status);
5324 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5325 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5327 if (float64_is_signaling_nan(a, status)
5328 || float64_is_signaling_nan(b, status)) {
5329 float_raise(float_flag_invalid, status);
5331 return 0;
5333 aSign = extractFloat64Sign( a );
5334 bSign = extractFloat64Sign( b );
5335 av = float64_val(a);
5336 bv = float64_val(b);
5337 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5338 return ( av != bv ) && ( aSign ^ ( av < bv ) );
5342 /*----------------------------------------------------------------------------
5343 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5344 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
5345 | comparison is performed according to the IEC/IEEE Standard for Binary
5346 | Floating-Point Arithmetic.
5347 *----------------------------------------------------------------------------*/
5349 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
5351 a = float64_squash_input_denormal(a, status);
5352 b = float64_squash_input_denormal(b, status);
5354 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5355 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5357 if (float64_is_signaling_nan(a, status)
5358 || float64_is_signaling_nan(b, status)) {
5359 float_raise(float_flag_invalid, status);
5361 return 1;
5363 return 0;
5366 /*----------------------------------------------------------------------------
5367 | Returns the result of converting the extended double-precision floating-
5368 | point value `a' to the 32-bit two's complement integer format. The
5369 | conversion is performed according to the IEC/IEEE Standard for Binary
5370 | Floating-Point Arithmetic---which means in particular that the conversion
5371 | is rounded according to the current rounding mode. If `a' is a NaN, the
5372 | largest positive integer is returned. Otherwise, if the conversion
5373 | overflows, the largest integer with the same sign as `a' is returned.
5374 *----------------------------------------------------------------------------*/
5376 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5378 flag aSign;
5379 int32_t aExp, shiftCount;
5380 uint64_t aSig;
5382 if (floatx80_invalid_encoding(a)) {
5383 float_raise(float_flag_invalid, status);
5384 return 1 << 31;
5386 aSig = extractFloatx80Frac( a );
5387 aExp = extractFloatx80Exp( a );
5388 aSign = extractFloatx80Sign( a );
5389 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5390 shiftCount = 0x4037 - aExp;
5391 if ( shiftCount <= 0 ) shiftCount = 1;
5392 shift64RightJamming( aSig, shiftCount, &aSig );
5393 return roundAndPackInt32(aSign, aSig, status);
5397 /*----------------------------------------------------------------------------
5398 | Returns the result of converting the extended double-precision floating-
5399 | point value `a' to the 32-bit two's complement integer format. The
5400 | conversion is performed according to the IEC/IEEE Standard for Binary
5401 | Floating-Point Arithmetic, except that the conversion is always rounded
5402 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5403 | Otherwise, if the conversion overflows, the largest integer with the same
5404 | sign as `a' is returned.
5405 *----------------------------------------------------------------------------*/
5407 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5409 flag aSign;
5410 int32_t aExp, shiftCount;
5411 uint64_t aSig, savedASig;
5412 int32_t z;
5414 if (floatx80_invalid_encoding(a)) {
5415 float_raise(float_flag_invalid, status);
5416 return 1 << 31;
5418 aSig = extractFloatx80Frac( a );
5419 aExp = extractFloatx80Exp( a );
5420 aSign = extractFloatx80Sign( a );
5421 if ( 0x401E < aExp ) {
5422 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5423 goto invalid;
5425 else if ( aExp < 0x3FFF ) {
5426 if (aExp || aSig) {
5427 status->float_exception_flags |= float_flag_inexact;
5429 return 0;
5431 shiftCount = 0x403E - aExp;
5432 savedASig = aSig;
5433 aSig >>= shiftCount;
5434 z = aSig;
5435 if ( aSign ) z = - z;
5436 if ( ( z < 0 ) ^ aSign ) {
5437 invalid:
5438 float_raise(float_flag_invalid, status);
5439 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5441 if ( ( aSig<<shiftCount ) != savedASig ) {
5442 status->float_exception_flags |= float_flag_inexact;
5444 return z;
5448 /*----------------------------------------------------------------------------
5449 | Returns the result of converting the extended double-precision floating-
5450 | point value `a' to the 64-bit two's complement integer format. The
5451 | conversion is performed according to the IEC/IEEE Standard for Binary
5452 | Floating-Point Arithmetic---which means in particular that the conversion
5453 | is rounded according to the current rounding mode. If `a' is a NaN,
5454 | the largest positive integer is returned. Otherwise, if the conversion
5455 | overflows, the largest integer with the same sign as `a' is returned.
5456 *----------------------------------------------------------------------------*/
5458 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5460 flag aSign;
5461 int32_t aExp, shiftCount;
5462 uint64_t aSig, aSigExtra;
5464 if (floatx80_invalid_encoding(a)) {
5465 float_raise(float_flag_invalid, status);
5466 return 1ULL << 63;
5468 aSig = extractFloatx80Frac( a );
5469 aExp = extractFloatx80Exp( a );
5470 aSign = extractFloatx80Sign( a );
5471 shiftCount = 0x403E - aExp;
5472 if ( shiftCount <= 0 ) {
5473 if ( shiftCount ) {
5474 float_raise(float_flag_invalid, status);
5475 if (!aSign || floatx80_is_any_nan(a)) {
5476 return LIT64( 0x7FFFFFFFFFFFFFFF );
5478 return (int64_t) LIT64( 0x8000000000000000 );
5480 aSigExtra = 0;
5482 else {
5483 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5485 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5489 /*----------------------------------------------------------------------------
5490 | Returns the result of converting the extended double-precision floating-
5491 | point value `a' to the 64-bit two's complement integer format. The
5492 | conversion is performed according to the IEC/IEEE Standard for Binary
5493 | Floating-Point Arithmetic, except that the conversion is always rounded
5494 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5495 | Otherwise, if the conversion overflows, the largest integer with the same
5496 | sign as `a' is returned.
5497 *----------------------------------------------------------------------------*/
5499 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5501 flag aSign;
5502 int32_t aExp, shiftCount;
5503 uint64_t aSig;
5504 int64_t z;
5506 if (floatx80_invalid_encoding(a)) {
5507 float_raise(float_flag_invalid, status);
5508 return 1ULL << 63;
5510 aSig = extractFloatx80Frac( a );
5511 aExp = extractFloatx80Exp( a );
5512 aSign = extractFloatx80Sign( a );
5513 shiftCount = aExp - 0x403E;
5514 if ( 0 <= shiftCount ) {
5515 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
5516 if ( ( a.high != 0xC03E ) || aSig ) {
5517 float_raise(float_flag_invalid, status);
5518 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5519 return LIT64( 0x7FFFFFFFFFFFFFFF );
5522 return (int64_t) LIT64( 0x8000000000000000 );
5524 else if ( aExp < 0x3FFF ) {
5525 if (aExp | aSig) {
5526 status->float_exception_flags |= float_flag_inexact;
5528 return 0;
5530 z = aSig>>( - shiftCount );
5531 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5532 status->float_exception_flags |= float_flag_inexact;
5534 if ( aSign ) z = - z;
5535 return z;
5539 /*----------------------------------------------------------------------------
5540 | Returns the result of converting the extended double-precision floating-
5541 | point value `a' to the single-precision floating-point format. The
5542 | conversion is performed according to the IEC/IEEE Standard for Binary
5543 | Floating-Point Arithmetic.
5544 *----------------------------------------------------------------------------*/
5546 float32 floatx80_to_float32(floatx80 a, float_status *status)
5548 flag aSign;
5549 int32_t aExp;
5550 uint64_t aSig;
5552 if (floatx80_invalid_encoding(a)) {
5553 float_raise(float_flag_invalid, status);
5554 return float32_default_nan(status);
5556 aSig = extractFloatx80Frac( a );
5557 aExp = extractFloatx80Exp( a );
5558 aSign = extractFloatx80Sign( a );
5559 if ( aExp == 0x7FFF ) {
5560 if ( (uint64_t) ( aSig<<1 ) ) {
5561 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5563 return packFloat32( aSign, 0xFF, 0 );
5565 shift64RightJamming( aSig, 33, &aSig );
5566 if ( aExp || aSig ) aExp -= 0x3F81;
5567 return roundAndPackFloat32(aSign, aExp, aSig, status);
5571 /*----------------------------------------------------------------------------
5572 | Returns the result of converting the extended double-precision floating-
5573 | point value `a' to the double-precision floating-point format. The
5574 | conversion is performed according to the IEC/IEEE Standard for Binary
5575 | Floating-Point Arithmetic.
5576 *----------------------------------------------------------------------------*/
5578 float64 floatx80_to_float64(floatx80 a, float_status *status)
5580 flag aSign;
5581 int32_t aExp;
5582 uint64_t aSig, zSig;
5584 if (floatx80_invalid_encoding(a)) {
5585 float_raise(float_flag_invalid, status);
5586 return float64_default_nan(status);
5588 aSig = extractFloatx80Frac( a );
5589 aExp = extractFloatx80Exp( a );
5590 aSign = extractFloatx80Sign( a );
5591 if ( aExp == 0x7FFF ) {
5592 if ( (uint64_t) ( aSig<<1 ) ) {
5593 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5595 return packFloat64( aSign, 0x7FF, 0 );
5597 shift64RightJamming( aSig, 1, &zSig );
5598 if ( aExp || aSig ) aExp -= 0x3C01;
5599 return roundAndPackFloat64(aSign, aExp, zSig, status);
5603 /*----------------------------------------------------------------------------
5604 | Returns the result of converting the extended double-precision floating-
5605 | point value `a' to the quadruple-precision floating-point format. The
5606 | conversion is performed according to the IEC/IEEE Standard for Binary
5607 | Floating-Point Arithmetic.
5608 *----------------------------------------------------------------------------*/
5610 float128 floatx80_to_float128(floatx80 a, float_status *status)
5612 flag aSign;
5613 int aExp;
5614 uint64_t aSig, zSig0, zSig1;
5616 if (floatx80_invalid_encoding(a)) {
5617 float_raise(float_flag_invalid, status);
5618 return float128_default_nan(status);
5620 aSig = extractFloatx80Frac( a );
5621 aExp = extractFloatx80Exp( a );
5622 aSign = extractFloatx80Sign( a );
5623 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5624 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5626 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5627 return packFloat128( aSign, aExp, zSig0, zSig1 );
5631 /*----------------------------------------------------------------------------
5632 | Rounds the extended double-precision floating-point value `a'
5633 | to the precision provided by floatx80_rounding_precision and returns the
5634 | result as an extended double-precision floating-point value.
5635 | The operation is performed according to the IEC/IEEE Standard for Binary
5636 | Floating-Point Arithmetic.
5637 *----------------------------------------------------------------------------*/
5639 floatx80 floatx80_round(floatx80 a, float_status *status)
5641 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5642 extractFloatx80Sign(a),
5643 extractFloatx80Exp(a),
5644 extractFloatx80Frac(a), 0, status);
5647 /*----------------------------------------------------------------------------
5648 | Rounds the extended double-precision floating-point value `a' to an integer,
5649 | and returns the result as an extended quadruple-precision floating-point
5650 | value. The operation is performed according to the IEC/IEEE Standard for
5651 | Binary Floating-Point Arithmetic.
5652 *----------------------------------------------------------------------------*/
5654 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5656 flag aSign;
5657 int32_t aExp;
5658 uint64_t lastBitMask, roundBitsMask;
5659 floatx80 z;
5661 if (floatx80_invalid_encoding(a)) {
5662 float_raise(float_flag_invalid, status);
5663 return floatx80_default_nan(status);
5665 aExp = extractFloatx80Exp( a );
5666 if ( 0x403E <= aExp ) {
5667 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5668 return propagateFloatx80NaN(a, a, status);
5670 return a;
5672 if ( aExp < 0x3FFF ) {
5673 if ( ( aExp == 0 )
5674 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5675 return a;
5677 status->float_exception_flags |= float_flag_inexact;
5678 aSign = extractFloatx80Sign( a );
5679 switch (status->float_rounding_mode) {
5680 case float_round_nearest_even:
5681 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5683 return
5684 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5686 break;
5687 case float_round_ties_away:
5688 if (aExp == 0x3FFE) {
5689 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5691 break;
5692 case float_round_down:
5693 return
5694 aSign ?
5695 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5696 : packFloatx80( 0, 0, 0 );
5697 case float_round_up:
5698 return
5699 aSign ? packFloatx80( 1, 0, 0 )
5700 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5702 return packFloatx80( aSign, 0, 0 );
5704 lastBitMask = 1;
5705 lastBitMask <<= 0x403E - aExp;
5706 roundBitsMask = lastBitMask - 1;
5707 z = a;
5708 switch (status->float_rounding_mode) {
5709 case float_round_nearest_even:
5710 z.low += lastBitMask>>1;
5711 if ((z.low & roundBitsMask) == 0) {
5712 z.low &= ~lastBitMask;
5714 break;
5715 case float_round_ties_away:
5716 z.low += lastBitMask >> 1;
5717 break;
5718 case float_round_to_zero:
5719 break;
5720 case float_round_up:
5721 if (!extractFloatx80Sign(z)) {
5722 z.low += roundBitsMask;
5724 break;
5725 case float_round_down:
5726 if (extractFloatx80Sign(z)) {
5727 z.low += roundBitsMask;
5729 break;
5730 default:
5731 abort();
5733 z.low &= ~ roundBitsMask;
5734 if ( z.low == 0 ) {
5735 ++z.high;
5736 z.low = LIT64( 0x8000000000000000 );
5738 if (z.low != a.low) {
5739 status->float_exception_flags |= float_flag_inexact;
5741 return z;
5745 /*----------------------------------------------------------------------------
5746 | Returns the result of adding the absolute values of the extended double-
5747 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5748 | negated before being returned. `zSign' is ignored if the result is a NaN.
5749 | The addition is performed according to the IEC/IEEE Standard for Binary
5750 | Floating-Point Arithmetic.
5751 *----------------------------------------------------------------------------*/
5753 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5754 float_status *status)
5756 int32_t aExp, bExp, zExp;
5757 uint64_t aSig, bSig, zSig0, zSig1;
5758 int32_t expDiff;
5760 aSig = extractFloatx80Frac( a );
5761 aExp = extractFloatx80Exp( a );
5762 bSig = extractFloatx80Frac( b );
5763 bExp = extractFloatx80Exp( b );
5764 expDiff = aExp - bExp;
5765 if ( 0 < expDiff ) {
5766 if ( aExp == 0x7FFF ) {
5767 if ((uint64_t)(aSig << 1)) {
5768 return propagateFloatx80NaN(a, b, status);
5770 return a;
5772 if ( bExp == 0 ) --expDiff;
5773 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5774 zExp = aExp;
5776 else if ( expDiff < 0 ) {
5777 if ( bExp == 0x7FFF ) {
5778 if ((uint64_t)(bSig << 1)) {
5779 return propagateFloatx80NaN(a, b, status);
5781 return packFloatx80(zSign,
5782 floatx80_infinity_high,
5783 floatx80_infinity_low);
5785 if ( aExp == 0 ) ++expDiff;
5786 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5787 zExp = bExp;
5789 else {
5790 if ( aExp == 0x7FFF ) {
5791 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5792 return propagateFloatx80NaN(a, b, status);
5794 return a;
5796 zSig1 = 0;
5797 zSig0 = aSig + bSig;
5798 if ( aExp == 0 ) {
5799 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5800 goto roundAndPack;
5802 zExp = aExp;
5803 goto shiftRight1;
5805 zSig0 = aSig + bSig;
5806 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5807 shiftRight1:
5808 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5809 zSig0 |= LIT64( 0x8000000000000000 );
5810 ++zExp;
5811 roundAndPack:
5812 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5813 zSign, zExp, zSig0, zSig1, status);
5816 /*----------------------------------------------------------------------------
5817 | Returns the result of subtracting the absolute values of the extended
5818 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5819 | difference is negated before being returned. `zSign' is ignored if the
5820 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5821 | Standard for Binary Floating-Point Arithmetic.
5822 *----------------------------------------------------------------------------*/
5824 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5825 float_status *status)
5827 int32_t aExp, bExp, zExp;
5828 uint64_t aSig, bSig, zSig0, zSig1;
5829 int32_t expDiff;
5831 aSig = extractFloatx80Frac( a );
5832 aExp = extractFloatx80Exp( a );
5833 bSig = extractFloatx80Frac( b );
5834 bExp = extractFloatx80Exp( b );
5835 expDiff = aExp - bExp;
5836 if ( 0 < expDiff ) goto aExpBigger;
5837 if ( expDiff < 0 ) goto bExpBigger;
5838 if ( aExp == 0x7FFF ) {
5839 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5840 return propagateFloatx80NaN(a, b, status);
5842 float_raise(float_flag_invalid, status);
5843 return floatx80_default_nan(status);
5845 if ( aExp == 0 ) {
5846 aExp = 1;
5847 bExp = 1;
5849 zSig1 = 0;
5850 if ( bSig < aSig ) goto aBigger;
5851 if ( aSig < bSig ) goto bBigger;
5852 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5853 bExpBigger:
5854 if ( bExp == 0x7FFF ) {
5855 if ((uint64_t)(bSig << 1)) {
5856 return propagateFloatx80NaN(a, b, status);
5858 return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5859 floatx80_infinity_low);
5861 if ( aExp == 0 ) ++expDiff;
5862 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5863 bBigger:
5864 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5865 zExp = bExp;
5866 zSign ^= 1;
5867 goto normalizeRoundAndPack;
5868 aExpBigger:
5869 if ( aExp == 0x7FFF ) {
5870 if ((uint64_t)(aSig << 1)) {
5871 return propagateFloatx80NaN(a, b, status);
5873 return a;
5875 if ( bExp == 0 ) --expDiff;
5876 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5877 aBigger:
5878 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5879 zExp = aExp;
5880 normalizeRoundAndPack:
5881 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5882 zSign, zExp, zSig0, zSig1, status);
5885 /*----------------------------------------------------------------------------
5886 | Returns the result of adding the extended double-precision floating-point
5887 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5888 | Standard for Binary Floating-Point Arithmetic.
5889 *----------------------------------------------------------------------------*/
5891 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5893 flag aSign, bSign;
5895 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5896 float_raise(float_flag_invalid, status);
5897 return floatx80_default_nan(status);
5899 aSign = extractFloatx80Sign( a );
5900 bSign = extractFloatx80Sign( b );
5901 if ( aSign == bSign ) {
5902 return addFloatx80Sigs(a, b, aSign, status);
5904 else {
5905 return subFloatx80Sigs(a, b, aSign, status);
5910 /*----------------------------------------------------------------------------
5911 | Returns the result of subtracting the extended double-precision floating-
5912 | point values `a' and `b'. The operation is performed according to the
5913 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5914 *----------------------------------------------------------------------------*/
5916 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5918 flag aSign, bSign;
5920 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5921 float_raise(float_flag_invalid, status);
5922 return floatx80_default_nan(status);
5924 aSign = extractFloatx80Sign( a );
5925 bSign = extractFloatx80Sign( b );
5926 if ( aSign == bSign ) {
5927 return subFloatx80Sigs(a, b, aSign, status);
5929 else {
5930 return addFloatx80Sigs(a, b, aSign, status);
5935 /*----------------------------------------------------------------------------
5936 | Returns the result of multiplying the extended double-precision floating-
5937 | point values `a' and `b'. The operation is performed according to the
5938 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5939 *----------------------------------------------------------------------------*/
5941 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5943 flag aSign, bSign, zSign;
5944 int32_t aExp, bExp, zExp;
5945 uint64_t aSig, bSig, zSig0, zSig1;
5947 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5948 float_raise(float_flag_invalid, status);
5949 return floatx80_default_nan(status);
5951 aSig = extractFloatx80Frac( a );
5952 aExp = extractFloatx80Exp( a );
5953 aSign = extractFloatx80Sign( a );
5954 bSig = extractFloatx80Frac( b );
5955 bExp = extractFloatx80Exp( b );
5956 bSign = extractFloatx80Sign( b );
5957 zSign = aSign ^ bSign;
5958 if ( aExp == 0x7FFF ) {
5959 if ( (uint64_t) ( aSig<<1 )
5960 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5961 return propagateFloatx80NaN(a, b, status);
5963 if ( ( bExp | bSig ) == 0 ) goto invalid;
5964 return packFloatx80(zSign, floatx80_infinity_high,
5965 floatx80_infinity_low);
5967 if ( bExp == 0x7FFF ) {
5968 if ((uint64_t)(bSig << 1)) {
5969 return propagateFloatx80NaN(a, b, status);
5971 if ( ( aExp | aSig ) == 0 ) {
5972 invalid:
5973 float_raise(float_flag_invalid, status);
5974 return floatx80_default_nan(status);
5976 return packFloatx80(zSign, floatx80_infinity_high,
5977 floatx80_infinity_low);
5979 if ( aExp == 0 ) {
5980 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5981 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5983 if ( bExp == 0 ) {
5984 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5985 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5987 zExp = aExp + bExp - 0x3FFE;
5988 mul64To128( aSig, bSig, &zSig0, &zSig1 );
5989 if ( 0 < (int64_t) zSig0 ) {
5990 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5991 --zExp;
5993 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5994 zSign, zExp, zSig0, zSig1, status);
5997 /*----------------------------------------------------------------------------
5998 | Returns the result of dividing the extended double-precision floating-point
5999 | value `a' by the corresponding value `b'. The operation is performed
6000 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6001 *----------------------------------------------------------------------------*/
6003 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6005 flag aSign, bSign, zSign;
6006 int32_t aExp, bExp, zExp;
6007 uint64_t aSig, bSig, zSig0, zSig1;
6008 uint64_t rem0, rem1, rem2, term0, term1, term2;
6010 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6011 float_raise(float_flag_invalid, status);
6012 return floatx80_default_nan(status);
6014 aSig = extractFloatx80Frac( a );
6015 aExp = extractFloatx80Exp( a );
6016 aSign = extractFloatx80Sign( a );
6017 bSig = extractFloatx80Frac( b );
6018 bExp = extractFloatx80Exp( b );
6019 bSign = extractFloatx80Sign( b );
6020 zSign = aSign ^ bSign;
6021 if ( aExp == 0x7FFF ) {
6022 if ((uint64_t)(aSig << 1)) {
6023 return propagateFloatx80NaN(a, b, status);
6025 if ( bExp == 0x7FFF ) {
6026 if ((uint64_t)(bSig << 1)) {
6027 return propagateFloatx80NaN(a, b, status);
6029 goto invalid;
6031 return packFloatx80(zSign, floatx80_infinity_high,
6032 floatx80_infinity_low);
6034 if ( bExp == 0x7FFF ) {
6035 if ((uint64_t)(bSig << 1)) {
6036 return propagateFloatx80NaN(a, b, status);
6038 return packFloatx80( zSign, 0, 0 );
6040 if ( bExp == 0 ) {
6041 if ( bSig == 0 ) {
6042 if ( ( aExp | aSig ) == 0 ) {
6043 invalid:
6044 float_raise(float_flag_invalid, status);
6045 return floatx80_default_nan(status);
6047 float_raise(float_flag_divbyzero, status);
6048 return packFloatx80(zSign, floatx80_infinity_high,
6049 floatx80_infinity_low);
6051 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6053 if ( aExp == 0 ) {
6054 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6055 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6057 zExp = aExp - bExp + 0x3FFE;
6058 rem1 = 0;
6059 if ( bSig <= aSig ) {
6060 shift128Right( aSig, 0, 1, &aSig, &rem1 );
6061 ++zExp;
6063 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6064 mul64To128( bSig, zSig0, &term0, &term1 );
6065 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6066 while ( (int64_t) rem0 < 0 ) {
6067 --zSig0;
6068 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6070 zSig1 = estimateDiv128To64( rem1, 0, bSig );
6071 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6072 mul64To128( bSig, zSig1, &term1, &term2 );
6073 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6074 while ( (int64_t) rem1 < 0 ) {
6075 --zSig1;
6076 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6078 zSig1 |= ( ( rem1 | rem2 ) != 0 );
6080 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6081 zSign, zExp, zSig0, zSig1, status);
6084 /*----------------------------------------------------------------------------
6085 | Returns the remainder of the extended double-precision floating-point value
6086 | `a' with respect to the corresponding value `b'. The operation is performed
6087 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6088 *----------------------------------------------------------------------------*/
6090 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6092 flag aSign, zSign;
6093 int32_t aExp, bExp, expDiff;
6094 uint64_t aSig0, aSig1, bSig;
6095 uint64_t q, term0, term1, alternateASig0, alternateASig1;
6097 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6098 float_raise(float_flag_invalid, status);
6099 return floatx80_default_nan(status);
6101 aSig0 = extractFloatx80Frac( a );
6102 aExp = extractFloatx80Exp( a );
6103 aSign = extractFloatx80Sign( a );
6104 bSig = extractFloatx80Frac( b );
6105 bExp = extractFloatx80Exp( b );
6106 if ( aExp == 0x7FFF ) {
6107 if ( (uint64_t) ( aSig0<<1 )
6108 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6109 return propagateFloatx80NaN(a, b, status);
6111 goto invalid;
6113 if ( bExp == 0x7FFF ) {
6114 if ((uint64_t)(bSig << 1)) {
6115 return propagateFloatx80NaN(a, b, status);
6117 return a;
6119 if ( bExp == 0 ) {
6120 if ( bSig == 0 ) {
6121 invalid:
6122 float_raise(float_flag_invalid, status);
6123 return floatx80_default_nan(status);
6125 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6127 if ( aExp == 0 ) {
6128 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
6129 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6131 bSig |= LIT64( 0x8000000000000000 );
6132 zSign = aSign;
6133 expDiff = aExp - bExp;
6134 aSig1 = 0;
6135 if ( expDiff < 0 ) {
6136 if ( expDiff < -1 ) return a;
6137 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6138 expDiff = 0;
6140 q = ( bSig <= aSig0 );
6141 if ( q ) aSig0 -= bSig;
6142 expDiff -= 64;
6143 while ( 0 < expDiff ) {
6144 q = estimateDiv128To64( aSig0, aSig1, bSig );
6145 q = ( 2 < q ) ? q - 2 : 0;
6146 mul64To128( bSig, q, &term0, &term1 );
6147 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6148 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6149 expDiff -= 62;
6151 expDiff += 64;
6152 if ( 0 < expDiff ) {
6153 q = estimateDiv128To64( aSig0, aSig1, bSig );
6154 q = ( 2 < q ) ? q - 2 : 0;
6155 q >>= 64 - expDiff;
6156 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6157 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6158 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6159 while ( le128( term0, term1, aSig0, aSig1 ) ) {
6160 ++q;
6161 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6164 else {
6165 term1 = 0;
6166 term0 = bSig;
6168 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6169 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6170 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6171 && ( q & 1 ) )
6173 aSig0 = alternateASig0;
6174 aSig1 = alternateASig1;
6175 zSign = ! zSign;
6177 return
6178 normalizeRoundAndPackFloatx80(
6179 80, zSign, bExp + expDiff, aSig0, aSig1, status);
6183 /*----------------------------------------------------------------------------
6184 | Returns the square root of the extended double-precision floating-point
6185 | value `a'. The operation is performed according to the IEC/IEEE Standard
6186 | for Binary Floating-Point Arithmetic.
6187 *----------------------------------------------------------------------------*/
6189 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6191 flag aSign;
6192 int32_t aExp, zExp;
6193 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6194 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6196 if (floatx80_invalid_encoding(a)) {
6197 float_raise(float_flag_invalid, status);
6198 return floatx80_default_nan(status);
6200 aSig0 = extractFloatx80Frac( a );
6201 aExp = extractFloatx80Exp( a );
6202 aSign = extractFloatx80Sign( a );
6203 if ( aExp == 0x7FFF ) {
6204 if ((uint64_t)(aSig0 << 1)) {
6205 return propagateFloatx80NaN(a, a, status);
6207 if ( ! aSign ) return a;
6208 goto invalid;
6210 if ( aSign ) {
6211 if ( ( aExp | aSig0 ) == 0 ) return a;
6212 invalid:
6213 float_raise(float_flag_invalid, status);
6214 return floatx80_default_nan(status);
6216 if ( aExp == 0 ) {
6217 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6218 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6220 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6221 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6222 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6223 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6224 doubleZSig0 = zSig0<<1;
6225 mul64To128( zSig0, zSig0, &term0, &term1 );
6226 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6227 while ( (int64_t) rem0 < 0 ) {
6228 --zSig0;
6229 doubleZSig0 -= 2;
6230 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6232 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6233 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
6234 if ( zSig1 == 0 ) zSig1 = 1;
6235 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6236 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6237 mul64To128( zSig1, zSig1, &term2, &term3 );
6238 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6239 while ( (int64_t) rem1 < 0 ) {
6240 --zSig1;
6241 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6242 term3 |= 1;
6243 term2 |= doubleZSig0;
6244 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6246 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6248 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6249 zSig0 |= doubleZSig0;
6250 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6251 0, zExp, zSig0, zSig1, status);
6254 /*----------------------------------------------------------------------------
6255 | Returns 1 if the extended double-precision floating-point value `a' is equal
6256 | to the corresponding value `b', and 0 otherwise. The invalid exception is
6257 | raised if either operand is a NaN. Otherwise, the comparison is performed
6258 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6259 *----------------------------------------------------------------------------*/
6261 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
6264 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6265 || (extractFloatx80Exp(a) == 0x7FFF
6266 && (uint64_t) (extractFloatx80Frac(a) << 1))
6267 || (extractFloatx80Exp(b) == 0x7FFF
6268 && (uint64_t) (extractFloatx80Frac(b) << 1))
6270 float_raise(float_flag_invalid, status);
6271 return 0;
6273 return
6274 ( a.low == b.low )
6275 && ( ( a.high == b.high )
6276 || ( ( a.low == 0 )
6277 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6282 /*----------------------------------------------------------------------------
6283 | Returns 1 if the extended double-precision floating-point value `a' is
6284 | less than or equal to the corresponding value `b', and 0 otherwise. The
6285 | invalid exception is raised if either operand is a NaN. The comparison is
6286 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6287 | Arithmetic.
6288 *----------------------------------------------------------------------------*/
6290 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
6292 flag aSign, bSign;
6294 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6295 || (extractFloatx80Exp(a) == 0x7FFF
6296 && (uint64_t) (extractFloatx80Frac(a) << 1))
6297 || (extractFloatx80Exp(b) == 0x7FFF
6298 && (uint64_t) (extractFloatx80Frac(b) << 1))
6300 float_raise(float_flag_invalid, status);
6301 return 0;
6303 aSign = extractFloatx80Sign( a );
6304 bSign = extractFloatx80Sign( b );
6305 if ( aSign != bSign ) {
6306 return
6307 aSign
6308 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6309 == 0 );
6311 return
6312 aSign ? le128( b.high, b.low, a.high, a.low )
6313 : le128( a.high, a.low, b.high, b.low );
6317 /*----------------------------------------------------------------------------
6318 | Returns 1 if the extended double-precision floating-point value `a' is
6319 | less than the corresponding value `b', and 0 otherwise. The invalid
6320 | exception is raised if either operand is a NaN. The comparison is performed
6321 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6322 *----------------------------------------------------------------------------*/
6324 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
6326 flag aSign, bSign;
6328 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6329 || (extractFloatx80Exp(a) == 0x7FFF
6330 && (uint64_t) (extractFloatx80Frac(a) << 1))
6331 || (extractFloatx80Exp(b) == 0x7FFF
6332 && (uint64_t) (extractFloatx80Frac(b) << 1))
6334 float_raise(float_flag_invalid, status);
6335 return 0;
6337 aSign = extractFloatx80Sign( a );
6338 bSign = extractFloatx80Sign( b );
6339 if ( aSign != bSign ) {
6340 return
6341 aSign
6342 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6343 != 0 );
6345 return
6346 aSign ? lt128( b.high, b.low, a.high, a.low )
6347 : lt128( a.high, a.low, b.high, b.low );
6351 /*----------------------------------------------------------------------------
6352 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6353 | cannot be compared, and 0 otherwise. The invalid exception is raised if
6354 | either operand is a NaN. The comparison is performed according to the
6355 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6356 *----------------------------------------------------------------------------*/
6357 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
6359 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6360 || (extractFloatx80Exp(a) == 0x7FFF
6361 && (uint64_t) (extractFloatx80Frac(a) << 1))
6362 || (extractFloatx80Exp(b) == 0x7FFF
6363 && (uint64_t) (extractFloatx80Frac(b) << 1))
6365 float_raise(float_flag_invalid, status);
6366 return 1;
6368 return 0;
6371 /*----------------------------------------------------------------------------
6372 | Returns 1 if the extended double-precision floating-point value `a' is
6373 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6374 | cause an exception. The comparison is performed according to the IEC/IEEE
6375 | Standard for Binary Floating-Point Arithmetic.
6376 *----------------------------------------------------------------------------*/
6378 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
6381 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6382 float_raise(float_flag_invalid, status);
6383 return 0;
6385 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
6386 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6387 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
6388 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6390 if (floatx80_is_signaling_nan(a, status)
6391 || floatx80_is_signaling_nan(b, status)) {
6392 float_raise(float_flag_invalid, status);
6394 return 0;
6396 return
6397 ( a.low == b.low )
6398 && ( ( a.high == b.high )
6399 || ( ( a.low == 0 )
6400 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6405 /*----------------------------------------------------------------------------
6406 | Returns 1 if the extended double-precision floating-point value `a' is less
6407 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
6408 | do not cause an exception. Otherwise, the comparison is performed according
6409 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6410 *----------------------------------------------------------------------------*/
6412 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
6414 flag aSign, bSign;
6416 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6417 float_raise(float_flag_invalid, status);
6418 return 0;
6420 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
6421 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6422 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
6423 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6425 if (floatx80_is_signaling_nan(a, status)
6426 || floatx80_is_signaling_nan(b, status)) {
6427 float_raise(float_flag_invalid, status);
6429 return 0;
6431 aSign = extractFloatx80Sign( a );
6432 bSign = extractFloatx80Sign( b );
6433 if ( aSign != bSign ) {
6434 return
6435 aSign
6436 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6437 == 0 );
6439 return
6440 aSign ? le128( b.high, b.low, a.high, a.low )
6441 : le128( a.high, a.low, b.high, b.low );
6445 /*----------------------------------------------------------------------------
6446 | Returns 1 if the extended double-precision floating-point value `a' is less
6447 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
6448 | an exception. Otherwise, the comparison is performed according to the
6449 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6450 *----------------------------------------------------------------------------*/
6452 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
6454 flag aSign, bSign;
6456 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6457 float_raise(float_flag_invalid, status);
6458 return 0;
6460 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
6461 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6462 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
6463 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6465 if (floatx80_is_signaling_nan(a, status)
6466 || floatx80_is_signaling_nan(b, status)) {
6467 float_raise(float_flag_invalid, status);
6469 return 0;
6471 aSign = extractFloatx80Sign( a );
6472 bSign = extractFloatx80Sign( b );
6473 if ( aSign != bSign ) {
6474 return
6475 aSign
6476 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6477 != 0 );
6479 return
6480 aSign ? lt128( b.high, b.low, a.high, a.low )
6481 : lt128( a.high, a.low, b.high, b.low );
6485 /*----------------------------------------------------------------------------
6486 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6487 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
6488 | The comparison is performed according to the IEC/IEEE Standard for Binary
6489 | Floating-Point Arithmetic.
6490 *----------------------------------------------------------------------------*/
6491 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
6493 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6494 float_raise(float_flag_invalid, status);
6495 return 1;
6497 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
6498 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6499 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
6500 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6502 if (floatx80_is_signaling_nan(a, status)
6503 || floatx80_is_signaling_nan(b, status)) {
6504 float_raise(float_flag_invalid, status);
6506 return 1;
6508 return 0;
6511 /*----------------------------------------------------------------------------
6512 | Returns the result of converting the quadruple-precision floating-point
6513 | value `a' to the 32-bit two's complement integer format. The conversion
6514 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6515 | Arithmetic---which means in particular that the conversion is rounded
6516 | according to the current rounding mode. If `a' is a NaN, the largest
6517 | positive integer is returned. Otherwise, if the conversion overflows, the
6518 | largest integer with the same sign as `a' is returned.
6519 *----------------------------------------------------------------------------*/
6521 int32_t float128_to_int32(float128 a, float_status *status)
6523 flag aSign;
6524 int32_t aExp, shiftCount;
6525 uint64_t aSig0, aSig1;
6527 aSig1 = extractFloat128Frac1( a );
6528 aSig0 = extractFloat128Frac0( a );
6529 aExp = extractFloat128Exp( a );
6530 aSign = extractFloat128Sign( a );
6531 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6532 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6533 aSig0 |= ( aSig1 != 0 );
6534 shiftCount = 0x4028 - aExp;
6535 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6536 return roundAndPackInt32(aSign, aSig0, status);
6540 /*----------------------------------------------------------------------------
6541 | Returns the result of converting the quadruple-precision floating-point
6542 | value `a' to the 32-bit two's complement integer format. The conversion
6543 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6544 | Arithmetic, except that the conversion is always rounded toward zero. If
6545 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
6546 | conversion overflows, the largest integer with the same sign as `a' is
6547 | returned.
6548 *----------------------------------------------------------------------------*/
6550 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6552 flag aSign;
6553 int32_t aExp, shiftCount;
6554 uint64_t aSig0, aSig1, savedASig;
6555 int32_t z;
6557 aSig1 = extractFloat128Frac1( a );
6558 aSig0 = extractFloat128Frac0( a );
6559 aExp = extractFloat128Exp( a );
6560 aSign = extractFloat128Sign( a );
6561 aSig0 |= ( aSig1 != 0 );
6562 if ( 0x401E < aExp ) {
6563 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6564 goto invalid;
6566 else if ( aExp < 0x3FFF ) {
6567 if (aExp || aSig0) {
6568 status->float_exception_flags |= float_flag_inexact;
6570 return 0;
6572 aSig0 |= LIT64( 0x0001000000000000 );
6573 shiftCount = 0x402F - aExp;
6574 savedASig = aSig0;
6575 aSig0 >>= shiftCount;
6576 z = aSig0;
6577 if ( aSign ) z = - z;
6578 if ( ( z < 0 ) ^ aSign ) {
6579 invalid:
6580 float_raise(float_flag_invalid, status);
6581 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
6583 if ( ( aSig0<<shiftCount ) != savedASig ) {
6584 status->float_exception_flags |= float_flag_inexact;
6586 return z;
6590 /*----------------------------------------------------------------------------
6591 | Returns the result of converting the quadruple-precision floating-point
6592 | value `a' to the 64-bit two's complement integer format. The conversion
6593 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6594 | Arithmetic---which means in particular that the conversion is rounded
6595 | according to the current rounding mode. If `a' is a NaN, the largest
6596 | positive integer is returned. Otherwise, if the conversion overflows, the
6597 | largest integer with the same sign as `a' is returned.
6598 *----------------------------------------------------------------------------*/
6600 int64_t float128_to_int64(float128 a, float_status *status)
6602 flag aSign;
6603 int32_t aExp, shiftCount;
6604 uint64_t aSig0, aSig1;
6606 aSig1 = extractFloat128Frac1( a );
6607 aSig0 = extractFloat128Frac0( a );
6608 aExp = extractFloat128Exp( a );
6609 aSign = extractFloat128Sign( a );
6610 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6611 shiftCount = 0x402F - aExp;
6612 if ( shiftCount <= 0 ) {
6613 if ( 0x403E < aExp ) {
6614 float_raise(float_flag_invalid, status);
6615 if ( ! aSign
6616 || ( ( aExp == 0x7FFF )
6617 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
6620 return LIT64( 0x7FFFFFFFFFFFFFFF );
6622 return (int64_t) LIT64( 0x8000000000000000 );
6624 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6626 else {
6627 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6629 return roundAndPackInt64(aSign, aSig0, aSig1, status);
6633 /*----------------------------------------------------------------------------
6634 | Returns the result of converting the quadruple-precision floating-point
6635 | value `a' to the 64-bit two's complement integer format. The conversion
6636 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6637 | Arithmetic, except that the conversion is always rounded toward zero.
6638 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6639 | the conversion overflows, the largest integer with the same sign as `a' is
6640 | returned.
6641 *----------------------------------------------------------------------------*/
6643 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6645 flag aSign;
6646 int32_t aExp, shiftCount;
6647 uint64_t aSig0, aSig1;
6648 int64_t z;
6650 aSig1 = extractFloat128Frac1( a );
6651 aSig0 = extractFloat128Frac0( a );
6652 aExp = extractFloat128Exp( a );
6653 aSign = extractFloat128Sign( a );
6654 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6655 shiftCount = aExp - 0x402F;
6656 if ( 0 < shiftCount ) {
6657 if ( 0x403E <= aExp ) {
6658 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
6659 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
6660 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
6661 if (aSig1) {
6662 status->float_exception_flags |= float_flag_inexact;
6665 else {
6666 float_raise(float_flag_invalid, status);
6667 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6668 return LIT64( 0x7FFFFFFFFFFFFFFF );
6671 return (int64_t) LIT64( 0x8000000000000000 );
6673 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6674 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6675 status->float_exception_flags |= float_flag_inexact;
6678 else {
6679 if ( aExp < 0x3FFF ) {
6680 if ( aExp | aSig0 | aSig1 ) {
6681 status->float_exception_flags |= float_flag_inexact;
6683 return 0;
6685 z = aSig0>>( - shiftCount );
6686 if ( aSig1
6687 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6688 status->float_exception_flags |= float_flag_inexact;
6691 if ( aSign ) z = - z;
6692 return z;
6696 /*----------------------------------------------------------------------------
6697 | Returns the result of converting the quadruple-precision floating-point value
6698 | `a' to the 64-bit unsigned integer format. The conversion is
6699 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6700 | Arithmetic---which means in particular that the conversion is rounded
6701 | according to the current rounding mode. If `a' is a NaN, the largest
6702 | positive integer is returned. If the conversion overflows, the
6703 | largest unsigned integer is returned. If 'a' is negative, the value is
6704 | rounded and zero is returned; negative values that do not round to zero
6705 | will raise the inexact exception.
6706 *----------------------------------------------------------------------------*/
6708 uint64_t float128_to_uint64(float128 a, float_status *status)
6710 flag aSign;
6711 int aExp;
6712 int shiftCount;
6713 uint64_t aSig0, aSig1;
6715 aSig0 = extractFloat128Frac0(a);
6716 aSig1 = extractFloat128Frac1(a);
6717 aExp = extractFloat128Exp(a);
6718 aSign = extractFloat128Sign(a);
6719 if (aSign && (aExp > 0x3FFE)) {
6720 float_raise(float_flag_invalid, status);
6721 if (float128_is_any_nan(a)) {
6722 return LIT64(0xFFFFFFFFFFFFFFFF);
6723 } else {
6724 return 0;
6727 if (aExp) {
6728 aSig0 |= LIT64(0x0001000000000000);
6730 shiftCount = 0x402F - aExp;
6731 if (shiftCount <= 0) {
6732 if (0x403E < aExp) {
6733 float_raise(float_flag_invalid, status);
6734 return LIT64(0xFFFFFFFFFFFFFFFF);
6736 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6737 } else {
6738 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6740 return roundAndPackUint64(aSign, aSig0, aSig1, status);
6743 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6745 uint64_t v;
6746 signed char current_rounding_mode = status->float_rounding_mode;
6748 set_float_rounding_mode(float_round_to_zero, status);
6749 v = float128_to_uint64(a, status);
6750 set_float_rounding_mode(current_rounding_mode, status);
6752 return v;
6755 /*----------------------------------------------------------------------------
6756 | Returns the result of converting the quadruple-precision floating-point
6757 | value `a' to the 32-bit unsigned integer format. The conversion
6758 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6759 | Arithmetic except that the conversion is always rounded toward zero.
6760 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
6761 | if the conversion overflows, the largest unsigned integer is returned.
6762 | If 'a' is negative, the value is rounded and zero is returned; negative
6763 | values that do not round to zero will raise the inexact exception.
6764 *----------------------------------------------------------------------------*/
6766 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6768 uint64_t v;
6769 uint32_t res;
6770 int old_exc_flags = get_float_exception_flags(status);
6772 v = float128_to_uint64_round_to_zero(a, status);
6773 if (v > 0xffffffff) {
6774 res = 0xffffffff;
6775 } else {
6776 return v;
6778 set_float_exception_flags(old_exc_flags, status);
6779 float_raise(float_flag_invalid, status);
6780 return res;
6783 /*----------------------------------------------------------------------------
6784 | Returns the result of converting the quadruple-precision floating-point
6785 | value `a' to the single-precision floating-point format. The conversion
6786 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6787 | Arithmetic.
6788 *----------------------------------------------------------------------------*/
6790 float32 float128_to_float32(float128 a, float_status *status)
6792 flag aSign;
6793 int32_t aExp;
6794 uint64_t aSig0, aSig1;
6795 uint32_t zSig;
6797 aSig1 = extractFloat128Frac1( a );
6798 aSig0 = extractFloat128Frac0( a );
6799 aExp = extractFloat128Exp( a );
6800 aSign = extractFloat128Sign( a );
6801 if ( aExp == 0x7FFF ) {
6802 if ( aSig0 | aSig1 ) {
6803 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6805 return packFloat32( aSign, 0xFF, 0 );
6807 aSig0 |= ( aSig1 != 0 );
6808 shift64RightJamming( aSig0, 18, &aSig0 );
6809 zSig = aSig0;
6810 if ( aExp || zSig ) {
6811 zSig |= 0x40000000;
6812 aExp -= 0x3F81;
6814 return roundAndPackFloat32(aSign, aExp, zSig, status);
6818 /*----------------------------------------------------------------------------
6819 | Returns the result of converting the quadruple-precision floating-point
6820 | value `a' to the double-precision floating-point format. The conversion
6821 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6822 | Arithmetic.
6823 *----------------------------------------------------------------------------*/
6825 float64 float128_to_float64(float128 a, float_status *status)
6827 flag aSign;
6828 int32_t aExp;
6829 uint64_t aSig0, aSig1;
6831 aSig1 = extractFloat128Frac1( a );
6832 aSig0 = extractFloat128Frac0( a );
6833 aExp = extractFloat128Exp( a );
6834 aSign = extractFloat128Sign( a );
6835 if ( aExp == 0x7FFF ) {
6836 if ( aSig0 | aSig1 ) {
6837 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6839 return packFloat64( aSign, 0x7FF, 0 );
6841 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6842 aSig0 |= ( aSig1 != 0 );
6843 if ( aExp || aSig0 ) {
6844 aSig0 |= LIT64( 0x4000000000000000 );
6845 aExp -= 0x3C01;
6847 return roundAndPackFloat64(aSign, aExp, aSig0, status);
6851 /*----------------------------------------------------------------------------
6852 | Returns the result of converting the quadruple-precision floating-point
6853 | value `a' to the extended double-precision floating-point format. The
6854 | conversion is performed according to the IEC/IEEE Standard for Binary
6855 | Floating-Point Arithmetic.
6856 *----------------------------------------------------------------------------*/
6858 floatx80 float128_to_floatx80(float128 a, float_status *status)
6860 flag aSign;
6861 int32_t aExp;
6862 uint64_t aSig0, aSig1;
6864 aSig1 = extractFloat128Frac1( a );
6865 aSig0 = extractFloat128Frac0( a );
6866 aExp = extractFloat128Exp( a );
6867 aSign = extractFloat128Sign( a );
6868 if ( aExp == 0x7FFF ) {
6869 if ( aSig0 | aSig1 ) {
6870 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6872 return packFloatx80(aSign, floatx80_infinity_high,
6873 floatx80_infinity_low);
6875 if ( aExp == 0 ) {
6876 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6877 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6879 else {
6880 aSig0 |= LIT64( 0x0001000000000000 );
6882 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6883 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6887 /*----------------------------------------------------------------------------
6888 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6889 | returns the result as a quadruple-precision floating-point value. The
6890 | operation is performed according to the IEC/IEEE Standard for Binary
6891 | Floating-Point Arithmetic.
6892 *----------------------------------------------------------------------------*/
6894 float128 float128_round_to_int(float128 a, float_status *status)
6896 flag aSign;
6897 int32_t aExp;
6898 uint64_t lastBitMask, roundBitsMask;
6899 float128 z;
6901 aExp = extractFloat128Exp( a );
6902 if ( 0x402F <= aExp ) {
6903 if ( 0x406F <= aExp ) {
6904 if ( ( aExp == 0x7FFF )
6905 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6907 return propagateFloat128NaN(a, a, status);
6909 return a;
6911 lastBitMask = 1;
6912 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6913 roundBitsMask = lastBitMask - 1;
6914 z = a;
6915 switch (status->float_rounding_mode) {
6916 case float_round_nearest_even:
6917 if ( lastBitMask ) {
6918 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6919 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6921 else {
6922 if ( (int64_t) z.low < 0 ) {
6923 ++z.high;
6924 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6927 break;
6928 case float_round_ties_away:
6929 if (lastBitMask) {
6930 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6931 } else {
6932 if ((int64_t) z.low < 0) {
6933 ++z.high;
6936 break;
6937 case float_round_to_zero:
6938 break;
6939 case float_round_up:
6940 if (!extractFloat128Sign(z)) {
6941 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6943 break;
6944 case float_round_down:
6945 if (extractFloat128Sign(z)) {
6946 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6948 break;
6949 default:
6950 abort();
6952 z.low &= ~ roundBitsMask;
6954 else {
6955 if ( aExp < 0x3FFF ) {
6956 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6957 status->float_exception_flags |= float_flag_inexact;
6958 aSign = extractFloat128Sign( a );
6959 switch (status->float_rounding_mode) {
6960 case float_round_nearest_even:
6961 if ( ( aExp == 0x3FFE )
6962 && ( extractFloat128Frac0( a )
6963 | extractFloat128Frac1( a ) )
6965 return packFloat128( aSign, 0x3FFF, 0, 0 );
6967 break;
6968 case float_round_ties_away:
6969 if (aExp == 0x3FFE) {
6970 return packFloat128(aSign, 0x3FFF, 0, 0);
6972 break;
6973 case float_round_down:
6974 return
6975 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6976 : packFloat128( 0, 0, 0, 0 );
6977 case float_round_up:
6978 return
6979 aSign ? packFloat128( 1, 0, 0, 0 )
6980 : packFloat128( 0, 0x3FFF, 0, 0 );
6982 return packFloat128( aSign, 0, 0, 0 );
6984 lastBitMask = 1;
6985 lastBitMask <<= 0x402F - aExp;
6986 roundBitsMask = lastBitMask - 1;
6987 z.low = 0;
6988 z.high = a.high;
6989 switch (status->float_rounding_mode) {
6990 case float_round_nearest_even:
6991 z.high += lastBitMask>>1;
6992 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6993 z.high &= ~ lastBitMask;
6995 break;
6996 case float_round_ties_away:
6997 z.high += lastBitMask>>1;
6998 break;
6999 case float_round_to_zero:
7000 break;
7001 case float_round_up:
7002 if (!extractFloat128Sign(z)) {
7003 z.high |= ( a.low != 0 );
7004 z.high += roundBitsMask;
7006 break;
7007 case float_round_down:
7008 if (extractFloat128Sign(z)) {
7009 z.high |= (a.low != 0);
7010 z.high += roundBitsMask;
7012 break;
7013 default:
7014 abort();
7016 z.high &= ~ roundBitsMask;
7018 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
7019 status->float_exception_flags |= float_flag_inexact;
7021 return z;
7025 /*----------------------------------------------------------------------------
7026 | Returns the result of adding the absolute values of the quadruple-precision
7027 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
7028 | before being returned. `zSign' is ignored if the result is a NaN.
7029 | The addition is performed according to the IEC/IEEE Standard for Binary
7030 | Floating-Point Arithmetic.
7031 *----------------------------------------------------------------------------*/
7033 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
7034 float_status *status)
7036 int32_t aExp, bExp, zExp;
7037 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7038 int32_t expDiff;
7040 aSig1 = extractFloat128Frac1( a );
7041 aSig0 = extractFloat128Frac0( a );
7042 aExp = extractFloat128Exp( a );
7043 bSig1 = extractFloat128Frac1( b );
7044 bSig0 = extractFloat128Frac0( b );
7045 bExp = extractFloat128Exp( b );
7046 expDiff = aExp - bExp;
7047 if ( 0 < expDiff ) {
7048 if ( aExp == 0x7FFF ) {
7049 if (aSig0 | aSig1) {
7050 return propagateFloat128NaN(a, b, status);
7052 return a;
7054 if ( bExp == 0 ) {
7055 --expDiff;
7057 else {
7058 bSig0 |= LIT64( 0x0001000000000000 );
7060 shift128ExtraRightJamming(
7061 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
7062 zExp = aExp;
7064 else if ( expDiff < 0 ) {
7065 if ( bExp == 0x7FFF ) {
7066 if (bSig0 | bSig1) {
7067 return propagateFloat128NaN(a, b, status);
7069 return packFloat128( zSign, 0x7FFF, 0, 0 );
7071 if ( aExp == 0 ) {
7072 ++expDiff;
7074 else {
7075 aSig0 |= LIT64( 0x0001000000000000 );
7077 shift128ExtraRightJamming(
7078 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
7079 zExp = bExp;
7081 else {
7082 if ( aExp == 0x7FFF ) {
7083 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7084 return propagateFloat128NaN(a, b, status);
7086 return a;
7088 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7089 if ( aExp == 0 ) {
7090 if (status->flush_to_zero) {
7091 if (zSig0 | zSig1) {
7092 float_raise(float_flag_output_denormal, status);
7094 return packFloat128(zSign, 0, 0, 0);
7096 return packFloat128( zSign, 0, zSig0, zSig1 );
7098 zSig2 = 0;
7099 zSig0 |= LIT64( 0x0002000000000000 );
7100 zExp = aExp;
7101 goto shiftRight1;
7103 aSig0 |= LIT64( 0x0001000000000000 );
7104 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7105 --zExp;
7106 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
7107 ++zExp;
7108 shiftRight1:
7109 shift128ExtraRightJamming(
7110 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7111 roundAndPack:
7112 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7116 /*----------------------------------------------------------------------------
7117 | Returns the result of subtracting the absolute values of the quadruple-
7118 | precision floating-point values `a' and `b'. If `zSign' is 1, the
7119 | difference is negated before being returned. `zSign' is ignored if the
7120 | result is a NaN. The subtraction is performed according to the IEC/IEEE
7121 | Standard for Binary Floating-Point Arithmetic.
7122 *----------------------------------------------------------------------------*/
7124 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
7125 float_status *status)
7127 int32_t aExp, bExp, zExp;
7128 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
7129 int32_t expDiff;
7131 aSig1 = extractFloat128Frac1( a );
7132 aSig0 = extractFloat128Frac0( a );
7133 aExp = extractFloat128Exp( a );
7134 bSig1 = extractFloat128Frac1( b );
7135 bSig0 = extractFloat128Frac0( b );
7136 bExp = extractFloat128Exp( b );
7137 expDiff = aExp - bExp;
7138 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7139 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7140 if ( 0 < expDiff ) goto aExpBigger;
7141 if ( expDiff < 0 ) goto bExpBigger;
7142 if ( aExp == 0x7FFF ) {
7143 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7144 return propagateFloat128NaN(a, b, status);
7146 float_raise(float_flag_invalid, status);
7147 return float128_default_nan(status);
7149 if ( aExp == 0 ) {
7150 aExp = 1;
7151 bExp = 1;
7153 if ( bSig0 < aSig0 ) goto aBigger;
7154 if ( aSig0 < bSig0 ) goto bBigger;
7155 if ( bSig1 < aSig1 ) goto aBigger;
7156 if ( aSig1 < bSig1 ) goto bBigger;
7157 return packFloat128(status->float_rounding_mode == float_round_down,
7158 0, 0, 0);
7159 bExpBigger:
7160 if ( bExp == 0x7FFF ) {
7161 if (bSig0 | bSig1) {
7162 return propagateFloat128NaN(a, b, status);
7164 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7166 if ( aExp == 0 ) {
7167 ++expDiff;
7169 else {
7170 aSig0 |= LIT64( 0x4000000000000000 );
7172 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7173 bSig0 |= LIT64( 0x4000000000000000 );
7174 bBigger:
7175 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7176 zExp = bExp;
7177 zSign ^= 1;
7178 goto normalizeRoundAndPack;
7179 aExpBigger:
7180 if ( aExp == 0x7FFF ) {
7181 if (aSig0 | aSig1) {
7182 return propagateFloat128NaN(a, b, status);
7184 return a;
7186 if ( bExp == 0 ) {
7187 --expDiff;
7189 else {
7190 bSig0 |= LIT64( 0x4000000000000000 );
7192 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7193 aSig0 |= LIT64( 0x4000000000000000 );
7194 aBigger:
7195 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7196 zExp = aExp;
7197 normalizeRoundAndPack:
7198 --zExp;
7199 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7200 status);
7204 /*----------------------------------------------------------------------------
7205 | Returns the result of adding the quadruple-precision floating-point values
7206 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
7207 | for Binary Floating-Point Arithmetic.
7208 *----------------------------------------------------------------------------*/
7210 float128 float128_add(float128 a, float128 b, float_status *status)
7212 flag aSign, bSign;
7214 aSign = extractFloat128Sign( a );
7215 bSign = extractFloat128Sign( b );
7216 if ( aSign == bSign ) {
7217 return addFloat128Sigs(a, b, aSign, status);
7219 else {
7220 return subFloat128Sigs(a, b, aSign, status);
7225 /*----------------------------------------------------------------------------
7226 | Returns the result of subtracting the quadruple-precision floating-point
7227 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7228 | Standard for Binary Floating-Point Arithmetic.
7229 *----------------------------------------------------------------------------*/
7231 float128 float128_sub(float128 a, float128 b, float_status *status)
7233 flag aSign, bSign;
7235 aSign = extractFloat128Sign( a );
7236 bSign = extractFloat128Sign( b );
7237 if ( aSign == bSign ) {
7238 return subFloat128Sigs(a, b, aSign, status);
7240 else {
7241 return addFloat128Sigs(a, b, aSign, status);
7246 /*----------------------------------------------------------------------------
7247 | Returns the result of multiplying the quadruple-precision floating-point
7248 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7249 | Standard for Binary Floating-Point Arithmetic.
7250 *----------------------------------------------------------------------------*/
7252 float128 float128_mul(float128 a, float128 b, float_status *status)
7254 flag aSign, bSign, zSign;
7255 int32_t aExp, bExp, zExp;
7256 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7258 aSig1 = extractFloat128Frac1( a );
7259 aSig0 = extractFloat128Frac0( a );
7260 aExp = extractFloat128Exp( a );
7261 aSign = extractFloat128Sign( a );
7262 bSig1 = extractFloat128Frac1( b );
7263 bSig0 = extractFloat128Frac0( b );
7264 bExp = extractFloat128Exp( b );
7265 bSign = extractFloat128Sign( b );
7266 zSign = aSign ^ bSign;
7267 if ( aExp == 0x7FFF ) {
7268 if ( ( aSig0 | aSig1 )
7269 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7270 return propagateFloat128NaN(a, b, status);
7272 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7273 return packFloat128( zSign, 0x7FFF, 0, 0 );
7275 if ( bExp == 0x7FFF ) {
7276 if (bSig0 | bSig1) {
7277 return propagateFloat128NaN(a, b, status);
7279 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7280 invalid:
7281 float_raise(float_flag_invalid, status);
7282 return float128_default_nan(status);
7284 return packFloat128( zSign, 0x7FFF, 0, 0 );
7286 if ( aExp == 0 ) {
7287 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7288 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7290 if ( bExp == 0 ) {
7291 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7292 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7294 zExp = aExp + bExp - 0x4000;
7295 aSig0 |= LIT64( 0x0001000000000000 );
7296 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7297 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7298 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7299 zSig2 |= ( zSig3 != 0 );
7300 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
7301 shift128ExtraRightJamming(
7302 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7303 ++zExp;
7305 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7309 /*----------------------------------------------------------------------------
7310 | Returns the result of dividing the quadruple-precision floating-point value
7311 | `a' by the corresponding value `b'. The operation is performed according to
7312 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7313 *----------------------------------------------------------------------------*/
7315 float128 float128_div(float128 a, float128 b, float_status *status)
7317 flag aSign, bSign, zSign;
7318 int32_t aExp, bExp, zExp;
7319 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7320 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7322 aSig1 = extractFloat128Frac1( a );
7323 aSig0 = extractFloat128Frac0( a );
7324 aExp = extractFloat128Exp( a );
7325 aSign = extractFloat128Sign( a );
7326 bSig1 = extractFloat128Frac1( b );
7327 bSig0 = extractFloat128Frac0( b );
7328 bExp = extractFloat128Exp( b );
7329 bSign = extractFloat128Sign( b );
7330 zSign = aSign ^ bSign;
7331 if ( aExp == 0x7FFF ) {
7332 if (aSig0 | aSig1) {
7333 return propagateFloat128NaN(a, b, status);
7335 if ( bExp == 0x7FFF ) {
7336 if (bSig0 | bSig1) {
7337 return propagateFloat128NaN(a, b, status);
7339 goto invalid;
7341 return packFloat128( zSign, 0x7FFF, 0, 0 );
7343 if ( bExp == 0x7FFF ) {
7344 if (bSig0 | bSig1) {
7345 return propagateFloat128NaN(a, b, status);
7347 return packFloat128( zSign, 0, 0, 0 );
7349 if ( bExp == 0 ) {
7350 if ( ( bSig0 | bSig1 ) == 0 ) {
7351 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7352 invalid:
7353 float_raise(float_flag_invalid, status);
7354 return float128_default_nan(status);
7356 float_raise(float_flag_divbyzero, status);
7357 return packFloat128( zSign, 0x7FFF, 0, 0 );
7359 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7361 if ( aExp == 0 ) {
7362 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7363 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7365 zExp = aExp - bExp + 0x3FFD;
7366 shortShift128Left(
7367 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
7368 shortShift128Left(
7369 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7370 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7371 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7372 ++zExp;
7374 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7375 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7376 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7377 while ( (int64_t) rem0 < 0 ) {
7378 --zSig0;
7379 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7381 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7382 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7383 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7384 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7385 while ( (int64_t) rem1 < 0 ) {
7386 --zSig1;
7387 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7389 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7391 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7392 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7396 /*----------------------------------------------------------------------------
7397 | Returns the remainder of the quadruple-precision floating-point value `a'
7398 | with respect to the corresponding value `b'. The operation is performed
7399 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7400 *----------------------------------------------------------------------------*/
7402 float128 float128_rem(float128 a, float128 b, float_status *status)
7404 flag aSign, zSign;
7405 int32_t aExp, bExp, expDiff;
7406 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7407 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7408 int64_t sigMean0;
7410 aSig1 = extractFloat128Frac1( a );
7411 aSig0 = extractFloat128Frac0( a );
7412 aExp = extractFloat128Exp( a );
7413 aSign = extractFloat128Sign( a );
7414 bSig1 = extractFloat128Frac1( b );
7415 bSig0 = extractFloat128Frac0( b );
7416 bExp = extractFloat128Exp( b );
7417 if ( aExp == 0x7FFF ) {
7418 if ( ( aSig0 | aSig1 )
7419 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7420 return propagateFloat128NaN(a, b, status);
7422 goto invalid;
7424 if ( bExp == 0x7FFF ) {
7425 if (bSig0 | bSig1) {
7426 return propagateFloat128NaN(a, b, status);
7428 return a;
7430 if ( bExp == 0 ) {
7431 if ( ( bSig0 | bSig1 ) == 0 ) {
7432 invalid:
7433 float_raise(float_flag_invalid, status);
7434 return float128_default_nan(status);
7436 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7438 if ( aExp == 0 ) {
7439 if ( ( aSig0 | aSig1 ) == 0 ) return a;
7440 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7442 expDiff = aExp - bExp;
7443 if ( expDiff < -1 ) return a;
7444 shortShift128Left(
7445 aSig0 | LIT64( 0x0001000000000000 ),
7446 aSig1,
7447 15 - ( expDiff < 0 ),
7448 &aSig0,
7449 &aSig1
7451 shortShift128Left(
7452 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7453 q = le128( bSig0, bSig1, aSig0, aSig1 );
7454 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7455 expDiff -= 64;
7456 while ( 0 < expDiff ) {
7457 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7458 q = ( 4 < q ) ? q - 4 : 0;
7459 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7460 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7461 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7462 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7463 expDiff -= 61;
7465 if ( -64 < expDiff ) {
7466 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7467 q = ( 4 < q ) ? q - 4 : 0;
7468 q >>= - expDiff;
7469 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7470 expDiff += 52;
7471 if ( expDiff < 0 ) {
7472 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7474 else {
7475 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7477 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7478 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7480 else {
7481 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7482 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7484 do {
7485 alternateASig0 = aSig0;
7486 alternateASig1 = aSig1;
7487 ++q;
7488 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7489 } while ( 0 <= (int64_t) aSig0 );
7490 add128(
7491 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7492 if ( ( sigMean0 < 0 )
7493 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7494 aSig0 = alternateASig0;
7495 aSig1 = alternateASig1;
7497 zSign = ( (int64_t) aSig0 < 0 );
7498 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7499 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7500 status);
7503 /*----------------------------------------------------------------------------
7504 | Returns the square root of the quadruple-precision floating-point value `a'.
7505 | The operation is performed according to the IEC/IEEE Standard for Binary
7506 | Floating-Point Arithmetic.
7507 *----------------------------------------------------------------------------*/
7509 float128 float128_sqrt(float128 a, float_status *status)
7511 flag aSign;
7512 int32_t aExp, zExp;
7513 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7514 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7516 aSig1 = extractFloat128Frac1( a );
7517 aSig0 = extractFloat128Frac0( a );
7518 aExp = extractFloat128Exp( a );
7519 aSign = extractFloat128Sign( a );
7520 if ( aExp == 0x7FFF ) {
7521 if (aSig0 | aSig1) {
7522 return propagateFloat128NaN(a, a, status);
7524 if ( ! aSign ) return a;
7525 goto invalid;
7527 if ( aSign ) {
7528 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7529 invalid:
7530 float_raise(float_flag_invalid, status);
7531 return float128_default_nan(status);
7533 if ( aExp == 0 ) {
7534 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7535 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7537 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7538 aSig0 |= LIT64( 0x0001000000000000 );
7539 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7540 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7541 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7542 doubleZSig0 = zSig0<<1;
7543 mul64To128( zSig0, zSig0, &term0, &term1 );
7544 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7545 while ( (int64_t) rem0 < 0 ) {
7546 --zSig0;
7547 doubleZSig0 -= 2;
7548 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7550 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7551 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7552 if ( zSig1 == 0 ) zSig1 = 1;
7553 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7554 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7555 mul64To128( zSig1, zSig1, &term2, &term3 );
7556 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7557 while ( (int64_t) rem1 < 0 ) {
7558 --zSig1;
7559 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7560 term3 |= 1;
7561 term2 |= doubleZSig0;
7562 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7564 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7566 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7567 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7571 /*----------------------------------------------------------------------------
7572 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7573 | the corresponding value `b', and 0 otherwise. The invalid exception is
7574 | raised if either operand is a NaN. Otherwise, the comparison is performed
7575 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7576 *----------------------------------------------------------------------------*/
7578 int float128_eq(float128 a, float128 b, float_status *status)
7581 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7582 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7583 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7584 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7586 float_raise(float_flag_invalid, status);
7587 return 0;
7589 return
7590 ( a.low == b.low )
7591 && ( ( a.high == b.high )
7592 || ( ( a.low == 0 )
7593 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7598 /*----------------------------------------------------------------------------
7599 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7600 | or equal to the corresponding value `b', and 0 otherwise. The invalid
7601 | exception is raised if either operand is a NaN. The comparison is performed
7602 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7603 *----------------------------------------------------------------------------*/
7605 int float128_le(float128 a, float128 b, float_status *status)
7607 flag aSign, bSign;
7609 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7610 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7611 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7612 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7614 float_raise(float_flag_invalid, status);
7615 return 0;
7617 aSign = extractFloat128Sign( a );
7618 bSign = extractFloat128Sign( b );
7619 if ( aSign != bSign ) {
7620 return
7621 aSign
7622 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7623 == 0 );
7625 return
7626 aSign ? le128( b.high, b.low, a.high, a.low )
7627 : le128( a.high, a.low, b.high, b.low );
7631 /*----------------------------------------------------------------------------
7632 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7633 | the corresponding value `b', and 0 otherwise. The invalid exception is
7634 | raised if either operand is a NaN. The comparison is performed according
7635 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7636 *----------------------------------------------------------------------------*/
7638 int float128_lt(float128 a, float128 b, float_status *status)
7640 flag aSign, bSign;
7642 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7643 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7644 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7645 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7647 float_raise(float_flag_invalid, status);
7648 return 0;
7650 aSign = extractFloat128Sign( a );
7651 bSign = extractFloat128Sign( b );
7652 if ( aSign != bSign ) {
7653 return
7654 aSign
7655 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7656 != 0 );
7658 return
7659 aSign ? lt128( b.high, b.low, a.high, a.low )
7660 : lt128( a.high, a.low, b.high, b.low );
7664 /*----------------------------------------------------------------------------
7665 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7666 | be compared, and 0 otherwise. The invalid exception is raised if either
7667 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7668 | Standard for Binary Floating-Point Arithmetic.
7669 *----------------------------------------------------------------------------*/
7671 int float128_unordered(float128 a, float128 b, float_status *status)
7673 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7674 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7675 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7676 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7678 float_raise(float_flag_invalid, status);
7679 return 1;
7681 return 0;
7684 /*----------------------------------------------------------------------------
7685 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7686 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7687 | exception. The comparison is performed according to the IEC/IEEE Standard
7688 | for Binary Floating-Point Arithmetic.
7689 *----------------------------------------------------------------------------*/
7691 int float128_eq_quiet(float128 a, float128 b, float_status *status)
7694 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7695 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7696 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7697 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7699 if (float128_is_signaling_nan(a, status)
7700 || float128_is_signaling_nan(b, status)) {
7701 float_raise(float_flag_invalid, status);
7703 return 0;
7705 return
7706 ( a.low == b.low )
7707 && ( ( a.high == b.high )
7708 || ( ( a.low == 0 )
7709 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7714 /*----------------------------------------------------------------------------
7715 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7716 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
7717 | cause an exception. Otherwise, the comparison is performed according to the
7718 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7719 *----------------------------------------------------------------------------*/
7721 int float128_le_quiet(float128 a, float128 b, float_status *status)
7723 flag aSign, bSign;
7725 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7726 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7727 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7728 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7730 if (float128_is_signaling_nan(a, status)
7731 || float128_is_signaling_nan(b, status)) {
7732 float_raise(float_flag_invalid, status);
7734 return 0;
7736 aSign = extractFloat128Sign( a );
7737 bSign = extractFloat128Sign( b );
7738 if ( aSign != bSign ) {
7739 return
7740 aSign
7741 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7742 == 0 );
7744 return
7745 aSign ? le128( b.high, b.low, a.high, a.low )
7746 : le128( a.high, a.low, b.high, b.low );
7750 /*----------------------------------------------------------------------------
7751 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7752 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7753 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
7754 | Standard for Binary Floating-Point Arithmetic.
7755 *----------------------------------------------------------------------------*/
7757 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7759 flag aSign, bSign;
7761 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7762 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7763 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7764 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7766 if (float128_is_signaling_nan(a, status)
7767 || float128_is_signaling_nan(b, status)) {
7768 float_raise(float_flag_invalid, status);
7770 return 0;
7772 aSign = extractFloat128Sign( a );
7773 bSign = extractFloat128Sign( b );
7774 if ( aSign != bSign ) {
7775 return
7776 aSign
7777 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7778 != 0 );
7780 return
7781 aSign ? lt128( b.high, b.low, a.high, a.low )
7782 : lt128( a.high, a.low, b.high, b.low );
7786 /*----------------------------------------------------------------------------
7787 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7788 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
7789 | comparison is performed according to the IEC/IEEE Standard for Binary
7790 | Floating-Point Arithmetic.
7791 *----------------------------------------------------------------------------*/
7793 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7795 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7796 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7797 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7798 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7800 if (float128_is_signaling_nan(a, status)
7801 || float128_is_signaling_nan(b, status)) {
7802 float_raise(float_flag_invalid, status);
7804 return 1;
7806 return 0;
7809 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7810 int is_quiet, float_status *status)
7812 flag aSign, bSign;
7814 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7815 float_raise(float_flag_invalid, status);
7816 return float_relation_unordered;
7818 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7819 ( extractFloatx80Frac( a )<<1 ) ) ||
7820 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7821 ( extractFloatx80Frac( b )<<1 ) )) {
7822 if (!is_quiet ||
7823 floatx80_is_signaling_nan(a, status) ||
7824 floatx80_is_signaling_nan(b, status)) {
7825 float_raise(float_flag_invalid, status);
7827 return float_relation_unordered;
7829 aSign = extractFloatx80Sign( a );
7830 bSign = extractFloatx80Sign( b );
7831 if ( aSign != bSign ) {
7833 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7834 ( ( a.low | b.low ) == 0 ) ) {
7835 /* zero case */
7836 return float_relation_equal;
7837 } else {
7838 return 1 - (2 * aSign);
7840 } else {
7841 if (a.low == b.low && a.high == b.high) {
7842 return float_relation_equal;
7843 } else {
7844 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7849 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7851 return floatx80_compare_internal(a, b, 0, status);
7854 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7856 return floatx80_compare_internal(a, b, 1, status);
7859 static inline int float128_compare_internal(float128 a, float128 b,
7860 int is_quiet, float_status *status)
7862 flag aSign, bSign;
7864 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7865 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7866 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7867 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7868 if (!is_quiet ||
7869 float128_is_signaling_nan(a, status) ||
7870 float128_is_signaling_nan(b, status)) {
7871 float_raise(float_flag_invalid, status);
7873 return float_relation_unordered;
7875 aSign = extractFloat128Sign( a );
7876 bSign = extractFloat128Sign( b );
7877 if ( aSign != bSign ) {
7878 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7879 /* zero case */
7880 return float_relation_equal;
7881 } else {
7882 return 1 - (2 * aSign);
7884 } else {
7885 if (a.low == b.low && a.high == b.high) {
7886 return float_relation_equal;
7887 } else {
7888 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7893 int float128_compare(float128 a, float128 b, float_status *status)
7895 return float128_compare_internal(a, b, 0, status);
7898 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7900 return float128_compare_internal(a, b, 1, status);
7903 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7905 flag aSign;
7906 int32_t aExp;
7907 uint64_t aSig;
7909 if (floatx80_invalid_encoding(a)) {
7910 float_raise(float_flag_invalid, status);
7911 return floatx80_default_nan(status);
7913 aSig = extractFloatx80Frac( a );
7914 aExp = extractFloatx80Exp( a );
7915 aSign = extractFloatx80Sign( a );
7917 if ( aExp == 0x7FFF ) {
7918 if ( aSig<<1 ) {
7919 return propagateFloatx80NaN(a, a, status);
7921 return a;
7924 if (aExp == 0) {
7925 if (aSig == 0) {
7926 return a;
7928 aExp++;
7931 if (n > 0x10000) {
7932 n = 0x10000;
7933 } else if (n < -0x10000) {
7934 n = -0x10000;
7937 aExp += n;
7938 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7939 aSign, aExp, aSig, 0, status);
7942 float128 float128_scalbn(float128 a, int n, float_status *status)
7944 flag aSign;
7945 int32_t aExp;
7946 uint64_t aSig0, aSig1;
7948 aSig1 = extractFloat128Frac1( a );
7949 aSig0 = extractFloat128Frac0( a );
7950 aExp = extractFloat128Exp( a );
7951 aSign = extractFloat128Sign( a );
7952 if ( aExp == 0x7FFF ) {
7953 if ( aSig0 | aSig1 ) {
7954 return propagateFloat128NaN(a, a, status);
7956 return a;
7958 if (aExp != 0) {
7959 aSig0 |= LIT64( 0x0001000000000000 );
7960 } else if (aSig0 == 0 && aSig1 == 0) {
7961 return a;
7962 } else {
7963 aExp++;
7966 if (n > 0x10000) {
7967 n = 0x10000;
7968 } else if (n < -0x10000) {
7969 n = -0x10000;
7972 aExp += n - 1;
7973 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7974 , status);