target/ppc: Tidy helper_fadd, helper_fsub
[qemu.git] / fpu / softfloat.c
blob7d63cffdeb8614b83afc93fbfa0a97c92d5395d4
1 /*
2 * QEMU float support
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "fpu/softfloat-macros.h"
98 /*----------------------------------------------------------------------------
99 | Returns the fraction bits of the half-precision floating-point value `a'.
100 *----------------------------------------------------------------------------*/
102 static inline uint32_t extractFloat16Frac(float16 a)
104 return float16_val(a) & 0x3ff;
107 /*----------------------------------------------------------------------------
108 | Returns the exponent bits of the half-precision floating-point value `a'.
109 *----------------------------------------------------------------------------*/
111 static inline int extractFloat16Exp(float16 a)
113 return (float16_val(a) >> 10) & 0x1f;
116 /*----------------------------------------------------------------------------
117 | Returns the fraction bits of the single-precision floating-point value `a'.
118 *----------------------------------------------------------------------------*/
120 static inline uint32_t extractFloat32Frac(float32 a)
122 return float32_val(a) & 0x007FFFFF;
125 /*----------------------------------------------------------------------------
126 | Returns the exponent bits of the single-precision floating-point value `a'.
127 *----------------------------------------------------------------------------*/
129 static inline int extractFloat32Exp(float32 a)
131 return (float32_val(a) >> 23) & 0xFF;
134 /*----------------------------------------------------------------------------
135 | Returns the sign bit of the single-precision floating-point value `a'.
136 *----------------------------------------------------------------------------*/
138 static inline flag extractFloat32Sign(float32 a)
140 return float32_val(a) >> 31;
143 /*----------------------------------------------------------------------------
144 | Returns the fraction bits of the double-precision floating-point value `a'.
145 *----------------------------------------------------------------------------*/
147 static inline uint64_t extractFloat64Frac(float64 a)
149 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
152 /*----------------------------------------------------------------------------
153 | Returns the exponent bits of the double-precision floating-point value `a'.
154 *----------------------------------------------------------------------------*/
156 static inline int extractFloat64Exp(float64 a)
158 return (float64_val(a) >> 52) & 0x7FF;
161 /*----------------------------------------------------------------------------
162 | Returns the sign bit of the double-precision floating-point value `a'.
163 *----------------------------------------------------------------------------*/
165 static inline flag extractFloat64Sign(float64 a)
167 return float64_val(a) >> 63;
171 * Classify a floating point number. Everything above float_class_qnan
172 * is a NaN so cls >= float_class_qnan is any NaN.
175 typedef enum __attribute__ ((__packed__)) {
176 float_class_unclassified,
177 float_class_zero,
178 float_class_normal,
179 float_class_inf,
180 float_class_qnan, /* all NaNs from here */
181 float_class_snan,
182 } FloatClass;
184 /* Simple helpers for checking if, or what kind of, NaN we have */
185 static inline __attribute__((unused)) bool is_nan(FloatClass c)
187 return unlikely(c >= float_class_qnan);
190 static inline __attribute__((unused)) bool is_snan(FloatClass c)
192 return c == float_class_snan;
195 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
197 return c == float_class_qnan;
201 * Structure holding all of the decomposed parts of a float. The
202 * exponent is unbiased and the fraction is normalized. All
203 * calculations are done with a 64 bit fraction and then rounded as
204 * appropriate for the final format.
206 * Thanks to the packed FloatClass a decent compiler should be able to
207 * fit the whole structure into registers and avoid using the stack
208 * for parameter passing.
211 typedef struct {
212 uint64_t frac;
213 int32_t exp;
214 FloatClass cls;
215 bool sign;
216 } FloatParts;
218 #define DECOMPOSED_BINARY_POINT (64 - 2)
219 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
220 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
222 /* Structure holding all of the relevant parameters for a format.
223 * exp_size: the size of the exponent field
224 * exp_bias: the offset applied to the exponent field
225 * exp_max: the maximum normalised exponent
226 * frac_size: the size of the fraction field
227 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
228 * The following are computed based the size of fraction
229 * frac_lsb: least significant bit of fraction
230 * frac_lsbm1: the bit below the least significant bit (for rounding)
231 * round_mask/roundeven_mask: masks used for rounding
232 * The following optional modifiers are available:
233 * arm_althp: handle ARM Alternative Half Precision
235 typedef struct {
236 int exp_size;
237 int exp_bias;
238 int exp_max;
239 int frac_size;
240 int frac_shift;
241 uint64_t frac_lsb;
242 uint64_t frac_lsbm1;
243 uint64_t round_mask;
244 uint64_t roundeven_mask;
245 bool arm_althp;
246 } FloatFmt;
248 /* Expand fields based on the size of exponent and fraction */
249 #define FLOAT_PARAMS(E, F) \
250 .exp_size = E, \
251 .exp_bias = ((1 << E) - 1) >> 1, \
252 .exp_max = (1 << E) - 1, \
253 .frac_size = F, \
254 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
255 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
256 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
257 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
258 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
260 static const FloatFmt float16_params = {
261 FLOAT_PARAMS(5, 10)
264 static const FloatFmt float16_params_ahp = {
265 FLOAT_PARAMS(5, 10),
266 .arm_althp = true
269 static const FloatFmt float32_params = {
270 FLOAT_PARAMS(8, 23)
273 static const FloatFmt float64_params = {
274 FLOAT_PARAMS(11, 52)
277 /* Unpack a float to parts, but do not canonicalize. */
278 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
280 const int sign_pos = fmt.frac_size + fmt.exp_size;
282 return (FloatParts) {
283 .cls = float_class_unclassified,
284 .sign = extract64(raw, sign_pos, 1),
285 .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
286 .frac = extract64(raw, 0, fmt.frac_size),
290 static inline FloatParts float16_unpack_raw(float16 f)
292 return unpack_raw(float16_params, f);
295 static inline FloatParts float32_unpack_raw(float32 f)
297 return unpack_raw(float32_params, f);
300 static inline FloatParts float64_unpack_raw(float64 f)
302 return unpack_raw(float64_params, f);
305 /* Pack a float from parts, but do not canonicalize. */
306 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
308 const int sign_pos = fmt.frac_size + fmt.exp_size;
309 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
310 return deposit64(ret, sign_pos, 1, p.sign);
313 static inline float16 float16_pack_raw(FloatParts p)
315 return make_float16(pack_raw(float16_params, p));
318 static inline float32 float32_pack_raw(FloatParts p)
320 return make_float32(pack_raw(float32_params, p));
323 static inline float64 float64_pack_raw(FloatParts p)
325 return make_float64(pack_raw(float64_params, p));
328 /*----------------------------------------------------------------------------
329 | Functions and definitions to determine: (1) whether tininess for underflow
330 | is detected before or after rounding by default, (2) what (if anything)
331 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
332 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
333 | are propagated from function inputs to output. These details are target-
334 | specific.
335 *----------------------------------------------------------------------------*/
336 #include "softfloat-specialize.h"
338 /* Canonicalize EXP and FRAC, setting CLS. */
339 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
340 float_status *status)
342 if (part.exp == parm->exp_max && !parm->arm_althp) {
343 if (part.frac == 0) {
344 part.cls = float_class_inf;
345 } else {
346 part.frac <<= parm->frac_shift;
347 part.cls = (parts_is_snan_frac(part.frac, status)
348 ? float_class_snan : float_class_qnan);
350 } else if (part.exp == 0) {
351 if (likely(part.frac == 0)) {
352 part.cls = float_class_zero;
353 } else if (status->flush_inputs_to_zero) {
354 float_raise(float_flag_input_denormal, status);
355 part.cls = float_class_zero;
356 part.frac = 0;
357 } else {
358 int shift = clz64(part.frac) - 1;
359 part.cls = float_class_normal;
360 part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
361 part.frac <<= shift;
363 } else {
364 part.cls = float_class_normal;
365 part.exp -= parm->exp_bias;
366 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
368 return part;
371 /* Round and uncanonicalize a floating-point number by parts. There
372 * are FRAC_SHIFT bits that may require rounding at the bottom of the
373 * fraction; these bits will be removed. The exponent will be biased
374 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
377 static FloatParts round_canonical(FloatParts p, float_status *s,
378 const FloatFmt *parm)
380 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
381 const uint64_t round_mask = parm->round_mask;
382 const uint64_t roundeven_mask = parm->roundeven_mask;
383 const int exp_max = parm->exp_max;
384 const int frac_shift = parm->frac_shift;
385 uint64_t frac, inc;
386 int exp, flags = 0;
387 bool overflow_norm;
389 frac = p.frac;
390 exp = p.exp;
392 switch (p.cls) {
393 case float_class_normal:
394 switch (s->float_rounding_mode) {
395 case float_round_nearest_even:
396 overflow_norm = false;
397 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
398 break;
399 case float_round_ties_away:
400 overflow_norm = false;
401 inc = frac_lsbm1;
402 break;
403 case float_round_to_zero:
404 overflow_norm = true;
405 inc = 0;
406 break;
407 case float_round_up:
408 inc = p.sign ? 0 : round_mask;
409 overflow_norm = p.sign;
410 break;
411 case float_round_down:
412 inc = p.sign ? round_mask : 0;
413 overflow_norm = !p.sign;
414 break;
415 default:
416 g_assert_not_reached();
419 exp += parm->exp_bias;
420 if (likely(exp > 0)) {
421 if (frac & round_mask) {
422 flags |= float_flag_inexact;
423 frac += inc;
424 if (frac & DECOMPOSED_OVERFLOW_BIT) {
425 frac >>= 1;
426 exp++;
429 frac >>= frac_shift;
431 if (parm->arm_althp) {
432 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
433 if (unlikely(exp > exp_max)) {
434 /* Overflow. Return the maximum normal. */
435 flags = float_flag_invalid;
436 exp = exp_max;
437 frac = -1;
439 } else if (unlikely(exp >= exp_max)) {
440 flags |= float_flag_overflow | float_flag_inexact;
441 if (overflow_norm) {
442 exp = exp_max - 1;
443 frac = -1;
444 } else {
445 p.cls = float_class_inf;
446 goto do_inf;
449 } else if (s->flush_to_zero) {
450 flags |= float_flag_output_denormal;
451 p.cls = float_class_zero;
452 goto do_zero;
453 } else {
454 bool is_tiny = (s->float_detect_tininess
455 == float_tininess_before_rounding)
456 || (exp < 0)
457 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
459 shift64RightJamming(frac, 1 - exp, &frac);
460 if (frac & round_mask) {
461 /* Need to recompute round-to-even. */
462 if (s->float_rounding_mode == float_round_nearest_even) {
463 inc = ((frac & roundeven_mask) != frac_lsbm1
464 ? frac_lsbm1 : 0);
466 flags |= float_flag_inexact;
467 frac += inc;
470 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
471 frac >>= frac_shift;
473 if (is_tiny && (flags & float_flag_inexact)) {
474 flags |= float_flag_underflow;
476 if (exp == 0 && frac == 0) {
477 p.cls = float_class_zero;
480 break;
482 case float_class_zero:
483 do_zero:
484 exp = 0;
485 frac = 0;
486 break;
488 case float_class_inf:
489 do_inf:
490 assert(!parm->arm_althp);
491 exp = exp_max;
492 frac = 0;
493 break;
495 case float_class_qnan:
496 case float_class_snan:
497 assert(!parm->arm_althp);
498 exp = exp_max;
499 frac >>= parm->frac_shift;
500 break;
502 default:
503 g_assert_not_reached();
506 float_raise(flags, s);
507 p.exp = exp;
508 p.frac = frac;
509 return p;
512 /* Explicit FloatFmt version */
513 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
514 const FloatFmt *params)
516 return canonicalize(float16_unpack_raw(f), params, s);
519 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
521 return float16a_unpack_canonical(f, s, &float16_params);
524 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
525 const FloatFmt *params)
527 return float16_pack_raw(round_canonical(p, s, params));
530 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
532 return float16a_round_pack_canonical(p, s, &float16_params);
535 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
537 return canonicalize(float32_unpack_raw(f), &float32_params, s);
540 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
542 return float32_pack_raw(round_canonical(p, s, &float32_params));
545 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
547 return canonicalize(float64_unpack_raw(f), &float64_params, s);
550 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
552 return float64_pack_raw(round_canonical(p, s, &float64_params));
555 static FloatParts return_nan(FloatParts a, float_status *s)
557 switch (a.cls) {
558 case float_class_snan:
559 s->float_exception_flags |= float_flag_invalid;
560 a = parts_silence_nan(a, s);
561 /* fall through */
562 case float_class_qnan:
563 if (s->default_nan_mode) {
564 return parts_default_nan(s);
566 break;
568 default:
569 g_assert_not_reached();
571 return a;
574 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
576 if (is_snan(a.cls) || is_snan(b.cls)) {
577 s->float_exception_flags |= float_flag_invalid;
580 if (s->default_nan_mode) {
581 return parts_default_nan(s);
582 } else {
583 if (pickNaN(a.cls, b.cls,
584 a.frac > b.frac ||
585 (a.frac == b.frac && a.sign < b.sign))) {
586 a = b;
588 if (is_snan(a.cls)) {
589 return parts_silence_nan(a, s);
592 return a;
595 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
596 bool inf_zero, float_status *s)
598 int which;
600 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
601 s->float_exception_flags |= float_flag_invalid;
604 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
606 if (s->default_nan_mode) {
607 /* Note that this check is after pickNaNMulAdd so that function
608 * has an opportunity to set the Invalid flag.
610 which = 3;
613 switch (which) {
614 case 0:
615 break;
616 case 1:
617 a = b;
618 break;
619 case 2:
620 a = c;
621 break;
622 case 3:
623 return parts_default_nan(s);
624 default:
625 g_assert_not_reached();
628 if (is_snan(a.cls)) {
629 return parts_silence_nan(a, s);
631 return a;
635 * Returns the result of adding or subtracting the values of the
636 * floating-point values `a' and `b'. The operation is performed
637 * according to the IEC/IEEE Standard for Binary Floating-Point
638 * Arithmetic.
641 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
642 float_status *s)
644 bool a_sign = a.sign;
645 bool b_sign = b.sign ^ subtract;
647 if (a_sign != b_sign) {
648 /* Subtraction */
650 if (a.cls == float_class_normal && b.cls == float_class_normal) {
651 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
652 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
653 a.frac = a.frac - b.frac;
654 } else {
655 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
656 a.frac = b.frac - a.frac;
657 a.exp = b.exp;
658 a_sign ^= 1;
661 if (a.frac == 0) {
662 a.cls = float_class_zero;
663 a.sign = s->float_rounding_mode == float_round_down;
664 } else {
665 int shift = clz64(a.frac) - 1;
666 a.frac = a.frac << shift;
667 a.exp = a.exp - shift;
668 a.sign = a_sign;
670 return a;
672 if (is_nan(a.cls) || is_nan(b.cls)) {
673 return pick_nan(a, b, s);
675 if (a.cls == float_class_inf) {
676 if (b.cls == float_class_inf) {
677 float_raise(float_flag_invalid, s);
678 return parts_default_nan(s);
680 return a;
682 if (a.cls == float_class_zero && b.cls == float_class_zero) {
683 a.sign = s->float_rounding_mode == float_round_down;
684 return a;
686 if (a.cls == float_class_zero || b.cls == float_class_inf) {
687 b.sign = a_sign ^ 1;
688 return b;
690 if (b.cls == float_class_zero) {
691 return a;
693 } else {
694 /* Addition */
695 if (a.cls == float_class_normal && b.cls == float_class_normal) {
696 if (a.exp > b.exp) {
697 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
698 } else if (a.exp < b.exp) {
699 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
700 a.exp = b.exp;
702 a.frac += b.frac;
703 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
704 shift64RightJamming(a.frac, 1, &a.frac);
705 a.exp += 1;
707 return a;
709 if (is_nan(a.cls) || is_nan(b.cls)) {
710 return pick_nan(a, b, s);
712 if (a.cls == float_class_inf || b.cls == float_class_zero) {
713 return a;
715 if (b.cls == float_class_inf || a.cls == float_class_zero) {
716 b.sign = b_sign;
717 return b;
720 g_assert_not_reached();
724 * Returns the result of adding or subtracting the floating-point
725 * values `a' and `b'. The operation is performed according to the
726 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
729 float16 __attribute__((flatten)) float16_add(float16 a, float16 b,
730 float_status *status)
732 FloatParts pa = float16_unpack_canonical(a, status);
733 FloatParts pb = float16_unpack_canonical(b, status);
734 FloatParts pr = addsub_floats(pa, pb, false, status);
736 return float16_round_pack_canonical(pr, status);
739 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
740 float_status *status)
742 FloatParts pa = float32_unpack_canonical(a, status);
743 FloatParts pb = float32_unpack_canonical(b, status);
744 FloatParts pr = addsub_floats(pa, pb, false, status);
746 return float32_round_pack_canonical(pr, status);
749 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
750 float_status *status)
752 FloatParts pa = float64_unpack_canonical(a, status);
753 FloatParts pb = float64_unpack_canonical(b, status);
754 FloatParts pr = addsub_floats(pa, pb, false, status);
756 return float64_round_pack_canonical(pr, status);
759 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
760 float_status *status)
762 FloatParts pa = float16_unpack_canonical(a, status);
763 FloatParts pb = float16_unpack_canonical(b, status);
764 FloatParts pr = addsub_floats(pa, pb, true, status);
766 return float16_round_pack_canonical(pr, status);
769 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
770 float_status *status)
772 FloatParts pa = float32_unpack_canonical(a, status);
773 FloatParts pb = float32_unpack_canonical(b, status);
774 FloatParts pr = addsub_floats(pa, pb, true, status);
776 return float32_round_pack_canonical(pr, status);
779 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
780 float_status *status)
782 FloatParts pa = float64_unpack_canonical(a, status);
783 FloatParts pb = float64_unpack_canonical(b, status);
784 FloatParts pr = addsub_floats(pa, pb, true, status);
786 return float64_round_pack_canonical(pr, status);
790 * Returns the result of multiplying the floating-point values `a' and
791 * `b'. The operation is performed according to the IEC/IEEE Standard
792 * for Binary Floating-Point Arithmetic.
795 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
797 bool sign = a.sign ^ b.sign;
799 if (a.cls == float_class_normal && b.cls == float_class_normal) {
800 uint64_t hi, lo;
801 int exp = a.exp + b.exp;
803 mul64To128(a.frac, b.frac, &hi, &lo);
804 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
805 if (lo & DECOMPOSED_OVERFLOW_BIT) {
806 shift64RightJamming(lo, 1, &lo);
807 exp += 1;
810 /* Re-use a */
811 a.exp = exp;
812 a.sign = sign;
813 a.frac = lo;
814 return a;
816 /* handle all the NaN cases */
817 if (is_nan(a.cls) || is_nan(b.cls)) {
818 return pick_nan(a, b, s);
820 /* Inf * Zero == NaN */
821 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
822 (a.cls == float_class_zero && b.cls == float_class_inf)) {
823 s->float_exception_flags |= float_flag_invalid;
824 return parts_default_nan(s);
826 /* Multiply by 0 or Inf */
827 if (a.cls == float_class_inf || a.cls == float_class_zero) {
828 a.sign = sign;
829 return a;
831 if (b.cls == float_class_inf || b.cls == float_class_zero) {
832 b.sign = sign;
833 return b;
835 g_assert_not_reached();
838 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
839 float_status *status)
841 FloatParts pa = float16_unpack_canonical(a, status);
842 FloatParts pb = float16_unpack_canonical(b, status);
843 FloatParts pr = mul_floats(pa, pb, status);
845 return float16_round_pack_canonical(pr, status);
848 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
849 float_status *status)
851 FloatParts pa = float32_unpack_canonical(a, status);
852 FloatParts pb = float32_unpack_canonical(b, status);
853 FloatParts pr = mul_floats(pa, pb, status);
855 return float32_round_pack_canonical(pr, status);
858 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
859 float_status *status)
861 FloatParts pa = float64_unpack_canonical(a, status);
862 FloatParts pb = float64_unpack_canonical(b, status);
863 FloatParts pr = mul_floats(pa, pb, status);
865 return float64_round_pack_canonical(pr, status);
869 * Returns the result of multiplying the floating-point values `a' and
870 * `b' then adding 'c', with no intermediate rounding step after the
871 * multiplication. The operation is performed according to the
872 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
873 * The flags argument allows the caller to select negation of the
874 * addend, the intermediate product, or the final result. (The
875 * difference between this and having the caller do a separate
876 * negation is that negating externally will flip the sign bit on
877 * NaNs.)
880 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
881 int flags, float_status *s)
883 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
884 ((1 << float_class_inf) | (1 << float_class_zero));
885 bool p_sign;
886 bool sign_flip = flags & float_muladd_negate_result;
887 FloatClass p_class;
888 uint64_t hi, lo;
889 int p_exp;
891 /* It is implementation-defined whether the cases of (0,inf,qnan)
892 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
893 * they return if they do), so we have to hand this information
894 * off to the target-specific pick-a-NaN routine.
896 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
897 return pick_nan_muladd(a, b, c, inf_zero, s);
900 if (inf_zero) {
901 s->float_exception_flags |= float_flag_invalid;
902 return parts_default_nan(s);
905 if (flags & float_muladd_negate_c) {
906 c.sign ^= 1;
909 p_sign = a.sign ^ b.sign;
911 if (flags & float_muladd_negate_product) {
912 p_sign ^= 1;
915 if (a.cls == float_class_inf || b.cls == float_class_inf) {
916 p_class = float_class_inf;
917 } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
918 p_class = float_class_zero;
919 } else {
920 p_class = float_class_normal;
923 if (c.cls == float_class_inf) {
924 if (p_class == float_class_inf && p_sign != c.sign) {
925 s->float_exception_flags |= float_flag_invalid;
926 return parts_default_nan(s);
927 } else {
928 a.cls = float_class_inf;
929 a.sign = c.sign ^ sign_flip;
930 return a;
934 if (p_class == float_class_inf) {
935 a.cls = float_class_inf;
936 a.sign = p_sign ^ sign_flip;
937 return a;
940 if (p_class == float_class_zero) {
941 if (c.cls == float_class_zero) {
942 if (p_sign != c.sign) {
943 p_sign = s->float_rounding_mode == float_round_down;
945 c.sign = p_sign;
946 } else if (flags & float_muladd_halve_result) {
947 c.exp -= 1;
949 c.sign ^= sign_flip;
950 return c;
953 /* a & b should be normals now... */
954 assert(a.cls == float_class_normal &&
955 b.cls == float_class_normal);
957 p_exp = a.exp + b.exp;
959 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
960 * result.
962 mul64To128(a.frac, b.frac, &hi, &lo);
963 /* binary point now at bit 124 */
965 /* check for overflow */
966 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
967 shift128RightJamming(hi, lo, 1, &hi, &lo);
968 p_exp += 1;
971 /* + add/sub */
972 if (c.cls == float_class_zero) {
973 /* move binary point back to 62 */
974 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
975 } else {
976 int exp_diff = p_exp - c.exp;
977 if (p_sign == c.sign) {
978 /* Addition */
979 if (exp_diff <= 0) {
980 shift128RightJamming(hi, lo,
981 DECOMPOSED_BINARY_POINT - exp_diff,
982 &hi, &lo);
983 lo += c.frac;
984 p_exp = c.exp;
985 } else {
986 uint64_t c_hi, c_lo;
987 /* shift c to the same binary point as the product (124) */
988 c_hi = c.frac >> 2;
989 c_lo = 0;
990 shift128RightJamming(c_hi, c_lo,
991 exp_diff,
992 &c_hi, &c_lo);
993 add128(hi, lo, c_hi, c_lo, &hi, &lo);
994 /* move binary point back to 62 */
995 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
998 if (lo & DECOMPOSED_OVERFLOW_BIT) {
999 shift64RightJamming(lo, 1, &lo);
1000 p_exp += 1;
1003 } else {
1004 /* Subtraction */
1005 uint64_t c_hi, c_lo;
1006 /* make C binary point match product at bit 124 */
1007 c_hi = c.frac >> 2;
1008 c_lo = 0;
1010 if (exp_diff <= 0) {
1011 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1012 if (exp_diff == 0
1014 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1015 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1016 } else {
1017 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1018 p_sign ^= 1;
1019 p_exp = c.exp;
1021 } else {
1022 shift128RightJamming(c_hi, c_lo,
1023 exp_diff,
1024 &c_hi, &c_lo);
1025 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1028 if (hi == 0 && lo == 0) {
1029 a.cls = float_class_zero;
1030 a.sign = s->float_rounding_mode == float_round_down;
1031 a.sign ^= sign_flip;
1032 return a;
1033 } else {
1034 int shift;
1035 if (hi != 0) {
1036 shift = clz64(hi);
1037 } else {
1038 shift = clz64(lo) + 64;
1040 /* Normalizing to a binary point of 124 is the
1041 correct adjust for the exponent. However since we're
1042 shifting, we might as well put the binary point back
1043 at 62 where we really want it. Therefore shift as
1044 if we're leaving 1 bit at the top of the word, but
1045 adjust the exponent as if we're leaving 3 bits. */
1046 shift -= 1;
1047 if (shift >= 64) {
1048 lo = lo << (shift - 64);
1049 } else {
1050 hi = (hi << shift) | (lo >> (64 - shift));
1051 lo = hi | ((lo << shift) != 0);
1053 p_exp -= shift - 2;
1058 if (flags & float_muladd_halve_result) {
1059 p_exp -= 1;
1062 /* finally prepare our result */
1063 a.cls = float_class_normal;
1064 a.sign = p_sign ^ sign_flip;
1065 a.exp = p_exp;
1066 a.frac = lo;
1068 return a;
1071 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1072 int flags, float_status *status)
1074 FloatParts pa = float16_unpack_canonical(a, status);
1075 FloatParts pb = float16_unpack_canonical(b, status);
1076 FloatParts pc = float16_unpack_canonical(c, status);
1077 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1079 return float16_round_pack_canonical(pr, status);
1082 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1083 int flags, float_status *status)
1085 FloatParts pa = float32_unpack_canonical(a, status);
1086 FloatParts pb = float32_unpack_canonical(b, status);
1087 FloatParts pc = float32_unpack_canonical(c, status);
1088 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1090 return float32_round_pack_canonical(pr, status);
1093 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1094 int flags, float_status *status)
1096 FloatParts pa = float64_unpack_canonical(a, status);
1097 FloatParts pb = float64_unpack_canonical(b, status);
1098 FloatParts pc = float64_unpack_canonical(c, status);
1099 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1101 return float64_round_pack_canonical(pr, status);
1105 * Returns the result of dividing the floating-point value `a' by the
1106 * corresponding value `b'. The operation is performed according to
1107 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1110 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1112 bool sign = a.sign ^ b.sign;
1114 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1115 uint64_t temp_lo, temp_hi;
1116 int exp = a.exp - b.exp;
1117 if (a.frac < b.frac) {
1118 exp -= 1;
1119 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1120 &temp_hi, &temp_lo);
1121 } else {
1122 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1123 &temp_hi, &temp_lo);
1125 /* LSB of quot is set if inexact which roundandpack will use
1126 * to set flags. Yet again we re-use a for the result */
1127 a.frac = div128To64(temp_lo, temp_hi, b.frac);
1128 a.sign = sign;
1129 a.exp = exp;
1130 return a;
1132 /* handle all the NaN cases */
1133 if (is_nan(a.cls) || is_nan(b.cls)) {
1134 return pick_nan(a, b, s);
1136 /* 0/0 or Inf/Inf */
1137 if (a.cls == b.cls
1139 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1140 s->float_exception_flags |= float_flag_invalid;
1141 return parts_default_nan(s);
1143 /* Inf / x or 0 / x */
1144 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1145 a.sign = sign;
1146 return a;
1148 /* Div 0 => Inf */
1149 if (b.cls == float_class_zero) {
1150 s->float_exception_flags |= float_flag_divbyzero;
1151 a.cls = float_class_inf;
1152 a.sign = sign;
1153 return a;
1155 /* Div by Inf */
1156 if (b.cls == float_class_inf) {
1157 a.cls = float_class_zero;
1158 a.sign = sign;
1159 return a;
1161 g_assert_not_reached();
1164 float16 float16_div(float16 a, float16 b, float_status *status)
1166 FloatParts pa = float16_unpack_canonical(a, status);
1167 FloatParts pb = float16_unpack_canonical(b, status);
1168 FloatParts pr = div_floats(pa, pb, status);
1170 return float16_round_pack_canonical(pr, status);
1173 float32 float32_div(float32 a, float32 b, float_status *status)
1175 FloatParts pa = float32_unpack_canonical(a, status);
1176 FloatParts pb = float32_unpack_canonical(b, status);
1177 FloatParts pr = div_floats(pa, pb, status);
1179 return float32_round_pack_canonical(pr, status);
1182 float64 float64_div(float64 a, float64 b, float_status *status)
1184 FloatParts pa = float64_unpack_canonical(a, status);
1185 FloatParts pb = float64_unpack_canonical(b, status);
1186 FloatParts pr = div_floats(pa, pb, status);
1188 return float64_round_pack_canonical(pr, status);
1192 * Float to Float conversions
1194 * Returns the result of converting one float format to another. The
1195 * conversion is performed according to the IEC/IEEE Standard for
1196 * Binary Floating-Point Arithmetic.
1198 * The float_to_float helper only needs to take care of raising
1199 * invalid exceptions and handling the conversion on NaNs.
1202 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1203 float_status *s)
1205 if (dstf->arm_althp) {
1206 switch (a.cls) {
1207 case float_class_qnan:
1208 case float_class_snan:
1209 /* There is no NaN in the destination format. Raise Invalid
1210 * and return a zero with the sign of the input NaN.
1212 s->float_exception_flags |= float_flag_invalid;
1213 a.cls = float_class_zero;
1214 a.frac = 0;
1215 a.exp = 0;
1216 break;
1218 case float_class_inf:
1219 /* There is no Inf in the destination format. Raise Invalid
1220 * and return the maximum normal with the correct sign.
1222 s->float_exception_flags |= float_flag_invalid;
1223 a.cls = float_class_normal;
1224 a.exp = dstf->exp_max;
1225 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1226 break;
1228 default:
1229 break;
1231 } else if (is_nan(a.cls)) {
1232 if (is_snan(a.cls)) {
1233 s->float_exception_flags |= float_flag_invalid;
1234 a = parts_silence_nan(a, s);
1236 if (s->default_nan_mode) {
1237 return parts_default_nan(s);
1240 return a;
1243 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1245 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1246 FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1247 FloatParts pr = float_to_float(p, &float32_params, s);
1248 return float32_round_pack_canonical(pr, s);
1251 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1253 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1254 FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1255 FloatParts pr = float_to_float(p, &float64_params, s);
1256 return float64_round_pack_canonical(pr, s);
1259 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1261 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1262 FloatParts p = float32_unpack_canonical(a, s);
1263 FloatParts pr = float_to_float(p, fmt16, s);
1264 return float16a_round_pack_canonical(pr, s, fmt16);
1267 float64 float32_to_float64(float32 a, float_status *s)
1269 FloatParts p = float32_unpack_canonical(a, s);
1270 FloatParts pr = float_to_float(p, &float64_params, s);
1271 return float64_round_pack_canonical(pr, s);
1274 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1276 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1277 FloatParts p = float64_unpack_canonical(a, s);
1278 FloatParts pr = float_to_float(p, fmt16, s);
1279 return float16a_round_pack_canonical(pr, s, fmt16);
1282 float32 float64_to_float32(float64 a, float_status *s)
1284 FloatParts p = float64_unpack_canonical(a, s);
1285 FloatParts pr = float_to_float(p, &float32_params, s);
1286 return float32_round_pack_canonical(pr, s);
1290 * Rounds the floating-point value `a' to an integer, and returns the
1291 * result as a floating-point value. The operation is performed
1292 * according to the IEC/IEEE Standard for Binary Floating-Point
1293 * Arithmetic.
1296 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1298 if (is_nan(a.cls)) {
1299 return return_nan(a, s);
1302 switch (a.cls) {
1303 case float_class_zero:
1304 case float_class_inf:
1305 case float_class_qnan:
1306 /* already "integral" */
1307 break;
1308 case float_class_normal:
1309 if (a.exp >= DECOMPOSED_BINARY_POINT) {
1310 /* already integral */
1311 break;
1313 if (a.exp < 0) {
1314 bool one;
1315 /* all fractional */
1316 s->float_exception_flags |= float_flag_inexact;
1317 switch (rounding_mode) {
1318 case float_round_nearest_even:
1319 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1320 break;
1321 case float_round_ties_away:
1322 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1323 break;
1324 case float_round_to_zero:
1325 one = false;
1326 break;
1327 case float_round_up:
1328 one = !a.sign;
1329 break;
1330 case float_round_down:
1331 one = a.sign;
1332 break;
1333 default:
1334 g_assert_not_reached();
1337 if (one) {
1338 a.frac = DECOMPOSED_IMPLICIT_BIT;
1339 a.exp = 0;
1340 } else {
1341 a.cls = float_class_zero;
1343 } else {
1344 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1345 uint64_t frac_lsbm1 = frac_lsb >> 1;
1346 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1347 uint64_t rnd_mask = rnd_even_mask >> 1;
1348 uint64_t inc;
1350 switch (rounding_mode) {
1351 case float_round_nearest_even:
1352 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1353 break;
1354 case float_round_ties_away:
1355 inc = frac_lsbm1;
1356 break;
1357 case float_round_to_zero:
1358 inc = 0;
1359 break;
1360 case float_round_up:
1361 inc = a.sign ? 0 : rnd_mask;
1362 break;
1363 case float_round_down:
1364 inc = a.sign ? rnd_mask : 0;
1365 break;
1366 default:
1367 g_assert_not_reached();
1370 if (a.frac & rnd_mask) {
1371 s->float_exception_flags |= float_flag_inexact;
1372 a.frac += inc;
1373 a.frac &= ~rnd_mask;
1374 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1375 a.frac >>= 1;
1376 a.exp++;
1380 break;
1381 default:
1382 g_assert_not_reached();
1384 return a;
1387 float16 float16_round_to_int(float16 a, float_status *s)
1389 FloatParts pa = float16_unpack_canonical(a, s);
1390 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1391 return float16_round_pack_canonical(pr, s);
1394 float32 float32_round_to_int(float32 a, float_status *s)
1396 FloatParts pa = float32_unpack_canonical(a, s);
1397 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1398 return float32_round_pack_canonical(pr, s);
1401 float64 float64_round_to_int(float64 a, float_status *s)
1403 FloatParts pa = float64_unpack_canonical(a, s);
1404 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1405 return float64_round_pack_canonical(pr, s);
1408 float64 float64_trunc_to_int(float64 a, float_status *s)
1410 FloatParts pa = float64_unpack_canonical(a, s);
1411 FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1412 return float64_round_pack_canonical(pr, s);
1416 * Returns the result of converting the floating-point value `a' to
1417 * the two's complement integer format. The conversion is performed
1418 * according to the IEC/IEEE Standard for Binary Floating-Point
1419 * Arithmetic---which means in particular that the conversion is
1420 * rounded according to the current rounding mode. If `a' is a NaN,
1421 * the largest positive integer is returned. Otherwise, if the
1422 * conversion overflows, the largest integer with the same sign as `a'
1423 * is returned.
1426 static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1427 int64_t min, int64_t max,
1428 float_status *s)
1430 uint64_t r;
1431 int orig_flags = get_float_exception_flags(s);
1432 FloatParts p = round_to_int(in, rmode, s);
1434 switch (p.cls) {
1435 case float_class_snan:
1436 case float_class_qnan:
1437 s->float_exception_flags = orig_flags | float_flag_invalid;
1438 return max;
1439 case float_class_inf:
1440 s->float_exception_flags = orig_flags | float_flag_invalid;
1441 return p.sign ? min : max;
1442 case float_class_zero:
1443 return 0;
1444 case float_class_normal:
1445 if (p.exp < DECOMPOSED_BINARY_POINT) {
1446 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1447 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1448 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1449 } else {
1450 r = UINT64_MAX;
1452 if (p.sign) {
1453 if (r <= -(uint64_t) min) {
1454 return -r;
1455 } else {
1456 s->float_exception_flags = orig_flags | float_flag_invalid;
1457 return min;
1459 } else {
1460 if (r <= max) {
1461 return r;
1462 } else {
1463 s->float_exception_flags = orig_flags | float_flag_invalid;
1464 return max;
1467 default:
1468 g_assert_not_reached();
1472 #define FLOAT_TO_INT(fsz, isz) \
1473 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \
1474 float_status *s) \
1476 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1477 return round_to_int_and_pack(p, s->float_rounding_mode, \
1478 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1479 s); \
1482 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \
1483 (float ## fsz a, float_status *s) \
1485 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1486 return round_to_int_and_pack(p, float_round_to_zero, \
1487 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1488 s); \
1491 FLOAT_TO_INT(16, 16)
1492 FLOAT_TO_INT(16, 32)
1493 FLOAT_TO_INT(16, 64)
1495 FLOAT_TO_INT(32, 16)
1496 FLOAT_TO_INT(32, 32)
1497 FLOAT_TO_INT(32, 64)
1499 FLOAT_TO_INT(64, 16)
1500 FLOAT_TO_INT(64, 32)
1501 FLOAT_TO_INT(64, 64)
1503 #undef FLOAT_TO_INT
1506 * Returns the result of converting the floating-point value `a' to
1507 * the unsigned integer format. The conversion is performed according
1508 * to the IEC/IEEE Standard for Binary Floating-Point
1509 * Arithmetic---which means in particular that the conversion is
1510 * rounded according to the current rounding mode. If `a' is a NaN,
1511 * the largest unsigned integer is returned. Otherwise, if the
1512 * conversion overflows, the largest unsigned integer is returned. If
1513 * the 'a' is negative, the result is rounded and zero is returned;
1514 * values that do not round to zero will raise the inexact exception
1515 * flag.
1518 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1519 float_status *s)
1521 int orig_flags = get_float_exception_flags(s);
1522 FloatParts p = round_to_int(in, rmode, s);
1524 switch (p.cls) {
1525 case float_class_snan:
1526 case float_class_qnan:
1527 s->float_exception_flags = orig_flags | float_flag_invalid;
1528 return max;
1529 case float_class_inf:
1530 s->float_exception_flags = orig_flags | float_flag_invalid;
1531 return p.sign ? 0 : max;
1532 case float_class_zero:
1533 return 0;
1534 case float_class_normal:
1536 uint64_t r;
1537 if (p.sign) {
1538 s->float_exception_flags = orig_flags | float_flag_invalid;
1539 return 0;
1542 if (p.exp < DECOMPOSED_BINARY_POINT) {
1543 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1544 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1545 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1546 } else {
1547 s->float_exception_flags = orig_flags | float_flag_invalid;
1548 return max;
1551 /* For uint64 this will never trip, but if p.exp is too large
1552 * to shift a decomposed fraction we shall have exited via the
1553 * 3rd leg above.
1555 if (r > max) {
1556 s->float_exception_flags = orig_flags | float_flag_invalid;
1557 return max;
1558 } else {
1559 return r;
1562 default:
1563 g_assert_not_reached();
1567 #define FLOAT_TO_UINT(fsz, isz) \
1568 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \
1569 float_status *s) \
1571 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1572 return round_to_uint_and_pack(p, s->float_rounding_mode, \
1573 UINT ## isz ## _MAX, s); \
1576 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \
1577 (float ## fsz a, float_status *s) \
1579 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1580 return round_to_uint_and_pack(p, float_round_to_zero, \
1581 UINT ## isz ## _MAX, s); \
1584 FLOAT_TO_UINT(16, 16)
1585 FLOAT_TO_UINT(16, 32)
1586 FLOAT_TO_UINT(16, 64)
1588 FLOAT_TO_UINT(32, 16)
1589 FLOAT_TO_UINT(32, 32)
1590 FLOAT_TO_UINT(32, 64)
1592 FLOAT_TO_UINT(64, 16)
1593 FLOAT_TO_UINT(64, 32)
1594 FLOAT_TO_UINT(64, 64)
1596 #undef FLOAT_TO_UINT
1599 * Integer to float conversions
1601 * Returns the result of converting the two's complement integer `a'
1602 * to the floating-point format. The conversion is performed according
1603 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1606 static FloatParts int_to_float(int64_t a, float_status *status)
1608 FloatParts r = {};
1609 if (a == 0) {
1610 r.cls = float_class_zero;
1611 r.sign = false;
1612 } else if (a == (1ULL << 63)) {
1613 r.cls = float_class_normal;
1614 r.sign = true;
1615 r.frac = DECOMPOSED_IMPLICIT_BIT;
1616 r.exp = 63;
1617 } else {
1618 uint64_t f;
1619 if (a < 0) {
1620 f = -a;
1621 r.sign = true;
1622 } else {
1623 f = a;
1624 r.sign = false;
1626 int shift = clz64(f) - 1;
1627 r.cls = float_class_normal;
1628 r.exp = (DECOMPOSED_BINARY_POINT - shift);
1629 r.frac = f << shift;
1632 return r;
1635 float16 int64_to_float16(int64_t a, float_status *status)
1637 FloatParts pa = int_to_float(a, status);
1638 return float16_round_pack_canonical(pa, status);
1641 float16 int32_to_float16(int32_t a, float_status *status)
1643 return int64_to_float16(a, status);
1646 float16 int16_to_float16(int16_t a, float_status *status)
1648 return int64_to_float16(a, status);
1651 float32 int64_to_float32(int64_t a, float_status *status)
1653 FloatParts pa = int_to_float(a, status);
1654 return float32_round_pack_canonical(pa, status);
1657 float32 int32_to_float32(int32_t a, float_status *status)
1659 return int64_to_float32(a, status);
1662 float32 int16_to_float32(int16_t a, float_status *status)
1664 return int64_to_float32(a, status);
1667 float64 int64_to_float64(int64_t a, float_status *status)
1669 FloatParts pa = int_to_float(a, status);
1670 return float64_round_pack_canonical(pa, status);
1673 float64 int32_to_float64(int32_t a, float_status *status)
1675 return int64_to_float64(a, status);
1678 float64 int16_to_float64(int16_t a, float_status *status)
1680 return int64_to_float64(a, status);
1685 * Unsigned Integer to float conversions
1687 * Returns the result of converting the unsigned integer `a' to the
1688 * floating-point format. The conversion is performed according to the
1689 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1692 static FloatParts uint_to_float(uint64_t a, float_status *status)
1694 FloatParts r = { .sign = false};
1696 if (a == 0) {
1697 r.cls = float_class_zero;
1698 } else {
1699 int spare_bits = clz64(a) - 1;
1700 r.cls = float_class_normal;
1701 r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1702 if (spare_bits < 0) {
1703 shift64RightJamming(a, -spare_bits, &a);
1704 r.frac = a;
1705 } else {
1706 r.frac = a << spare_bits;
1710 return r;
1713 float16 uint64_to_float16(uint64_t a, float_status *status)
1715 FloatParts pa = uint_to_float(a, status);
1716 return float16_round_pack_canonical(pa, status);
1719 float16 uint32_to_float16(uint32_t a, float_status *status)
1721 return uint64_to_float16(a, status);
1724 float16 uint16_to_float16(uint16_t a, float_status *status)
1726 return uint64_to_float16(a, status);
1729 float32 uint64_to_float32(uint64_t a, float_status *status)
1731 FloatParts pa = uint_to_float(a, status);
1732 return float32_round_pack_canonical(pa, status);
1735 float32 uint32_to_float32(uint32_t a, float_status *status)
1737 return uint64_to_float32(a, status);
1740 float32 uint16_to_float32(uint16_t a, float_status *status)
1742 return uint64_to_float32(a, status);
1745 float64 uint64_to_float64(uint64_t a, float_status *status)
1747 FloatParts pa = uint_to_float(a, status);
1748 return float64_round_pack_canonical(pa, status);
1751 float64 uint32_to_float64(uint32_t a, float_status *status)
1753 return uint64_to_float64(a, status);
1756 float64 uint16_to_float64(uint16_t a, float_status *status)
1758 return uint64_to_float64(a, status);
1761 /* Float Min/Max */
1762 /* min() and max() functions. These can't be implemented as
1763 * 'compare and pick one input' because that would mishandle
1764 * NaNs and +0 vs -0.
1766 * minnum() and maxnum() functions. These are similar to the min()
1767 * and max() functions but if one of the arguments is a QNaN and
1768 * the other is numerical then the numerical argument is returned.
1769 * SNaNs will get quietened before being returned.
1770 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1771 * and maxNum() operations. min() and max() are the typical min/max
1772 * semantics provided by many CPUs which predate that specification.
1774 * minnummag() and maxnummag() functions correspond to minNumMag()
1775 * and minNumMag() from the IEEE-754 2008.
1777 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1778 bool ieee, bool ismag, float_status *s)
1780 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1781 if (ieee) {
1782 /* Takes two floating-point values `a' and `b', one of
1783 * which is a NaN, and returns the appropriate NaN
1784 * result. If either `a' or `b' is a signaling NaN,
1785 * the invalid exception is raised.
1787 if (is_snan(a.cls) || is_snan(b.cls)) {
1788 return pick_nan(a, b, s);
1789 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1790 return b;
1791 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1792 return a;
1795 return pick_nan(a, b, s);
1796 } else {
1797 int a_exp, b_exp;
1799 switch (a.cls) {
1800 case float_class_normal:
1801 a_exp = a.exp;
1802 break;
1803 case float_class_inf:
1804 a_exp = INT_MAX;
1805 break;
1806 case float_class_zero:
1807 a_exp = INT_MIN;
1808 break;
1809 default:
1810 g_assert_not_reached();
1811 break;
1813 switch (b.cls) {
1814 case float_class_normal:
1815 b_exp = b.exp;
1816 break;
1817 case float_class_inf:
1818 b_exp = INT_MAX;
1819 break;
1820 case float_class_zero:
1821 b_exp = INT_MIN;
1822 break;
1823 default:
1824 g_assert_not_reached();
1825 break;
1828 if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
1829 bool a_less = a_exp < b_exp;
1830 if (a_exp == b_exp) {
1831 a_less = a.frac < b.frac;
1833 return a_less ^ ismin ? b : a;
1836 if (a.sign == b.sign) {
1837 bool a_less = a_exp < b_exp;
1838 if (a_exp == b_exp) {
1839 a_less = a.frac < b.frac;
1841 return a.sign ^ a_less ^ ismin ? b : a;
1842 } else {
1843 return a.sign ^ ismin ? b : a;
1848 #define MINMAX(sz, name, ismin, isiee, ismag) \
1849 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
1850 float_status *s) \
1852 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1853 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1854 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
1856 return float ## sz ## _round_pack_canonical(pr, s); \
1859 MINMAX(16, min, true, false, false)
1860 MINMAX(16, minnum, true, true, false)
1861 MINMAX(16, minnummag, true, true, true)
1862 MINMAX(16, max, false, false, false)
1863 MINMAX(16, maxnum, false, true, false)
1864 MINMAX(16, maxnummag, false, true, true)
1866 MINMAX(32, min, true, false, false)
1867 MINMAX(32, minnum, true, true, false)
1868 MINMAX(32, minnummag, true, true, true)
1869 MINMAX(32, max, false, false, false)
1870 MINMAX(32, maxnum, false, true, false)
1871 MINMAX(32, maxnummag, false, true, true)
1873 MINMAX(64, min, true, false, false)
1874 MINMAX(64, minnum, true, true, false)
1875 MINMAX(64, minnummag, true, true, true)
1876 MINMAX(64, max, false, false, false)
1877 MINMAX(64, maxnum, false, true, false)
1878 MINMAX(64, maxnummag, false, true, true)
1880 #undef MINMAX
1882 /* Floating point compare */
1883 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
1884 float_status *s)
1886 if (is_nan(a.cls) || is_nan(b.cls)) {
1887 if (!is_quiet ||
1888 a.cls == float_class_snan ||
1889 b.cls == float_class_snan) {
1890 s->float_exception_flags |= float_flag_invalid;
1892 return float_relation_unordered;
1895 if (a.cls == float_class_zero) {
1896 if (b.cls == float_class_zero) {
1897 return float_relation_equal;
1899 return b.sign ? float_relation_greater : float_relation_less;
1900 } else if (b.cls == float_class_zero) {
1901 return a.sign ? float_relation_less : float_relation_greater;
1904 /* The only really important thing about infinity is its sign. If
1905 * both are infinities the sign marks the smallest of the two.
1907 if (a.cls == float_class_inf) {
1908 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
1909 return float_relation_equal;
1911 return a.sign ? float_relation_less : float_relation_greater;
1912 } else if (b.cls == float_class_inf) {
1913 return b.sign ? float_relation_greater : float_relation_less;
1916 if (a.sign != b.sign) {
1917 return a.sign ? float_relation_less : float_relation_greater;
1920 if (a.exp == b.exp) {
1921 if (a.frac == b.frac) {
1922 return float_relation_equal;
1924 if (a.sign) {
1925 return a.frac > b.frac ?
1926 float_relation_less : float_relation_greater;
1927 } else {
1928 return a.frac > b.frac ?
1929 float_relation_greater : float_relation_less;
1931 } else {
1932 if (a.sign) {
1933 return a.exp > b.exp ? float_relation_less : float_relation_greater;
1934 } else {
1935 return a.exp > b.exp ? float_relation_greater : float_relation_less;
1940 #define COMPARE(sz) \
1941 int float ## sz ## _compare(float ## sz a, float ## sz b, \
1942 float_status *s) \
1944 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1945 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1946 return compare_floats(pa, pb, false, s); \
1948 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \
1949 float_status *s) \
1951 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1952 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1953 return compare_floats(pa, pb, true, s); \
1956 COMPARE(16)
1957 COMPARE(32)
1958 COMPARE(64)
1960 #undef COMPARE
1962 /* Multiply A by 2 raised to the power N. */
1963 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1965 if (unlikely(is_nan(a.cls))) {
1966 return return_nan(a, s);
1968 if (a.cls == float_class_normal) {
1969 /* The largest float type (even though not supported by FloatParts)
1970 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
1971 * still allows rounding to infinity, without allowing overflow
1972 * within the int32_t that backs FloatParts.exp.
1974 n = MIN(MAX(n, -0x10000), 0x10000);
1975 a.exp += n;
1977 return a;
1980 float16 float16_scalbn(float16 a, int n, float_status *status)
1982 FloatParts pa = float16_unpack_canonical(a, status);
1983 FloatParts pr = scalbn_decomposed(pa, n, status);
1984 return float16_round_pack_canonical(pr, status);
1987 float32 float32_scalbn(float32 a, int n, float_status *status)
1989 FloatParts pa = float32_unpack_canonical(a, status);
1990 FloatParts pr = scalbn_decomposed(pa, n, status);
1991 return float32_round_pack_canonical(pr, status);
1994 float64 float64_scalbn(float64 a, int n, float_status *status)
1996 FloatParts pa = float64_unpack_canonical(a, status);
1997 FloatParts pr = scalbn_decomposed(pa, n, status);
1998 return float64_round_pack_canonical(pr, status);
2002 * Square Root
2004 * The old softfloat code did an approximation step before zeroing in
2005 * on the final result. However for simpleness we just compute the
2006 * square root by iterating down from the implicit bit to enough extra
2007 * bits to ensure we get a correctly rounded result.
2009 * This does mean however the calculation is slower than before,
2010 * especially for 64 bit floats.
2013 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
2015 uint64_t a_frac, r_frac, s_frac;
2016 int bit, last_bit;
2018 if (is_nan(a.cls)) {
2019 return return_nan(a, s);
2021 if (a.cls == float_class_zero) {
2022 return a; /* sqrt(+-0) = +-0 */
2024 if (a.sign) {
2025 s->float_exception_flags |= float_flag_invalid;
2026 return parts_default_nan(s);
2028 if (a.cls == float_class_inf) {
2029 return a; /* sqrt(+inf) = +inf */
2032 assert(a.cls == float_class_normal);
2034 /* We need two overflow bits at the top. Adding room for that is a
2035 * right shift. If the exponent is odd, we can discard the low bit
2036 * by multiplying the fraction by 2; that's a left shift. Combine
2037 * those and we shift right if the exponent is even.
2039 a_frac = a.frac;
2040 if (!(a.exp & 1)) {
2041 a_frac >>= 1;
2043 a.exp >>= 1;
2045 /* Bit-by-bit computation of sqrt. */
2046 r_frac = 0;
2047 s_frac = 0;
2049 /* Iterate from implicit bit down to the 3 extra bits to compute a
2050 * properly rounded result. Remember we've inserted one more bit
2051 * at the top, so these positions are one less.
2053 bit = DECOMPOSED_BINARY_POINT - 1;
2054 last_bit = MAX(p->frac_shift - 4, 0);
2055 do {
2056 uint64_t q = 1ULL << bit;
2057 uint64_t t_frac = s_frac + q;
2058 if (t_frac <= a_frac) {
2059 s_frac = t_frac + q;
2060 a_frac -= t_frac;
2061 r_frac += q;
2063 a_frac <<= 1;
2064 } while (--bit >= last_bit);
2066 /* Undo the right shift done above. If there is any remaining
2067 * fraction, the result is inexact. Set the sticky bit.
2069 a.frac = (r_frac << 1) + (a_frac != 0);
2071 return a;
2074 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
2076 FloatParts pa = float16_unpack_canonical(a, status);
2077 FloatParts pr = sqrt_float(pa, status, &float16_params);
2078 return float16_round_pack_canonical(pr, status);
2081 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
2083 FloatParts pa = float32_unpack_canonical(a, status);
2084 FloatParts pr = sqrt_float(pa, status, &float32_params);
2085 return float32_round_pack_canonical(pr, status);
2088 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
2090 FloatParts pa = float64_unpack_canonical(a, status);
2091 FloatParts pr = sqrt_float(pa, status, &float64_params);
2092 return float64_round_pack_canonical(pr, status);
2095 /*----------------------------------------------------------------------------
2096 | The pattern for a default generated NaN.
2097 *----------------------------------------------------------------------------*/
2099 float16 float16_default_nan(float_status *status)
2101 FloatParts p = parts_default_nan(status);
2102 p.frac >>= float16_params.frac_shift;
2103 return float16_pack_raw(p);
2106 float32 float32_default_nan(float_status *status)
2108 FloatParts p = parts_default_nan(status);
2109 p.frac >>= float32_params.frac_shift;
2110 return float32_pack_raw(p);
2113 float64 float64_default_nan(float_status *status)
2115 FloatParts p = parts_default_nan(status);
2116 p.frac >>= float64_params.frac_shift;
2117 return float64_pack_raw(p);
2120 float128 float128_default_nan(float_status *status)
2122 FloatParts p = parts_default_nan(status);
2123 float128 r;
2125 /* Extrapolate from the choices made by parts_default_nan to fill
2126 * in the quad-floating format. If the low bit is set, assume we
2127 * want to set all non-snan bits.
2129 r.low = -(p.frac & 1);
2130 r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
2131 r.high |= LIT64(0x7FFF000000000000);
2132 r.high |= (uint64_t)p.sign << 63;
2134 return r;
2137 /*----------------------------------------------------------------------------
2138 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
2139 *----------------------------------------------------------------------------*/
2141 float16 float16_silence_nan(float16 a, float_status *status)
2143 FloatParts p = float16_unpack_raw(a);
2144 p.frac <<= float16_params.frac_shift;
2145 p = parts_silence_nan(p, status);
2146 p.frac >>= float16_params.frac_shift;
2147 return float16_pack_raw(p);
2150 float32 float32_silence_nan(float32 a, float_status *status)
2152 FloatParts p = float32_unpack_raw(a);
2153 p.frac <<= float32_params.frac_shift;
2154 p = parts_silence_nan(p, status);
2155 p.frac >>= float32_params.frac_shift;
2156 return float32_pack_raw(p);
2159 float64 float64_silence_nan(float64 a, float_status *status)
2161 FloatParts p = float64_unpack_raw(a);
2162 p.frac <<= float64_params.frac_shift;
2163 p = parts_silence_nan(p, status);
2164 p.frac >>= float64_params.frac_shift;
2165 return float64_pack_raw(p);
2168 /*----------------------------------------------------------------------------
2169 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2170 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2171 | input. If `zSign' is 1, the input is negated before being converted to an
2172 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
2173 | is simply rounded to an integer, with the inexact exception raised if the
2174 | input cannot be represented exactly as an integer. However, if the fixed-
2175 | point input is too large, the invalid exception is raised and the largest
2176 | positive or negative integer is returned.
2177 *----------------------------------------------------------------------------*/
2179 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2181 int8_t roundingMode;
2182 flag roundNearestEven;
2183 int8_t roundIncrement, roundBits;
2184 int32_t z;
2186 roundingMode = status->float_rounding_mode;
2187 roundNearestEven = ( roundingMode == float_round_nearest_even );
2188 switch (roundingMode) {
2189 case float_round_nearest_even:
2190 case float_round_ties_away:
2191 roundIncrement = 0x40;
2192 break;
2193 case float_round_to_zero:
2194 roundIncrement = 0;
2195 break;
2196 case float_round_up:
2197 roundIncrement = zSign ? 0 : 0x7f;
2198 break;
2199 case float_round_down:
2200 roundIncrement = zSign ? 0x7f : 0;
2201 break;
2202 default:
2203 abort();
2205 roundBits = absZ & 0x7F;
2206 absZ = ( absZ + roundIncrement )>>7;
2207 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2208 z = absZ;
2209 if ( zSign ) z = - z;
2210 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2211 float_raise(float_flag_invalid, status);
2212 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2214 if (roundBits) {
2215 status->float_exception_flags |= float_flag_inexact;
2217 return z;
2221 /*----------------------------------------------------------------------------
2222 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2223 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2224 | and returns the properly rounded 64-bit integer corresponding to the input.
2225 | If `zSign' is 1, the input is negated before being converted to an integer.
2226 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2227 | the inexact exception raised if the input cannot be represented exactly as
2228 | an integer. However, if the fixed-point input is too large, the invalid
2229 | exception is raised and the largest positive or negative integer is
2230 | returned.
2231 *----------------------------------------------------------------------------*/
2233 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2234 float_status *status)
2236 int8_t roundingMode;
2237 flag roundNearestEven, increment;
2238 int64_t z;
2240 roundingMode = status->float_rounding_mode;
2241 roundNearestEven = ( roundingMode == float_round_nearest_even );
2242 switch (roundingMode) {
2243 case float_round_nearest_even:
2244 case float_round_ties_away:
2245 increment = ((int64_t) absZ1 < 0);
2246 break;
2247 case float_round_to_zero:
2248 increment = 0;
2249 break;
2250 case float_round_up:
2251 increment = !zSign && absZ1;
2252 break;
2253 case float_round_down:
2254 increment = zSign && absZ1;
2255 break;
2256 default:
2257 abort();
2259 if ( increment ) {
2260 ++absZ0;
2261 if ( absZ0 == 0 ) goto overflow;
2262 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2264 z = absZ0;
2265 if ( zSign ) z = - z;
2266 if ( z && ( ( z < 0 ) ^ zSign ) ) {
2267 overflow:
2268 float_raise(float_flag_invalid, status);
2269 return
2270 zSign ? (int64_t) LIT64( 0x8000000000000000 )
2271 : LIT64( 0x7FFFFFFFFFFFFFFF );
2273 if (absZ1) {
2274 status->float_exception_flags |= float_flag_inexact;
2276 return z;
2280 /*----------------------------------------------------------------------------
2281 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2282 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2283 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2284 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
2285 | with the inexact exception raised if the input cannot be represented exactly
2286 | as an integer. However, if the fixed-point input is too large, the invalid
2287 | exception is raised and the largest unsigned integer is returned.
2288 *----------------------------------------------------------------------------*/
2290 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2291 uint64_t absZ1, float_status *status)
2293 int8_t roundingMode;
2294 flag roundNearestEven, increment;
2296 roundingMode = status->float_rounding_mode;
2297 roundNearestEven = (roundingMode == float_round_nearest_even);
2298 switch (roundingMode) {
2299 case float_round_nearest_even:
2300 case float_round_ties_away:
2301 increment = ((int64_t)absZ1 < 0);
2302 break;
2303 case float_round_to_zero:
2304 increment = 0;
2305 break;
2306 case float_round_up:
2307 increment = !zSign && absZ1;
2308 break;
2309 case float_round_down:
2310 increment = zSign && absZ1;
2311 break;
2312 default:
2313 abort();
2315 if (increment) {
2316 ++absZ0;
2317 if (absZ0 == 0) {
2318 float_raise(float_flag_invalid, status);
2319 return LIT64(0xFFFFFFFFFFFFFFFF);
2321 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2324 if (zSign && absZ0) {
2325 float_raise(float_flag_invalid, status);
2326 return 0;
2329 if (absZ1) {
2330 status->float_exception_flags |= float_flag_inexact;
2332 return absZ0;
2335 /*----------------------------------------------------------------------------
2336 | If `a' is denormal and we are in flush-to-zero mode then set the
2337 | input-denormal exception and return zero. Otherwise just return the value.
2338 *----------------------------------------------------------------------------*/
2339 float32 float32_squash_input_denormal(float32 a, float_status *status)
2341 if (status->flush_inputs_to_zero) {
2342 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2343 float_raise(float_flag_input_denormal, status);
2344 return make_float32(float32_val(a) & 0x80000000);
2347 return a;
2350 /*----------------------------------------------------------------------------
2351 | Normalizes the subnormal single-precision floating-point value represented
2352 | by the denormalized significand `aSig'. The normalized exponent and
2353 | significand are stored at the locations pointed to by `zExpPtr' and
2354 | `zSigPtr', respectively.
2355 *----------------------------------------------------------------------------*/
2357 static void
2358 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2360 int8_t shiftCount;
2362 shiftCount = countLeadingZeros32( aSig ) - 8;
2363 *zSigPtr = aSig<<shiftCount;
2364 *zExpPtr = 1 - shiftCount;
2368 /*----------------------------------------------------------------------------
2369 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2370 | and significand `zSig', and returns the proper single-precision floating-
2371 | point value corresponding to the abstract input. Ordinarily, the abstract
2372 | value is simply rounded and packed into the single-precision format, with
2373 | the inexact exception raised if the abstract input cannot be represented
2374 | exactly. However, if the abstract value is too large, the overflow and
2375 | inexact exceptions are raised and an infinity or maximal finite value is
2376 | returned. If the abstract value is too small, the input value is rounded to
2377 | a subnormal number, and the underflow and inexact exceptions are raised if
2378 | the abstract input cannot be represented exactly as a subnormal single-
2379 | precision floating-point number.
2380 | The input significand `zSig' has its binary point between bits 30
2381 | and 29, which is 7 bits to the left of the usual location. This shifted
2382 | significand must be normalized or smaller. If `zSig' is not normalized,
2383 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2384 | and it must not require rounding. In the usual case that `zSig' is
2385 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2386 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2387 | Binary Floating-Point Arithmetic.
2388 *----------------------------------------------------------------------------*/
2390 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2391 float_status *status)
2393 int8_t roundingMode;
2394 flag roundNearestEven;
2395 int8_t roundIncrement, roundBits;
2396 flag isTiny;
2398 roundingMode = status->float_rounding_mode;
2399 roundNearestEven = ( roundingMode == float_round_nearest_even );
2400 switch (roundingMode) {
2401 case float_round_nearest_even:
2402 case float_round_ties_away:
2403 roundIncrement = 0x40;
2404 break;
2405 case float_round_to_zero:
2406 roundIncrement = 0;
2407 break;
2408 case float_round_up:
2409 roundIncrement = zSign ? 0 : 0x7f;
2410 break;
2411 case float_round_down:
2412 roundIncrement = zSign ? 0x7f : 0;
2413 break;
2414 default:
2415 abort();
2416 break;
2418 roundBits = zSig & 0x7F;
2419 if ( 0xFD <= (uint16_t) zExp ) {
2420 if ( ( 0xFD < zExp )
2421 || ( ( zExp == 0xFD )
2422 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2424 float_raise(float_flag_overflow | float_flag_inexact, status);
2425 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2427 if ( zExp < 0 ) {
2428 if (status->flush_to_zero) {
2429 float_raise(float_flag_output_denormal, status);
2430 return packFloat32(zSign, 0, 0);
2432 isTiny =
2433 (status->float_detect_tininess
2434 == float_tininess_before_rounding)
2435 || ( zExp < -1 )
2436 || ( zSig + roundIncrement < 0x80000000 );
2437 shift32RightJamming( zSig, - zExp, &zSig );
2438 zExp = 0;
2439 roundBits = zSig & 0x7F;
2440 if (isTiny && roundBits) {
2441 float_raise(float_flag_underflow, status);
2445 if (roundBits) {
2446 status->float_exception_flags |= float_flag_inexact;
2448 zSig = ( zSig + roundIncrement )>>7;
2449 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2450 if ( zSig == 0 ) zExp = 0;
2451 return packFloat32( zSign, zExp, zSig );
2455 /*----------------------------------------------------------------------------
2456 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2457 | and significand `zSig', and returns the proper single-precision floating-
2458 | point value corresponding to the abstract input. This routine is just like
2459 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2460 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2461 | floating-point exponent.
2462 *----------------------------------------------------------------------------*/
2464 static float32
2465 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2466 float_status *status)
2468 int8_t shiftCount;
2470 shiftCount = countLeadingZeros32( zSig ) - 1;
2471 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2472 status);
2476 /*----------------------------------------------------------------------------
2477 | If `a' is denormal and we are in flush-to-zero mode then set the
2478 | input-denormal exception and return zero. Otherwise just return the value.
2479 *----------------------------------------------------------------------------*/
2480 float64 float64_squash_input_denormal(float64 a, float_status *status)
2482 if (status->flush_inputs_to_zero) {
2483 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2484 float_raise(float_flag_input_denormal, status);
2485 return make_float64(float64_val(a) & (1ULL << 63));
2488 return a;
2491 /*----------------------------------------------------------------------------
2492 | Normalizes the subnormal double-precision floating-point value represented
2493 | by the denormalized significand `aSig'. The normalized exponent and
2494 | significand are stored at the locations pointed to by `zExpPtr' and
2495 | `zSigPtr', respectively.
2496 *----------------------------------------------------------------------------*/
2498 static void
2499 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2501 int8_t shiftCount;
2503 shiftCount = countLeadingZeros64( aSig ) - 11;
2504 *zSigPtr = aSig<<shiftCount;
2505 *zExpPtr = 1 - shiftCount;
2509 /*----------------------------------------------------------------------------
2510 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2511 | double-precision floating-point value, returning the result. After being
2512 | shifted into the proper positions, the three fields are simply added
2513 | together to form the result. This means that any integer portion of `zSig'
2514 | will be added into the exponent. Since a properly normalized significand
2515 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2516 | than the desired result exponent whenever `zSig' is a complete, normalized
2517 | significand.
2518 *----------------------------------------------------------------------------*/
2520 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2523 return make_float64(
2524 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2528 /*----------------------------------------------------------------------------
2529 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2530 | and significand `zSig', and returns the proper double-precision floating-
2531 | point value corresponding to the abstract input. Ordinarily, the abstract
2532 | value is simply rounded and packed into the double-precision format, with
2533 | the inexact exception raised if the abstract input cannot be represented
2534 | exactly. However, if the abstract value is too large, the overflow and
2535 | inexact exceptions are raised and an infinity or maximal finite value is
2536 | returned. If the abstract value is too small, the input value is rounded to
2537 | a subnormal number, and the underflow and inexact exceptions are raised if
2538 | the abstract input cannot be represented exactly as a subnormal double-
2539 | precision floating-point number.
2540 | The input significand `zSig' has its binary point between bits 62
2541 | and 61, which is 10 bits to the left of the usual location. This shifted
2542 | significand must be normalized or smaller. If `zSig' is not normalized,
2543 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2544 | and it must not require rounding. In the usual case that `zSig' is
2545 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2546 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2547 | Binary Floating-Point Arithmetic.
2548 *----------------------------------------------------------------------------*/
2550 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2551 float_status *status)
2553 int8_t roundingMode;
2554 flag roundNearestEven;
2555 int roundIncrement, roundBits;
2556 flag isTiny;
2558 roundingMode = status->float_rounding_mode;
2559 roundNearestEven = ( roundingMode == float_round_nearest_even );
2560 switch (roundingMode) {
2561 case float_round_nearest_even:
2562 case float_round_ties_away:
2563 roundIncrement = 0x200;
2564 break;
2565 case float_round_to_zero:
2566 roundIncrement = 0;
2567 break;
2568 case float_round_up:
2569 roundIncrement = zSign ? 0 : 0x3ff;
2570 break;
2571 case float_round_down:
2572 roundIncrement = zSign ? 0x3ff : 0;
2573 break;
2574 case float_round_to_odd:
2575 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2576 break;
2577 default:
2578 abort();
2580 roundBits = zSig & 0x3FF;
2581 if ( 0x7FD <= (uint16_t) zExp ) {
2582 if ( ( 0x7FD < zExp )
2583 || ( ( zExp == 0x7FD )
2584 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2586 bool overflow_to_inf = roundingMode != float_round_to_odd &&
2587 roundIncrement != 0;
2588 float_raise(float_flag_overflow | float_flag_inexact, status);
2589 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2591 if ( zExp < 0 ) {
2592 if (status->flush_to_zero) {
2593 float_raise(float_flag_output_denormal, status);
2594 return packFloat64(zSign, 0, 0);
2596 isTiny =
2597 (status->float_detect_tininess
2598 == float_tininess_before_rounding)
2599 || ( zExp < -1 )
2600 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2601 shift64RightJamming( zSig, - zExp, &zSig );
2602 zExp = 0;
2603 roundBits = zSig & 0x3FF;
2604 if (isTiny && roundBits) {
2605 float_raise(float_flag_underflow, status);
2607 if (roundingMode == float_round_to_odd) {
2609 * For round-to-odd case, the roundIncrement depends on
2610 * zSig which just changed.
2612 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2616 if (roundBits) {
2617 status->float_exception_flags |= float_flag_inexact;
2619 zSig = ( zSig + roundIncrement )>>10;
2620 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2621 if ( zSig == 0 ) zExp = 0;
2622 return packFloat64( zSign, zExp, zSig );
2626 /*----------------------------------------------------------------------------
2627 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2628 | and significand `zSig', and returns the proper double-precision floating-
2629 | point value corresponding to the abstract input. This routine is just like
2630 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2631 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2632 | floating-point exponent.
2633 *----------------------------------------------------------------------------*/
2635 static float64
2636 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2637 float_status *status)
2639 int8_t shiftCount;
2641 shiftCount = countLeadingZeros64( zSig ) - 1;
2642 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2643 status);
2647 /*----------------------------------------------------------------------------
2648 | Normalizes the subnormal extended double-precision floating-point value
2649 | represented by the denormalized significand `aSig'. The normalized exponent
2650 | and significand are stored at the locations pointed to by `zExpPtr' and
2651 | `zSigPtr', respectively.
2652 *----------------------------------------------------------------------------*/
2654 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2655 uint64_t *zSigPtr)
2657 int8_t shiftCount;
2659 shiftCount = countLeadingZeros64( aSig );
2660 *zSigPtr = aSig<<shiftCount;
2661 *zExpPtr = 1 - shiftCount;
2664 /*----------------------------------------------------------------------------
2665 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2666 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2667 | and returns the proper extended double-precision floating-point value
2668 | corresponding to the abstract input. Ordinarily, the abstract value is
2669 | rounded and packed into the extended double-precision format, with the
2670 | inexact exception raised if the abstract input cannot be represented
2671 | exactly. However, if the abstract value is too large, the overflow and
2672 | inexact exceptions are raised and an infinity or maximal finite value is
2673 | returned. If the abstract value is too small, the input value is rounded to
2674 | a subnormal number, and the underflow and inexact exceptions are raised if
2675 | the abstract input cannot be represented exactly as a subnormal extended
2676 | double-precision floating-point number.
2677 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
2678 | number of bits as single or double precision, respectively. Otherwise, the
2679 | result is rounded to the full precision of the extended double-precision
2680 | format.
2681 | The input significand must be normalized or smaller. If the input
2682 | significand is not normalized, `zExp' must be 0; in that case, the result
2683 | returned is a subnormal number, and it must not require rounding. The
2684 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2685 | Floating-Point Arithmetic.
2686 *----------------------------------------------------------------------------*/
2688 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2689 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2690 float_status *status)
2692 int8_t roundingMode;
2693 flag roundNearestEven, increment, isTiny;
2694 int64_t roundIncrement, roundMask, roundBits;
2696 roundingMode = status->float_rounding_mode;
2697 roundNearestEven = ( roundingMode == float_round_nearest_even );
2698 if ( roundingPrecision == 80 ) goto precision80;
2699 if ( roundingPrecision == 64 ) {
2700 roundIncrement = LIT64( 0x0000000000000400 );
2701 roundMask = LIT64( 0x00000000000007FF );
2703 else if ( roundingPrecision == 32 ) {
2704 roundIncrement = LIT64( 0x0000008000000000 );
2705 roundMask = LIT64( 0x000000FFFFFFFFFF );
2707 else {
2708 goto precision80;
2710 zSig0 |= ( zSig1 != 0 );
2711 switch (roundingMode) {
2712 case float_round_nearest_even:
2713 case float_round_ties_away:
2714 break;
2715 case float_round_to_zero:
2716 roundIncrement = 0;
2717 break;
2718 case float_round_up:
2719 roundIncrement = zSign ? 0 : roundMask;
2720 break;
2721 case float_round_down:
2722 roundIncrement = zSign ? roundMask : 0;
2723 break;
2724 default:
2725 abort();
2727 roundBits = zSig0 & roundMask;
2728 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2729 if ( ( 0x7FFE < zExp )
2730 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2732 goto overflow;
2734 if ( zExp <= 0 ) {
2735 if (status->flush_to_zero) {
2736 float_raise(float_flag_output_denormal, status);
2737 return packFloatx80(zSign, 0, 0);
2739 isTiny =
2740 (status->float_detect_tininess
2741 == float_tininess_before_rounding)
2742 || ( zExp < 0 )
2743 || ( zSig0 <= zSig0 + roundIncrement );
2744 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2745 zExp = 0;
2746 roundBits = zSig0 & roundMask;
2747 if (isTiny && roundBits) {
2748 float_raise(float_flag_underflow, status);
2750 if (roundBits) {
2751 status->float_exception_flags |= float_flag_inexact;
2753 zSig0 += roundIncrement;
2754 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2755 roundIncrement = roundMask + 1;
2756 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2757 roundMask |= roundIncrement;
2759 zSig0 &= ~ roundMask;
2760 return packFloatx80( zSign, zExp, zSig0 );
2763 if (roundBits) {
2764 status->float_exception_flags |= float_flag_inexact;
2766 zSig0 += roundIncrement;
2767 if ( zSig0 < roundIncrement ) {
2768 ++zExp;
2769 zSig0 = LIT64( 0x8000000000000000 );
2771 roundIncrement = roundMask + 1;
2772 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2773 roundMask |= roundIncrement;
2775 zSig0 &= ~ roundMask;
2776 if ( zSig0 == 0 ) zExp = 0;
2777 return packFloatx80( zSign, zExp, zSig0 );
2778 precision80:
2779 switch (roundingMode) {
2780 case float_round_nearest_even:
2781 case float_round_ties_away:
2782 increment = ((int64_t)zSig1 < 0);
2783 break;
2784 case float_round_to_zero:
2785 increment = 0;
2786 break;
2787 case float_round_up:
2788 increment = !zSign && zSig1;
2789 break;
2790 case float_round_down:
2791 increment = zSign && zSig1;
2792 break;
2793 default:
2794 abort();
2796 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2797 if ( ( 0x7FFE < zExp )
2798 || ( ( zExp == 0x7FFE )
2799 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2800 && increment
2803 roundMask = 0;
2804 overflow:
2805 float_raise(float_flag_overflow | float_flag_inexact, status);
2806 if ( ( roundingMode == float_round_to_zero )
2807 || ( zSign && ( roundingMode == float_round_up ) )
2808 || ( ! zSign && ( roundingMode == float_round_down ) )
2810 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2812 return packFloatx80(zSign,
2813 floatx80_infinity_high,
2814 floatx80_infinity_low);
2816 if ( zExp <= 0 ) {
2817 isTiny =
2818 (status->float_detect_tininess
2819 == float_tininess_before_rounding)
2820 || ( zExp < 0 )
2821 || ! increment
2822 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2823 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2824 zExp = 0;
2825 if (isTiny && zSig1) {
2826 float_raise(float_flag_underflow, status);
2828 if (zSig1) {
2829 status->float_exception_flags |= float_flag_inexact;
2831 switch (roundingMode) {
2832 case float_round_nearest_even:
2833 case float_round_ties_away:
2834 increment = ((int64_t)zSig1 < 0);
2835 break;
2836 case float_round_to_zero:
2837 increment = 0;
2838 break;
2839 case float_round_up:
2840 increment = !zSign && zSig1;
2841 break;
2842 case float_round_down:
2843 increment = zSign && zSig1;
2844 break;
2845 default:
2846 abort();
2848 if ( increment ) {
2849 ++zSig0;
2850 zSig0 &=
2851 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2852 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2854 return packFloatx80( zSign, zExp, zSig0 );
2857 if (zSig1) {
2858 status->float_exception_flags |= float_flag_inexact;
2860 if ( increment ) {
2861 ++zSig0;
2862 if ( zSig0 == 0 ) {
2863 ++zExp;
2864 zSig0 = LIT64( 0x8000000000000000 );
2866 else {
2867 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2870 else {
2871 if ( zSig0 == 0 ) zExp = 0;
2873 return packFloatx80( zSign, zExp, zSig0 );
2877 /*----------------------------------------------------------------------------
2878 | Takes an abstract floating-point value having sign `zSign', exponent
2879 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2880 | and returns the proper extended double-precision floating-point value
2881 | corresponding to the abstract input. This routine is just like
2882 | `roundAndPackFloatx80' except that the input significand does not have to be
2883 | normalized.
2884 *----------------------------------------------------------------------------*/
2886 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2887 flag zSign, int32_t zExp,
2888 uint64_t zSig0, uint64_t zSig1,
2889 float_status *status)
2891 int8_t shiftCount;
2893 if ( zSig0 == 0 ) {
2894 zSig0 = zSig1;
2895 zSig1 = 0;
2896 zExp -= 64;
2898 shiftCount = countLeadingZeros64( zSig0 );
2899 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2900 zExp -= shiftCount;
2901 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2902 zSig0, zSig1, status);
2906 /*----------------------------------------------------------------------------
2907 | Returns the least-significant 64 fraction bits of the quadruple-precision
2908 | floating-point value `a'.
2909 *----------------------------------------------------------------------------*/
2911 static inline uint64_t extractFloat128Frac1( float128 a )
2914 return a.low;
2918 /*----------------------------------------------------------------------------
2919 | Returns the most-significant 48 fraction bits of the quadruple-precision
2920 | floating-point value `a'.
2921 *----------------------------------------------------------------------------*/
2923 static inline uint64_t extractFloat128Frac0( float128 a )
2926 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2930 /*----------------------------------------------------------------------------
2931 | Returns the exponent bits of the quadruple-precision floating-point value
2932 | `a'.
2933 *----------------------------------------------------------------------------*/
2935 static inline int32_t extractFloat128Exp( float128 a )
2938 return ( a.high>>48 ) & 0x7FFF;
2942 /*----------------------------------------------------------------------------
2943 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2944 *----------------------------------------------------------------------------*/
2946 static inline flag extractFloat128Sign( float128 a )
2949 return a.high>>63;
2953 /*----------------------------------------------------------------------------
2954 | Normalizes the subnormal quadruple-precision floating-point value
2955 | represented by the denormalized significand formed by the concatenation of
2956 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
2957 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
2958 | significand are stored at the location pointed to by `zSig0Ptr', and the
2959 | least significant 64 bits of the normalized significand are stored at the
2960 | location pointed to by `zSig1Ptr'.
2961 *----------------------------------------------------------------------------*/
2963 static void
2964 normalizeFloat128Subnormal(
2965 uint64_t aSig0,
2966 uint64_t aSig1,
2967 int32_t *zExpPtr,
2968 uint64_t *zSig0Ptr,
2969 uint64_t *zSig1Ptr
2972 int8_t shiftCount;
2974 if ( aSig0 == 0 ) {
2975 shiftCount = countLeadingZeros64( aSig1 ) - 15;
2976 if ( shiftCount < 0 ) {
2977 *zSig0Ptr = aSig1>>( - shiftCount );
2978 *zSig1Ptr = aSig1<<( shiftCount & 63 );
2980 else {
2981 *zSig0Ptr = aSig1<<shiftCount;
2982 *zSig1Ptr = 0;
2984 *zExpPtr = - shiftCount - 63;
2986 else {
2987 shiftCount = countLeadingZeros64( aSig0 ) - 15;
2988 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2989 *zExpPtr = 1 - shiftCount;
2994 /*----------------------------------------------------------------------------
2995 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2996 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2997 | floating-point value, returning the result. After being shifted into the
2998 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2999 | added together to form the most significant 32 bits of the result. This
3000 | means that any integer portion of `zSig0' will be added into the exponent.
3001 | Since a properly normalized significand will have an integer portion equal
3002 | to 1, the `zExp' input should be 1 less than the desired result exponent
3003 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
3004 | significand.
3005 *----------------------------------------------------------------------------*/
3007 static inline float128
3008 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
3010 float128 z;
3012 z.low = zSig1;
3013 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
3014 return z;
3018 /*----------------------------------------------------------------------------
3019 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3020 | and extended significand formed by the concatenation of `zSig0', `zSig1',
3021 | and `zSig2', and returns the proper quadruple-precision floating-point value
3022 | corresponding to the abstract input. Ordinarily, the abstract value is
3023 | simply rounded and packed into the quadruple-precision format, with the
3024 | inexact exception raised if the abstract input cannot be represented
3025 | exactly. However, if the abstract value is too large, the overflow and
3026 | inexact exceptions are raised and an infinity or maximal finite value is
3027 | returned. If the abstract value is too small, the input value is rounded to
3028 | a subnormal number, and the underflow and inexact exceptions are raised if
3029 | the abstract input cannot be represented exactly as a subnormal quadruple-
3030 | precision floating-point number.
3031 | The input significand must be normalized or smaller. If the input
3032 | significand is not normalized, `zExp' must be 0; in that case, the result
3033 | returned is a subnormal number, and it must not require rounding. In the
3034 | usual case that the input significand is normalized, `zExp' must be 1 less
3035 | than the ``true'' floating-point exponent. The handling of underflow and
3036 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3037 *----------------------------------------------------------------------------*/
3039 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
3040 uint64_t zSig0, uint64_t zSig1,
3041 uint64_t zSig2, float_status *status)
3043 int8_t roundingMode;
3044 flag roundNearestEven, increment, isTiny;
3046 roundingMode = status->float_rounding_mode;
3047 roundNearestEven = ( roundingMode == float_round_nearest_even );
3048 switch (roundingMode) {
3049 case float_round_nearest_even:
3050 case float_round_ties_away:
3051 increment = ((int64_t)zSig2 < 0);
3052 break;
3053 case float_round_to_zero:
3054 increment = 0;
3055 break;
3056 case float_round_up:
3057 increment = !zSign && zSig2;
3058 break;
3059 case float_round_down:
3060 increment = zSign && zSig2;
3061 break;
3062 case float_round_to_odd:
3063 increment = !(zSig1 & 0x1) && zSig2;
3064 break;
3065 default:
3066 abort();
3068 if ( 0x7FFD <= (uint32_t) zExp ) {
3069 if ( ( 0x7FFD < zExp )
3070 || ( ( zExp == 0x7FFD )
3071 && eq128(
3072 LIT64( 0x0001FFFFFFFFFFFF ),
3073 LIT64( 0xFFFFFFFFFFFFFFFF ),
3074 zSig0,
3075 zSig1
3077 && increment
3080 float_raise(float_flag_overflow | float_flag_inexact, status);
3081 if ( ( roundingMode == float_round_to_zero )
3082 || ( zSign && ( roundingMode == float_round_up ) )
3083 || ( ! zSign && ( roundingMode == float_round_down ) )
3084 || (roundingMode == float_round_to_odd)
3086 return
3087 packFloat128(
3088 zSign,
3089 0x7FFE,
3090 LIT64( 0x0000FFFFFFFFFFFF ),
3091 LIT64( 0xFFFFFFFFFFFFFFFF )
3094 return packFloat128( zSign, 0x7FFF, 0, 0 );
3096 if ( zExp < 0 ) {
3097 if (status->flush_to_zero) {
3098 float_raise(float_flag_output_denormal, status);
3099 return packFloat128(zSign, 0, 0, 0);
3101 isTiny =
3102 (status->float_detect_tininess
3103 == float_tininess_before_rounding)
3104 || ( zExp < -1 )
3105 || ! increment
3106 || lt128(
3107 zSig0,
3108 zSig1,
3109 LIT64( 0x0001FFFFFFFFFFFF ),
3110 LIT64( 0xFFFFFFFFFFFFFFFF )
3112 shift128ExtraRightJamming(
3113 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
3114 zExp = 0;
3115 if (isTiny && zSig2) {
3116 float_raise(float_flag_underflow, status);
3118 switch (roundingMode) {
3119 case float_round_nearest_even:
3120 case float_round_ties_away:
3121 increment = ((int64_t)zSig2 < 0);
3122 break;
3123 case float_round_to_zero:
3124 increment = 0;
3125 break;
3126 case float_round_up:
3127 increment = !zSign && zSig2;
3128 break;
3129 case float_round_down:
3130 increment = zSign && zSig2;
3131 break;
3132 case float_round_to_odd:
3133 increment = !(zSig1 & 0x1) && zSig2;
3134 break;
3135 default:
3136 abort();
3140 if (zSig2) {
3141 status->float_exception_flags |= float_flag_inexact;
3143 if ( increment ) {
3144 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
3145 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
3147 else {
3148 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
3150 return packFloat128( zSign, zExp, zSig0, zSig1 );
3154 /*----------------------------------------------------------------------------
3155 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3156 | and significand formed by the concatenation of `zSig0' and `zSig1', and
3157 | returns the proper quadruple-precision floating-point value corresponding
3158 | to the abstract input. This routine is just like `roundAndPackFloat128'
3159 | except that the input significand has fewer bits and does not have to be
3160 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
3161 | point exponent.
3162 *----------------------------------------------------------------------------*/
3164 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
3165 uint64_t zSig0, uint64_t zSig1,
3166 float_status *status)
3168 int8_t shiftCount;
3169 uint64_t zSig2;
3171 if ( zSig0 == 0 ) {
3172 zSig0 = zSig1;
3173 zSig1 = 0;
3174 zExp -= 64;
3176 shiftCount = countLeadingZeros64( zSig0 ) - 15;
3177 if ( 0 <= shiftCount ) {
3178 zSig2 = 0;
3179 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3181 else {
3182 shift128ExtraRightJamming(
3183 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3185 zExp -= shiftCount;
3186 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3191 /*----------------------------------------------------------------------------
3192 | Returns the result of converting the 32-bit two's complement integer `a'
3193 | to the extended double-precision floating-point format. The conversion
3194 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3195 | Arithmetic.
3196 *----------------------------------------------------------------------------*/
3198 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3200 flag zSign;
3201 uint32_t absA;
3202 int8_t shiftCount;
3203 uint64_t zSig;
3205 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3206 zSign = ( a < 0 );
3207 absA = zSign ? - a : a;
3208 shiftCount = countLeadingZeros32( absA ) + 32;
3209 zSig = absA;
3210 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3214 /*----------------------------------------------------------------------------
3215 | Returns the result of converting the 32-bit two's complement integer `a' to
3216 | the quadruple-precision floating-point format. The conversion is performed
3217 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3218 *----------------------------------------------------------------------------*/
3220 float128 int32_to_float128(int32_t a, float_status *status)
3222 flag zSign;
3223 uint32_t absA;
3224 int8_t shiftCount;
3225 uint64_t zSig0;
3227 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3228 zSign = ( a < 0 );
3229 absA = zSign ? - a : a;
3230 shiftCount = countLeadingZeros32( absA ) + 17;
3231 zSig0 = absA;
3232 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3236 /*----------------------------------------------------------------------------
3237 | Returns the result of converting the 64-bit two's complement integer `a'
3238 | to the extended double-precision floating-point format. The conversion
3239 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3240 | Arithmetic.
3241 *----------------------------------------------------------------------------*/
3243 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3245 flag zSign;
3246 uint64_t absA;
3247 int8_t shiftCount;
3249 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3250 zSign = ( a < 0 );
3251 absA = zSign ? - a : a;
3252 shiftCount = countLeadingZeros64( absA );
3253 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3257 /*----------------------------------------------------------------------------
3258 | Returns the result of converting the 64-bit two's complement integer `a' to
3259 | the quadruple-precision floating-point format. The conversion is performed
3260 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3261 *----------------------------------------------------------------------------*/
3263 float128 int64_to_float128(int64_t a, float_status *status)
3265 flag zSign;
3266 uint64_t absA;
3267 int8_t shiftCount;
3268 int32_t zExp;
3269 uint64_t zSig0, zSig1;
3271 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3272 zSign = ( a < 0 );
3273 absA = zSign ? - a : a;
3274 shiftCount = countLeadingZeros64( absA ) + 49;
3275 zExp = 0x406E - shiftCount;
3276 if ( 64 <= shiftCount ) {
3277 zSig1 = 0;
3278 zSig0 = absA;
3279 shiftCount -= 64;
3281 else {
3282 zSig1 = absA;
3283 zSig0 = 0;
3285 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3286 return packFloat128( zSign, zExp, zSig0, zSig1 );
3290 /*----------------------------------------------------------------------------
3291 | Returns the result of converting the 64-bit unsigned integer `a'
3292 | to the quadruple-precision floating-point format. The conversion is performed
3293 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3294 *----------------------------------------------------------------------------*/
3296 float128 uint64_to_float128(uint64_t a, float_status *status)
3298 if (a == 0) {
3299 return float128_zero;
3301 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
3304 /*----------------------------------------------------------------------------
3305 | Returns the result of converting the single-precision floating-point value
3306 | `a' to the extended double-precision floating-point format. The conversion
3307 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3308 | Arithmetic.
3309 *----------------------------------------------------------------------------*/
3311 floatx80 float32_to_floatx80(float32 a, float_status *status)
3313 flag aSign;
3314 int aExp;
3315 uint32_t aSig;
3317 a = float32_squash_input_denormal(a, status);
3318 aSig = extractFloat32Frac( a );
3319 aExp = extractFloat32Exp( a );
3320 aSign = extractFloat32Sign( a );
3321 if ( aExp == 0xFF ) {
3322 if (aSig) {
3323 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3325 return packFloatx80(aSign,
3326 floatx80_infinity_high,
3327 floatx80_infinity_low);
3329 if ( aExp == 0 ) {
3330 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3331 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3333 aSig |= 0x00800000;
3334 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3338 /*----------------------------------------------------------------------------
3339 | Returns the result of converting the single-precision floating-point value
3340 | `a' to the double-precision floating-point format. The conversion is
3341 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3342 | Arithmetic.
3343 *----------------------------------------------------------------------------*/
3345 float128 float32_to_float128(float32 a, float_status *status)
3347 flag aSign;
3348 int aExp;
3349 uint32_t aSig;
3351 a = float32_squash_input_denormal(a, status);
3352 aSig = extractFloat32Frac( a );
3353 aExp = extractFloat32Exp( a );
3354 aSign = extractFloat32Sign( a );
3355 if ( aExp == 0xFF ) {
3356 if (aSig) {
3357 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3359 return packFloat128( aSign, 0x7FFF, 0, 0 );
3361 if ( aExp == 0 ) {
3362 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3363 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3364 --aExp;
3366 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3370 /*----------------------------------------------------------------------------
3371 | Returns the remainder of the single-precision floating-point value `a'
3372 | with respect to the corresponding value `b'. The operation is performed
3373 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3374 *----------------------------------------------------------------------------*/
3376 float32 float32_rem(float32 a, float32 b, float_status *status)
3378 flag aSign, zSign;
3379 int aExp, bExp, expDiff;
3380 uint32_t aSig, bSig;
3381 uint32_t q;
3382 uint64_t aSig64, bSig64, q64;
3383 uint32_t alternateASig;
3384 int32_t sigMean;
3385 a = float32_squash_input_denormal(a, status);
3386 b = float32_squash_input_denormal(b, status);
3388 aSig = extractFloat32Frac( a );
3389 aExp = extractFloat32Exp( a );
3390 aSign = extractFloat32Sign( a );
3391 bSig = extractFloat32Frac( b );
3392 bExp = extractFloat32Exp( b );
3393 if ( aExp == 0xFF ) {
3394 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3395 return propagateFloat32NaN(a, b, status);
3397 float_raise(float_flag_invalid, status);
3398 return float32_default_nan(status);
3400 if ( bExp == 0xFF ) {
3401 if (bSig) {
3402 return propagateFloat32NaN(a, b, status);
3404 return a;
3406 if ( bExp == 0 ) {
3407 if ( bSig == 0 ) {
3408 float_raise(float_flag_invalid, status);
3409 return float32_default_nan(status);
3411 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3413 if ( aExp == 0 ) {
3414 if ( aSig == 0 ) return a;
3415 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3417 expDiff = aExp - bExp;
3418 aSig |= 0x00800000;
3419 bSig |= 0x00800000;
3420 if ( expDiff < 32 ) {
3421 aSig <<= 8;
3422 bSig <<= 8;
3423 if ( expDiff < 0 ) {
3424 if ( expDiff < -1 ) return a;
3425 aSig >>= 1;
3427 q = ( bSig <= aSig );
3428 if ( q ) aSig -= bSig;
3429 if ( 0 < expDiff ) {
3430 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3431 q >>= 32 - expDiff;
3432 bSig >>= 2;
3433 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3435 else {
3436 aSig >>= 2;
3437 bSig >>= 2;
3440 else {
3441 if ( bSig <= aSig ) aSig -= bSig;
3442 aSig64 = ( (uint64_t) aSig )<<40;
3443 bSig64 = ( (uint64_t) bSig )<<40;
3444 expDiff -= 64;
3445 while ( 0 < expDiff ) {
3446 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3447 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3448 aSig64 = - ( ( bSig * q64 )<<38 );
3449 expDiff -= 62;
3451 expDiff += 64;
3452 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3453 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3454 q = q64>>( 64 - expDiff );
3455 bSig <<= 6;
3456 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3458 do {
3459 alternateASig = aSig;
3460 ++q;
3461 aSig -= bSig;
3462 } while ( 0 <= (int32_t) aSig );
3463 sigMean = aSig + alternateASig;
3464 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3465 aSig = alternateASig;
3467 zSign = ( (int32_t) aSig < 0 );
3468 if ( zSign ) aSig = - aSig;
3469 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3474 /*----------------------------------------------------------------------------
3475 | Returns the binary exponential of the single-precision floating-point value
3476 | `a'. The operation is performed according to the IEC/IEEE Standard for
3477 | Binary Floating-Point Arithmetic.
3479 | Uses the following identities:
3481 | 1. -------------------------------------------------------------------------
3482 | x x*ln(2)
3483 | 2 = e
3485 | 2. -------------------------------------------------------------------------
3486 | 2 3 4 5 n
3487 | x x x x x x x
3488 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3489 | 1! 2! 3! 4! 5! n!
3490 *----------------------------------------------------------------------------*/
3492 static const float64 float32_exp2_coefficients[15] =
3494 const_float64( 0x3ff0000000000000ll ), /* 1 */
3495 const_float64( 0x3fe0000000000000ll ), /* 2 */
3496 const_float64( 0x3fc5555555555555ll ), /* 3 */
3497 const_float64( 0x3fa5555555555555ll ), /* 4 */
3498 const_float64( 0x3f81111111111111ll ), /* 5 */
3499 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
3500 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
3501 const_float64( 0x3efa01a01a01a01all ), /* 8 */
3502 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
3503 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3504 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3505 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3506 const_float64( 0x3de6124613a86d09ll ), /* 13 */
3507 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3508 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3511 float32 float32_exp2(float32 a, float_status *status)
3513 flag aSign;
3514 int aExp;
3515 uint32_t aSig;
3516 float64 r, x, xn;
3517 int i;
3518 a = float32_squash_input_denormal(a, status);
3520 aSig = extractFloat32Frac( a );
3521 aExp = extractFloat32Exp( a );
3522 aSign = extractFloat32Sign( a );
3524 if ( aExp == 0xFF) {
3525 if (aSig) {
3526 return propagateFloat32NaN(a, float32_zero, status);
3528 return (aSign) ? float32_zero : a;
3530 if (aExp == 0) {
3531 if (aSig == 0) return float32_one;
3534 float_raise(float_flag_inexact, status);
3536 /* ******************************* */
3537 /* using float64 for approximation */
3538 /* ******************************* */
3539 x = float32_to_float64(a, status);
3540 x = float64_mul(x, float64_ln2, status);
3542 xn = x;
3543 r = float64_one;
3544 for (i = 0 ; i < 15 ; i++) {
3545 float64 f;
3547 f = float64_mul(xn, float32_exp2_coefficients[i], status);
3548 r = float64_add(r, f, status);
3550 xn = float64_mul(xn, x, status);
3553 return float64_to_float32(r, status);
3556 /*----------------------------------------------------------------------------
3557 | Returns the binary log of the single-precision floating-point value `a'.
3558 | The operation is performed according to the IEC/IEEE Standard for Binary
3559 | Floating-Point Arithmetic.
3560 *----------------------------------------------------------------------------*/
3561 float32 float32_log2(float32 a, float_status *status)
3563 flag aSign, zSign;
3564 int aExp;
3565 uint32_t aSig, zSig, i;
3567 a = float32_squash_input_denormal(a, status);
3568 aSig = extractFloat32Frac( a );
3569 aExp = extractFloat32Exp( a );
3570 aSign = extractFloat32Sign( a );
3572 if ( aExp == 0 ) {
3573 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3574 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3576 if ( aSign ) {
3577 float_raise(float_flag_invalid, status);
3578 return float32_default_nan(status);
3580 if ( aExp == 0xFF ) {
3581 if (aSig) {
3582 return propagateFloat32NaN(a, float32_zero, status);
3584 return a;
3587 aExp -= 0x7F;
3588 aSig |= 0x00800000;
3589 zSign = aExp < 0;
3590 zSig = aExp << 23;
3592 for (i = 1 << 22; i > 0; i >>= 1) {
3593 aSig = ( (uint64_t)aSig * aSig ) >> 23;
3594 if ( aSig & 0x01000000 ) {
3595 aSig >>= 1;
3596 zSig |= i;
3600 if ( zSign )
3601 zSig = -zSig;
3603 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3606 /*----------------------------------------------------------------------------
3607 | Returns 1 if the single-precision floating-point value `a' is equal to
3608 | the corresponding value `b', and 0 otherwise. The invalid exception is
3609 | raised if either operand is a NaN. Otherwise, the comparison is performed
3610 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3611 *----------------------------------------------------------------------------*/
3613 int float32_eq(float32 a, float32 b, float_status *status)
3615 uint32_t av, bv;
3616 a = float32_squash_input_denormal(a, status);
3617 b = float32_squash_input_denormal(b, status);
3619 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3620 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3622 float_raise(float_flag_invalid, status);
3623 return 0;
3625 av = float32_val(a);
3626 bv = float32_val(b);
3627 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3630 /*----------------------------------------------------------------------------
3631 | Returns 1 if the single-precision floating-point value `a' is less than
3632 | or equal to the corresponding value `b', and 0 otherwise. The invalid
3633 | exception is raised if either operand is a NaN. The comparison is performed
3634 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3635 *----------------------------------------------------------------------------*/
3637 int float32_le(float32 a, float32 b, float_status *status)
3639 flag aSign, bSign;
3640 uint32_t av, bv;
3641 a = float32_squash_input_denormal(a, status);
3642 b = float32_squash_input_denormal(b, status);
3644 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3645 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3647 float_raise(float_flag_invalid, status);
3648 return 0;
3650 aSign = extractFloat32Sign( a );
3651 bSign = extractFloat32Sign( b );
3652 av = float32_val(a);
3653 bv = float32_val(b);
3654 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3655 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3659 /*----------------------------------------------------------------------------
3660 | Returns 1 if the single-precision floating-point value `a' is less than
3661 | the corresponding value `b', and 0 otherwise. The invalid exception is
3662 | raised if either operand is a NaN. The comparison is performed according
3663 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3664 *----------------------------------------------------------------------------*/
3666 int float32_lt(float32 a, float32 b, float_status *status)
3668 flag aSign, bSign;
3669 uint32_t av, bv;
3670 a = float32_squash_input_denormal(a, status);
3671 b = float32_squash_input_denormal(b, status);
3673 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3674 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3676 float_raise(float_flag_invalid, status);
3677 return 0;
3679 aSign = extractFloat32Sign( a );
3680 bSign = extractFloat32Sign( b );
3681 av = float32_val(a);
3682 bv = float32_val(b);
3683 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3684 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3688 /*----------------------------------------------------------------------------
3689 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3690 | be compared, and 0 otherwise. The invalid exception is raised if either
3691 | operand is a NaN. The comparison is performed according to the IEC/IEEE
3692 | Standard for Binary Floating-Point Arithmetic.
3693 *----------------------------------------------------------------------------*/
3695 int float32_unordered(float32 a, float32 b, float_status *status)
3697 a = float32_squash_input_denormal(a, status);
3698 b = float32_squash_input_denormal(b, status);
3700 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3701 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3703 float_raise(float_flag_invalid, status);
3704 return 1;
3706 return 0;
3709 /*----------------------------------------------------------------------------
3710 | Returns 1 if the single-precision floating-point value `a' is equal to
3711 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3712 | exception. The comparison is performed according to the IEC/IEEE Standard
3713 | for Binary Floating-Point Arithmetic.
3714 *----------------------------------------------------------------------------*/
3716 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3718 a = float32_squash_input_denormal(a, status);
3719 b = float32_squash_input_denormal(b, status);
3721 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3722 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3724 if (float32_is_signaling_nan(a, status)
3725 || float32_is_signaling_nan(b, status)) {
3726 float_raise(float_flag_invalid, status);
3728 return 0;
3730 return ( float32_val(a) == float32_val(b) ) ||
3731 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3734 /*----------------------------------------------------------------------------
3735 | Returns 1 if the single-precision floating-point value `a' is less than or
3736 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3737 | cause an exception. Otherwise, the comparison is performed according to the
3738 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3739 *----------------------------------------------------------------------------*/
3741 int float32_le_quiet(float32 a, float32 b, float_status *status)
3743 flag aSign, bSign;
3744 uint32_t av, bv;
3745 a = float32_squash_input_denormal(a, status);
3746 b = float32_squash_input_denormal(b, status);
3748 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3749 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3751 if (float32_is_signaling_nan(a, status)
3752 || float32_is_signaling_nan(b, status)) {
3753 float_raise(float_flag_invalid, status);
3755 return 0;
3757 aSign = extractFloat32Sign( a );
3758 bSign = extractFloat32Sign( b );
3759 av = float32_val(a);
3760 bv = float32_val(b);
3761 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3762 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3766 /*----------------------------------------------------------------------------
3767 | Returns 1 if the single-precision floating-point value `a' is less than
3768 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3769 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3770 | Standard for Binary Floating-Point Arithmetic.
3771 *----------------------------------------------------------------------------*/
3773 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3775 flag aSign, bSign;
3776 uint32_t av, bv;
3777 a = float32_squash_input_denormal(a, status);
3778 b = float32_squash_input_denormal(b, status);
3780 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3781 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3783 if (float32_is_signaling_nan(a, status)
3784 || float32_is_signaling_nan(b, status)) {
3785 float_raise(float_flag_invalid, status);
3787 return 0;
3789 aSign = extractFloat32Sign( a );
3790 bSign = extractFloat32Sign( b );
3791 av = float32_val(a);
3792 bv = float32_val(b);
3793 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3794 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3798 /*----------------------------------------------------------------------------
3799 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3800 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3801 | comparison is performed according to the IEC/IEEE Standard for Binary
3802 | Floating-Point Arithmetic.
3803 *----------------------------------------------------------------------------*/
3805 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3807 a = float32_squash_input_denormal(a, status);
3808 b = float32_squash_input_denormal(b, status);
3810 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3811 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3813 if (float32_is_signaling_nan(a, status)
3814 || float32_is_signaling_nan(b, status)) {
3815 float_raise(float_flag_invalid, status);
3817 return 1;
3819 return 0;
3822 /*----------------------------------------------------------------------------
3823 | If `a' is denormal and we are in flush-to-zero mode then set the
3824 | input-denormal exception and return zero. Otherwise just return the value.
3825 *----------------------------------------------------------------------------*/
3826 float16 float16_squash_input_denormal(float16 a, float_status *status)
3828 if (status->flush_inputs_to_zero) {
3829 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3830 float_raise(float_flag_input_denormal, status);
3831 return make_float16(float16_val(a) & 0x8000);
3834 return a;
3837 /*----------------------------------------------------------------------------
3838 | Returns the result of converting the double-precision floating-point value
3839 | `a' to the extended double-precision floating-point format. The conversion
3840 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3841 | Arithmetic.
3842 *----------------------------------------------------------------------------*/
3844 floatx80 float64_to_floatx80(float64 a, float_status *status)
3846 flag aSign;
3847 int aExp;
3848 uint64_t aSig;
3850 a = float64_squash_input_denormal(a, status);
3851 aSig = extractFloat64Frac( a );
3852 aExp = extractFloat64Exp( a );
3853 aSign = extractFloat64Sign( a );
3854 if ( aExp == 0x7FF ) {
3855 if (aSig) {
3856 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3858 return packFloatx80(aSign,
3859 floatx80_infinity_high,
3860 floatx80_infinity_low);
3862 if ( aExp == 0 ) {
3863 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3864 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3866 return
3867 packFloatx80(
3868 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3872 /*----------------------------------------------------------------------------
3873 | Returns the result of converting the double-precision floating-point value
3874 | `a' to the quadruple-precision floating-point format. The conversion is
3875 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3876 | Arithmetic.
3877 *----------------------------------------------------------------------------*/
3879 float128 float64_to_float128(float64 a, float_status *status)
3881 flag aSign;
3882 int aExp;
3883 uint64_t aSig, zSig0, zSig1;
3885 a = float64_squash_input_denormal(a, status);
3886 aSig = extractFloat64Frac( a );
3887 aExp = extractFloat64Exp( a );
3888 aSign = extractFloat64Sign( a );
3889 if ( aExp == 0x7FF ) {
3890 if (aSig) {
3891 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3893 return packFloat128( aSign, 0x7FFF, 0, 0 );
3895 if ( aExp == 0 ) {
3896 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3897 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3898 --aExp;
3900 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3901 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3906 /*----------------------------------------------------------------------------
3907 | Returns the remainder of the double-precision floating-point value `a'
3908 | with respect to the corresponding value `b'. The operation is performed
3909 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3910 *----------------------------------------------------------------------------*/
3912 float64 float64_rem(float64 a, float64 b, float_status *status)
3914 flag aSign, zSign;
3915 int aExp, bExp, expDiff;
3916 uint64_t aSig, bSig;
3917 uint64_t q, alternateASig;
3918 int64_t sigMean;
3920 a = float64_squash_input_denormal(a, status);
3921 b = float64_squash_input_denormal(b, status);
3922 aSig = extractFloat64Frac( a );
3923 aExp = extractFloat64Exp( a );
3924 aSign = extractFloat64Sign( a );
3925 bSig = extractFloat64Frac( b );
3926 bExp = extractFloat64Exp( b );
3927 if ( aExp == 0x7FF ) {
3928 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3929 return propagateFloat64NaN(a, b, status);
3931 float_raise(float_flag_invalid, status);
3932 return float64_default_nan(status);
3934 if ( bExp == 0x7FF ) {
3935 if (bSig) {
3936 return propagateFloat64NaN(a, b, status);
3938 return a;
3940 if ( bExp == 0 ) {
3941 if ( bSig == 0 ) {
3942 float_raise(float_flag_invalid, status);
3943 return float64_default_nan(status);
3945 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3947 if ( aExp == 0 ) {
3948 if ( aSig == 0 ) return a;
3949 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3951 expDiff = aExp - bExp;
3952 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3953 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3954 if ( expDiff < 0 ) {
3955 if ( expDiff < -1 ) return a;
3956 aSig >>= 1;
3958 q = ( bSig <= aSig );
3959 if ( q ) aSig -= bSig;
3960 expDiff -= 64;
3961 while ( 0 < expDiff ) {
3962 q = estimateDiv128To64( aSig, 0, bSig );
3963 q = ( 2 < q ) ? q - 2 : 0;
3964 aSig = - ( ( bSig>>2 ) * q );
3965 expDiff -= 62;
3967 expDiff += 64;
3968 if ( 0 < expDiff ) {
3969 q = estimateDiv128To64( aSig, 0, bSig );
3970 q = ( 2 < q ) ? q - 2 : 0;
3971 q >>= 64 - expDiff;
3972 bSig >>= 2;
3973 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3975 else {
3976 aSig >>= 2;
3977 bSig >>= 2;
3979 do {
3980 alternateASig = aSig;
3981 ++q;
3982 aSig -= bSig;
3983 } while ( 0 <= (int64_t) aSig );
3984 sigMean = aSig + alternateASig;
3985 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3986 aSig = alternateASig;
3988 zSign = ( (int64_t) aSig < 0 );
3989 if ( zSign ) aSig = - aSig;
3990 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
3994 /*----------------------------------------------------------------------------
3995 | Returns the binary log of the double-precision floating-point value `a'.
3996 | The operation is performed according to the IEC/IEEE Standard for Binary
3997 | Floating-Point Arithmetic.
3998 *----------------------------------------------------------------------------*/
3999 float64 float64_log2(float64 a, float_status *status)
4001 flag aSign, zSign;
4002 int aExp;
4003 uint64_t aSig, aSig0, aSig1, zSig, i;
4004 a = float64_squash_input_denormal(a, status);
4006 aSig = extractFloat64Frac( a );
4007 aExp = extractFloat64Exp( a );
4008 aSign = extractFloat64Sign( a );
4010 if ( aExp == 0 ) {
4011 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4012 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4014 if ( aSign ) {
4015 float_raise(float_flag_invalid, status);
4016 return float64_default_nan(status);
4018 if ( aExp == 0x7FF ) {
4019 if (aSig) {
4020 return propagateFloat64NaN(a, float64_zero, status);
4022 return a;
4025 aExp -= 0x3FF;
4026 aSig |= LIT64( 0x0010000000000000 );
4027 zSign = aExp < 0;
4028 zSig = (uint64_t)aExp << 52;
4029 for (i = 1LL << 51; i > 0; i >>= 1) {
4030 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4031 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4032 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4033 aSig >>= 1;
4034 zSig |= i;
4038 if ( zSign )
4039 zSig = -zSig;
4040 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4043 /*----------------------------------------------------------------------------
4044 | Returns 1 if the double-precision floating-point value `a' is equal to the
4045 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4046 | if either operand is a NaN. Otherwise, the comparison is performed
4047 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4048 *----------------------------------------------------------------------------*/
4050 int float64_eq(float64 a, float64 b, float_status *status)
4052 uint64_t av, bv;
4053 a = float64_squash_input_denormal(a, status);
4054 b = float64_squash_input_denormal(b, status);
4056 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4057 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4059 float_raise(float_flag_invalid, status);
4060 return 0;
4062 av = float64_val(a);
4063 bv = float64_val(b);
4064 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4068 /*----------------------------------------------------------------------------
4069 | Returns 1 if the double-precision floating-point value `a' is less than or
4070 | equal to the corresponding value `b', and 0 otherwise. The invalid
4071 | exception is raised if either operand is a NaN. The comparison is performed
4072 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4073 *----------------------------------------------------------------------------*/
4075 int float64_le(float64 a, float64 b, float_status *status)
4077 flag aSign, bSign;
4078 uint64_t av, bv;
4079 a = float64_squash_input_denormal(a, status);
4080 b = float64_squash_input_denormal(b, status);
4082 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4083 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4085 float_raise(float_flag_invalid, status);
4086 return 0;
4088 aSign = extractFloat64Sign( a );
4089 bSign = extractFloat64Sign( b );
4090 av = float64_val(a);
4091 bv = float64_val(b);
4092 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4093 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4097 /*----------------------------------------------------------------------------
4098 | Returns 1 if the double-precision floating-point value `a' is less than
4099 | the corresponding value `b', and 0 otherwise. The invalid exception is
4100 | raised if either operand is a NaN. The comparison is performed according
4101 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4102 *----------------------------------------------------------------------------*/
4104 int float64_lt(float64 a, float64 b, float_status *status)
4106 flag aSign, bSign;
4107 uint64_t av, bv;
4109 a = float64_squash_input_denormal(a, status);
4110 b = float64_squash_input_denormal(b, status);
4111 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4112 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4114 float_raise(float_flag_invalid, status);
4115 return 0;
4117 aSign = extractFloat64Sign( a );
4118 bSign = extractFloat64Sign( b );
4119 av = float64_val(a);
4120 bv = float64_val(b);
4121 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4122 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4126 /*----------------------------------------------------------------------------
4127 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4128 | be compared, and 0 otherwise. The invalid exception is raised if either
4129 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4130 | Standard for Binary Floating-Point Arithmetic.
4131 *----------------------------------------------------------------------------*/
4133 int float64_unordered(float64 a, float64 b, float_status *status)
4135 a = float64_squash_input_denormal(a, status);
4136 b = float64_squash_input_denormal(b, status);
4138 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4139 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4141 float_raise(float_flag_invalid, status);
4142 return 1;
4144 return 0;
4147 /*----------------------------------------------------------------------------
4148 | Returns 1 if the double-precision floating-point value `a' is equal to the
4149 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4150 | exception.The comparison is performed according to the IEC/IEEE Standard
4151 | for Binary Floating-Point Arithmetic.
4152 *----------------------------------------------------------------------------*/
4154 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4156 uint64_t av, bv;
4157 a = float64_squash_input_denormal(a, status);
4158 b = float64_squash_input_denormal(b, status);
4160 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4161 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4163 if (float64_is_signaling_nan(a, status)
4164 || float64_is_signaling_nan(b, status)) {
4165 float_raise(float_flag_invalid, status);
4167 return 0;
4169 av = float64_val(a);
4170 bv = float64_val(b);
4171 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4175 /*----------------------------------------------------------------------------
4176 | Returns 1 if the double-precision floating-point value `a' is less than or
4177 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4178 | cause an exception. Otherwise, the comparison is performed according to the
4179 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4180 *----------------------------------------------------------------------------*/
4182 int float64_le_quiet(float64 a, float64 b, float_status *status)
4184 flag aSign, bSign;
4185 uint64_t av, bv;
4186 a = float64_squash_input_denormal(a, status);
4187 b = float64_squash_input_denormal(b, status);
4189 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4190 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4192 if (float64_is_signaling_nan(a, status)
4193 || float64_is_signaling_nan(b, status)) {
4194 float_raise(float_flag_invalid, status);
4196 return 0;
4198 aSign = extractFloat64Sign( a );
4199 bSign = extractFloat64Sign( b );
4200 av = float64_val(a);
4201 bv = float64_val(b);
4202 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4203 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4207 /*----------------------------------------------------------------------------
4208 | Returns 1 if the double-precision floating-point value `a' is less than
4209 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4210 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4211 | Standard for Binary Floating-Point Arithmetic.
4212 *----------------------------------------------------------------------------*/
4214 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4216 flag aSign, bSign;
4217 uint64_t av, bv;
4218 a = float64_squash_input_denormal(a, status);
4219 b = float64_squash_input_denormal(b, status);
4221 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4222 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4224 if (float64_is_signaling_nan(a, status)
4225 || float64_is_signaling_nan(b, status)) {
4226 float_raise(float_flag_invalid, status);
4228 return 0;
4230 aSign = extractFloat64Sign( a );
4231 bSign = extractFloat64Sign( b );
4232 av = float64_val(a);
4233 bv = float64_val(b);
4234 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4235 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4239 /*----------------------------------------------------------------------------
4240 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4241 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4242 | comparison is performed according to the IEC/IEEE Standard for Binary
4243 | Floating-Point Arithmetic.
4244 *----------------------------------------------------------------------------*/
4246 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4248 a = float64_squash_input_denormal(a, status);
4249 b = float64_squash_input_denormal(b, status);
4251 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4252 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4254 if (float64_is_signaling_nan(a, status)
4255 || float64_is_signaling_nan(b, status)) {
4256 float_raise(float_flag_invalid, status);
4258 return 1;
4260 return 0;
4263 /*----------------------------------------------------------------------------
4264 | Returns the result of converting the extended double-precision floating-
4265 | point value `a' to the 32-bit two's complement integer format. The
4266 | conversion is performed according to the IEC/IEEE Standard for Binary
4267 | Floating-Point Arithmetic---which means in particular that the conversion
4268 | is rounded according to the current rounding mode. If `a' is a NaN, the
4269 | largest positive integer is returned. Otherwise, if the conversion
4270 | overflows, the largest integer with the same sign as `a' is returned.
4271 *----------------------------------------------------------------------------*/
4273 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4275 flag aSign;
4276 int32_t aExp, shiftCount;
4277 uint64_t aSig;
4279 if (floatx80_invalid_encoding(a)) {
4280 float_raise(float_flag_invalid, status);
4281 return 1 << 31;
4283 aSig = extractFloatx80Frac( a );
4284 aExp = extractFloatx80Exp( a );
4285 aSign = extractFloatx80Sign( a );
4286 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4287 shiftCount = 0x4037 - aExp;
4288 if ( shiftCount <= 0 ) shiftCount = 1;
4289 shift64RightJamming( aSig, shiftCount, &aSig );
4290 return roundAndPackInt32(aSign, aSig, status);
4294 /*----------------------------------------------------------------------------
4295 | Returns the result of converting the extended double-precision floating-
4296 | point value `a' to the 32-bit two's complement integer format. The
4297 | conversion is performed according to the IEC/IEEE Standard for Binary
4298 | Floating-Point Arithmetic, except that the conversion is always rounded
4299 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4300 | Otherwise, if the conversion overflows, the largest integer with the same
4301 | sign as `a' is returned.
4302 *----------------------------------------------------------------------------*/
4304 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4306 flag aSign;
4307 int32_t aExp, shiftCount;
4308 uint64_t aSig, savedASig;
4309 int32_t z;
4311 if (floatx80_invalid_encoding(a)) {
4312 float_raise(float_flag_invalid, status);
4313 return 1 << 31;
4315 aSig = extractFloatx80Frac( a );
4316 aExp = extractFloatx80Exp( a );
4317 aSign = extractFloatx80Sign( a );
4318 if ( 0x401E < aExp ) {
4319 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4320 goto invalid;
4322 else if ( aExp < 0x3FFF ) {
4323 if (aExp || aSig) {
4324 status->float_exception_flags |= float_flag_inexact;
4326 return 0;
4328 shiftCount = 0x403E - aExp;
4329 savedASig = aSig;
4330 aSig >>= shiftCount;
4331 z = aSig;
4332 if ( aSign ) z = - z;
4333 if ( ( z < 0 ) ^ aSign ) {
4334 invalid:
4335 float_raise(float_flag_invalid, status);
4336 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4338 if ( ( aSig<<shiftCount ) != savedASig ) {
4339 status->float_exception_flags |= float_flag_inexact;
4341 return z;
4345 /*----------------------------------------------------------------------------
4346 | Returns the result of converting the extended double-precision floating-
4347 | point value `a' to the 64-bit two's complement integer format. The
4348 | conversion is performed according to the IEC/IEEE Standard for Binary
4349 | Floating-Point Arithmetic---which means in particular that the conversion
4350 | is rounded according to the current rounding mode. If `a' is a NaN,
4351 | the largest positive integer is returned. Otherwise, if the conversion
4352 | overflows, the largest integer with the same sign as `a' is returned.
4353 *----------------------------------------------------------------------------*/
4355 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4357 flag aSign;
4358 int32_t aExp, shiftCount;
4359 uint64_t aSig, aSigExtra;
4361 if (floatx80_invalid_encoding(a)) {
4362 float_raise(float_flag_invalid, status);
4363 return 1ULL << 63;
4365 aSig = extractFloatx80Frac( a );
4366 aExp = extractFloatx80Exp( a );
4367 aSign = extractFloatx80Sign( a );
4368 shiftCount = 0x403E - aExp;
4369 if ( shiftCount <= 0 ) {
4370 if ( shiftCount ) {
4371 float_raise(float_flag_invalid, status);
4372 if (!aSign || floatx80_is_any_nan(a)) {
4373 return LIT64( 0x7FFFFFFFFFFFFFFF );
4375 return (int64_t) LIT64( 0x8000000000000000 );
4377 aSigExtra = 0;
4379 else {
4380 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4382 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4386 /*----------------------------------------------------------------------------
4387 | Returns the result of converting the extended double-precision floating-
4388 | point value `a' to the 64-bit two's complement integer format. The
4389 | conversion is performed according to the IEC/IEEE Standard for Binary
4390 | Floating-Point Arithmetic, except that the conversion is always rounded
4391 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4392 | Otherwise, if the conversion overflows, the largest integer with the same
4393 | sign as `a' is returned.
4394 *----------------------------------------------------------------------------*/
4396 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4398 flag aSign;
4399 int32_t aExp, shiftCount;
4400 uint64_t aSig;
4401 int64_t z;
4403 if (floatx80_invalid_encoding(a)) {
4404 float_raise(float_flag_invalid, status);
4405 return 1ULL << 63;
4407 aSig = extractFloatx80Frac( a );
4408 aExp = extractFloatx80Exp( a );
4409 aSign = extractFloatx80Sign( a );
4410 shiftCount = aExp - 0x403E;
4411 if ( 0 <= shiftCount ) {
4412 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4413 if ( ( a.high != 0xC03E ) || aSig ) {
4414 float_raise(float_flag_invalid, status);
4415 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4416 return LIT64( 0x7FFFFFFFFFFFFFFF );
4419 return (int64_t) LIT64( 0x8000000000000000 );
4421 else if ( aExp < 0x3FFF ) {
4422 if (aExp | aSig) {
4423 status->float_exception_flags |= float_flag_inexact;
4425 return 0;
4427 z = aSig>>( - shiftCount );
4428 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4429 status->float_exception_flags |= float_flag_inexact;
4431 if ( aSign ) z = - z;
4432 return z;
4436 /*----------------------------------------------------------------------------
4437 | Returns the result of converting the extended double-precision floating-
4438 | point value `a' to the single-precision floating-point format. The
4439 | conversion is performed according to the IEC/IEEE Standard for Binary
4440 | Floating-Point Arithmetic.
4441 *----------------------------------------------------------------------------*/
4443 float32 floatx80_to_float32(floatx80 a, float_status *status)
4445 flag aSign;
4446 int32_t aExp;
4447 uint64_t aSig;
4449 if (floatx80_invalid_encoding(a)) {
4450 float_raise(float_flag_invalid, status);
4451 return float32_default_nan(status);
4453 aSig = extractFloatx80Frac( a );
4454 aExp = extractFloatx80Exp( a );
4455 aSign = extractFloatx80Sign( a );
4456 if ( aExp == 0x7FFF ) {
4457 if ( (uint64_t) ( aSig<<1 ) ) {
4458 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4460 return packFloat32( aSign, 0xFF, 0 );
4462 shift64RightJamming( aSig, 33, &aSig );
4463 if ( aExp || aSig ) aExp -= 0x3F81;
4464 return roundAndPackFloat32(aSign, aExp, aSig, status);
4468 /*----------------------------------------------------------------------------
4469 | Returns the result of converting the extended double-precision floating-
4470 | point value `a' to the double-precision floating-point format. The
4471 | conversion is performed according to the IEC/IEEE Standard for Binary
4472 | Floating-Point Arithmetic.
4473 *----------------------------------------------------------------------------*/
4475 float64 floatx80_to_float64(floatx80 a, float_status *status)
4477 flag aSign;
4478 int32_t aExp;
4479 uint64_t aSig, zSig;
4481 if (floatx80_invalid_encoding(a)) {
4482 float_raise(float_flag_invalid, status);
4483 return float64_default_nan(status);
4485 aSig = extractFloatx80Frac( a );
4486 aExp = extractFloatx80Exp( a );
4487 aSign = extractFloatx80Sign( a );
4488 if ( aExp == 0x7FFF ) {
4489 if ( (uint64_t) ( aSig<<1 ) ) {
4490 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4492 return packFloat64( aSign, 0x7FF, 0 );
4494 shift64RightJamming( aSig, 1, &zSig );
4495 if ( aExp || aSig ) aExp -= 0x3C01;
4496 return roundAndPackFloat64(aSign, aExp, zSig, status);
4500 /*----------------------------------------------------------------------------
4501 | Returns the result of converting the extended double-precision floating-
4502 | point value `a' to the quadruple-precision floating-point format. The
4503 | conversion is performed according to the IEC/IEEE Standard for Binary
4504 | Floating-Point Arithmetic.
4505 *----------------------------------------------------------------------------*/
4507 float128 floatx80_to_float128(floatx80 a, float_status *status)
4509 flag aSign;
4510 int aExp;
4511 uint64_t aSig, zSig0, zSig1;
4513 if (floatx80_invalid_encoding(a)) {
4514 float_raise(float_flag_invalid, status);
4515 return float128_default_nan(status);
4517 aSig = extractFloatx80Frac( a );
4518 aExp = extractFloatx80Exp( a );
4519 aSign = extractFloatx80Sign( a );
4520 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4521 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4523 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4524 return packFloat128( aSign, aExp, zSig0, zSig1 );
4528 /*----------------------------------------------------------------------------
4529 | Rounds the extended double-precision floating-point value `a'
4530 | to the precision provided by floatx80_rounding_precision and returns the
4531 | result as an extended double-precision floating-point value.
4532 | The operation is performed according to the IEC/IEEE Standard for Binary
4533 | Floating-Point Arithmetic.
4534 *----------------------------------------------------------------------------*/
4536 floatx80 floatx80_round(floatx80 a, float_status *status)
4538 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4539 extractFloatx80Sign(a),
4540 extractFloatx80Exp(a),
4541 extractFloatx80Frac(a), 0, status);
4544 /*----------------------------------------------------------------------------
4545 | Rounds the extended double-precision floating-point value `a' to an integer,
4546 | and returns the result as an extended quadruple-precision floating-point
4547 | value. The operation is performed according to the IEC/IEEE Standard for
4548 | Binary Floating-Point Arithmetic.
4549 *----------------------------------------------------------------------------*/
4551 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4553 flag aSign;
4554 int32_t aExp;
4555 uint64_t lastBitMask, roundBitsMask;
4556 floatx80 z;
4558 if (floatx80_invalid_encoding(a)) {
4559 float_raise(float_flag_invalid, status);
4560 return floatx80_default_nan(status);
4562 aExp = extractFloatx80Exp( a );
4563 if ( 0x403E <= aExp ) {
4564 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4565 return propagateFloatx80NaN(a, a, status);
4567 return a;
4569 if ( aExp < 0x3FFF ) {
4570 if ( ( aExp == 0 )
4571 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4572 return a;
4574 status->float_exception_flags |= float_flag_inexact;
4575 aSign = extractFloatx80Sign( a );
4576 switch (status->float_rounding_mode) {
4577 case float_round_nearest_even:
4578 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4580 return
4581 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4583 break;
4584 case float_round_ties_away:
4585 if (aExp == 0x3FFE) {
4586 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4588 break;
4589 case float_round_down:
4590 return
4591 aSign ?
4592 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4593 : packFloatx80( 0, 0, 0 );
4594 case float_round_up:
4595 return
4596 aSign ? packFloatx80( 1, 0, 0 )
4597 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4599 return packFloatx80( aSign, 0, 0 );
4601 lastBitMask = 1;
4602 lastBitMask <<= 0x403E - aExp;
4603 roundBitsMask = lastBitMask - 1;
4604 z = a;
4605 switch (status->float_rounding_mode) {
4606 case float_round_nearest_even:
4607 z.low += lastBitMask>>1;
4608 if ((z.low & roundBitsMask) == 0) {
4609 z.low &= ~lastBitMask;
4611 break;
4612 case float_round_ties_away:
4613 z.low += lastBitMask >> 1;
4614 break;
4615 case float_round_to_zero:
4616 break;
4617 case float_round_up:
4618 if (!extractFloatx80Sign(z)) {
4619 z.low += roundBitsMask;
4621 break;
4622 case float_round_down:
4623 if (extractFloatx80Sign(z)) {
4624 z.low += roundBitsMask;
4626 break;
4627 default:
4628 abort();
4630 z.low &= ~ roundBitsMask;
4631 if ( z.low == 0 ) {
4632 ++z.high;
4633 z.low = LIT64( 0x8000000000000000 );
4635 if (z.low != a.low) {
4636 status->float_exception_flags |= float_flag_inexact;
4638 return z;
4642 /*----------------------------------------------------------------------------
4643 | Returns the result of adding the absolute values of the extended double-
4644 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4645 | negated before being returned. `zSign' is ignored if the result is a NaN.
4646 | The addition is performed according to the IEC/IEEE Standard for Binary
4647 | Floating-Point Arithmetic.
4648 *----------------------------------------------------------------------------*/
4650 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4651 float_status *status)
4653 int32_t aExp, bExp, zExp;
4654 uint64_t aSig, bSig, zSig0, zSig1;
4655 int32_t expDiff;
4657 aSig = extractFloatx80Frac( a );
4658 aExp = extractFloatx80Exp( a );
4659 bSig = extractFloatx80Frac( b );
4660 bExp = extractFloatx80Exp( b );
4661 expDiff = aExp - bExp;
4662 if ( 0 < expDiff ) {
4663 if ( aExp == 0x7FFF ) {
4664 if ((uint64_t)(aSig << 1)) {
4665 return propagateFloatx80NaN(a, b, status);
4667 return a;
4669 if ( bExp == 0 ) --expDiff;
4670 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4671 zExp = aExp;
4673 else if ( expDiff < 0 ) {
4674 if ( bExp == 0x7FFF ) {
4675 if ((uint64_t)(bSig << 1)) {
4676 return propagateFloatx80NaN(a, b, status);
4678 return packFloatx80(zSign,
4679 floatx80_infinity_high,
4680 floatx80_infinity_low);
4682 if ( aExp == 0 ) ++expDiff;
4683 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4684 zExp = bExp;
4686 else {
4687 if ( aExp == 0x7FFF ) {
4688 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4689 return propagateFloatx80NaN(a, b, status);
4691 return a;
4693 zSig1 = 0;
4694 zSig0 = aSig + bSig;
4695 if ( aExp == 0 ) {
4696 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4697 goto roundAndPack;
4699 zExp = aExp;
4700 goto shiftRight1;
4702 zSig0 = aSig + bSig;
4703 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4704 shiftRight1:
4705 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4706 zSig0 |= LIT64( 0x8000000000000000 );
4707 ++zExp;
4708 roundAndPack:
4709 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4710 zSign, zExp, zSig0, zSig1, status);
4713 /*----------------------------------------------------------------------------
4714 | Returns the result of subtracting the absolute values of the extended
4715 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4716 | difference is negated before being returned. `zSign' is ignored if the
4717 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4718 | Standard for Binary Floating-Point Arithmetic.
4719 *----------------------------------------------------------------------------*/
4721 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4722 float_status *status)
4724 int32_t aExp, bExp, zExp;
4725 uint64_t aSig, bSig, zSig0, zSig1;
4726 int32_t expDiff;
4728 aSig = extractFloatx80Frac( a );
4729 aExp = extractFloatx80Exp( a );
4730 bSig = extractFloatx80Frac( b );
4731 bExp = extractFloatx80Exp( b );
4732 expDiff = aExp - bExp;
4733 if ( 0 < expDiff ) goto aExpBigger;
4734 if ( expDiff < 0 ) goto bExpBigger;
4735 if ( aExp == 0x7FFF ) {
4736 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4737 return propagateFloatx80NaN(a, b, status);
4739 float_raise(float_flag_invalid, status);
4740 return floatx80_default_nan(status);
4742 if ( aExp == 0 ) {
4743 aExp = 1;
4744 bExp = 1;
4746 zSig1 = 0;
4747 if ( bSig < aSig ) goto aBigger;
4748 if ( aSig < bSig ) goto bBigger;
4749 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
4750 bExpBigger:
4751 if ( bExp == 0x7FFF ) {
4752 if ((uint64_t)(bSig << 1)) {
4753 return propagateFloatx80NaN(a, b, status);
4755 return packFloatx80(zSign ^ 1, floatx80_infinity_high,
4756 floatx80_infinity_low);
4758 if ( aExp == 0 ) ++expDiff;
4759 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4760 bBigger:
4761 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4762 zExp = bExp;
4763 zSign ^= 1;
4764 goto normalizeRoundAndPack;
4765 aExpBigger:
4766 if ( aExp == 0x7FFF ) {
4767 if ((uint64_t)(aSig << 1)) {
4768 return propagateFloatx80NaN(a, b, status);
4770 return a;
4772 if ( bExp == 0 ) --expDiff;
4773 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4774 aBigger:
4775 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4776 zExp = aExp;
4777 normalizeRoundAndPack:
4778 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
4779 zSign, zExp, zSig0, zSig1, status);
4782 /*----------------------------------------------------------------------------
4783 | Returns the result of adding the extended double-precision floating-point
4784 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4785 | Standard for Binary Floating-Point Arithmetic.
4786 *----------------------------------------------------------------------------*/
4788 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
4790 flag aSign, bSign;
4792 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4793 float_raise(float_flag_invalid, status);
4794 return floatx80_default_nan(status);
4796 aSign = extractFloatx80Sign( a );
4797 bSign = extractFloatx80Sign( b );
4798 if ( aSign == bSign ) {
4799 return addFloatx80Sigs(a, b, aSign, status);
4801 else {
4802 return subFloatx80Sigs(a, b, aSign, status);
4807 /*----------------------------------------------------------------------------
4808 | Returns the result of subtracting the extended double-precision floating-
4809 | point values `a' and `b'. The operation is performed according to the
4810 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4811 *----------------------------------------------------------------------------*/
4813 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
4815 flag aSign, bSign;
4817 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4818 float_raise(float_flag_invalid, status);
4819 return floatx80_default_nan(status);
4821 aSign = extractFloatx80Sign( a );
4822 bSign = extractFloatx80Sign( b );
4823 if ( aSign == bSign ) {
4824 return subFloatx80Sigs(a, b, aSign, status);
4826 else {
4827 return addFloatx80Sigs(a, b, aSign, status);
4832 /*----------------------------------------------------------------------------
4833 | Returns the result of multiplying the extended double-precision floating-
4834 | point values `a' and `b'. The operation is performed according to the
4835 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4836 *----------------------------------------------------------------------------*/
4838 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
4840 flag aSign, bSign, zSign;
4841 int32_t aExp, bExp, zExp;
4842 uint64_t aSig, bSig, zSig0, zSig1;
4844 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4845 float_raise(float_flag_invalid, status);
4846 return floatx80_default_nan(status);
4848 aSig = extractFloatx80Frac( a );
4849 aExp = extractFloatx80Exp( a );
4850 aSign = extractFloatx80Sign( a );
4851 bSig = extractFloatx80Frac( b );
4852 bExp = extractFloatx80Exp( b );
4853 bSign = extractFloatx80Sign( b );
4854 zSign = aSign ^ bSign;
4855 if ( aExp == 0x7FFF ) {
4856 if ( (uint64_t) ( aSig<<1 )
4857 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4858 return propagateFloatx80NaN(a, b, status);
4860 if ( ( bExp | bSig ) == 0 ) goto invalid;
4861 return packFloatx80(zSign, floatx80_infinity_high,
4862 floatx80_infinity_low);
4864 if ( bExp == 0x7FFF ) {
4865 if ((uint64_t)(bSig << 1)) {
4866 return propagateFloatx80NaN(a, b, status);
4868 if ( ( aExp | aSig ) == 0 ) {
4869 invalid:
4870 float_raise(float_flag_invalid, status);
4871 return floatx80_default_nan(status);
4873 return packFloatx80(zSign, floatx80_infinity_high,
4874 floatx80_infinity_low);
4876 if ( aExp == 0 ) {
4877 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4878 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4880 if ( bExp == 0 ) {
4881 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4882 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4884 zExp = aExp + bExp - 0x3FFE;
4885 mul64To128( aSig, bSig, &zSig0, &zSig1 );
4886 if ( 0 < (int64_t) zSig0 ) {
4887 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4888 --zExp;
4890 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4891 zSign, zExp, zSig0, zSig1, status);
4894 /*----------------------------------------------------------------------------
4895 | Returns the result of dividing the extended double-precision floating-point
4896 | value `a' by the corresponding value `b'. The operation is performed
4897 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4898 *----------------------------------------------------------------------------*/
4900 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
4902 flag aSign, bSign, zSign;
4903 int32_t aExp, bExp, zExp;
4904 uint64_t aSig, bSig, zSig0, zSig1;
4905 uint64_t rem0, rem1, rem2, term0, term1, term2;
4907 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4908 float_raise(float_flag_invalid, status);
4909 return floatx80_default_nan(status);
4911 aSig = extractFloatx80Frac( a );
4912 aExp = extractFloatx80Exp( a );
4913 aSign = extractFloatx80Sign( a );
4914 bSig = extractFloatx80Frac( b );
4915 bExp = extractFloatx80Exp( b );
4916 bSign = extractFloatx80Sign( b );
4917 zSign = aSign ^ bSign;
4918 if ( aExp == 0x7FFF ) {
4919 if ((uint64_t)(aSig << 1)) {
4920 return propagateFloatx80NaN(a, b, status);
4922 if ( bExp == 0x7FFF ) {
4923 if ((uint64_t)(bSig << 1)) {
4924 return propagateFloatx80NaN(a, b, status);
4926 goto invalid;
4928 return packFloatx80(zSign, floatx80_infinity_high,
4929 floatx80_infinity_low);
4931 if ( bExp == 0x7FFF ) {
4932 if ((uint64_t)(bSig << 1)) {
4933 return propagateFloatx80NaN(a, b, status);
4935 return packFloatx80( zSign, 0, 0 );
4937 if ( bExp == 0 ) {
4938 if ( bSig == 0 ) {
4939 if ( ( aExp | aSig ) == 0 ) {
4940 invalid:
4941 float_raise(float_flag_invalid, status);
4942 return floatx80_default_nan(status);
4944 float_raise(float_flag_divbyzero, status);
4945 return packFloatx80(zSign, floatx80_infinity_high,
4946 floatx80_infinity_low);
4948 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4950 if ( aExp == 0 ) {
4951 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4952 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4954 zExp = aExp - bExp + 0x3FFE;
4955 rem1 = 0;
4956 if ( bSig <= aSig ) {
4957 shift128Right( aSig, 0, 1, &aSig, &rem1 );
4958 ++zExp;
4960 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4961 mul64To128( bSig, zSig0, &term0, &term1 );
4962 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4963 while ( (int64_t) rem0 < 0 ) {
4964 --zSig0;
4965 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4967 zSig1 = estimateDiv128To64( rem1, 0, bSig );
4968 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4969 mul64To128( bSig, zSig1, &term1, &term2 );
4970 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4971 while ( (int64_t) rem1 < 0 ) {
4972 --zSig1;
4973 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4975 zSig1 |= ( ( rem1 | rem2 ) != 0 );
4977 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4978 zSign, zExp, zSig0, zSig1, status);
4981 /*----------------------------------------------------------------------------
4982 | Returns the remainder of the extended double-precision floating-point value
4983 | `a' with respect to the corresponding value `b'. The operation is performed
4984 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4985 *----------------------------------------------------------------------------*/
4987 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
4989 flag aSign, zSign;
4990 int32_t aExp, bExp, expDiff;
4991 uint64_t aSig0, aSig1, bSig;
4992 uint64_t q, term0, term1, alternateASig0, alternateASig1;
4994 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4995 float_raise(float_flag_invalid, status);
4996 return floatx80_default_nan(status);
4998 aSig0 = extractFloatx80Frac( a );
4999 aExp = extractFloatx80Exp( a );
5000 aSign = extractFloatx80Sign( a );
5001 bSig = extractFloatx80Frac( b );
5002 bExp = extractFloatx80Exp( b );
5003 if ( aExp == 0x7FFF ) {
5004 if ( (uint64_t) ( aSig0<<1 )
5005 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5006 return propagateFloatx80NaN(a, b, status);
5008 goto invalid;
5010 if ( bExp == 0x7FFF ) {
5011 if ((uint64_t)(bSig << 1)) {
5012 return propagateFloatx80NaN(a, b, status);
5014 return a;
5016 if ( bExp == 0 ) {
5017 if ( bSig == 0 ) {
5018 invalid:
5019 float_raise(float_flag_invalid, status);
5020 return floatx80_default_nan(status);
5022 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5024 if ( aExp == 0 ) {
5025 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5026 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5028 bSig |= LIT64( 0x8000000000000000 );
5029 zSign = aSign;
5030 expDiff = aExp - bExp;
5031 aSig1 = 0;
5032 if ( expDiff < 0 ) {
5033 if ( expDiff < -1 ) return a;
5034 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5035 expDiff = 0;
5037 q = ( bSig <= aSig0 );
5038 if ( q ) aSig0 -= bSig;
5039 expDiff -= 64;
5040 while ( 0 < expDiff ) {
5041 q = estimateDiv128To64( aSig0, aSig1, bSig );
5042 q = ( 2 < q ) ? q - 2 : 0;
5043 mul64To128( bSig, q, &term0, &term1 );
5044 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5045 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5046 expDiff -= 62;
5048 expDiff += 64;
5049 if ( 0 < expDiff ) {
5050 q = estimateDiv128To64( aSig0, aSig1, bSig );
5051 q = ( 2 < q ) ? q - 2 : 0;
5052 q >>= 64 - expDiff;
5053 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5054 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5055 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5056 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5057 ++q;
5058 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5061 else {
5062 term1 = 0;
5063 term0 = bSig;
5065 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5066 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5067 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5068 && ( q & 1 ) )
5070 aSig0 = alternateASig0;
5071 aSig1 = alternateASig1;
5072 zSign = ! zSign;
5074 return
5075 normalizeRoundAndPackFloatx80(
5076 80, zSign, bExp + expDiff, aSig0, aSig1, status);
5080 /*----------------------------------------------------------------------------
5081 | Returns the square root of the extended double-precision floating-point
5082 | value `a'. The operation is performed according to the IEC/IEEE Standard
5083 | for Binary Floating-Point Arithmetic.
5084 *----------------------------------------------------------------------------*/
5086 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5088 flag aSign;
5089 int32_t aExp, zExp;
5090 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5091 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5093 if (floatx80_invalid_encoding(a)) {
5094 float_raise(float_flag_invalid, status);
5095 return floatx80_default_nan(status);
5097 aSig0 = extractFloatx80Frac( a );
5098 aExp = extractFloatx80Exp( a );
5099 aSign = extractFloatx80Sign( a );
5100 if ( aExp == 0x7FFF ) {
5101 if ((uint64_t)(aSig0 << 1)) {
5102 return propagateFloatx80NaN(a, a, status);
5104 if ( ! aSign ) return a;
5105 goto invalid;
5107 if ( aSign ) {
5108 if ( ( aExp | aSig0 ) == 0 ) return a;
5109 invalid:
5110 float_raise(float_flag_invalid, status);
5111 return floatx80_default_nan(status);
5113 if ( aExp == 0 ) {
5114 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5115 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5117 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5118 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5119 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5120 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5121 doubleZSig0 = zSig0<<1;
5122 mul64To128( zSig0, zSig0, &term0, &term1 );
5123 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5124 while ( (int64_t) rem0 < 0 ) {
5125 --zSig0;
5126 doubleZSig0 -= 2;
5127 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5129 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5130 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5131 if ( zSig1 == 0 ) zSig1 = 1;
5132 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5133 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5134 mul64To128( zSig1, zSig1, &term2, &term3 );
5135 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5136 while ( (int64_t) rem1 < 0 ) {
5137 --zSig1;
5138 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5139 term3 |= 1;
5140 term2 |= doubleZSig0;
5141 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5143 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5145 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5146 zSig0 |= doubleZSig0;
5147 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5148 0, zExp, zSig0, zSig1, status);
5151 /*----------------------------------------------------------------------------
5152 | Returns 1 if the extended double-precision floating-point value `a' is equal
5153 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5154 | raised if either operand is a NaN. Otherwise, the comparison is performed
5155 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5156 *----------------------------------------------------------------------------*/
5158 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5161 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5162 || (extractFloatx80Exp(a) == 0x7FFF
5163 && (uint64_t) (extractFloatx80Frac(a) << 1))
5164 || (extractFloatx80Exp(b) == 0x7FFF
5165 && (uint64_t) (extractFloatx80Frac(b) << 1))
5167 float_raise(float_flag_invalid, status);
5168 return 0;
5170 return
5171 ( a.low == b.low )
5172 && ( ( a.high == b.high )
5173 || ( ( a.low == 0 )
5174 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5179 /*----------------------------------------------------------------------------
5180 | Returns 1 if the extended double-precision floating-point value `a' is
5181 | less than or equal to the corresponding value `b', and 0 otherwise. The
5182 | invalid exception is raised if either operand is a NaN. The comparison is
5183 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5184 | Arithmetic.
5185 *----------------------------------------------------------------------------*/
5187 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5189 flag aSign, bSign;
5191 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5192 || (extractFloatx80Exp(a) == 0x7FFF
5193 && (uint64_t) (extractFloatx80Frac(a) << 1))
5194 || (extractFloatx80Exp(b) == 0x7FFF
5195 && (uint64_t) (extractFloatx80Frac(b) << 1))
5197 float_raise(float_flag_invalid, status);
5198 return 0;
5200 aSign = extractFloatx80Sign( a );
5201 bSign = extractFloatx80Sign( b );
5202 if ( aSign != bSign ) {
5203 return
5204 aSign
5205 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5206 == 0 );
5208 return
5209 aSign ? le128( b.high, b.low, a.high, a.low )
5210 : le128( a.high, a.low, b.high, b.low );
5214 /*----------------------------------------------------------------------------
5215 | Returns 1 if the extended double-precision floating-point value `a' is
5216 | less than the corresponding value `b', and 0 otherwise. The invalid
5217 | exception is raised if either operand is a NaN. The comparison is performed
5218 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5219 *----------------------------------------------------------------------------*/
5221 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5223 flag aSign, bSign;
5225 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5226 || (extractFloatx80Exp(a) == 0x7FFF
5227 && (uint64_t) (extractFloatx80Frac(a) << 1))
5228 || (extractFloatx80Exp(b) == 0x7FFF
5229 && (uint64_t) (extractFloatx80Frac(b) << 1))
5231 float_raise(float_flag_invalid, status);
5232 return 0;
5234 aSign = extractFloatx80Sign( a );
5235 bSign = extractFloatx80Sign( b );
5236 if ( aSign != bSign ) {
5237 return
5238 aSign
5239 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5240 != 0 );
5242 return
5243 aSign ? lt128( b.high, b.low, a.high, a.low )
5244 : lt128( a.high, a.low, b.high, b.low );
5248 /*----------------------------------------------------------------------------
5249 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5250 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5251 | either operand is a NaN. The comparison is performed according to the
5252 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5253 *----------------------------------------------------------------------------*/
5254 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5256 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5257 || (extractFloatx80Exp(a) == 0x7FFF
5258 && (uint64_t) (extractFloatx80Frac(a) << 1))
5259 || (extractFloatx80Exp(b) == 0x7FFF
5260 && (uint64_t) (extractFloatx80Frac(b) << 1))
5262 float_raise(float_flag_invalid, status);
5263 return 1;
5265 return 0;
5268 /*----------------------------------------------------------------------------
5269 | Returns 1 if the extended double-precision floating-point value `a' is
5270 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5271 | cause an exception. The comparison is performed according to the IEC/IEEE
5272 | Standard for Binary Floating-Point Arithmetic.
5273 *----------------------------------------------------------------------------*/
5275 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5278 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5279 float_raise(float_flag_invalid, status);
5280 return 0;
5282 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5283 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5284 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5285 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5287 if (floatx80_is_signaling_nan(a, status)
5288 || floatx80_is_signaling_nan(b, status)) {
5289 float_raise(float_flag_invalid, status);
5291 return 0;
5293 return
5294 ( a.low == b.low )
5295 && ( ( a.high == b.high )
5296 || ( ( a.low == 0 )
5297 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5302 /*----------------------------------------------------------------------------
5303 | Returns 1 if the extended double-precision floating-point value `a' is less
5304 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5305 | do not cause an exception. Otherwise, the comparison is performed according
5306 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5307 *----------------------------------------------------------------------------*/
5309 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5311 flag aSign, bSign;
5313 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5314 float_raise(float_flag_invalid, status);
5315 return 0;
5317 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5318 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5319 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5320 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5322 if (floatx80_is_signaling_nan(a, status)
5323 || floatx80_is_signaling_nan(b, status)) {
5324 float_raise(float_flag_invalid, status);
5326 return 0;
5328 aSign = extractFloatx80Sign( a );
5329 bSign = extractFloatx80Sign( b );
5330 if ( aSign != bSign ) {
5331 return
5332 aSign
5333 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5334 == 0 );
5336 return
5337 aSign ? le128( b.high, b.low, a.high, a.low )
5338 : le128( a.high, a.low, b.high, b.low );
5342 /*----------------------------------------------------------------------------
5343 | Returns 1 if the extended double-precision floating-point value `a' is less
5344 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5345 | an exception. Otherwise, the comparison is performed according to the
5346 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5347 *----------------------------------------------------------------------------*/
5349 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5351 flag aSign, bSign;
5353 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5354 float_raise(float_flag_invalid, status);
5355 return 0;
5357 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5358 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5359 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5360 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5362 if (floatx80_is_signaling_nan(a, status)
5363 || floatx80_is_signaling_nan(b, status)) {
5364 float_raise(float_flag_invalid, status);
5366 return 0;
5368 aSign = extractFloatx80Sign( a );
5369 bSign = extractFloatx80Sign( b );
5370 if ( aSign != bSign ) {
5371 return
5372 aSign
5373 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5374 != 0 );
5376 return
5377 aSign ? lt128( b.high, b.low, a.high, a.low )
5378 : lt128( a.high, a.low, b.high, b.low );
5382 /*----------------------------------------------------------------------------
5383 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5384 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5385 | The comparison is performed according to the IEC/IEEE Standard for Binary
5386 | Floating-Point Arithmetic.
5387 *----------------------------------------------------------------------------*/
5388 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5390 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5391 float_raise(float_flag_invalid, status);
5392 return 1;
5394 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5395 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5396 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5397 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5399 if (floatx80_is_signaling_nan(a, status)
5400 || floatx80_is_signaling_nan(b, status)) {
5401 float_raise(float_flag_invalid, status);
5403 return 1;
5405 return 0;
5408 /*----------------------------------------------------------------------------
5409 | Returns the result of converting the quadruple-precision floating-point
5410 | value `a' to the 32-bit two's complement integer format. The conversion
5411 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5412 | Arithmetic---which means in particular that the conversion is rounded
5413 | according to the current rounding mode. If `a' is a NaN, the largest
5414 | positive integer is returned. Otherwise, if the conversion overflows, the
5415 | largest integer with the same sign as `a' is returned.
5416 *----------------------------------------------------------------------------*/
5418 int32_t float128_to_int32(float128 a, float_status *status)
5420 flag aSign;
5421 int32_t aExp, shiftCount;
5422 uint64_t aSig0, aSig1;
5424 aSig1 = extractFloat128Frac1( a );
5425 aSig0 = extractFloat128Frac0( a );
5426 aExp = extractFloat128Exp( a );
5427 aSign = extractFloat128Sign( a );
5428 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5429 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5430 aSig0 |= ( aSig1 != 0 );
5431 shiftCount = 0x4028 - aExp;
5432 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5433 return roundAndPackInt32(aSign, aSig0, status);
5437 /*----------------------------------------------------------------------------
5438 | Returns the result of converting the quadruple-precision floating-point
5439 | value `a' to the 32-bit two's complement integer format. The conversion
5440 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5441 | Arithmetic, except that the conversion is always rounded toward zero. If
5442 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5443 | conversion overflows, the largest integer with the same sign as `a' is
5444 | returned.
5445 *----------------------------------------------------------------------------*/
5447 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5449 flag aSign;
5450 int32_t aExp, shiftCount;
5451 uint64_t aSig0, aSig1, savedASig;
5452 int32_t z;
5454 aSig1 = extractFloat128Frac1( a );
5455 aSig0 = extractFloat128Frac0( a );
5456 aExp = extractFloat128Exp( a );
5457 aSign = extractFloat128Sign( a );
5458 aSig0 |= ( aSig1 != 0 );
5459 if ( 0x401E < aExp ) {
5460 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5461 goto invalid;
5463 else if ( aExp < 0x3FFF ) {
5464 if (aExp || aSig0) {
5465 status->float_exception_flags |= float_flag_inexact;
5467 return 0;
5469 aSig0 |= LIT64( 0x0001000000000000 );
5470 shiftCount = 0x402F - aExp;
5471 savedASig = aSig0;
5472 aSig0 >>= shiftCount;
5473 z = aSig0;
5474 if ( aSign ) z = - z;
5475 if ( ( z < 0 ) ^ aSign ) {
5476 invalid:
5477 float_raise(float_flag_invalid, status);
5478 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5480 if ( ( aSig0<<shiftCount ) != savedASig ) {
5481 status->float_exception_flags |= float_flag_inexact;
5483 return z;
5487 /*----------------------------------------------------------------------------
5488 | Returns the result of converting the quadruple-precision floating-point
5489 | value `a' to the 64-bit two's complement integer format. The conversion
5490 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5491 | Arithmetic---which means in particular that the conversion is rounded
5492 | according to the current rounding mode. If `a' is a NaN, the largest
5493 | positive integer is returned. Otherwise, if the conversion overflows, the
5494 | largest integer with the same sign as `a' is returned.
5495 *----------------------------------------------------------------------------*/
5497 int64_t float128_to_int64(float128 a, float_status *status)
5499 flag aSign;
5500 int32_t aExp, shiftCount;
5501 uint64_t aSig0, aSig1;
5503 aSig1 = extractFloat128Frac1( a );
5504 aSig0 = extractFloat128Frac0( a );
5505 aExp = extractFloat128Exp( a );
5506 aSign = extractFloat128Sign( a );
5507 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5508 shiftCount = 0x402F - aExp;
5509 if ( shiftCount <= 0 ) {
5510 if ( 0x403E < aExp ) {
5511 float_raise(float_flag_invalid, status);
5512 if ( ! aSign
5513 || ( ( aExp == 0x7FFF )
5514 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5517 return LIT64( 0x7FFFFFFFFFFFFFFF );
5519 return (int64_t) LIT64( 0x8000000000000000 );
5521 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5523 else {
5524 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5526 return roundAndPackInt64(aSign, aSig0, aSig1, status);
5530 /*----------------------------------------------------------------------------
5531 | Returns the result of converting the quadruple-precision floating-point
5532 | value `a' to the 64-bit two's complement integer format. The conversion
5533 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5534 | Arithmetic, except that the conversion is always rounded toward zero.
5535 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5536 | the conversion overflows, the largest integer with the same sign as `a' is
5537 | returned.
5538 *----------------------------------------------------------------------------*/
5540 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5542 flag aSign;
5543 int32_t aExp, shiftCount;
5544 uint64_t aSig0, aSig1;
5545 int64_t z;
5547 aSig1 = extractFloat128Frac1( a );
5548 aSig0 = extractFloat128Frac0( a );
5549 aExp = extractFloat128Exp( a );
5550 aSign = extractFloat128Sign( a );
5551 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5552 shiftCount = aExp - 0x402F;
5553 if ( 0 < shiftCount ) {
5554 if ( 0x403E <= aExp ) {
5555 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5556 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5557 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5558 if (aSig1) {
5559 status->float_exception_flags |= float_flag_inexact;
5562 else {
5563 float_raise(float_flag_invalid, status);
5564 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5565 return LIT64( 0x7FFFFFFFFFFFFFFF );
5568 return (int64_t) LIT64( 0x8000000000000000 );
5570 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5571 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5572 status->float_exception_flags |= float_flag_inexact;
5575 else {
5576 if ( aExp < 0x3FFF ) {
5577 if ( aExp | aSig0 | aSig1 ) {
5578 status->float_exception_flags |= float_flag_inexact;
5580 return 0;
5582 z = aSig0>>( - shiftCount );
5583 if ( aSig1
5584 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5585 status->float_exception_flags |= float_flag_inexact;
5588 if ( aSign ) z = - z;
5589 return z;
5593 /*----------------------------------------------------------------------------
5594 | Returns the result of converting the quadruple-precision floating-point value
5595 | `a' to the 64-bit unsigned integer format. The conversion is
5596 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5597 | Arithmetic---which means in particular that the conversion is rounded
5598 | according to the current rounding mode. If `a' is a NaN, the largest
5599 | positive integer is returned. If the conversion overflows, the
5600 | largest unsigned integer is returned. If 'a' is negative, the value is
5601 | rounded and zero is returned; negative values that do not round to zero
5602 | will raise the inexact exception.
5603 *----------------------------------------------------------------------------*/
5605 uint64_t float128_to_uint64(float128 a, float_status *status)
5607 flag aSign;
5608 int aExp;
5609 int shiftCount;
5610 uint64_t aSig0, aSig1;
5612 aSig0 = extractFloat128Frac0(a);
5613 aSig1 = extractFloat128Frac1(a);
5614 aExp = extractFloat128Exp(a);
5615 aSign = extractFloat128Sign(a);
5616 if (aSign && (aExp > 0x3FFE)) {
5617 float_raise(float_flag_invalid, status);
5618 if (float128_is_any_nan(a)) {
5619 return LIT64(0xFFFFFFFFFFFFFFFF);
5620 } else {
5621 return 0;
5624 if (aExp) {
5625 aSig0 |= LIT64(0x0001000000000000);
5627 shiftCount = 0x402F - aExp;
5628 if (shiftCount <= 0) {
5629 if (0x403E < aExp) {
5630 float_raise(float_flag_invalid, status);
5631 return LIT64(0xFFFFFFFFFFFFFFFF);
5633 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5634 } else {
5635 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5637 return roundAndPackUint64(aSign, aSig0, aSig1, status);
5640 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5642 uint64_t v;
5643 signed char current_rounding_mode = status->float_rounding_mode;
5645 set_float_rounding_mode(float_round_to_zero, status);
5646 v = float128_to_uint64(a, status);
5647 set_float_rounding_mode(current_rounding_mode, status);
5649 return v;
5652 /*----------------------------------------------------------------------------
5653 | Returns the result of converting the quadruple-precision floating-point
5654 | value `a' to the 32-bit unsigned integer format. The conversion
5655 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5656 | Arithmetic except that the conversion is always rounded toward zero.
5657 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
5658 | if the conversion overflows, the largest unsigned integer is returned.
5659 | If 'a' is negative, the value is rounded and zero is returned; negative
5660 | values that do not round to zero will raise the inexact exception.
5661 *----------------------------------------------------------------------------*/
5663 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5665 uint64_t v;
5666 uint32_t res;
5667 int old_exc_flags = get_float_exception_flags(status);
5669 v = float128_to_uint64_round_to_zero(a, status);
5670 if (v > 0xffffffff) {
5671 res = 0xffffffff;
5672 } else {
5673 return v;
5675 set_float_exception_flags(old_exc_flags, status);
5676 float_raise(float_flag_invalid, status);
5677 return res;
5680 /*----------------------------------------------------------------------------
5681 | Returns the result of converting the quadruple-precision floating-point
5682 | value `a' to the single-precision floating-point format. The conversion
5683 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5684 | Arithmetic.
5685 *----------------------------------------------------------------------------*/
5687 float32 float128_to_float32(float128 a, float_status *status)
5689 flag aSign;
5690 int32_t aExp;
5691 uint64_t aSig0, aSig1;
5692 uint32_t zSig;
5694 aSig1 = extractFloat128Frac1( a );
5695 aSig0 = extractFloat128Frac0( a );
5696 aExp = extractFloat128Exp( a );
5697 aSign = extractFloat128Sign( a );
5698 if ( aExp == 0x7FFF ) {
5699 if ( aSig0 | aSig1 ) {
5700 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5702 return packFloat32( aSign, 0xFF, 0 );
5704 aSig0 |= ( aSig1 != 0 );
5705 shift64RightJamming( aSig0, 18, &aSig0 );
5706 zSig = aSig0;
5707 if ( aExp || zSig ) {
5708 zSig |= 0x40000000;
5709 aExp -= 0x3F81;
5711 return roundAndPackFloat32(aSign, aExp, zSig, status);
5715 /*----------------------------------------------------------------------------
5716 | Returns the result of converting the quadruple-precision floating-point
5717 | value `a' to the double-precision floating-point format. The conversion
5718 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5719 | Arithmetic.
5720 *----------------------------------------------------------------------------*/
5722 float64 float128_to_float64(float128 a, float_status *status)
5724 flag aSign;
5725 int32_t aExp;
5726 uint64_t aSig0, aSig1;
5728 aSig1 = extractFloat128Frac1( a );
5729 aSig0 = extractFloat128Frac0( a );
5730 aExp = extractFloat128Exp( a );
5731 aSign = extractFloat128Sign( a );
5732 if ( aExp == 0x7FFF ) {
5733 if ( aSig0 | aSig1 ) {
5734 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5736 return packFloat64( aSign, 0x7FF, 0 );
5738 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5739 aSig0 |= ( aSig1 != 0 );
5740 if ( aExp || aSig0 ) {
5741 aSig0 |= LIT64( 0x4000000000000000 );
5742 aExp -= 0x3C01;
5744 return roundAndPackFloat64(aSign, aExp, aSig0, status);
5748 /*----------------------------------------------------------------------------
5749 | Returns the result of converting the quadruple-precision floating-point
5750 | value `a' to the extended double-precision floating-point format. The
5751 | conversion is performed according to the IEC/IEEE Standard for Binary
5752 | Floating-Point Arithmetic.
5753 *----------------------------------------------------------------------------*/
5755 floatx80 float128_to_floatx80(float128 a, float_status *status)
5757 flag aSign;
5758 int32_t aExp;
5759 uint64_t aSig0, aSig1;
5761 aSig1 = extractFloat128Frac1( a );
5762 aSig0 = extractFloat128Frac0( a );
5763 aExp = extractFloat128Exp( a );
5764 aSign = extractFloat128Sign( a );
5765 if ( aExp == 0x7FFF ) {
5766 if ( aSig0 | aSig1 ) {
5767 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
5769 return packFloatx80(aSign, floatx80_infinity_high,
5770 floatx80_infinity_low);
5772 if ( aExp == 0 ) {
5773 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5774 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5776 else {
5777 aSig0 |= LIT64( 0x0001000000000000 );
5779 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5780 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
5784 /*----------------------------------------------------------------------------
5785 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5786 | returns the result as a quadruple-precision floating-point value. The
5787 | operation is performed according to the IEC/IEEE Standard for Binary
5788 | Floating-Point Arithmetic.
5789 *----------------------------------------------------------------------------*/
5791 float128 float128_round_to_int(float128 a, float_status *status)
5793 flag aSign;
5794 int32_t aExp;
5795 uint64_t lastBitMask, roundBitsMask;
5796 float128 z;
5798 aExp = extractFloat128Exp( a );
5799 if ( 0x402F <= aExp ) {
5800 if ( 0x406F <= aExp ) {
5801 if ( ( aExp == 0x7FFF )
5802 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5804 return propagateFloat128NaN(a, a, status);
5806 return a;
5808 lastBitMask = 1;
5809 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5810 roundBitsMask = lastBitMask - 1;
5811 z = a;
5812 switch (status->float_rounding_mode) {
5813 case float_round_nearest_even:
5814 if ( lastBitMask ) {
5815 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5816 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5818 else {
5819 if ( (int64_t) z.low < 0 ) {
5820 ++z.high;
5821 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5824 break;
5825 case float_round_ties_away:
5826 if (lastBitMask) {
5827 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
5828 } else {
5829 if ((int64_t) z.low < 0) {
5830 ++z.high;
5833 break;
5834 case float_round_to_zero:
5835 break;
5836 case float_round_up:
5837 if (!extractFloat128Sign(z)) {
5838 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5840 break;
5841 case float_round_down:
5842 if (extractFloat128Sign(z)) {
5843 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5845 break;
5846 default:
5847 abort();
5849 z.low &= ~ roundBitsMask;
5851 else {
5852 if ( aExp < 0x3FFF ) {
5853 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5854 status->float_exception_flags |= float_flag_inexact;
5855 aSign = extractFloat128Sign( a );
5856 switch (status->float_rounding_mode) {
5857 case float_round_nearest_even:
5858 if ( ( aExp == 0x3FFE )
5859 && ( extractFloat128Frac0( a )
5860 | extractFloat128Frac1( a ) )
5862 return packFloat128( aSign, 0x3FFF, 0, 0 );
5864 break;
5865 case float_round_ties_away:
5866 if (aExp == 0x3FFE) {
5867 return packFloat128(aSign, 0x3FFF, 0, 0);
5869 break;
5870 case float_round_down:
5871 return
5872 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5873 : packFloat128( 0, 0, 0, 0 );
5874 case float_round_up:
5875 return
5876 aSign ? packFloat128( 1, 0, 0, 0 )
5877 : packFloat128( 0, 0x3FFF, 0, 0 );
5879 return packFloat128( aSign, 0, 0, 0 );
5881 lastBitMask = 1;
5882 lastBitMask <<= 0x402F - aExp;
5883 roundBitsMask = lastBitMask - 1;
5884 z.low = 0;
5885 z.high = a.high;
5886 switch (status->float_rounding_mode) {
5887 case float_round_nearest_even:
5888 z.high += lastBitMask>>1;
5889 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5890 z.high &= ~ lastBitMask;
5892 break;
5893 case float_round_ties_away:
5894 z.high += lastBitMask>>1;
5895 break;
5896 case float_round_to_zero:
5897 break;
5898 case float_round_up:
5899 if (!extractFloat128Sign(z)) {
5900 z.high |= ( a.low != 0 );
5901 z.high += roundBitsMask;
5903 break;
5904 case float_round_down:
5905 if (extractFloat128Sign(z)) {
5906 z.high |= (a.low != 0);
5907 z.high += roundBitsMask;
5909 break;
5910 default:
5911 abort();
5913 z.high &= ~ roundBitsMask;
5915 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5916 status->float_exception_flags |= float_flag_inexact;
5918 return z;
5922 /*----------------------------------------------------------------------------
5923 | Returns the result of adding the absolute values of the quadruple-precision
5924 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5925 | before being returned. `zSign' is ignored if the result is a NaN.
5926 | The addition is performed according to the IEC/IEEE Standard for Binary
5927 | Floating-Point Arithmetic.
5928 *----------------------------------------------------------------------------*/
5930 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
5931 float_status *status)
5933 int32_t aExp, bExp, zExp;
5934 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5935 int32_t expDiff;
5937 aSig1 = extractFloat128Frac1( a );
5938 aSig0 = extractFloat128Frac0( a );
5939 aExp = extractFloat128Exp( a );
5940 bSig1 = extractFloat128Frac1( b );
5941 bSig0 = extractFloat128Frac0( b );
5942 bExp = extractFloat128Exp( b );
5943 expDiff = aExp - bExp;
5944 if ( 0 < expDiff ) {
5945 if ( aExp == 0x7FFF ) {
5946 if (aSig0 | aSig1) {
5947 return propagateFloat128NaN(a, b, status);
5949 return a;
5951 if ( bExp == 0 ) {
5952 --expDiff;
5954 else {
5955 bSig0 |= LIT64( 0x0001000000000000 );
5957 shift128ExtraRightJamming(
5958 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5959 zExp = aExp;
5961 else if ( expDiff < 0 ) {
5962 if ( bExp == 0x7FFF ) {
5963 if (bSig0 | bSig1) {
5964 return propagateFloat128NaN(a, b, status);
5966 return packFloat128( zSign, 0x7FFF, 0, 0 );
5968 if ( aExp == 0 ) {
5969 ++expDiff;
5971 else {
5972 aSig0 |= LIT64( 0x0001000000000000 );
5974 shift128ExtraRightJamming(
5975 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5976 zExp = bExp;
5978 else {
5979 if ( aExp == 0x7FFF ) {
5980 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5981 return propagateFloat128NaN(a, b, status);
5983 return a;
5985 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5986 if ( aExp == 0 ) {
5987 if (status->flush_to_zero) {
5988 if (zSig0 | zSig1) {
5989 float_raise(float_flag_output_denormal, status);
5991 return packFloat128(zSign, 0, 0, 0);
5993 return packFloat128( zSign, 0, zSig0, zSig1 );
5995 zSig2 = 0;
5996 zSig0 |= LIT64( 0x0002000000000000 );
5997 zExp = aExp;
5998 goto shiftRight1;
6000 aSig0 |= LIT64( 0x0001000000000000 );
6001 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6002 --zExp;
6003 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6004 ++zExp;
6005 shiftRight1:
6006 shift128ExtraRightJamming(
6007 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6008 roundAndPack:
6009 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6013 /*----------------------------------------------------------------------------
6014 | Returns the result of subtracting the absolute values of the quadruple-
6015 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6016 | difference is negated before being returned. `zSign' is ignored if the
6017 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6018 | Standard for Binary Floating-Point Arithmetic.
6019 *----------------------------------------------------------------------------*/
6021 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6022 float_status *status)
6024 int32_t aExp, bExp, zExp;
6025 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6026 int32_t expDiff;
6028 aSig1 = extractFloat128Frac1( a );
6029 aSig0 = extractFloat128Frac0( a );
6030 aExp = extractFloat128Exp( a );
6031 bSig1 = extractFloat128Frac1( b );
6032 bSig0 = extractFloat128Frac0( b );
6033 bExp = extractFloat128Exp( b );
6034 expDiff = aExp - bExp;
6035 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6036 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6037 if ( 0 < expDiff ) goto aExpBigger;
6038 if ( expDiff < 0 ) goto bExpBigger;
6039 if ( aExp == 0x7FFF ) {
6040 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6041 return propagateFloat128NaN(a, b, status);
6043 float_raise(float_flag_invalid, status);
6044 return float128_default_nan(status);
6046 if ( aExp == 0 ) {
6047 aExp = 1;
6048 bExp = 1;
6050 if ( bSig0 < aSig0 ) goto aBigger;
6051 if ( aSig0 < bSig0 ) goto bBigger;
6052 if ( bSig1 < aSig1 ) goto aBigger;
6053 if ( aSig1 < bSig1 ) goto bBigger;
6054 return packFloat128(status->float_rounding_mode == float_round_down,
6055 0, 0, 0);
6056 bExpBigger:
6057 if ( bExp == 0x7FFF ) {
6058 if (bSig0 | bSig1) {
6059 return propagateFloat128NaN(a, b, status);
6061 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6063 if ( aExp == 0 ) {
6064 ++expDiff;
6066 else {
6067 aSig0 |= LIT64( 0x4000000000000000 );
6069 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6070 bSig0 |= LIT64( 0x4000000000000000 );
6071 bBigger:
6072 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6073 zExp = bExp;
6074 zSign ^= 1;
6075 goto normalizeRoundAndPack;
6076 aExpBigger:
6077 if ( aExp == 0x7FFF ) {
6078 if (aSig0 | aSig1) {
6079 return propagateFloat128NaN(a, b, status);
6081 return a;
6083 if ( bExp == 0 ) {
6084 --expDiff;
6086 else {
6087 bSig0 |= LIT64( 0x4000000000000000 );
6089 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6090 aSig0 |= LIT64( 0x4000000000000000 );
6091 aBigger:
6092 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6093 zExp = aExp;
6094 normalizeRoundAndPack:
6095 --zExp;
6096 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6097 status);
6101 /*----------------------------------------------------------------------------
6102 | Returns the result of adding the quadruple-precision floating-point values
6103 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6104 | for Binary Floating-Point Arithmetic.
6105 *----------------------------------------------------------------------------*/
6107 float128 float128_add(float128 a, float128 b, float_status *status)
6109 flag aSign, bSign;
6111 aSign = extractFloat128Sign( a );
6112 bSign = extractFloat128Sign( b );
6113 if ( aSign == bSign ) {
6114 return addFloat128Sigs(a, b, aSign, status);
6116 else {
6117 return subFloat128Sigs(a, b, aSign, status);
6122 /*----------------------------------------------------------------------------
6123 | Returns the result of subtracting the quadruple-precision floating-point
6124 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6125 | Standard for Binary Floating-Point Arithmetic.
6126 *----------------------------------------------------------------------------*/
6128 float128 float128_sub(float128 a, float128 b, float_status *status)
6130 flag aSign, bSign;
6132 aSign = extractFloat128Sign( a );
6133 bSign = extractFloat128Sign( b );
6134 if ( aSign == bSign ) {
6135 return subFloat128Sigs(a, b, aSign, status);
6137 else {
6138 return addFloat128Sigs(a, b, aSign, status);
6143 /*----------------------------------------------------------------------------
6144 | Returns the result of multiplying the quadruple-precision floating-point
6145 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6146 | Standard for Binary Floating-Point Arithmetic.
6147 *----------------------------------------------------------------------------*/
6149 float128 float128_mul(float128 a, float128 b, float_status *status)
6151 flag aSign, bSign, zSign;
6152 int32_t aExp, bExp, zExp;
6153 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6155 aSig1 = extractFloat128Frac1( a );
6156 aSig0 = extractFloat128Frac0( a );
6157 aExp = extractFloat128Exp( a );
6158 aSign = extractFloat128Sign( a );
6159 bSig1 = extractFloat128Frac1( b );
6160 bSig0 = extractFloat128Frac0( b );
6161 bExp = extractFloat128Exp( b );
6162 bSign = extractFloat128Sign( b );
6163 zSign = aSign ^ bSign;
6164 if ( aExp == 0x7FFF ) {
6165 if ( ( aSig0 | aSig1 )
6166 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6167 return propagateFloat128NaN(a, b, status);
6169 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6170 return packFloat128( zSign, 0x7FFF, 0, 0 );
6172 if ( bExp == 0x7FFF ) {
6173 if (bSig0 | bSig1) {
6174 return propagateFloat128NaN(a, b, status);
6176 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6177 invalid:
6178 float_raise(float_flag_invalid, status);
6179 return float128_default_nan(status);
6181 return packFloat128( zSign, 0x7FFF, 0, 0 );
6183 if ( aExp == 0 ) {
6184 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6185 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6187 if ( bExp == 0 ) {
6188 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6189 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6191 zExp = aExp + bExp - 0x4000;
6192 aSig0 |= LIT64( 0x0001000000000000 );
6193 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6194 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6195 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6196 zSig2 |= ( zSig3 != 0 );
6197 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6198 shift128ExtraRightJamming(
6199 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6200 ++zExp;
6202 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6206 /*----------------------------------------------------------------------------
6207 | Returns the result of dividing the quadruple-precision floating-point value
6208 | `a' by the corresponding value `b'. The operation is performed according to
6209 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6210 *----------------------------------------------------------------------------*/
6212 float128 float128_div(float128 a, float128 b, float_status *status)
6214 flag aSign, bSign, zSign;
6215 int32_t aExp, bExp, zExp;
6216 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6217 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6219 aSig1 = extractFloat128Frac1( a );
6220 aSig0 = extractFloat128Frac0( a );
6221 aExp = extractFloat128Exp( a );
6222 aSign = extractFloat128Sign( a );
6223 bSig1 = extractFloat128Frac1( b );
6224 bSig0 = extractFloat128Frac0( b );
6225 bExp = extractFloat128Exp( b );
6226 bSign = extractFloat128Sign( b );
6227 zSign = aSign ^ bSign;
6228 if ( aExp == 0x7FFF ) {
6229 if (aSig0 | aSig1) {
6230 return propagateFloat128NaN(a, b, status);
6232 if ( bExp == 0x7FFF ) {
6233 if (bSig0 | bSig1) {
6234 return propagateFloat128NaN(a, b, status);
6236 goto invalid;
6238 return packFloat128( zSign, 0x7FFF, 0, 0 );
6240 if ( bExp == 0x7FFF ) {
6241 if (bSig0 | bSig1) {
6242 return propagateFloat128NaN(a, b, status);
6244 return packFloat128( zSign, 0, 0, 0 );
6246 if ( bExp == 0 ) {
6247 if ( ( bSig0 | bSig1 ) == 0 ) {
6248 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6249 invalid:
6250 float_raise(float_flag_invalid, status);
6251 return float128_default_nan(status);
6253 float_raise(float_flag_divbyzero, status);
6254 return packFloat128( zSign, 0x7FFF, 0, 0 );
6256 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6258 if ( aExp == 0 ) {
6259 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6260 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6262 zExp = aExp - bExp + 0x3FFD;
6263 shortShift128Left(
6264 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6265 shortShift128Left(
6266 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6267 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6268 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6269 ++zExp;
6271 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6272 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6273 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6274 while ( (int64_t) rem0 < 0 ) {
6275 --zSig0;
6276 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6278 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6279 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6280 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6281 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6282 while ( (int64_t) rem1 < 0 ) {
6283 --zSig1;
6284 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6286 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6288 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6289 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6293 /*----------------------------------------------------------------------------
6294 | Returns the remainder of the quadruple-precision floating-point value `a'
6295 | with respect to the corresponding value `b'. The operation is performed
6296 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6297 *----------------------------------------------------------------------------*/
6299 float128 float128_rem(float128 a, float128 b, float_status *status)
6301 flag aSign, zSign;
6302 int32_t aExp, bExp, expDiff;
6303 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6304 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6305 int64_t sigMean0;
6307 aSig1 = extractFloat128Frac1( a );
6308 aSig0 = extractFloat128Frac0( a );
6309 aExp = extractFloat128Exp( a );
6310 aSign = extractFloat128Sign( a );
6311 bSig1 = extractFloat128Frac1( b );
6312 bSig0 = extractFloat128Frac0( b );
6313 bExp = extractFloat128Exp( b );
6314 if ( aExp == 0x7FFF ) {
6315 if ( ( aSig0 | aSig1 )
6316 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6317 return propagateFloat128NaN(a, b, status);
6319 goto invalid;
6321 if ( bExp == 0x7FFF ) {
6322 if (bSig0 | bSig1) {
6323 return propagateFloat128NaN(a, b, status);
6325 return a;
6327 if ( bExp == 0 ) {
6328 if ( ( bSig0 | bSig1 ) == 0 ) {
6329 invalid:
6330 float_raise(float_flag_invalid, status);
6331 return float128_default_nan(status);
6333 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6335 if ( aExp == 0 ) {
6336 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6337 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6339 expDiff = aExp - bExp;
6340 if ( expDiff < -1 ) return a;
6341 shortShift128Left(
6342 aSig0 | LIT64( 0x0001000000000000 ),
6343 aSig1,
6344 15 - ( expDiff < 0 ),
6345 &aSig0,
6346 &aSig1
6348 shortShift128Left(
6349 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6350 q = le128( bSig0, bSig1, aSig0, aSig1 );
6351 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6352 expDiff -= 64;
6353 while ( 0 < expDiff ) {
6354 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6355 q = ( 4 < q ) ? q - 4 : 0;
6356 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6357 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6358 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6359 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6360 expDiff -= 61;
6362 if ( -64 < expDiff ) {
6363 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6364 q = ( 4 < q ) ? q - 4 : 0;
6365 q >>= - expDiff;
6366 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6367 expDiff += 52;
6368 if ( expDiff < 0 ) {
6369 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6371 else {
6372 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6374 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6375 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6377 else {
6378 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6379 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6381 do {
6382 alternateASig0 = aSig0;
6383 alternateASig1 = aSig1;
6384 ++q;
6385 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6386 } while ( 0 <= (int64_t) aSig0 );
6387 add128(
6388 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6389 if ( ( sigMean0 < 0 )
6390 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6391 aSig0 = alternateASig0;
6392 aSig1 = alternateASig1;
6394 zSign = ( (int64_t) aSig0 < 0 );
6395 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6396 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6397 status);
6400 /*----------------------------------------------------------------------------
6401 | Returns the square root of the quadruple-precision floating-point value `a'.
6402 | The operation is performed according to the IEC/IEEE Standard for Binary
6403 | Floating-Point Arithmetic.
6404 *----------------------------------------------------------------------------*/
6406 float128 float128_sqrt(float128 a, float_status *status)
6408 flag aSign;
6409 int32_t aExp, zExp;
6410 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6411 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6413 aSig1 = extractFloat128Frac1( a );
6414 aSig0 = extractFloat128Frac0( a );
6415 aExp = extractFloat128Exp( a );
6416 aSign = extractFloat128Sign( a );
6417 if ( aExp == 0x7FFF ) {
6418 if (aSig0 | aSig1) {
6419 return propagateFloat128NaN(a, a, status);
6421 if ( ! aSign ) return a;
6422 goto invalid;
6424 if ( aSign ) {
6425 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6426 invalid:
6427 float_raise(float_flag_invalid, status);
6428 return float128_default_nan(status);
6430 if ( aExp == 0 ) {
6431 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6432 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6434 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6435 aSig0 |= LIT64( 0x0001000000000000 );
6436 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6437 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6438 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6439 doubleZSig0 = zSig0<<1;
6440 mul64To128( zSig0, zSig0, &term0, &term1 );
6441 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6442 while ( (int64_t) rem0 < 0 ) {
6443 --zSig0;
6444 doubleZSig0 -= 2;
6445 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6447 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6448 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6449 if ( zSig1 == 0 ) zSig1 = 1;
6450 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6451 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6452 mul64To128( zSig1, zSig1, &term2, &term3 );
6453 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6454 while ( (int64_t) rem1 < 0 ) {
6455 --zSig1;
6456 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6457 term3 |= 1;
6458 term2 |= doubleZSig0;
6459 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6461 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6463 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6464 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6468 /*----------------------------------------------------------------------------
6469 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6470 | the corresponding value `b', and 0 otherwise. The invalid exception is
6471 | raised if either operand is a NaN. Otherwise, the comparison is performed
6472 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6473 *----------------------------------------------------------------------------*/
6475 int float128_eq(float128 a, float128 b, float_status *status)
6478 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6479 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6480 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6481 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6483 float_raise(float_flag_invalid, status);
6484 return 0;
6486 return
6487 ( a.low == b.low )
6488 && ( ( a.high == b.high )
6489 || ( ( a.low == 0 )
6490 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6495 /*----------------------------------------------------------------------------
6496 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6497 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6498 | exception is raised if either operand is a NaN. The comparison is performed
6499 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6500 *----------------------------------------------------------------------------*/
6502 int float128_le(float128 a, float128 b, float_status *status)
6504 flag aSign, bSign;
6506 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6507 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6508 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6509 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6511 float_raise(float_flag_invalid, status);
6512 return 0;
6514 aSign = extractFloat128Sign( a );
6515 bSign = extractFloat128Sign( b );
6516 if ( aSign != bSign ) {
6517 return
6518 aSign
6519 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6520 == 0 );
6522 return
6523 aSign ? le128( b.high, b.low, a.high, a.low )
6524 : le128( a.high, a.low, b.high, b.low );
6528 /*----------------------------------------------------------------------------
6529 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6530 | the corresponding value `b', and 0 otherwise. The invalid exception is
6531 | raised if either operand is a NaN. The comparison is performed according
6532 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6533 *----------------------------------------------------------------------------*/
6535 int float128_lt(float128 a, float128 b, float_status *status)
6537 flag aSign, bSign;
6539 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6540 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6541 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6542 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6544 float_raise(float_flag_invalid, status);
6545 return 0;
6547 aSign = extractFloat128Sign( a );
6548 bSign = extractFloat128Sign( b );
6549 if ( aSign != bSign ) {
6550 return
6551 aSign
6552 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6553 != 0 );
6555 return
6556 aSign ? lt128( b.high, b.low, a.high, a.low )
6557 : lt128( a.high, a.low, b.high, b.low );
6561 /*----------------------------------------------------------------------------
6562 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6563 | be compared, and 0 otherwise. The invalid exception is raised if either
6564 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6565 | Standard for Binary Floating-Point Arithmetic.
6566 *----------------------------------------------------------------------------*/
6568 int float128_unordered(float128 a, float128 b, float_status *status)
6570 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6571 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6572 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6573 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6575 float_raise(float_flag_invalid, status);
6576 return 1;
6578 return 0;
6581 /*----------------------------------------------------------------------------
6582 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6583 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6584 | exception. The comparison is performed according to the IEC/IEEE Standard
6585 | for Binary Floating-Point Arithmetic.
6586 *----------------------------------------------------------------------------*/
6588 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6591 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6592 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6593 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6594 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6596 if (float128_is_signaling_nan(a, status)
6597 || float128_is_signaling_nan(b, status)) {
6598 float_raise(float_flag_invalid, status);
6600 return 0;
6602 return
6603 ( a.low == b.low )
6604 && ( ( a.high == b.high )
6605 || ( ( a.low == 0 )
6606 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6611 /*----------------------------------------------------------------------------
6612 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6613 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6614 | cause an exception. Otherwise, the comparison is performed according to the
6615 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6616 *----------------------------------------------------------------------------*/
6618 int float128_le_quiet(float128 a, float128 b, float_status *status)
6620 flag aSign, bSign;
6622 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6623 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6624 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6625 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6627 if (float128_is_signaling_nan(a, status)
6628 || float128_is_signaling_nan(b, status)) {
6629 float_raise(float_flag_invalid, status);
6631 return 0;
6633 aSign = extractFloat128Sign( a );
6634 bSign = extractFloat128Sign( b );
6635 if ( aSign != bSign ) {
6636 return
6637 aSign
6638 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6639 == 0 );
6641 return
6642 aSign ? le128( b.high, b.low, a.high, a.low )
6643 : le128( a.high, a.low, b.high, b.low );
6647 /*----------------------------------------------------------------------------
6648 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6649 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6650 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6651 | Standard for Binary Floating-Point Arithmetic.
6652 *----------------------------------------------------------------------------*/
6654 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6656 flag aSign, bSign;
6658 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6659 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6660 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6661 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6663 if (float128_is_signaling_nan(a, status)
6664 || float128_is_signaling_nan(b, status)) {
6665 float_raise(float_flag_invalid, status);
6667 return 0;
6669 aSign = extractFloat128Sign( a );
6670 bSign = extractFloat128Sign( b );
6671 if ( aSign != bSign ) {
6672 return
6673 aSign
6674 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6675 != 0 );
6677 return
6678 aSign ? lt128( b.high, b.low, a.high, a.low )
6679 : lt128( a.high, a.low, b.high, b.low );
6683 /*----------------------------------------------------------------------------
6684 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6685 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6686 | comparison is performed according to the IEC/IEEE Standard for Binary
6687 | Floating-Point Arithmetic.
6688 *----------------------------------------------------------------------------*/
6690 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6692 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6693 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6694 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6695 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6697 if (float128_is_signaling_nan(a, status)
6698 || float128_is_signaling_nan(b, status)) {
6699 float_raise(float_flag_invalid, status);
6701 return 1;
6703 return 0;
6706 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6707 int is_quiet, float_status *status)
6709 flag aSign, bSign;
6711 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6712 float_raise(float_flag_invalid, status);
6713 return float_relation_unordered;
6715 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6716 ( extractFloatx80Frac( a )<<1 ) ) ||
6717 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6718 ( extractFloatx80Frac( b )<<1 ) )) {
6719 if (!is_quiet ||
6720 floatx80_is_signaling_nan(a, status) ||
6721 floatx80_is_signaling_nan(b, status)) {
6722 float_raise(float_flag_invalid, status);
6724 return float_relation_unordered;
6726 aSign = extractFloatx80Sign( a );
6727 bSign = extractFloatx80Sign( b );
6728 if ( aSign != bSign ) {
6730 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6731 ( ( a.low | b.low ) == 0 ) ) {
6732 /* zero case */
6733 return float_relation_equal;
6734 } else {
6735 return 1 - (2 * aSign);
6737 } else {
6738 if (a.low == b.low && a.high == b.high) {
6739 return float_relation_equal;
6740 } else {
6741 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6746 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6748 return floatx80_compare_internal(a, b, 0, status);
6751 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6753 return floatx80_compare_internal(a, b, 1, status);
6756 static inline int float128_compare_internal(float128 a, float128 b,
6757 int is_quiet, float_status *status)
6759 flag aSign, bSign;
6761 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6762 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6763 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6764 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6765 if (!is_quiet ||
6766 float128_is_signaling_nan(a, status) ||
6767 float128_is_signaling_nan(b, status)) {
6768 float_raise(float_flag_invalid, status);
6770 return float_relation_unordered;
6772 aSign = extractFloat128Sign( a );
6773 bSign = extractFloat128Sign( b );
6774 if ( aSign != bSign ) {
6775 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6776 /* zero case */
6777 return float_relation_equal;
6778 } else {
6779 return 1 - (2 * aSign);
6781 } else {
6782 if (a.low == b.low && a.high == b.high) {
6783 return float_relation_equal;
6784 } else {
6785 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6790 int float128_compare(float128 a, float128 b, float_status *status)
6792 return float128_compare_internal(a, b, 0, status);
6795 int float128_compare_quiet(float128 a, float128 b, float_status *status)
6797 return float128_compare_internal(a, b, 1, status);
6800 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
6802 flag aSign;
6803 int32_t aExp;
6804 uint64_t aSig;
6806 if (floatx80_invalid_encoding(a)) {
6807 float_raise(float_flag_invalid, status);
6808 return floatx80_default_nan(status);
6810 aSig = extractFloatx80Frac( a );
6811 aExp = extractFloatx80Exp( a );
6812 aSign = extractFloatx80Sign( a );
6814 if ( aExp == 0x7FFF ) {
6815 if ( aSig<<1 ) {
6816 return propagateFloatx80NaN(a, a, status);
6818 return a;
6821 if (aExp == 0) {
6822 if (aSig == 0) {
6823 return a;
6825 aExp++;
6828 if (n > 0x10000) {
6829 n = 0x10000;
6830 } else if (n < -0x10000) {
6831 n = -0x10000;
6834 aExp += n;
6835 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
6836 aSign, aExp, aSig, 0, status);
6839 float128 float128_scalbn(float128 a, int n, float_status *status)
6841 flag aSign;
6842 int32_t aExp;
6843 uint64_t aSig0, aSig1;
6845 aSig1 = extractFloat128Frac1( a );
6846 aSig0 = extractFloat128Frac0( a );
6847 aExp = extractFloat128Exp( a );
6848 aSign = extractFloat128Sign( a );
6849 if ( aExp == 0x7FFF ) {
6850 if ( aSig0 | aSig1 ) {
6851 return propagateFloat128NaN(a, a, status);
6853 return a;
6855 if (aExp != 0) {
6856 aSig0 |= LIT64( 0x0001000000000000 );
6857 } else if (aSig0 == 0 && aSig1 == 0) {
6858 return a;
6859 } else {
6860 aExp++;
6863 if (n > 0x10000) {
6864 n = 0x10000;
6865 } else if (n < -0x10000) {
6866 n = -0x10000;
6869 aExp += n - 1;
6870 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6871 , status);