softfloat: Add scaling int-to-float routines
[qemu.git] / fpu / softfloat.c
blob12f373cbade368266c553f3cab1f1dff0c7f66a1
1 /*
2 * QEMU float support
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "fpu/softfloat-macros.h"
98 /*----------------------------------------------------------------------------
99 | Returns the fraction bits of the half-precision floating-point value `a'.
100 *----------------------------------------------------------------------------*/
102 static inline uint32_t extractFloat16Frac(float16 a)
104 return float16_val(a) & 0x3ff;
107 /*----------------------------------------------------------------------------
108 | Returns the exponent bits of the half-precision floating-point value `a'.
109 *----------------------------------------------------------------------------*/
111 static inline int extractFloat16Exp(float16 a)
113 return (float16_val(a) >> 10) & 0x1f;
116 /*----------------------------------------------------------------------------
117 | Returns the fraction bits of the single-precision floating-point value `a'.
118 *----------------------------------------------------------------------------*/
120 static inline uint32_t extractFloat32Frac(float32 a)
122 return float32_val(a) & 0x007FFFFF;
125 /*----------------------------------------------------------------------------
126 | Returns the exponent bits of the single-precision floating-point value `a'.
127 *----------------------------------------------------------------------------*/
129 static inline int extractFloat32Exp(float32 a)
131 return (float32_val(a) >> 23) & 0xFF;
134 /*----------------------------------------------------------------------------
135 | Returns the sign bit of the single-precision floating-point value `a'.
136 *----------------------------------------------------------------------------*/
138 static inline flag extractFloat32Sign(float32 a)
140 return float32_val(a) >> 31;
143 /*----------------------------------------------------------------------------
144 | Returns the fraction bits of the double-precision floating-point value `a'.
145 *----------------------------------------------------------------------------*/
147 static inline uint64_t extractFloat64Frac(float64 a)
149 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
152 /*----------------------------------------------------------------------------
153 | Returns the exponent bits of the double-precision floating-point value `a'.
154 *----------------------------------------------------------------------------*/
156 static inline int extractFloat64Exp(float64 a)
158 return (float64_val(a) >> 52) & 0x7FF;
161 /*----------------------------------------------------------------------------
162 | Returns the sign bit of the double-precision floating-point value `a'.
163 *----------------------------------------------------------------------------*/
165 static inline flag extractFloat64Sign(float64 a)
167 return float64_val(a) >> 63;
171 * Classify a floating point number. Everything above float_class_qnan
172 * is a NaN so cls >= float_class_qnan is any NaN.
175 typedef enum __attribute__ ((__packed__)) {
176 float_class_unclassified,
177 float_class_zero,
178 float_class_normal,
179 float_class_inf,
180 float_class_qnan, /* all NaNs from here */
181 float_class_snan,
182 } FloatClass;
184 /* Simple helpers for checking if, or what kind of, NaN we have */
185 static inline __attribute__((unused)) bool is_nan(FloatClass c)
187 return unlikely(c >= float_class_qnan);
190 static inline __attribute__((unused)) bool is_snan(FloatClass c)
192 return c == float_class_snan;
195 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
197 return c == float_class_qnan;
201 * Structure holding all of the decomposed parts of a float. The
202 * exponent is unbiased and the fraction is normalized. All
203 * calculations are done with a 64 bit fraction and then rounded as
204 * appropriate for the final format.
206 * Thanks to the packed FloatClass a decent compiler should be able to
207 * fit the whole structure into registers and avoid using the stack
208 * for parameter passing.
211 typedef struct {
212 uint64_t frac;
213 int32_t exp;
214 FloatClass cls;
215 bool sign;
216 } FloatParts;
218 #define DECOMPOSED_BINARY_POINT (64 - 2)
219 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
220 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
222 /* Structure holding all of the relevant parameters for a format.
223 * exp_size: the size of the exponent field
224 * exp_bias: the offset applied to the exponent field
225 * exp_max: the maximum normalised exponent
226 * frac_size: the size of the fraction field
227 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
228 * The following are computed based the size of fraction
229 * frac_lsb: least significant bit of fraction
230 * frac_lsbm1: the bit below the least significant bit (for rounding)
231 * round_mask/roundeven_mask: masks used for rounding
232 * The following optional modifiers are available:
233 * arm_althp: handle ARM Alternative Half Precision
235 typedef struct {
236 int exp_size;
237 int exp_bias;
238 int exp_max;
239 int frac_size;
240 int frac_shift;
241 uint64_t frac_lsb;
242 uint64_t frac_lsbm1;
243 uint64_t round_mask;
244 uint64_t roundeven_mask;
245 bool arm_althp;
246 } FloatFmt;
248 /* Expand fields based on the size of exponent and fraction */
249 #define FLOAT_PARAMS(E, F) \
250 .exp_size = E, \
251 .exp_bias = ((1 << E) - 1) >> 1, \
252 .exp_max = (1 << E) - 1, \
253 .frac_size = F, \
254 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
255 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
256 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
257 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
258 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
260 static const FloatFmt float16_params = {
261 FLOAT_PARAMS(5, 10)
264 static const FloatFmt float16_params_ahp = {
265 FLOAT_PARAMS(5, 10),
266 .arm_althp = true
269 static const FloatFmt float32_params = {
270 FLOAT_PARAMS(8, 23)
273 static const FloatFmt float64_params = {
274 FLOAT_PARAMS(11, 52)
277 /* Unpack a float to parts, but do not canonicalize. */
278 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
280 const int sign_pos = fmt.frac_size + fmt.exp_size;
282 return (FloatParts) {
283 .cls = float_class_unclassified,
284 .sign = extract64(raw, sign_pos, 1),
285 .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
286 .frac = extract64(raw, 0, fmt.frac_size),
290 static inline FloatParts float16_unpack_raw(float16 f)
292 return unpack_raw(float16_params, f);
295 static inline FloatParts float32_unpack_raw(float32 f)
297 return unpack_raw(float32_params, f);
300 static inline FloatParts float64_unpack_raw(float64 f)
302 return unpack_raw(float64_params, f);
305 /* Pack a float from parts, but do not canonicalize. */
306 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
308 const int sign_pos = fmt.frac_size + fmt.exp_size;
309 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
310 return deposit64(ret, sign_pos, 1, p.sign);
313 static inline float16 float16_pack_raw(FloatParts p)
315 return make_float16(pack_raw(float16_params, p));
318 static inline float32 float32_pack_raw(FloatParts p)
320 return make_float32(pack_raw(float32_params, p));
323 static inline float64 float64_pack_raw(FloatParts p)
325 return make_float64(pack_raw(float64_params, p));
328 /*----------------------------------------------------------------------------
329 | Functions and definitions to determine: (1) whether tininess for underflow
330 | is detected before or after rounding by default, (2) what (if anything)
331 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
332 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
333 | are propagated from function inputs to output. These details are target-
334 | specific.
335 *----------------------------------------------------------------------------*/
336 #include "softfloat-specialize.h"
338 /* Canonicalize EXP and FRAC, setting CLS. */
339 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
340 float_status *status)
342 if (part.exp == parm->exp_max && !parm->arm_althp) {
343 if (part.frac == 0) {
344 part.cls = float_class_inf;
345 } else {
346 part.frac <<= parm->frac_shift;
347 part.cls = (parts_is_snan_frac(part.frac, status)
348 ? float_class_snan : float_class_qnan);
350 } else if (part.exp == 0) {
351 if (likely(part.frac == 0)) {
352 part.cls = float_class_zero;
353 } else if (status->flush_inputs_to_zero) {
354 float_raise(float_flag_input_denormal, status);
355 part.cls = float_class_zero;
356 part.frac = 0;
357 } else {
358 int shift = clz64(part.frac) - 1;
359 part.cls = float_class_normal;
360 part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
361 part.frac <<= shift;
363 } else {
364 part.cls = float_class_normal;
365 part.exp -= parm->exp_bias;
366 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
368 return part;
371 /* Round and uncanonicalize a floating-point number by parts. There
372 * are FRAC_SHIFT bits that may require rounding at the bottom of the
373 * fraction; these bits will be removed. The exponent will be biased
374 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
377 static FloatParts round_canonical(FloatParts p, float_status *s,
378 const FloatFmt *parm)
380 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
381 const uint64_t round_mask = parm->round_mask;
382 const uint64_t roundeven_mask = parm->roundeven_mask;
383 const int exp_max = parm->exp_max;
384 const int frac_shift = parm->frac_shift;
385 uint64_t frac, inc;
386 int exp, flags = 0;
387 bool overflow_norm;
389 frac = p.frac;
390 exp = p.exp;
392 switch (p.cls) {
393 case float_class_normal:
394 switch (s->float_rounding_mode) {
395 case float_round_nearest_even:
396 overflow_norm = false;
397 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
398 break;
399 case float_round_ties_away:
400 overflow_norm = false;
401 inc = frac_lsbm1;
402 break;
403 case float_round_to_zero:
404 overflow_norm = true;
405 inc = 0;
406 break;
407 case float_round_up:
408 inc = p.sign ? 0 : round_mask;
409 overflow_norm = p.sign;
410 break;
411 case float_round_down:
412 inc = p.sign ? round_mask : 0;
413 overflow_norm = !p.sign;
414 break;
415 default:
416 g_assert_not_reached();
419 exp += parm->exp_bias;
420 if (likely(exp > 0)) {
421 if (frac & round_mask) {
422 flags |= float_flag_inexact;
423 frac += inc;
424 if (frac & DECOMPOSED_OVERFLOW_BIT) {
425 frac >>= 1;
426 exp++;
429 frac >>= frac_shift;
431 if (parm->arm_althp) {
432 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
433 if (unlikely(exp > exp_max)) {
434 /* Overflow. Return the maximum normal. */
435 flags = float_flag_invalid;
436 exp = exp_max;
437 frac = -1;
439 } else if (unlikely(exp >= exp_max)) {
440 flags |= float_flag_overflow | float_flag_inexact;
441 if (overflow_norm) {
442 exp = exp_max - 1;
443 frac = -1;
444 } else {
445 p.cls = float_class_inf;
446 goto do_inf;
449 } else if (s->flush_to_zero) {
450 flags |= float_flag_output_denormal;
451 p.cls = float_class_zero;
452 goto do_zero;
453 } else {
454 bool is_tiny = (s->float_detect_tininess
455 == float_tininess_before_rounding)
456 || (exp < 0)
457 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
459 shift64RightJamming(frac, 1 - exp, &frac);
460 if (frac & round_mask) {
461 /* Need to recompute round-to-even. */
462 if (s->float_rounding_mode == float_round_nearest_even) {
463 inc = ((frac & roundeven_mask) != frac_lsbm1
464 ? frac_lsbm1 : 0);
466 flags |= float_flag_inexact;
467 frac += inc;
470 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
471 frac >>= frac_shift;
473 if (is_tiny && (flags & float_flag_inexact)) {
474 flags |= float_flag_underflow;
476 if (exp == 0 && frac == 0) {
477 p.cls = float_class_zero;
480 break;
482 case float_class_zero:
483 do_zero:
484 exp = 0;
485 frac = 0;
486 break;
488 case float_class_inf:
489 do_inf:
490 assert(!parm->arm_althp);
491 exp = exp_max;
492 frac = 0;
493 break;
495 case float_class_qnan:
496 case float_class_snan:
497 assert(!parm->arm_althp);
498 exp = exp_max;
499 frac >>= parm->frac_shift;
500 break;
502 default:
503 g_assert_not_reached();
506 float_raise(flags, s);
507 p.exp = exp;
508 p.frac = frac;
509 return p;
512 /* Explicit FloatFmt version */
513 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
514 const FloatFmt *params)
516 return canonicalize(float16_unpack_raw(f), params, s);
519 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
521 return float16a_unpack_canonical(f, s, &float16_params);
524 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
525 const FloatFmt *params)
527 return float16_pack_raw(round_canonical(p, s, params));
530 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
532 return float16a_round_pack_canonical(p, s, &float16_params);
535 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
537 return canonicalize(float32_unpack_raw(f), &float32_params, s);
540 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
542 return float32_pack_raw(round_canonical(p, s, &float32_params));
545 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
547 return canonicalize(float64_unpack_raw(f), &float64_params, s);
550 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
552 return float64_pack_raw(round_canonical(p, s, &float64_params));
555 static FloatParts return_nan(FloatParts a, float_status *s)
557 switch (a.cls) {
558 case float_class_snan:
559 s->float_exception_flags |= float_flag_invalid;
560 a = parts_silence_nan(a, s);
561 /* fall through */
562 case float_class_qnan:
563 if (s->default_nan_mode) {
564 return parts_default_nan(s);
566 break;
568 default:
569 g_assert_not_reached();
571 return a;
574 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
576 if (is_snan(a.cls) || is_snan(b.cls)) {
577 s->float_exception_flags |= float_flag_invalid;
580 if (s->default_nan_mode) {
581 return parts_default_nan(s);
582 } else {
583 if (pickNaN(a.cls, b.cls,
584 a.frac > b.frac ||
585 (a.frac == b.frac && a.sign < b.sign))) {
586 a = b;
588 if (is_snan(a.cls)) {
589 return parts_silence_nan(a, s);
592 return a;
595 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
596 bool inf_zero, float_status *s)
598 int which;
600 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
601 s->float_exception_flags |= float_flag_invalid;
604 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
606 if (s->default_nan_mode) {
607 /* Note that this check is after pickNaNMulAdd so that function
608 * has an opportunity to set the Invalid flag.
610 which = 3;
613 switch (which) {
614 case 0:
615 break;
616 case 1:
617 a = b;
618 break;
619 case 2:
620 a = c;
621 break;
622 case 3:
623 return parts_default_nan(s);
624 default:
625 g_assert_not_reached();
628 if (is_snan(a.cls)) {
629 return parts_silence_nan(a, s);
631 return a;
635 * Returns the result of adding or subtracting the values of the
636 * floating-point values `a' and `b'. The operation is performed
637 * according to the IEC/IEEE Standard for Binary Floating-Point
638 * Arithmetic.
641 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
642 float_status *s)
644 bool a_sign = a.sign;
645 bool b_sign = b.sign ^ subtract;
647 if (a_sign != b_sign) {
648 /* Subtraction */
650 if (a.cls == float_class_normal && b.cls == float_class_normal) {
651 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
652 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
653 a.frac = a.frac - b.frac;
654 } else {
655 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
656 a.frac = b.frac - a.frac;
657 a.exp = b.exp;
658 a_sign ^= 1;
661 if (a.frac == 0) {
662 a.cls = float_class_zero;
663 a.sign = s->float_rounding_mode == float_round_down;
664 } else {
665 int shift = clz64(a.frac) - 1;
666 a.frac = a.frac << shift;
667 a.exp = a.exp - shift;
668 a.sign = a_sign;
670 return a;
672 if (is_nan(a.cls) || is_nan(b.cls)) {
673 return pick_nan(a, b, s);
675 if (a.cls == float_class_inf) {
676 if (b.cls == float_class_inf) {
677 float_raise(float_flag_invalid, s);
678 return parts_default_nan(s);
680 return a;
682 if (a.cls == float_class_zero && b.cls == float_class_zero) {
683 a.sign = s->float_rounding_mode == float_round_down;
684 return a;
686 if (a.cls == float_class_zero || b.cls == float_class_inf) {
687 b.sign = a_sign ^ 1;
688 return b;
690 if (b.cls == float_class_zero) {
691 return a;
693 } else {
694 /* Addition */
695 if (a.cls == float_class_normal && b.cls == float_class_normal) {
696 if (a.exp > b.exp) {
697 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
698 } else if (a.exp < b.exp) {
699 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
700 a.exp = b.exp;
702 a.frac += b.frac;
703 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
704 shift64RightJamming(a.frac, 1, &a.frac);
705 a.exp += 1;
707 return a;
709 if (is_nan(a.cls) || is_nan(b.cls)) {
710 return pick_nan(a, b, s);
712 if (a.cls == float_class_inf || b.cls == float_class_zero) {
713 return a;
715 if (b.cls == float_class_inf || a.cls == float_class_zero) {
716 b.sign = b_sign;
717 return b;
720 g_assert_not_reached();
724 * Returns the result of adding or subtracting the floating-point
725 * values `a' and `b'. The operation is performed according to the
726 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
729 float16 __attribute__((flatten)) float16_add(float16 a, float16 b,
730 float_status *status)
732 FloatParts pa = float16_unpack_canonical(a, status);
733 FloatParts pb = float16_unpack_canonical(b, status);
734 FloatParts pr = addsub_floats(pa, pb, false, status);
736 return float16_round_pack_canonical(pr, status);
739 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
740 float_status *status)
742 FloatParts pa = float32_unpack_canonical(a, status);
743 FloatParts pb = float32_unpack_canonical(b, status);
744 FloatParts pr = addsub_floats(pa, pb, false, status);
746 return float32_round_pack_canonical(pr, status);
749 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
750 float_status *status)
752 FloatParts pa = float64_unpack_canonical(a, status);
753 FloatParts pb = float64_unpack_canonical(b, status);
754 FloatParts pr = addsub_floats(pa, pb, false, status);
756 return float64_round_pack_canonical(pr, status);
759 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
760 float_status *status)
762 FloatParts pa = float16_unpack_canonical(a, status);
763 FloatParts pb = float16_unpack_canonical(b, status);
764 FloatParts pr = addsub_floats(pa, pb, true, status);
766 return float16_round_pack_canonical(pr, status);
769 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
770 float_status *status)
772 FloatParts pa = float32_unpack_canonical(a, status);
773 FloatParts pb = float32_unpack_canonical(b, status);
774 FloatParts pr = addsub_floats(pa, pb, true, status);
776 return float32_round_pack_canonical(pr, status);
779 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
780 float_status *status)
782 FloatParts pa = float64_unpack_canonical(a, status);
783 FloatParts pb = float64_unpack_canonical(b, status);
784 FloatParts pr = addsub_floats(pa, pb, true, status);
786 return float64_round_pack_canonical(pr, status);
790 * Returns the result of multiplying the floating-point values `a' and
791 * `b'. The operation is performed according to the IEC/IEEE Standard
792 * for Binary Floating-Point Arithmetic.
795 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
797 bool sign = a.sign ^ b.sign;
799 if (a.cls == float_class_normal && b.cls == float_class_normal) {
800 uint64_t hi, lo;
801 int exp = a.exp + b.exp;
803 mul64To128(a.frac, b.frac, &hi, &lo);
804 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
805 if (lo & DECOMPOSED_OVERFLOW_BIT) {
806 shift64RightJamming(lo, 1, &lo);
807 exp += 1;
810 /* Re-use a */
811 a.exp = exp;
812 a.sign = sign;
813 a.frac = lo;
814 return a;
816 /* handle all the NaN cases */
817 if (is_nan(a.cls) || is_nan(b.cls)) {
818 return pick_nan(a, b, s);
820 /* Inf * Zero == NaN */
821 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
822 (a.cls == float_class_zero && b.cls == float_class_inf)) {
823 s->float_exception_flags |= float_flag_invalid;
824 return parts_default_nan(s);
826 /* Multiply by 0 or Inf */
827 if (a.cls == float_class_inf || a.cls == float_class_zero) {
828 a.sign = sign;
829 return a;
831 if (b.cls == float_class_inf || b.cls == float_class_zero) {
832 b.sign = sign;
833 return b;
835 g_assert_not_reached();
838 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
839 float_status *status)
841 FloatParts pa = float16_unpack_canonical(a, status);
842 FloatParts pb = float16_unpack_canonical(b, status);
843 FloatParts pr = mul_floats(pa, pb, status);
845 return float16_round_pack_canonical(pr, status);
848 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
849 float_status *status)
851 FloatParts pa = float32_unpack_canonical(a, status);
852 FloatParts pb = float32_unpack_canonical(b, status);
853 FloatParts pr = mul_floats(pa, pb, status);
855 return float32_round_pack_canonical(pr, status);
858 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
859 float_status *status)
861 FloatParts pa = float64_unpack_canonical(a, status);
862 FloatParts pb = float64_unpack_canonical(b, status);
863 FloatParts pr = mul_floats(pa, pb, status);
865 return float64_round_pack_canonical(pr, status);
869 * Returns the result of multiplying the floating-point values `a' and
870 * `b' then adding 'c', with no intermediate rounding step after the
871 * multiplication. The operation is performed according to the
872 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
873 * The flags argument allows the caller to select negation of the
874 * addend, the intermediate product, or the final result. (The
875 * difference between this and having the caller do a separate
876 * negation is that negating externally will flip the sign bit on
877 * NaNs.)
880 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
881 int flags, float_status *s)
883 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
884 ((1 << float_class_inf) | (1 << float_class_zero));
885 bool p_sign;
886 bool sign_flip = flags & float_muladd_negate_result;
887 FloatClass p_class;
888 uint64_t hi, lo;
889 int p_exp;
891 /* It is implementation-defined whether the cases of (0,inf,qnan)
892 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
893 * they return if they do), so we have to hand this information
894 * off to the target-specific pick-a-NaN routine.
896 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
897 return pick_nan_muladd(a, b, c, inf_zero, s);
900 if (inf_zero) {
901 s->float_exception_flags |= float_flag_invalid;
902 return parts_default_nan(s);
905 if (flags & float_muladd_negate_c) {
906 c.sign ^= 1;
909 p_sign = a.sign ^ b.sign;
911 if (flags & float_muladd_negate_product) {
912 p_sign ^= 1;
915 if (a.cls == float_class_inf || b.cls == float_class_inf) {
916 p_class = float_class_inf;
917 } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
918 p_class = float_class_zero;
919 } else {
920 p_class = float_class_normal;
923 if (c.cls == float_class_inf) {
924 if (p_class == float_class_inf && p_sign != c.sign) {
925 s->float_exception_flags |= float_flag_invalid;
926 return parts_default_nan(s);
927 } else {
928 a.cls = float_class_inf;
929 a.sign = c.sign ^ sign_flip;
930 return a;
934 if (p_class == float_class_inf) {
935 a.cls = float_class_inf;
936 a.sign = p_sign ^ sign_flip;
937 return a;
940 if (p_class == float_class_zero) {
941 if (c.cls == float_class_zero) {
942 if (p_sign != c.sign) {
943 p_sign = s->float_rounding_mode == float_round_down;
945 c.sign = p_sign;
946 } else if (flags & float_muladd_halve_result) {
947 c.exp -= 1;
949 c.sign ^= sign_flip;
950 return c;
953 /* a & b should be normals now... */
954 assert(a.cls == float_class_normal &&
955 b.cls == float_class_normal);
957 p_exp = a.exp + b.exp;
959 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
960 * result.
962 mul64To128(a.frac, b.frac, &hi, &lo);
963 /* binary point now at bit 124 */
965 /* check for overflow */
966 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
967 shift128RightJamming(hi, lo, 1, &hi, &lo);
968 p_exp += 1;
971 /* + add/sub */
972 if (c.cls == float_class_zero) {
973 /* move binary point back to 62 */
974 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
975 } else {
976 int exp_diff = p_exp - c.exp;
977 if (p_sign == c.sign) {
978 /* Addition */
979 if (exp_diff <= 0) {
980 shift128RightJamming(hi, lo,
981 DECOMPOSED_BINARY_POINT - exp_diff,
982 &hi, &lo);
983 lo += c.frac;
984 p_exp = c.exp;
985 } else {
986 uint64_t c_hi, c_lo;
987 /* shift c to the same binary point as the product (124) */
988 c_hi = c.frac >> 2;
989 c_lo = 0;
990 shift128RightJamming(c_hi, c_lo,
991 exp_diff,
992 &c_hi, &c_lo);
993 add128(hi, lo, c_hi, c_lo, &hi, &lo);
994 /* move binary point back to 62 */
995 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
998 if (lo & DECOMPOSED_OVERFLOW_BIT) {
999 shift64RightJamming(lo, 1, &lo);
1000 p_exp += 1;
1003 } else {
1004 /* Subtraction */
1005 uint64_t c_hi, c_lo;
1006 /* make C binary point match product at bit 124 */
1007 c_hi = c.frac >> 2;
1008 c_lo = 0;
1010 if (exp_diff <= 0) {
1011 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1012 if (exp_diff == 0
1014 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1015 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1016 } else {
1017 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1018 p_sign ^= 1;
1019 p_exp = c.exp;
1021 } else {
1022 shift128RightJamming(c_hi, c_lo,
1023 exp_diff,
1024 &c_hi, &c_lo);
1025 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1028 if (hi == 0 && lo == 0) {
1029 a.cls = float_class_zero;
1030 a.sign = s->float_rounding_mode == float_round_down;
1031 a.sign ^= sign_flip;
1032 return a;
1033 } else {
1034 int shift;
1035 if (hi != 0) {
1036 shift = clz64(hi);
1037 } else {
1038 shift = clz64(lo) + 64;
1040 /* Normalizing to a binary point of 124 is the
1041 correct adjust for the exponent. However since we're
1042 shifting, we might as well put the binary point back
1043 at 62 where we really want it. Therefore shift as
1044 if we're leaving 1 bit at the top of the word, but
1045 adjust the exponent as if we're leaving 3 bits. */
1046 shift -= 1;
1047 if (shift >= 64) {
1048 lo = lo << (shift - 64);
1049 } else {
1050 hi = (hi << shift) | (lo >> (64 - shift));
1051 lo = hi | ((lo << shift) != 0);
1053 p_exp -= shift - 2;
1058 if (flags & float_muladd_halve_result) {
1059 p_exp -= 1;
1062 /* finally prepare our result */
1063 a.cls = float_class_normal;
1064 a.sign = p_sign ^ sign_flip;
1065 a.exp = p_exp;
1066 a.frac = lo;
1068 return a;
1071 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1072 int flags, float_status *status)
1074 FloatParts pa = float16_unpack_canonical(a, status);
1075 FloatParts pb = float16_unpack_canonical(b, status);
1076 FloatParts pc = float16_unpack_canonical(c, status);
1077 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1079 return float16_round_pack_canonical(pr, status);
1082 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1083 int flags, float_status *status)
1085 FloatParts pa = float32_unpack_canonical(a, status);
1086 FloatParts pb = float32_unpack_canonical(b, status);
1087 FloatParts pc = float32_unpack_canonical(c, status);
1088 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1090 return float32_round_pack_canonical(pr, status);
1093 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1094 int flags, float_status *status)
1096 FloatParts pa = float64_unpack_canonical(a, status);
1097 FloatParts pb = float64_unpack_canonical(b, status);
1098 FloatParts pc = float64_unpack_canonical(c, status);
1099 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1101 return float64_round_pack_canonical(pr, status);
1105 * Returns the result of dividing the floating-point value `a' by the
1106 * corresponding value `b'. The operation is performed according to
1107 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1110 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1112 bool sign = a.sign ^ b.sign;
1114 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1115 uint64_t temp_lo, temp_hi;
1116 int exp = a.exp - b.exp;
1117 if (a.frac < b.frac) {
1118 exp -= 1;
1119 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1120 &temp_hi, &temp_lo);
1121 } else {
1122 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1123 &temp_hi, &temp_lo);
1125 /* LSB of quot is set if inexact which roundandpack will use
1126 * to set flags. Yet again we re-use a for the result */
1127 a.frac = div128To64(temp_lo, temp_hi, b.frac);
1128 a.sign = sign;
1129 a.exp = exp;
1130 return a;
1132 /* handle all the NaN cases */
1133 if (is_nan(a.cls) || is_nan(b.cls)) {
1134 return pick_nan(a, b, s);
1136 /* 0/0 or Inf/Inf */
1137 if (a.cls == b.cls
1139 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1140 s->float_exception_flags |= float_flag_invalid;
1141 return parts_default_nan(s);
1143 /* Inf / x or 0 / x */
1144 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1145 a.sign = sign;
1146 return a;
1148 /* Div 0 => Inf */
1149 if (b.cls == float_class_zero) {
1150 s->float_exception_flags |= float_flag_divbyzero;
1151 a.cls = float_class_inf;
1152 a.sign = sign;
1153 return a;
1155 /* Div by Inf */
1156 if (b.cls == float_class_inf) {
1157 a.cls = float_class_zero;
1158 a.sign = sign;
1159 return a;
1161 g_assert_not_reached();
1164 float16 float16_div(float16 a, float16 b, float_status *status)
1166 FloatParts pa = float16_unpack_canonical(a, status);
1167 FloatParts pb = float16_unpack_canonical(b, status);
1168 FloatParts pr = div_floats(pa, pb, status);
1170 return float16_round_pack_canonical(pr, status);
1173 float32 float32_div(float32 a, float32 b, float_status *status)
1175 FloatParts pa = float32_unpack_canonical(a, status);
1176 FloatParts pb = float32_unpack_canonical(b, status);
1177 FloatParts pr = div_floats(pa, pb, status);
1179 return float32_round_pack_canonical(pr, status);
1182 float64 float64_div(float64 a, float64 b, float_status *status)
1184 FloatParts pa = float64_unpack_canonical(a, status);
1185 FloatParts pb = float64_unpack_canonical(b, status);
1186 FloatParts pr = div_floats(pa, pb, status);
1188 return float64_round_pack_canonical(pr, status);
1192 * Float to Float conversions
1194 * Returns the result of converting one float format to another. The
1195 * conversion is performed according to the IEC/IEEE Standard for
1196 * Binary Floating-Point Arithmetic.
1198 * The float_to_float helper only needs to take care of raising
1199 * invalid exceptions and handling the conversion on NaNs.
1202 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1203 float_status *s)
1205 if (dstf->arm_althp) {
1206 switch (a.cls) {
1207 case float_class_qnan:
1208 case float_class_snan:
1209 /* There is no NaN in the destination format. Raise Invalid
1210 * and return a zero with the sign of the input NaN.
1212 s->float_exception_flags |= float_flag_invalid;
1213 a.cls = float_class_zero;
1214 a.frac = 0;
1215 a.exp = 0;
1216 break;
1218 case float_class_inf:
1219 /* There is no Inf in the destination format. Raise Invalid
1220 * and return the maximum normal with the correct sign.
1222 s->float_exception_flags |= float_flag_invalid;
1223 a.cls = float_class_normal;
1224 a.exp = dstf->exp_max;
1225 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1226 break;
1228 default:
1229 break;
1231 } else if (is_nan(a.cls)) {
1232 if (is_snan(a.cls)) {
1233 s->float_exception_flags |= float_flag_invalid;
1234 a = parts_silence_nan(a, s);
1236 if (s->default_nan_mode) {
1237 return parts_default_nan(s);
1240 return a;
1243 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1245 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1246 FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1247 FloatParts pr = float_to_float(p, &float32_params, s);
1248 return float32_round_pack_canonical(pr, s);
1251 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1253 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1254 FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1255 FloatParts pr = float_to_float(p, &float64_params, s);
1256 return float64_round_pack_canonical(pr, s);
1259 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1261 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1262 FloatParts p = float32_unpack_canonical(a, s);
1263 FloatParts pr = float_to_float(p, fmt16, s);
1264 return float16a_round_pack_canonical(pr, s, fmt16);
1267 float64 float32_to_float64(float32 a, float_status *s)
1269 FloatParts p = float32_unpack_canonical(a, s);
1270 FloatParts pr = float_to_float(p, &float64_params, s);
1271 return float64_round_pack_canonical(pr, s);
1274 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1276 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1277 FloatParts p = float64_unpack_canonical(a, s);
1278 FloatParts pr = float_to_float(p, fmt16, s);
1279 return float16a_round_pack_canonical(pr, s, fmt16);
1282 float32 float64_to_float32(float64 a, float_status *s)
1284 FloatParts p = float64_unpack_canonical(a, s);
1285 FloatParts pr = float_to_float(p, &float32_params, s);
1286 return float32_round_pack_canonical(pr, s);
1290 * Rounds the floating-point value `a' to an integer, and returns the
1291 * result as a floating-point value. The operation is performed
1292 * according to the IEC/IEEE Standard for Binary Floating-Point
1293 * Arithmetic.
1296 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1298 if (is_nan(a.cls)) {
1299 return return_nan(a, s);
1302 switch (a.cls) {
1303 case float_class_zero:
1304 case float_class_inf:
1305 case float_class_qnan:
1306 /* already "integral" */
1307 break;
1308 case float_class_normal:
1309 if (a.exp >= DECOMPOSED_BINARY_POINT) {
1310 /* already integral */
1311 break;
1313 if (a.exp < 0) {
1314 bool one;
1315 /* all fractional */
1316 s->float_exception_flags |= float_flag_inexact;
1317 switch (rounding_mode) {
1318 case float_round_nearest_even:
1319 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1320 break;
1321 case float_round_ties_away:
1322 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1323 break;
1324 case float_round_to_zero:
1325 one = false;
1326 break;
1327 case float_round_up:
1328 one = !a.sign;
1329 break;
1330 case float_round_down:
1331 one = a.sign;
1332 break;
1333 default:
1334 g_assert_not_reached();
1337 if (one) {
1338 a.frac = DECOMPOSED_IMPLICIT_BIT;
1339 a.exp = 0;
1340 } else {
1341 a.cls = float_class_zero;
1343 } else {
1344 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1345 uint64_t frac_lsbm1 = frac_lsb >> 1;
1346 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1347 uint64_t rnd_mask = rnd_even_mask >> 1;
1348 uint64_t inc;
1350 switch (rounding_mode) {
1351 case float_round_nearest_even:
1352 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1353 break;
1354 case float_round_ties_away:
1355 inc = frac_lsbm1;
1356 break;
1357 case float_round_to_zero:
1358 inc = 0;
1359 break;
1360 case float_round_up:
1361 inc = a.sign ? 0 : rnd_mask;
1362 break;
1363 case float_round_down:
1364 inc = a.sign ? rnd_mask : 0;
1365 break;
1366 default:
1367 g_assert_not_reached();
1370 if (a.frac & rnd_mask) {
1371 s->float_exception_flags |= float_flag_inexact;
1372 a.frac += inc;
1373 a.frac &= ~rnd_mask;
1374 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1375 a.frac >>= 1;
1376 a.exp++;
1380 break;
1381 default:
1382 g_assert_not_reached();
1384 return a;
1387 float16 float16_round_to_int(float16 a, float_status *s)
1389 FloatParts pa = float16_unpack_canonical(a, s);
1390 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1391 return float16_round_pack_canonical(pr, s);
1394 float32 float32_round_to_int(float32 a, float_status *s)
1396 FloatParts pa = float32_unpack_canonical(a, s);
1397 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1398 return float32_round_pack_canonical(pr, s);
1401 float64 float64_round_to_int(float64 a, float_status *s)
1403 FloatParts pa = float64_unpack_canonical(a, s);
1404 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1405 return float64_round_pack_canonical(pr, s);
1408 float64 float64_trunc_to_int(float64 a, float_status *s)
1410 FloatParts pa = float64_unpack_canonical(a, s);
1411 FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1412 return float64_round_pack_canonical(pr, s);
1416 * Returns the result of converting the floating-point value `a' to
1417 * the two's complement integer format. The conversion is performed
1418 * according to the IEC/IEEE Standard for Binary Floating-Point
1419 * Arithmetic---which means in particular that the conversion is
1420 * rounded according to the current rounding mode. If `a' is a NaN,
1421 * the largest positive integer is returned. Otherwise, if the
1422 * conversion overflows, the largest integer with the same sign as `a'
1423 * is returned.
1426 static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1427 int64_t min, int64_t max,
1428 float_status *s)
1430 uint64_t r;
1431 int orig_flags = get_float_exception_flags(s);
1432 FloatParts p = round_to_int(in, rmode, s);
1434 switch (p.cls) {
1435 case float_class_snan:
1436 case float_class_qnan:
1437 s->float_exception_flags = orig_flags | float_flag_invalid;
1438 return max;
1439 case float_class_inf:
1440 s->float_exception_flags = orig_flags | float_flag_invalid;
1441 return p.sign ? min : max;
1442 case float_class_zero:
1443 return 0;
1444 case float_class_normal:
1445 if (p.exp < DECOMPOSED_BINARY_POINT) {
1446 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1447 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1448 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1449 } else {
1450 r = UINT64_MAX;
1452 if (p.sign) {
1453 if (r <= -(uint64_t) min) {
1454 return -r;
1455 } else {
1456 s->float_exception_flags = orig_flags | float_flag_invalid;
1457 return min;
1459 } else {
1460 if (r <= max) {
1461 return r;
1462 } else {
1463 s->float_exception_flags = orig_flags | float_flag_invalid;
1464 return max;
1467 default:
1468 g_assert_not_reached();
1472 #define FLOAT_TO_INT(fsz, isz) \
1473 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \
1474 float_status *s) \
1476 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1477 return round_to_int_and_pack(p, s->float_rounding_mode, \
1478 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1479 s); \
1482 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \
1483 (float ## fsz a, float_status *s) \
1485 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1486 return round_to_int_and_pack(p, float_round_to_zero, \
1487 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1488 s); \
1491 FLOAT_TO_INT(16, 16)
1492 FLOAT_TO_INT(16, 32)
1493 FLOAT_TO_INT(16, 64)
1495 FLOAT_TO_INT(32, 16)
1496 FLOAT_TO_INT(32, 32)
1497 FLOAT_TO_INT(32, 64)
1499 FLOAT_TO_INT(64, 16)
1500 FLOAT_TO_INT(64, 32)
1501 FLOAT_TO_INT(64, 64)
1503 #undef FLOAT_TO_INT
1506 * Returns the result of converting the floating-point value `a' to
1507 * the unsigned integer format. The conversion is performed according
1508 * to the IEC/IEEE Standard for Binary Floating-Point
1509 * Arithmetic---which means in particular that the conversion is
1510 * rounded according to the current rounding mode. If `a' is a NaN,
1511 * the largest unsigned integer is returned. Otherwise, if the
1512 * conversion overflows, the largest unsigned integer is returned. If
1513 * the 'a' is negative, the result is rounded and zero is returned;
1514 * values that do not round to zero will raise the inexact exception
1515 * flag.
1518 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1519 float_status *s)
1521 int orig_flags = get_float_exception_flags(s);
1522 FloatParts p = round_to_int(in, rmode, s);
1524 switch (p.cls) {
1525 case float_class_snan:
1526 case float_class_qnan:
1527 s->float_exception_flags = orig_flags | float_flag_invalid;
1528 return max;
1529 case float_class_inf:
1530 s->float_exception_flags = orig_flags | float_flag_invalid;
1531 return p.sign ? 0 : max;
1532 case float_class_zero:
1533 return 0;
1534 case float_class_normal:
1536 uint64_t r;
1537 if (p.sign) {
1538 s->float_exception_flags = orig_flags | float_flag_invalid;
1539 return 0;
1542 if (p.exp < DECOMPOSED_BINARY_POINT) {
1543 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1544 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1545 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1546 } else {
1547 s->float_exception_flags = orig_flags | float_flag_invalid;
1548 return max;
1551 /* For uint64 this will never trip, but if p.exp is too large
1552 * to shift a decomposed fraction we shall have exited via the
1553 * 3rd leg above.
1555 if (r > max) {
1556 s->float_exception_flags = orig_flags | float_flag_invalid;
1557 return max;
1558 } else {
1559 return r;
1562 default:
1563 g_assert_not_reached();
1567 #define FLOAT_TO_UINT(fsz, isz) \
1568 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \
1569 float_status *s) \
1571 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1572 return round_to_uint_and_pack(p, s->float_rounding_mode, \
1573 UINT ## isz ## _MAX, s); \
1576 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \
1577 (float ## fsz a, float_status *s) \
1579 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1580 return round_to_uint_and_pack(p, float_round_to_zero, \
1581 UINT ## isz ## _MAX, s); \
1584 FLOAT_TO_UINT(16, 16)
1585 FLOAT_TO_UINT(16, 32)
1586 FLOAT_TO_UINT(16, 64)
1588 FLOAT_TO_UINT(32, 16)
1589 FLOAT_TO_UINT(32, 32)
1590 FLOAT_TO_UINT(32, 64)
1592 FLOAT_TO_UINT(64, 16)
1593 FLOAT_TO_UINT(64, 32)
1594 FLOAT_TO_UINT(64, 64)
1596 #undef FLOAT_TO_UINT
1599 * Integer to float conversions
1601 * Returns the result of converting the two's complement integer `a'
1602 * to the floating-point format. The conversion is performed according
1603 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1606 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
1608 FloatParts r = { .sign = false };
1610 if (a == 0) {
1611 r.cls = float_class_zero;
1612 } else {
1613 uint64_t f = a;
1614 int shift;
1616 r.cls = float_class_normal;
1617 if (a < 0) {
1618 f = -f;
1619 r.sign = true;
1621 shift = clz64(f) - 1;
1622 scale = MIN(MAX(scale, -0x10000), 0x10000);
1624 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
1625 r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
1628 return r;
1631 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
1633 FloatParts pa = int_to_float(a, scale, status);
1634 return float16_round_pack_canonical(pa, status);
1637 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
1639 return int64_to_float16_scalbn(a, scale, status);
1642 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
1644 return int64_to_float16_scalbn(a, scale, status);
1647 float16 int64_to_float16(int64_t a, float_status *status)
1649 return int64_to_float16_scalbn(a, 0, status);
1652 float16 int32_to_float16(int32_t a, float_status *status)
1654 return int64_to_float16_scalbn(a, 0, status);
1657 float16 int16_to_float16(int16_t a, float_status *status)
1659 return int64_to_float16_scalbn(a, 0, status);
1662 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
1664 FloatParts pa = int_to_float(a, scale, status);
1665 return float32_round_pack_canonical(pa, status);
1668 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
1670 return int64_to_float32_scalbn(a, scale, status);
1673 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
1675 return int64_to_float32_scalbn(a, scale, status);
1678 float32 int64_to_float32(int64_t a, float_status *status)
1680 return int64_to_float32_scalbn(a, 0, status);
1683 float32 int32_to_float32(int32_t a, float_status *status)
1685 return int64_to_float32_scalbn(a, 0, status);
1688 float32 int16_to_float32(int16_t a, float_status *status)
1690 return int64_to_float32_scalbn(a, 0, status);
1693 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
1695 FloatParts pa = int_to_float(a, scale, status);
1696 return float64_round_pack_canonical(pa, status);
1699 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
1701 return int64_to_float64_scalbn(a, scale, status);
1704 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
1706 return int64_to_float64_scalbn(a, scale, status);
1709 float64 int64_to_float64(int64_t a, float_status *status)
1711 return int64_to_float64_scalbn(a, 0, status);
1714 float64 int32_to_float64(int32_t a, float_status *status)
1716 return int64_to_float64_scalbn(a, 0, status);
1719 float64 int16_to_float64(int16_t a, float_status *status)
1721 return int64_to_float64_scalbn(a, 0, status);
1726 * Unsigned Integer to float conversions
1728 * Returns the result of converting the unsigned integer `a' to the
1729 * floating-point format. The conversion is performed according to the
1730 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1733 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
1735 FloatParts r = { .sign = false };
1737 if (a == 0) {
1738 r.cls = float_class_zero;
1739 } else {
1740 scale = MIN(MAX(scale, -0x10000), 0x10000);
1741 r.cls = float_class_normal;
1742 if ((int64_t)a < 0) {
1743 r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
1744 shift64RightJamming(a, 1, &a);
1745 r.frac = a;
1746 } else {
1747 int shift = clz64(a) - 1;
1748 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
1749 r.frac = a << shift;
1753 return r;
1756 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
1758 FloatParts pa = uint_to_float(a, scale, status);
1759 return float16_round_pack_canonical(pa, status);
1762 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
1764 return uint64_to_float16_scalbn(a, scale, status);
1767 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
1769 return uint64_to_float16_scalbn(a, scale, status);
1772 float16 uint64_to_float16(uint64_t a, float_status *status)
1774 return uint64_to_float16_scalbn(a, 0, status);
1777 float16 uint32_to_float16(uint32_t a, float_status *status)
1779 return uint64_to_float16_scalbn(a, 0, status);
1782 float16 uint16_to_float16(uint16_t a, float_status *status)
1784 return uint64_to_float16_scalbn(a, 0, status);
1787 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
1789 FloatParts pa = uint_to_float(a, scale, status);
1790 return float32_round_pack_canonical(pa, status);
1793 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
1795 return uint64_to_float32_scalbn(a, scale, status);
1798 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
1800 return uint64_to_float32_scalbn(a, scale, status);
1803 float32 uint64_to_float32(uint64_t a, float_status *status)
1805 return uint64_to_float32_scalbn(a, 0, status);
1808 float32 uint32_to_float32(uint32_t a, float_status *status)
1810 return uint64_to_float32_scalbn(a, 0, status);
1813 float32 uint16_to_float32(uint16_t a, float_status *status)
1815 return uint64_to_float32_scalbn(a, 0, status);
1818 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
1820 FloatParts pa = uint_to_float(a, scale, status);
1821 return float64_round_pack_canonical(pa, status);
1824 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
1826 return uint64_to_float64_scalbn(a, scale, status);
1829 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
1831 return uint64_to_float64_scalbn(a, scale, status);
1834 float64 uint64_to_float64(uint64_t a, float_status *status)
1836 return uint64_to_float64_scalbn(a, 0, status);
1839 float64 uint32_to_float64(uint32_t a, float_status *status)
1841 return uint64_to_float64_scalbn(a, 0, status);
1844 float64 uint16_to_float64(uint16_t a, float_status *status)
1846 return uint64_to_float64_scalbn(a, 0, status);
1849 /* Float Min/Max */
1850 /* min() and max() functions. These can't be implemented as
1851 * 'compare and pick one input' because that would mishandle
1852 * NaNs and +0 vs -0.
1854 * minnum() and maxnum() functions. These are similar to the min()
1855 * and max() functions but if one of the arguments is a QNaN and
1856 * the other is numerical then the numerical argument is returned.
1857 * SNaNs will get quietened before being returned.
1858 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1859 * and maxNum() operations. min() and max() are the typical min/max
1860 * semantics provided by many CPUs which predate that specification.
1862 * minnummag() and maxnummag() functions correspond to minNumMag()
1863 * and minNumMag() from the IEEE-754 2008.
1865 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1866 bool ieee, bool ismag, float_status *s)
1868 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1869 if (ieee) {
1870 /* Takes two floating-point values `a' and `b', one of
1871 * which is a NaN, and returns the appropriate NaN
1872 * result. If either `a' or `b' is a signaling NaN,
1873 * the invalid exception is raised.
1875 if (is_snan(a.cls) || is_snan(b.cls)) {
1876 return pick_nan(a, b, s);
1877 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1878 return b;
1879 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1880 return a;
1883 return pick_nan(a, b, s);
1884 } else {
1885 int a_exp, b_exp;
1887 switch (a.cls) {
1888 case float_class_normal:
1889 a_exp = a.exp;
1890 break;
1891 case float_class_inf:
1892 a_exp = INT_MAX;
1893 break;
1894 case float_class_zero:
1895 a_exp = INT_MIN;
1896 break;
1897 default:
1898 g_assert_not_reached();
1899 break;
1901 switch (b.cls) {
1902 case float_class_normal:
1903 b_exp = b.exp;
1904 break;
1905 case float_class_inf:
1906 b_exp = INT_MAX;
1907 break;
1908 case float_class_zero:
1909 b_exp = INT_MIN;
1910 break;
1911 default:
1912 g_assert_not_reached();
1913 break;
1916 if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
1917 bool a_less = a_exp < b_exp;
1918 if (a_exp == b_exp) {
1919 a_less = a.frac < b.frac;
1921 return a_less ^ ismin ? b : a;
1924 if (a.sign == b.sign) {
1925 bool a_less = a_exp < b_exp;
1926 if (a_exp == b_exp) {
1927 a_less = a.frac < b.frac;
1929 return a.sign ^ a_less ^ ismin ? b : a;
1930 } else {
1931 return a.sign ^ ismin ? b : a;
1936 #define MINMAX(sz, name, ismin, isiee, ismag) \
1937 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
1938 float_status *s) \
1940 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1941 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1942 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
1944 return float ## sz ## _round_pack_canonical(pr, s); \
1947 MINMAX(16, min, true, false, false)
1948 MINMAX(16, minnum, true, true, false)
1949 MINMAX(16, minnummag, true, true, true)
1950 MINMAX(16, max, false, false, false)
1951 MINMAX(16, maxnum, false, true, false)
1952 MINMAX(16, maxnummag, false, true, true)
1954 MINMAX(32, min, true, false, false)
1955 MINMAX(32, minnum, true, true, false)
1956 MINMAX(32, minnummag, true, true, true)
1957 MINMAX(32, max, false, false, false)
1958 MINMAX(32, maxnum, false, true, false)
1959 MINMAX(32, maxnummag, false, true, true)
1961 MINMAX(64, min, true, false, false)
1962 MINMAX(64, minnum, true, true, false)
1963 MINMAX(64, minnummag, true, true, true)
1964 MINMAX(64, max, false, false, false)
1965 MINMAX(64, maxnum, false, true, false)
1966 MINMAX(64, maxnummag, false, true, true)
1968 #undef MINMAX
1970 /* Floating point compare */
1971 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
1972 float_status *s)
1974 if (is_nan(a.cls) || is_nan(b.cls)) {
1975 if (!is_quiet ||
1976 a.cls == float_class_snan ||
1977 b.cls == float_class_snan) {
1978 s->float_exception_flags |= float_flag_invalid;
1980 return float_relation_unordered;
1983 if (a.cls == float_class_zero) {
1984 if (b.cls == float_class_zero) {
1985 return float_relation_equal;
1987 return b.sign ? float_relation_greater : float_relation_less;
1988 } else if (b.cls == float_class_zero) {
1989 return a.sign ? float_relation_less : float_relation_greater;
1992 /* The only really important thing about infinity is its sign. If
1993 * both are infinities the sign marks the smallest of the two.
1995 if (a.cls == float_class_inf) {
1996 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
1997 return float_relation_equal;
1999 return a.sign ? float_relation_less : float_relation_greater;
2000 } else if (b.cls == float_class_inf) {
2001 return b.sign ? float_relation_greater : float_relation_less;
2004 if (a.sign != b.sign) {
2005 return a.sign ? float_relation_less : float_relation_greater;
2008 if (a.exp == b.exp) {
2009 if (a.frac == b.frac) {
2010 return float_relation_equal;
2012 if (a.sign) {
2013 return a.frac > b.frac ?
2014 float_relation_less : float_relation_greater;
2015 } else {
2016 return a.frac > b.frac ?
2017 float_relation_greater : float_relation_less;
2019 } else {
2020 if (a.sign) {
2021 return a.exp > b.exp ? float_relation_less : float_relation_greater;
2022 } else {
2023 return a.exp > b.exp ? float_relation_greater : float_relation_less;
2028 #define COMPARE(sz) \
2029 int float ## sz ## _compare(float ## sz a, float ## sz b, \
2030 float_status *s) \
2032 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2033 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2034 return compare_floats(pa, pb, false, s); \
2036 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \
2037 float_status *s) \
2039 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2040 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2041 return compare_floats(pa, pb, true, s); \
2044 COMPARE(16)
2045 COMPARE(32)
2046 COMPARE(64)
2048 #undef COMPARE
2050 /* Multiply A by 2 raised to the power N. */
2051 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
2053 if (unlikely(is_nan(a.cls))) {
2054 return return_nan(a, s);
2056 if (a.cls == float_class_normal) {
2057 /* The largest float type (even though not supported by FloatParts)
2058 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
2059 * still allows rounding to infinity, without allowing overflow
2060 * within the int32_t that backs FloatParts.exp.
2062 n = MIN(MAX(n, -0x10000), 0x10000);
2063 a.exp += n;
2065 return a;
2068 float16 float16_scalbn(float16 a, int n, float_status *status)
2070 FloatParts pa = float16_unpack_canonical(a, status);
2071 FloatParts pr = scalbn_decomposed(pa, n, status);
2072 return float16_round_pack_canonical(pr, status);
2075 float32 float32_scalbn(float32 a, int n, float_status *status)
2077 FloatParts pa = float32_unpack_canonical(a, status);
2078 FloatParts pr = scalbn_decomposed(pa, n, status);
2079 return float32_round_pack_canonical(pr, status);
2082 float64 float64_scalbn(float64 a, int n, float_status *status)
2084 FloatParts pa = float64_unpack_canonical(a, status);
2085 FloatParts pr = scalbn_decomposed(pa, n, status);
2086 return float64_round_pack_canonical(pr, status);
2090 * Square Root
2092 * The old softfloat code did an approximation step before zeroing in
2093 * on the final result. However for simpleness we just compute the
2094 * square root by iterating down from the implicit bit to enough extra
2095 * bits to ensure we get a correctly rounded result.
2097 * This does mean however the calculation is slower than before,
2098 * especially for 64 bit floats.
2101 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
2103 uint64_t a_frac, r_frac, s_frac;
2104 int bit, last_bit;
2106 if (is_nan(a.cls)) {
2107 return return_nan(a, s);
2109 if (a.cls == float_class_zero) {
2110 return a; /* sqrt(+-0) = +-0 */
2112 if (a.sign) {
2113 s->float_exception_flags |= float_flag_invalid;
2114 return parts_default_nan(s);
2116 if (a.cls == float_class_inf) {
2117 return a; /* sqrt(+inf) = +inf */
2120 assert(a.cls == float_class_normal);
2122 /* We need two overflow bits at the top. Adding room for that is a
2123 * right shift. If the exponent is odd, we can discard the low bit
2124 * by multiplying the fraction by 2; that's a left shift. Combine
2125 * those and we shift right if the exponent is even.
2127 a_frac = a.frac;
2128 if (!(a.exp & 1)) {
2129 a_frac >>= 1;
2131 a.exp >>= 1;
2133 /* Bit-by-bit computation of sqrt. */
2134 r_frac = 0;
2135 s_frac = 0;
2137 /* Iterate from implicit bit down to the 3 extra bits to compute a
2138 * properly rounded result. Remember we've inserted one more bit
2139 * at the top, so these positions are one less.
2141 bit = DECOMPOSED_BINARY_POINT - 1;
2142 last_bit = MAX(p->frac_shift - 4, 0);
2143 do {
2144 uint64_t q = 1ULL << bit;
2145 uint64_t t_frac = s_frac + q;
2146 if (t_frac <= a_frac) {
2147 s_frac = t_frac + q;
2148 a_frac -= t_frac;
2149 r_frac += q;
2151 a_frac <<= 1;
2152 } while (--bit >= last_bit);
2154 /* Undo the right shift done above. If there is any remaining
2155 * fraction, the result is inexact. Set the sticky bit.
2157 a.frac = (r_frac << 1) + (a_frac != 0);
2159 return a;
2162 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
2164 FloatParts pa = float16_unpack_canonical(a, status);
2165 FloatParts pr = sqrt_float(pa, status, &float16_params);
2166 return float16_round_pack_canonical(pr, status);
2169 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
2171 FloatParts pa = float32_unpack_canonical(a, status);
2172 FloatParts pr = sqrt_float(pa, status, &float32_params);
2173 return float32_round_pack_canonical(pr, status);
2176 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
2178 FloatParts pa = float64_unpack_canonical(a, status);
2179 FloatParts pr = sqrt_float(pa, status, &float64_params);
2180 return float64_round_pack_canonical(pr, status);
2183 /*----------------------------------------------------------------------------
2184 | The pattern for a default generated NaN.
2185 *----------------------------------------------------------------------------*/
2187 float16 float16_default_nan(float_status *status)
2189 FloatParts p = parts_default_nan(status);
2190 p.frac >>= float16_params.frac_shift;
2191 return float16_pack_raw(p);
2194 float32 float32_default_nan(float_status *status)
2196 FloatParts p = parts_default_nan(status);
2197 p.frac >>= float32_params.frac_shift;
2198 return float32_pack_raw(p);
2201 float64 float64_default_nan(float_status *status)
2203 FloatParts p = parts_default_nan(status);
2204 p.frac >>= float64_params.frac_shift;
2205 return float64_pack_raw(p);
2208 float128 float128_default_nan(float_status *status)
2210 FloatParts p = parts_default_nan(status);
2211 float128 r;
2213 /* Extrapolate from the choices made by parts_default_nan to fill
2214 * in the quad-floating format. If the low bit is set, assume we
2215 * want to set all non-snan bits.
2217 r.low = -(p.frac & 1);
2218 r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
2219 r.high |= LIT64(0x7FFF000000000000);
2220 r.high |= (uint64_t)p.sign << 63;
2222 return r;
2225 /*----------------------------------------------------------------------------
2226 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
2227 *----------------------------------------------------------------------------*/
2229 float16 float16_silence_nan(float16 a, float_status *status)
2231 FloatParts p = float16_unpack_raw(a);
2232 p.frac <<= float16_params.frac_shift;
2233 p = parts_silence_nan(p, status);
2234 p.frac >>= float16_params.frac_shift;
2235 return float16_pack_raw(p);
2238 float32 float32_silence_nan(float32 a, float_status *status)
2240 FloatParts p = float32_unpack_raw(a);
2241 p.frac <<= float32_params.frac_shift;
2242 p = parts_silence_nan(p, status);
2243 p.frac >>= float32_params.frac_shift;
2244 return float32_pack_raw(p);
2247 float64 float64_silence_nan(float64 a, float_status *status)
2249 FloatParts p = float64_unpack_raw(a);
2250 p.frac <<= float64_params.frac_shift;
2251 p = parts_silence_nan(p, status);
2252 p.frac >>= float64_params.frac_shift;
2253 return float64_pack_raw(p);
2256 /*----------------------------------------------------------------------------
2257 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2258 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2259 | input. If `zSign' is 1, the input is negated before being converted to an
2260 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
2261 | is simply rounded to an integer, with the inexact exception raised if the
2262 | input cannot be represented exactly as an integer. However, if the fixed-
2263 | point input is too large, the invalid exception is raised and the largest
2264 | positive or negative integer is returned.
2265 *----------------------------------------------------------------------------*/
2267 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2269 int8_t roundingMode;
2270 flag roundNearestEven;
2271 int8_t roundIncrement, roundBits;
2272 int32_t z;
2274 roundingMode = status->float_rounding_mode;
2275 roundNearestEven = ( roundingMode == float_round_nearest_even );
2276 switch (roundingMode) {
2277 case float_round_nearest_even:
2278 case float_round_ties_away:
2279 roundIncrement = 0x40;
2280 break;
2281 case float_round_to_zero:
2282 roundIncrement = 0;
2283 break;
2284 case float_round_up:
2285 roundIncrement = zSign ? 0 : 0x7f;
2286 break;
2287 case float_round_down:
2288 roundIncrement = zSign ? 0x7f : 0;
2289 break;
2290 default:
2291 abort();
2293 roundBits = absZ & 0x7F;
2294 absZ = ( absZ + roundIncrement )>>7;
2295 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2296 z = absZ;
2297 if ( zSign ) z = - z;
2298 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2299 float_raise(float_flag_invalid, status);
2300 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2302 if (roundBits) {
2303 status->float_exception_flags |= float_flag_inexact;
2305 return z;
2309 /*----------------------------------------------------------------------------
2310 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2311 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2312 | and returns the properly rounded 64-bit integer corresponding to the input.
2313 | If `zSign' is 1, the input is negated before being converted to an integer.
2314 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2315 | the inexact exception raised if the input cannot be represented exactly as
2316 | an integer. However, if the fixed-point input is too large, the invalid
2317 | exception is raised and the largest positive or negative integer is
2318 | returned.
2319 *----------------------------------------------------------------------------*/
2321 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2322 float_status *status)
2324 int8_t roundingMode;
2325 flag roundNearestEven, increment;
2326 int64_t z;
2328 roundingMode = status->float_rounding_mode;
2329 roundNearestEven = ( roundingMode == float_round_nearest_even );
2330 switch (roundingMode) {
2331 case float_round_nearest_even:
2332 case float_round_ties_away:
2333 increment = ((int64_t) absZ1 < 0);
2334 break;
2335 case float_round_to_zero:
2336 increment = 0;
2337 break;
2338 case float_round_up:
2339 increment = !zSign && absZ1;
2340 break;
2341 case float_round_down:
2342 increment = zSign && absZ1;
2343 break;
2344 default:
2345 abort();
2347 if ( increment ) {
2348 ++absZ0;
2349 if ( absZ0 == 0 ) goto overflow;
2350 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2352 z = absZ0;
2353 if ( zSign ) z = - z;
2354 if ( z && ( ( z < 0 ) ^ zSign ) ) {
2355 overflow:
2356 float_raise(float_flag_invalid, status);
2357 return
2358 zSign ? (int64_t) LIT64( 0x8000000000000000 )
2359 : LIT64( 0x7FFFFFFFFFFFFFFF );
2361 if (absZ1) {
2362 status->float_exception_flags |= float_flag_inexact;
2364 return z;
2368 /*----------------------------------------------------------------------------
2369 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2370 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2371 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2372 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
2373 | with the inexact exception raised if the input cannot be represented exactly
2374 | as an integer. However, if the fixed-point input is too large, the invalid
2375 | exception is raised and the largest unsigned integer is returned.
2376 *----------------------------------------------------------------------------*/
2378 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2379 uint64_t absZ1, float_status *status)
2381 int8_t roundingMode;
2382 flag roundNearestEven, increment;
2384 roundingMode = status->float_rounding_mode;
2385 roundNearestEven = (roundingMode == float_round_nearest_even);
2386 switch (roundingMode) {
2387 case float_round_nearest_even:
2388 case float_round_ties_away:
2389 increment = ((int64_t)absZ1 < 0);
2390 break;
2391 case float_round_to_zero:
2392 increment = 0;
2393 break;
2394 case float_round_up:
2395 increment = !zSign && absZ1;
2396 break;
2397 case float_round_down:
2398 increment = zSign && absZ1;
2399 break;
2400 default:
2401 abort();
2403 if (increment) {
2404 ++absZ0;
2405 if (absZ0 == 0) {
2406 float_raise(float_flag_invalid, status);
2407 return LIT64(0xFFFFFFFFFFFFFFFF);
2409 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2412 if (zSign && absZ0) {
2413 float_raise(float_flag_invalid, status);
2414 return 0;
2417 if (absZ1) {
2418 status->float_exception_flags |= float_flag_inexact;
2420 return absZ0;
2423 /*----------------------------------------------------------------------------
2424 | If `a' is denormal and we are in flush-to-zero mode then set the
2425 | input-denormal exception and return zero. Otherwise just return the value.
2426 *----------------------------------------------------------------------------*/
2427 float32 float32_squash_input_denormal(float32 a, float_status *status)
2429 if (status->flush_inputs_to_zero) {
2430 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2431 float_raise(float_flag_input_denormal, status);
2432 return make_float32(float32_val(a) & 0x80000000);
2435 return a;
2438 /*----------------------------------------------------------------------------
2439 | Normalizes the subnormal single-precision floating-point value represented
2440 | by the denormalized significand `aSig'. The normalized exponent and
2441 | significand are stored at the locations pointed to by `zExpPtr' and
2442 | `zSigPtr', respectively.
2443 *----------------------------------------------------------------------------*/
2445 static void
2446 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2448 int8_t shiftCount;
2450 shiftCount = countLeadingZeros32( aSig ) - 8;
2451 *zSigPtr = aSig<<shiftCount;
2452 *zExpPtr = 1 - shiftCount;
2456 /*----------------------------------------------------------------------------
2457 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2458 | and significand `zSig', and returns the proper single-precision floating-
2459 | point value corresponding to the abstract input. Ordinarily, the abstract
2460 | value is simply rounded and packed into the single-precision format, with
2461 | the inexact exception raised if the abstract input cannot be represented
2462 | exactly. However, if the abstract value is too large, the overflow and
2463 | inexact exceptions are raised and an infinity or maximal finite value is
2464 | returned. If the abstract value is too small, the input value is rounded to
2465 | a subnormal number, and the underflow and inexact exceptions are raised if
2466 | the abstract input cannot be represented exactly as a subnormal single-
2467 | precision floating-point number.
2468 | The input significand `zSig' has its binary point between bits 30
2469 | and 29, which is 7 bits to the left of the usual location. This shifted
2470 | significand must be normalized or smaller. If `zSig' is not normalized,
2471 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2472 | and it must not require rounding. In the usual case that `zSig' is
2473 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2474 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2475 | Binary Floating-Point Arithmetic.
2476 *----------------------------------------------------------------------------*/
2478 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2479 float_status *status)
2481 int8_t roundingMode;
2482 flag roundNearestEven;
2483 int8_t roundIncrement, roundBits;
2484 flag isTiny;
2486 roundingMode = status->float_rounding_mode;
2487 roundNearestEven = ( roundingMode == float_round_nearest_even );
2488 switch (roundingMode) {
2489 case float_round_nearest_even:
2490 case float_round_ties_away:
2491 roundIncrement = 0x40;
2492 break;
2493 case float_round_to_zero:
2494 roundIncrement = 0;
2495 break;
2496 case float_round_up:
2497 roundIncrement = zSign ? 0 : 0x7f;
2498 break;
2499 case float_round_down:
2500 roundIncrement = zSign ? 0x7f : 0;
2501 break;
2502 default:
2503 abort();
2504 break;
2506 roundBits = zSig & 0x7F;
2507 if ( 0xFD <= (uint16_t) zExp ) {
2508 if ( ( 0xFD < zExp )
2509 || ( ( zExp == 0xFD )
2510 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2512 float_raise(float_flag_overflow | float_flag_inexact, status);
2513 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2515 if ( zExp < 0 ) {
2516 if (status->flush_to_zero) {
2517 float_raise(float_flag_output_denormal, status);
2518 return packFloat32(zSign, 0, 0);
2520 isTiny =
2521 (status->float_detect_tininess
2522 == float_tininess_before_rounding)
2523 || ( zExp < -1 )
2524 || ( zSig + roundIncrement < 0x80000000 );
2525 shift32RightJamming( zSig, - zExp, &zSig );
2526 zExp = 0;
2527 roundBits = zSig & 0x7F;
2528 if (isTiny && roundBits) {
2529 float_raise(float_flag_underflow, status);
2533 if (roundBits) {
2534 status->float_exception_flags |= float_flag_inexact;
2536 zSig = ( zSig + roundIncrement )>>7;
2537 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2538 if ( zSig == 0 ) zExp = 0;
2539 return packFloat32( zSign, zExp, zSig );
2543 /*----------------------------------------------------------------------------
2544 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2545 | and significand `zSig', and returns the proper single-precision floating-
2546 | point value corresponding to the abstract input. This routine is just like
2547 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2548 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2549 | floating-point exponent.
2550 *----------------------------------------------------------------------------*/
2552 static float32
2553 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2554 float_status *status)
2556 int8_t shiftCount;
2558 shiftCount = countLeadingZeros32( zSig ) - 1;
2559 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2560 status);
2564 /*----------------------------------------------------------------------------
2565 | If `a' is denormal and we are in flush-to-zero mode then set the
2566 | input-denormal exception and return zero. Otherwise just return the value.
2567 *----------------------------------------------------------------------------*/
2568 float64 float64_squash_input_denormal(float64 a, float_status *status)
2570 if (status->flush_inputs_to_zero) {
2571 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2572 float_raise(float_flag_input_denormal, status);
2573 return make_float64(float64_val(a) & (1ULL << 63));
2576 return a;
2579 /*----------------------------------------------------------------------------
2580 | Normalizes the subnormal double-precision floating-point value represented
2581 | by the denormalized significand `aSig'. The normalized exponent and
2582 | significand are stored at the locations pointed to by `zExpPtr' and
2583 | `zSigPtr', respectively.
2584 *----------------------------------------------------------------------------*/
2586 static void
2587 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2589 int8_t shiftCount;
2591 shiftCount = countLeadingZeros64( aSig ) - 11;
2592 *zSigPtr = aSig<<shiftCount;
2593 *zExpPtr = 1 - shiftCount;
2597 /*----------------------------------------------------------------------------
2598 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2599 | double-precision floating-point value, returning the result. After being
2600 | shifted into the proper positions, the three fields are simply added
2601 | together to form the result. This means that any integer portion of `zSig'
2602 | will be added into the exponent. Since a properly normalized significand
2603 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2604 | than the desired result exponent whenever `zSig' is a complete, normalized
2605 | significand.
2606 *----------------------------------------------------------------------------*/
2608 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2611 return make_float64(
2612 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2616 /*----------------------------------------------------------------------------
2617 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2618 | and significand `zSig', and returns the proper double-precision floating-
2619 | point value corresponding to the abstract input. Ordinarily, the abstract
2620 | value is simply rounded and packed into the double-precision format, with
2621 | the inexact exception raised if the abstract input cannot be represented
2622 | exactly. However, if the abstract value is too large, the overflow and
2623 | inexact exceptions are raised and an infinity or maximal finite value is
2624 | returned. If the abstract value is too small, the input value is rounded to
2625 | a subnormal number, and the underflow and inexact exceptions are raised if
2626 | the abstract input cannot be represented exactly as a subnormal double-
2627 | precision floating-point number.
2628 | The input significand `zSig' has its binary point between bits 62
2629 | and 61, which is 10 bits to the left of the usual location. This shifted
2630 | significand must be normalized or smaller. If `zSig' is not normalized,
2631 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2632 | and it must not require rounding. In the usual case that `zSig' is
2633 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2634 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2635 | Binary Floating-Point Arithmetic.
2636 *----------------------------------------------------------------------------*/
2638 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2639 float_status *status)
2641 int8_t roundingMode;
2642 flag roundNearestEven;
2643 int roundIncrement, roundBits;
2644 flag isTiny;
2646 roundingMode = status->float_rounding_mode;
2647 roundNearestEven = ( roundingMode == float_round_nearest_even );
2648 switch (roundingMode) {
2649 case float_round_nearest_even:
2650 case float_round_ties_away:
2651 roundIncrement = 0x200;
2652 break;
2653 case float_round_to_zero:
2654 roundIncrement = 0;
2655 break;
2656 case float_round_up:
2657 roundIncrement = zSign ? 0 : 0x3ff;
2658 break;
2659 case float_round_down:
2660 roundIncrement = zSign ? 0x3ff : 0;
2661 break;
2662 case float_round_to_odd:
2663 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2664 break;
2665 default:
2666 abort();
2668 roundBits = zSig & 0x3FF;
2669 if ( 0x7FD <= (uint16_t) zExp ) {
2670 if ( ( 0x7FD < zExp )
2671 || ( ( zExp == 0x7FD )
2672 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2674 bool overflow_to_inf = roundingMode != float_round_to_odd &&
2675 roundIncrement != 0;
2676 float_raise(float_flag_overflow | float_flag_inexact, status);
2677 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2679 if ( zExp < 0 ) {
2680 if (status->flush_to_zero) {
2681 float_raise(float_flag_output_denormal, status);
2682 return packFloat64(zSign, 0, 0);
2684 isTiny =
2685 (status->float_detect_tininess
2686 == float_tininess_before_rounding)
2687 || ( zExp < -1 )
2688 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2689 shift64RightJamming( zSig, - zExp, &zSig );
2690 zExp = 0;
2691 roundBits = zSig & 0x3FF;
2692 if (isTiny && roundBits) {
2693 float_raise(float_flag_underflow, status);
2695 if (roundingMode == float_round_to_odd) {
2697 * For round-to-odd case, the roundIncrement depends on
2698 * zSig which just changed.
2700 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2704 if (roundBits) {
2705 status->float_exception_flags |= float_flag_inexact;
2707 zSig = ( zSig + roundIncrement )>>10;
2708 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2709 if ( zSig == 0 ) zExp = 0;
2710 return packFloat64( zSign, zExp, zSig );
2714 /*----------------------------------------------------------------------------
2715 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2716 | and significand `zSig', and returns the proper double-precision floating-
2717 | point value corresponding to the abstract input. This routine is just like
2718 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2719 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2720 | floating-point exponent.
2721 *----------------------------------------------------------------------------*/
2723 static float64
2724 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2725 float_status *status)
2727 int8_t shiftCount;
2729 shiftCount = countLeadingZeros64( zSig ) - 1;
2730 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2731 status);
2735 /*----------------------------------------------------------------------------
2736 | Normalizes the subnormal extended double-precision floating-point value
2737 | represented by the denormalized significand `aSig'. The normalized exponent
2738 | and significand are stored at the locations pointed to by `zExpPtr' and
2739 | `zSigPtr', respectively.
2740 *----------------------------------------------------------------------------*/
2742 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2743 uint64_t *zSigPtr)
2745 int8_t shiftCount;
2747 shiftCount = countLeadingZeros64( aSig );
2748 *zSigPtr = aSig<<shiftCount;
2749 *zExpPtr = 1 - shiftCount;
2752 /*----------------------------------------------------------------------------
2753 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2754 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2755 | and returns the proper extended double-precision floating-point value
2756 | corresponding to the abstract input. Ordinarily, the abstract value is
2757 | rounded and packed into the extended double-precision format, with the
2758 | inexact exception raised if the abstract input cannot be represented
2759 | exactly. However, if the abstract value is too large, the overflow and
2760 | inexact exceptions are raised and an infinity or maximal finite value is
2761 | returned. If the abstract value is too small, the input value is rounded to
2762 | a subnormal number, and the underflow and inexact exceptions are raised if
2763 | the abstract input cannot be represented exactly as a subnormal extended
2764 | double-precision floating-point number.
2765 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
2766 | number of bits as single or double precision, respectively. Otherwise, the
2767 | result is rounded to the full precision of the extended double-precision
2768 | format.
2769 | The input significand must be normalized or smaller. If the input
2770 | significand is not normalized, `zExp' must be 0; in that case, the result
2771 | returned is a subnormal number, and it must not require rounding. The
2772 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2773 | Floating-Point Arithmetic.
2774 *----------------------------------------------------------------------------*/
2776 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2777 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2778 float_status *status)
2780 int8_t roundingMode;
2781 flag roundNearestEven, increment, isTiny;
2782 int64_t roundIncrement, roundMask, roundBits;
2784 roundingMode = status->float_rounding_mode;
2785 roundNearestEven = ( roundingMode == float_round_nearest_even );
2786 if ( roundingPrecision == 80 ) goto precision80;
2787 if ( roundingPrecision == 64 ) {
2788 roundIncrement = LIT64( 0x0000000000000400 );
2789 roundMask = LIT64( 0x00000000000007FF );
2791 else if ( roundingPrecision == 32 ) {
2792 roundIncrement = LIT64( 0x0000008000000000 );
2793 roundMask = LIT64( 0x000000FFFFFFFFFF );
2795 else {
2796 goto precision80;
2798 zSig0 |= ( zSig1 != 0 );
2799 switch (roundingMode) {
2800 case float_round_nearest_even:
2801 case float_round_ties_away:
2802 break;
2803 case float_round_to_zero:
2804 roundIncrement = 0;
2805 break;
2806 case float_round_up:
2807 roundIncrement = zSign ? 0 : roundMask;
2808 break;
2809 case float_round_down:
2810 roundIncrement = zSign ? roundMask : 0;
2811 break;
2812 default:
2813 abort();
2815 roundBits = zSig0 & roundMask;
2816 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2817 if ( ( 0x7FFE < zExp )
2818 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2820 goto overflow;
2822 if ( zExp <= 0 ) {
2823 if (status->flush_to_zero) {
2824 float_raise(float_flag_output_denormal, status);
2825 return packFloatx80(zSign, 0, 0);
2827 isTiny =
2828 (status->float_detect_tininess
2829 == float_tininess_before_rounding)
2830 || ( zExp < 0 )
2831 || ( zSig0 <= zSig0 + roundIncrement );
2832 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2833 zExp = 0;
2834 roundBits = zSig0 & roundMask;
2835 if (isTiny && roundBits) {
2836 float_raise(float_flag_underflow, status);
2838 if (roundBits) {
2839 status->float_exception_flags |= float_flag_inexact;
2841 zSig0 += roundIncrement;
2842 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2843 roundIncrement = roundMask + 1;
2844 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2845 roundMask |= roundIncrement;
2847 zSig0 &= ~ roundMask;
2848 return packFloatx80( zSign, zExp, zSig0 );
2851 if (roundBits) {
2852 status->float_exception_flags |= float_flag_inexact;
2854 zSig0 += roundIncrement;
2855 if ( zSig0 < roundIncrement ) {
2856 ++zExp;
2857 zSig0 = LIT64( 0x8000000000000000 );
2859 roundIncrement = roundMask + 1;
2860 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2861 roundMask |= roundIncrement;
2863 zSig0 &= ~ roundMask;
2864 if ( zSig0 == 0 ) zExp = 0;
2865 return packFloatx80( zSign, zExp, zSig0 );
2866 precision80:
2867 switch (roundingMode) {
2868 case float_round_nearest_even:
2869 case float_round_ties_away:
2870 increment = ((int64_t)zSig1 < 0);
2871 break;
2872 case float_round_to_zero:
2873 increment = 0;
2874 break;
2875 case float_round_up:
2876 increment = !zSign && zSig1;
2877 break;
2878 case float_round_down:
2879 increment = zSign && zSig1;
2880 break;
2881 default:
2882 abort();
2884 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2885 if ( ( 0x7FFE < zExp )
2886 || ( ( zExp == 0x7FFE )
2887 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2888 && increment
2891 roundMask = 0;
2892 overflow:
2893 float_raise(float_flag_overflow | float_flag_inexact, status);
2894 if ( ( roundingMode == float_round_to_zero )
2895 || ( zSign && ( roundingMode == float_round_up ) )
2896 || ( ! zSign && ( roundingMode == float_round_down ) )
2898 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2900 return packFloatx80(zSign,
2901 floatx80_infinity_high,
2902 floatx80_infinity_low);
2904 if ( zExp <= 0 ) {
2905 isTiny =
2906 (status->float_detect_tininess
2907 == float_tininess_before_rounding)
2908 || ( zExp < 0 )
2909 || ! increment
2910 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2911 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2912 zExp = 0;
2913 if (isTiny && zSig1) {
2914 float_raise(float_flag_underflow, status);
2916 if (zSig1) {
2917 status->float_exception_flags |= float_flag_inexact;
2919 switch (roundingMode) {
2920 case float_round_nearest_even:
2921 case float_round_ties_away:
2922 increment = ((int64_t)zSig1 < 0);
2923 break;
2924 case float_round_to_zero:
2925 increment = 0;
2926 break;
2927 case float_round_up:
2928 increment = !zSign && zSig1;
2929 break;
2930 case float_round_down:
2931 increment = zSign && zSig1;
2932 break;
2933 default:
2934 abort();
2936 if ( increment ) {
2937 ++zSig0;
2938 zSig0 &=
2939 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2940 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2942 return packFloatx80( zSign, zExp, zSig0 );
2945 if (zSig1) {
2946 status->float_exception_flags |= float_flag_inexact;
2948 if ( increment ) {
2949 ++zSig0;
2950 if ( zSig0 == 0 ) {
2951 ++zExp;
2952 zSig0 = LIT64( 0x8000000000000000 );
2954 else {
2955 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2958 else {
2959 if ( zSig0 == 0 ) zExp = 0;
2961 return packFloatx80( zSign, zExp, zSig0 );
2965 /*----------------------------------------------------------------------------
2966 | Takes an abstract floating-point value having sign `zSign', exponent
2967 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2968 | and returns the proper extended double-precision floating-point value
2969 | corresponding to the abstract input. This routine is just like
2970 | `roundAndPackFloatx80' except that the input significand does not have to be
2971 | normalized.
2972 *----------------------------------------------------------------------------*/
2974 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2975 flag zSign, int32_t zExp,
2976 uint64_t zSig0, uint64_t zSig1,
2977 float_status *status)
2979 int8_t shiftCount;
2981 if ( zSig0 == 0 ) {
2982 zSig0 = zSig1;
2983 zSig1 = 0;
2984 zExp -= 64;
2986 shiftCount = countLeadingZeros64( zSig0 );
2987 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2988 zExp -= shiftCount;
2989 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2990 zSig0, zSig1, status);
2994 /*----------------------------------------------------------------------------
2995 | Returns the least-significant 64 fraction bits of the quadruple-precision
2996 | floating-point value `a'.
2997 *----------------------------------------------------------------------------*/
2999 static inline uint64_t extractFloat128Frac1( float128 a )
3002 return a.low;
3006 /*----------------------------------------------------------------------------
3007 | Returns the most-significant 48 fraction bits of the quadruple-precision
3008 | floating-point value `a'.
3009 *----------------------------------------------------------------------------*/
3011 static inline uint64_t extractFloat128Frac0( float128 a )
3014 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
3018 /*----------------------------------------------------------------------------
3019 | Returns the exponent bits of the quadruple-precision floating-point value
3020 | `a'.
3021 *----------------------------------------------------------------------------*/
3023 static inline int32_t extractFloat128Exp( float128 a )
3026 return ( a.high>>48 ) & 0x7FFF;
3030 /*----------------------------------------------------------------------------
3031 | Returns the sign bit of the quadruple-precision floating-point value `a'.
3032 *----------------------------------------------------------------------------*/
3034 static inline flag extractFloat128Sign( float128 a )
3037 return a.high>>63;
3041 /*----------------------------------------------------------------------------
3042 | Normalizes the subnormal quadruple-precision floating-point value
3043 | represented by the denormalized significand formed by the concatenation of
3044 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
3045 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
3046 | significand are stored at the location pointed to by `zSig0Ptr', and the
3047 | least significant 64 bits of the normalized significand are stored at the
3048 | location pointed to by `zSig1Ptr'.
3049 *----------------------------------------------------------------------------*/
3051 static void
3052 normalizeFloat128Subnormal(
3053 uint64_t aSig0,
3054 uint64_t aSig1,
3055 int32_t *zExpPtr,
3056 uint64_t *zSig0Ptr,
3057 uint64_t *zSig1Ptr
3060 int8_t shiftCount;
3062 if ( aSig0 == 0 ) {
3063 shiftCount = countLeadingZeros64( aSig1 ) - 15;
3064 if ( shiftCount < 0 ) {
3065 *zSig0Ptr = aSig1>>( - shiftCount );
3066 *zSig1Ptr = aSig1<<( shiftCount & 63 );
3068 else {
3069 *zSig0Ptr = aSig1<<shiftCount;
3070 *zSig1Ptr = 0;
3072 *zExpPtr = - shiftCount - 63;
3074 else {
3075 shiftCount = countLeadingZeros64( aSig0 ) - 15;
3076 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
3077 *zExpPtr = 1 - shiftCount;
3082 /*----------------------------------------------------------------------------
3083 | Packs the sign `zSign', the exponent `zExp', and the significand formed
3084 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
3085 | floating-point value, returning the result. After being shifted into the
3086 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
3087 | added together to form the most significant 32 bits of the result. This
3088 | means that any integer portion of `zSig0' will be added into the exponent.
3089 | Since a properly normalized significand will have an integer portion equal
3090 | to 1, the `zExp' input should be 1 less than the desired result exponent
3091 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
3092 | significand.
3093 *----------------------------------------------------------------------------*/
3095 static inline float128
3096 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
3098 float128 z;
3100 z.low = zSig1;
3101 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
3102 return z;
3106 /*----------------------------------------------------------------------------
3107 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3108 | and extended significand formed by the concatenation of `zSig0', `zSig1',
3109 | and `zSig2', and returns the proper quadruple-precision floating-point value
3110 | corresponding to the abstract input. Ordinarily, the abstract value is
3111 | simply rounded and packed into the quadruple-precision format, with the
3112 | inexact exception raised if the abstract input cannot be represented
3113 | exactly. However, if the abstract value is too large, the overflow and
3114 | inexact exceptions are raised and an infinity or maximal finite value is
3115 | returned. If the abstract value is too small, the input value is rounded to
3116 | a subnormal number, and the underflow and inexact exceptions are raised if
3117 | the abstract input cannot be represented exactly as a subnormal quadruple-
3118 | precision floating-point number.
3119 | The input significand must be normalized or smaller. If the input
3120 | significand is not normalized, `zExp' must be 0; in that case, the result
3121 | returned is a subnormal number, and it must not require rounding. In the
3122 | usual case that the input significand is normalized, `zExp' must be 1 less
3123 | than the ``true'' floating-point exponent. The handling of underflow and
3124 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3125 *----------------------------------------------------------------------------*/
3127 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
3128 uint64_t zSig0, uint64_t zSig1,
3129 uint64_t zSig2, float_status *status)
3131 int8_t roundingMode;
3132 flag roundNearestEven, increment, isTiny;
3134 roundingMode = status->float_rounding_mode;
3135 roundNearestEven = ( roundingMode == float_round_nearest_even );
3136 switch (roundingMode) {
3137 case float_round_nearest_even:
3138 case float_round_ties_away:
3139 increment = ((int64_t)zSig2 < 0);
3140 break;
3141 case float_round_to_zero:
3142 increment = 0;
3143 break;
3144 case float_round_up:
3145 increment = !zSign && zSig2;
3146 break;
3147 case float_round_down:
3148 increment = zSign && zSig2;
3149 break;
3150 case float_round_to_odd:
3151 increment = !(zSig1 & 0x1) && zSig2;
3152 break;
3153 default:
3154 abort();
3156 if ( 0x7FFD <= (uint32_t) zExp ) {
3157 if ( ( 0x7FFD < zExp )
3158 || ( ( zExp == 0x7FFD )
3159 && eq128(
3160 LIT64( 0x0001FFFFFFFFFFFF ),
3161 LIT64( 0xFFFFFFFFFFFFFFFF ),
3162 zSig0,
3163 zSig1
3165 && increment
3168 float_raise(float_flag_overflow | float_flag_inexact, status);
3169 if ( ( roundingMode == float_round_to_zero )
3170 || ( zSign && ( roundingMode == float_round_up ) )
3171 || ( ! zSign && ( roundingMode == float_round_down ) )
3172 || (roundingMode == float_round_to_odd)
3174 return
3175 packFloat128(
3176 zSign,
3177 0x7FFE,
3178 LIT64( 0x0000FFFFFFFFFFFF ),
3179 LIT64( 0xFFFFFFFFFFFFFFFF )
3182 return packFloat128( zSign, 0x7FFF, 0, 0 );
3184 if ( zExp < 0 ) {
3185 if (status->flush_to_zero) {
3186 float_raise(float_flag_output_denormal, status);
3187 return packFloat128(zSign, 0, 0, 0);
3189 isTiny =
3190 (status->float_detect_tininess
3191 == float_tininess_before_rounding)
3192 || ( zExp < -1 )
3193 || ! increment
3194 || lt128(
3195 zSig0,
3196 zSig1,
3197 LIT64( 0x0001FFFFFFFFFFFF ),
3198 LIT64( 0xFFFFFFFFFFFFFFFF )
3200 shift128ExtraRightJamming(
3201 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
3202 zExp = 0;
3203 if (isTiny && zSig2) {
3204 float_raise(float_flag_underflow, status);
3206 switch (roundingMode) {
3207 case float_round_nearest_even:
3208 case float_round_ties_away:
3209 increment = ((int64_t)zSig2 < 0);
3210 break;
3211 case float_round_to_zero:
3212 increment = 0;
3213 break;
3214 case float_round_up:
3215 increment = !zSign && zSig2;
3216 break;
3217 case float_round_down:
3218 increment = zSign && zSig2;
3219 break;
3220 case float_round_to_odd:
3221 increment = !(zSig1 & 0x1) && zSig2;
3222 break;
3223 default:
3224 abort();
3228 if (zSig2) {
3229 status->float_exception_flags |= float_flag_inexact;
3231 if ( increment ) {
3232 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
3233 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
3235 else {
3236 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
3238 return packFloat128( zSign, zExp, zSig0, zSig1 );
3242 /*----------------------------------------------------------------------------
3243 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3244 | and significand formed by the concatenation of `zSig0' and `zSig1', and
3245 | returns the proper quadruple-precision floating-point value corresponding
3246 | to the abstract input. This routine is just like `roundAndPackFloat128'
3247 | except that the input significand has fewer bits and does not have to be
3248 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
3249 | point exponent.
3250 *----------------------------------------------------------------------------*/
3252 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
3253 uint64_t zSig0, uint64_t zSig1,
3254 float_status *status)
3256 int8_t shiftCount;
3257 uint64_t zSig2;
3259 if ( zSig0 == 0 ) {
3260 zSig0 = zSig1;
3261 zSig1 = 0;
3262 zExp -= 64;
3264 shiftCount = countLeadingZeros64( zSig0 ) - 15;
3265 if ( 0 <= shiftCount ) {
3266 zSig2 = 0;
3267 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3269 else {
3270 shift128ExtraRightJamming(
3271 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3273 zExp -= shiftCount;
3274 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3279 /*----------------------------------------------------------------------------
3280 | Returns the result of converting the 32-bit two's complement integer `a'
3281 | to the extended double-precision floating-point format. The conversion
3282 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3283 | Arithmetic.
3284 *----------------------------------------------------------------------------*/
3286 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3288 flag zSign;
3289 uint32_t absA;
3290 int8_t shiftCount;
3291 uint64_t zSig;
3293 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3294 zSign = ( a < 0 );
3295 absA = zSign ? - a : a;
3296 shiftCount = countLeadingZeros32( absA ) + 32;
3297 zSig = absA;
3298 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3302 /*----------------------------------------------------------------------------
3303 | Returns the result of converting the 32-bit two's complement integer `a' to
3304 | the quadruple-precision floating-point format. The conversion is performed
3305 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3306 *----------------------------------------------------------------------------*/
3308 float128 int32_to_float128(int32_t a, float_status *status)
3310 flag zSign;
3311 uint32_t absA;
3312 int8_t shiftCount;
3313 uint64_t zSig0;
3315 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3316 zSign = ( a < 0 );
3317 absA = zSign ? - a : a;
3318 shiftCount = countLeadingZeros32( absA ) + 17;
3319 zSig0 = absA;
3320 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3324 /*----------------------------------------------------------------------------
3325 | Returns the result of converting the 64-bit two's complement integer `a'
3326 | to the extended double-precision floating-point format. The conversion
3327 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3328 | Arithmetic.
3329 *----------------------------------------------------------------------------*/
3331 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3333 flag zSign;
3334 uint64_t absA;
3335 int8_t shiftCount;
3337 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3338 zSign = ( a < 0 );
3339 absA = zSign ? - a : a;
3340 shiftCount = countLeadingZeros64( absA );
3341 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3345 /*----------------------------------------------------------------------------
3346 | Returns the result of converting the 64-bit two's complement integer `a' to
3347 | the quadruple-precision floating-point format. The conversion is performed
3348 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3349 *----------------------------------------------------------------------------*/
3351 float128 int64_to_float128(int64_t a, float_status *status)
3353 flag zSign;
3354 uint64_t absA;
3355 int8_t shiftCount;
3356 int32_t zExp;
3357 uint64_t zSig0, zSig1;
3359 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3360 zSign = ( a < 0 );
3361 absA = zSign ? - a : a;
3362 shiftCount = countLeadingZeros64( absA ) + 49;
3363 zExp = 0x406E - shiftCount;
3364 if ( 64 <= shiftCount ) {
3365 zSig1 = 0;
3366 zSig0 = absA;
3367 shiftCount -= 64;
3369 else {
3370 zSig1 = absA;
3371 zSig0 = 0;
3373 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3374 return packFloat128( zSign, zExp, zSig0, zSig1 );
3378 /*----------------------------------------------------------------------------
3379 | Returns the result of converting the 64-bit unsigned integer `a'
3380 | to the quadruple-precision floating-point format. The conversion is performed
3381 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3382 *----------------------------------------------------------------------------*/
3384 float128 uint64_to_float128(uint64_t a, float_status *status)
3386 if (a == 0) {
3387 return float128_zero;
3389 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
3392 /*----------------------------------------------------------------------------
3393 | Returns the result of converting the single-precision floating-point value
3394 | `a' to the extended double-precision floating-point format. The conversion
3395 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3396 | Arithmetic.
3397 *----------------------------------------------------------------------------*/
3399 floatx80 float32_to_floatx80(float32 a, float_status *status)
3401 flag aSign;
3402 int aExp;
3403 uint32_t aSig;
3405 a = float32_squash_input_denormal(a, status);
3406 aSig = extractFloat32Frac( a );
3407 aExp = extractFloat32Exp( a );
3408 aSign = extractFloat32Sign( a );
3409 if ( aExp == 0xFF ) {
3410 if (aSig) {
3411 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3413 return packFloatx80(aSign,
3414 floatx80_infinity_high,
3415 floatx80_infinity_low);
3417 if ( aExp == 0 ) {
3418 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3419 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3421 aSig |= 0x00800000;
3422 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3426 /*----------------------------------------------------------------------------
3427 | Returns the result of converting the single-precision floating-point value
3428 | `a' to the double-precision floating-point format. The conversion is
3429 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3430 | Arithmetic.
3431 *----------------------------------------------------------------------------*/
3433 float128 float32_to_float128(float32 a, float_status *status)
3435 flag aSign;
3436 int aExp;
3437 uint32_t aSig;
3439 a = float32_squash_input_denormal(a, status);
3440 aSig = extractFloat32Frac( a );
3441 aExp = extractFloat32Exp( a );
3442 aSign = extractFloat32Sign( a );
3443 if ( aExp == 0xFF ) {
3444 if (aSig) {
3445 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3447 return packFloat128( aSign, 0x7FFF, 0, 0 );
3449 if ( aExp == 0 ) {
3450 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3451 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3452 --aExp;
3454 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3458 /*----------------------------------------------------------------------------
3459 | Returns the remainder of the single-precision floating-point value `a'
3460 | with respect to the corresponding value `b'. The operation is performed
3461 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3462 *----------------------------------------------------------------------------*/
3464 float32 float32_rem(float32 a, float32 b, float_status *status)
3466 flag aSign, zSign;
3467 int aExp, bExp, expDiff;
3468 uint32_t aSig, bSig;
3469 uint32_t q;
3470 uint64_t aSig64, bSig64, q64;
3471 uint32_t alternateASig;
3472 int32_t sigMean;
3473 a = float32_squash_input_denormal(a, status);
3474 b = float32_squash_input_denormal(b, status);
3476 aSig = extractFloat32Frac( a );
3477 aExp = extractFloat32Exp( a );
3478 aSign = extractFloat32Sign( a );
3479 bSig = extractFloat32Frac( b );
3480 bExp = extractFloat32Exp( b );
3481 if ( aExp == 0xFF ) {
3482 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3483 return propagateFloat32NaN(a, b, status);
3485 float_raise(float_flag_invalid, status);
3486 return float32_default_nan(status);
3488 if ( bExp == 0xFF ) {
3489 if (bSig) {
3490 return propagateFloat32NaN(a, b, status);
3492 return a;
3494 if ( bExp == 0 ) {
3495 if ( bSig == 0 ) {
3496 float_raise(float_flag_invalid, status);
3497 return float32_default_nan(status);
3499 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3501 if ( aExp == 0 ) {
3502 if ( aSig == 0 ) return a;
3503 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3505 expDiff = aExp - bExp;
3506 aSig |= 0x00800000;
3507 bSig |= 0x00800000;
3508 if ( expDiff < 32 ) {
3509 aSig <<= 8;
3510 bSig <<= 8;
3511 if ( expDiff < 0 ) {
3512 if ( expDiff < -1 ) return a;
3513 aSig >>= 1;
3515 q = ( bSig <= aSig );
3516 if ( q ) aSig -= bSig;
3517 if ( 0 < expDiff ) {
3518 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3519 q >>= 32 - expDiff;
3520 bSig >>= 2;
3521 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3523 else {
3524 aSig >>= 2;
3525 bSig >>= 2;
3528 else {
3529 if ( bSig <= aSig ) aSig -= bSig;
3530 aSig64 = ( (uint64_t) aSig )<<40;
3531 bSig64 = ( (uint64_t) bSig )<<40;
3532 expDiff -= 64;
3533 while ( 0 < expDiff ) {
3534 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3535 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3536 aSig64 = - ( ( bSig * q64 )<<38 );
3537 expDiff -= 62;
3539 expDiff += 64;
3540 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3541 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3542 q = q64>>( 64 - expDiff );
3543 bSig <<= 6;
3544 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3546 do {
3547 alternateASig = aSig;
3548 ++q;
3549 aSig -= bSig;
3550 } while ( 0 <= (int32_t) aSig );
3551 sigMean = aSig + alternateASig;
3552 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3553 aSig = alternateASig;
3555 zSign = ( (int32_t) aSig < 0 );
3556 if ( zSign ) aSig = - aSig;
3557 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3562 /*----------------------------------------------------------------------------
3563 | Returns the binary exponential of the single-precision floating-point value
3564 | `a'. The operation is performed according to the IEC/IEEE Standard for
3565 | Binary Floating-Point Arithmetic.
3567 | Uses the following identities:
3569 | 1. -------------------------------------------------------------------------
3570 | x x*ln(2)
3571 | 2 = e
3573 | 2. -------------------------------------------------------------------------
3574 | 2 3 4 5 n
3575 | x x x x x x x
3576 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3577 | 1! 2! 3! 4! 5! n!
3578 *----------------------------------------------------------------------------*/
3580 static const float64 float32_exp2_coefficients[15] =
3582 const_float64( 0x3ff0000000000000ll ), /* 1 */
3583 const_float64( 0x3fe0000000000000ll ), /* 2 */
3584 const_float64( 0x3fc5555555555555ll ), /* 3 */
3585 const_float64( 0x3fa5555555555555ll ), /* 4 */
3586 const_float64( 0x3f81111111111111ll ), /* 5 */
3587 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
3588 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
3589 const_float64( 0x3efa01a01a01a01all ), /* 8 */
3590 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
3591 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3592 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3593 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3594 const_float64( 0x3de6124613a86d09ll ), /* 13 */
3595 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3596 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3599 float32 float32_exp2(float32 a, float_status *status)
3601 flag aSign;
3602 int aExp;
3603 uint32_t aSig;
3604 float64 r, x, xn;
3605 int i;
3606 a = float32_squash_input_denormal(a, status);
3608 aSig = extractFloat32Frac( a );
3609 aExp = extractFloat32Exp( a );
3610 aSign = extractFloat32Sign( a );
3612 if ( aExp == 0xFF) {
3613 if (aSig) {
3614 return propagateFloat32NaN(a, float32_zero, status);
3616 return (aSign) ? float32_zero : a;
3618 if (aExp == 0) {
3619 if (aSig == 0) return float32_one;
3622 float_raise(float_flag_inexact, status);
3624 /* ******************************* */
3625 /* using float64 for approximation */
3626 /* ******************************* */
3627 x = float32_to_float64(a, status);
3628 x = float64_mul(x, float64_ln2, status);
3630 xn = x;
3631 r = float64_one;
3632 for (i = 0 ; i < 15 ; i++) {
3633 float64 f;
3635 f = float64_mul(xn, float32_exp2_coefficients[i], status);
3636 r = float64_add(r, f, status);
3638 xn = float64_mul(xn, x, status);
3641 return float64_to_float32(r, status);
3644 /*----------------------------------------------------------------------------
3645 | Returns the binary log of the single-precision floating-point value `a'.
3646 | The operation is performed according to the IEC/IEEE Standard for Binary
3647 | Floating-Point Arithmetic.
3648 *----------------------------------------------------------------------------*/
3649 float32 float32_log2(float32 a, float_status *status)
3651 flag aSign, zSign;
3652 int aExp;
3653 uint32_t aSig, zSig, i;
3655 a = float32_squash_input_denormal(a, status);
3656 aSig = extractFloat32Frac( a );
3657 aExp = extractFloat32Exp( a );
3658 aSign = extractFloat32Sign( a );
3660 if ( aExp == 0 ) {
3661 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3662 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3664 if ( aSign ) {
3665 float_raise(float_flag_invalid, status);
3666 return float32_default_nan(status);
3668 if ( aExp == 0xFF ) {
3669 if (aSig) {
3670 return propagateFloat32NaN(a, float32_zero, status);
3672 return a;
3675 aExp -= 0x7F;
3676 aSig |= 0x00800000;
3677 zSign = aExp < 0;
3678 zSig = aExp << 23;
3680 for (i = 1 << 22; i > 0; i >>= 1) {
3681 aSig = ( (uint64_t)aSig * aSig ) >> 23;
3682 if ( aSig & 0x01000000 ) {
3683 aSig >>= 1;
3684 zSig |= i;
3688 if ( zSign )
3689 zSig = -zSig;
3691 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3694 /*----------------------------------------------------------------------------
3695 | Returns 1 if the single-precision floating-point value `a' is equal to
3696 | the corresponding value `b', and 0 otherwise. The invalid exception is
3697 | raised if either operand is a NaN. Otherwise, the comparison is performed
3698 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3699 *----------------------------------------------------------------------------*/
3701 int float32_eq(float32 a, float32 b, float_status *status)
3703 uint32_t av, bv;
3704 a = float32_squash_input_denormal(a, status);
3705 b = float32_squash_input_denormal(b, status);
3707 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3708 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3710 float_raise(float_flag_invalid, status);
3711 return 0;
3713 av = float32_val(a);
3714 bv = float32_val(b);
3715 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3718 /*----------------------------------------------------------------------------
3719 | Returns 1 if the single-precision floating-point value `a' is less than
3720 | or equal to the corresponding value `b', and 0 otherwise. The invalid
3721 | exception is raised if either operand is a NaN. The comparison is performed
3722 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3723 *----------------------------------------------------------------------------*/
3725 int float32_le(float32 a, float32 b, float_status *status)
3727 flag aSign, bSign;
3728 uint32_t av, bv;
3729 a = float32_squash_input_denormal(a, status);
3730 b = float32_squash_input_denormal(b, status);
3732 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3733 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3735 float_raise(float_flag_invalid, status);
3736 return 0;
3738 aSign = extractFloat32Sign( a );
3739 bSign = extractFloat32Sign( b );
3740 av = float32_val(a);
3741 bv = float32_val(b);
3742 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3743 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3747 /*----------------------------------------------------------------------------
3748 | Returns 1 if the single-precision floating-point value `a' is less than
3749 | the corresponding value `b', and 0 otherwise. The invalid exception is
3750 | raised if either operand is a NaN. The comparison is performed according
3751 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3752 *----------------------------------------------------------------------------*/
3754 int float32_lt(float32 a, float32 b, float_status *status)
3756 flag aSign, bSign;
3757 uint32_t av, bv;
3758 a = float32_squash_input_denormal(a, status);
3759 b = float32_squash_input_denormal(b, status);
3761 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3762 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3764 float_raise(float_flag_invalid, status);
3765 return 0;
3767 aSign = extractFloat32Sign( a );
3768 bSign = extractFloat32Sign( b );
3769 av = float32_val(a);
3770 bv = float32_val(b);
3771 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3772 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3776 /*----------------------------------------------------------------------------
3777 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3778 | be compared, and 0 otherwise. The invalid exception is raised if either
3779 | operand is a NaN. The comparison is performed according to the IEC/IEEE
3780 | Standard for Binary Floating-Point Arithmetic.
3781 *----------------------------------------------------------------------------*/
3783 int float32_unordered(float32 a, float32 b, float_status *status)
3785 a = float32_squash_input_denormal(a, status);
3786 b = float32_squash_input_denormal(b, status);
3788 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3789 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3791 float_raise(float_flag_invalid, status);
3792 return 1;
3794 return 0;
3797 /*----------------------------------------------------------------------------
3798 | Returns 1 if the single-precision floating-point value `a' is equal to
3799 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3800 | exception. The comparison is performed according to the IEC/IEEE Standard
3801 | for Binary Floating-Point Arithmetic.
3802 *----------------------------------------------------------------------------*/
3804 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3806 a = float32_squash_input_denormal(a, status);
3807 b = float32_squash_input_denormal(b, status);
3809 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3810 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3812 if (float32_is_signaling_nan(a, status)
3813 || float32_is_signaling_nan(b, status)) {
3814 float_raise(float_flag_invalid, status);
3816 return 0;
3818 return ( float32_val(a) == float32_val(b) ) ||
3819 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3822 /*----------------------------------------------------------------------------
3823 | Returns 1 if the single-precision floating-point value `a' is less than or
3824 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3825 | cause an exception. Otherwise, the comparison is performed according to the
3826 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3827 *----------------------------------------------------------------------------*/
3829 int float32_le_quiet(float32 a, float32 b, float_status *status)
3831 flag aSign, bSign;
3832 uint32_t av, bv;
3833 a = float32_squash_input_denormal(a, status);
3834 b = float32_squash_input_denormal(b, status);
3836 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3837 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3839 if (float32_is_signaling_nan(a, status)
3840 || float32_is_signaling_nan(b, status)) {
3841 float_raise(float_flag_invalid, status);
3843 return 0;
3845 aSign = extractFloat32Sign( a );
3846 bSign = extractFloat32Sign( b );
3847 av = float32_val(a);
3848 bv = float32_val(b);
3849 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3850 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3854 /*----------------------------------------------------------------------------
3855 | Returns 1 if the single-precision floating-point value `a' is less than
3856 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3857 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3858 | Standard for Binary Floating-Point Arithmetic.
3859 *----------------------------------------------------------------------------*/
3861 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3863 flag aSign, bSign;
3864 uint32_t av, bv;
3865 a = float32_squash_input_denormal(a, status);
3866 b = float32_squash_input_denormal(b, status);
3868 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3869 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3871 if (float32_is_signaling_nan(a, status)
3872 || float32_is_signaling_nan(b, status)) {
3873 float_raise(float_flag_invalid, status);
3875 return 0;
3877 aSign = extractFloat32Sign( a );
3878 bSign = extractFloat32Sign( b );
3879 av = float32_val(a);
3880 bv = float32_val(b);
3881 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3882 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3886 /*----------------------------------------------------------------------------
3887 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3888 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3889 | comparison is performed according to the IEC/IEEE Standard for Binary
3890 | Floating-Point Arithmetic.
3891 *----------------------------------------------------------------------------*/
3893 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3895 a = float32_squash_input_denormal(a, status);
3896 b = float32_squash_input_denormal(b, status);
3898 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3899 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3901 if (float32_is_signaling_nan(a, status)
3902 || float32_is_signaling_nan(b, status)) {
3903 float_raise(float_flag_invalid, status);
3905 return 1;
3907 return 0;
3910 /*----------------------------------------------------------------------------
3911 | If `a' is denormal and we are in flush-to-zero mode then set the
3912 | input-denormal exception and return zero. Otherwise just return the value.
3913 *----------------------------------------------------------------------------*/
3914 float16 float16_squash_input_denormal(float16 a, float_status *status)
3916 if (status->flush_inputs_to_zero) {
3917 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3918 float_raise(float_flag_input_denormal, status);
3919 return make_float16(float16_val(a) & 0x8000);
3922 return a;
3925 /*----------------------------------------------------------------------------
3926 | Returns the result of converting the double-precision floating-point value
3927 | `a' to the extended double-precision floating-point format. The conversion
3928 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3929 | Arithmetic.
3930 *----------------------------------------------------------------------------*/
3932 floatx80 float64_to_floatx80(float64 a, float_status *status)
3934 flag aSign;
3935 int aExp;
3936 uint64_t aSig;
3938 a = float64_squash_input_denormal(a, status);
3939 aSig = extractFloat64Frac( a );
3940 aExp = extractFloat64Exp( a );
3941 aSign = extractFloat64Sign( a );
3942 if ( aExp == 0x7FF ) {
3943 if (aSig) {
3944 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3946 return packFloatx80(aSign,
3947 floatx80_infinity_high,
3948 floatx80_infinity_low);
3950 if ( aExp == 0 ) {
3951 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3952 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3954 return
3955 packFloatx80(
3956 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3960 /*----------------------------------------------------------------------------
3961 | Returns the result of converting the double-precision floating-point value
3962 | `a' to the quadruple-precision floating-point format. The conversion is
3963 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3964 | Arithmetic.
3965 *----------------------------------------------------------------------------*/
3967 float128 float64_to_float128(float64 a, float_status *status)
3969 flag aSign;
3970 int aExp;
3971 uint64_t aSig, zSig0, zSig1;
3973 a = float64_squash_input_denormal(a, status);
3974 aSig = extractFloat64Frac( a );
3975 aExp = extractFloat64Exp( a );
3976 aSign = extractFloat64Sign( a );
3977 if ( aExp == 0x7FF ) {
3978 if (aSig) {
3979 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3981 return packFloat128( aSign, 0x7FFF, 0, 0 );
3983 if ( aExp == 0 ) {
3984 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3985 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3986 --aExp;
3988 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3989 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3994 /*----------------------------------------------------------------------------
3995 | Returns the remainder of the double-precision floating-point value `a'
3996 | with respect to the corresponding value `b'. The operation is performed
3997 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3998 *----------------------------------------------------------------------------*/
4000 float64 float64_rem(float64 a, float64 b, float_status *status)
4002 flag aSign, zSign;
4003 int aExp, bExp, expDiff;
4004 uint64_t aSig, bSig;
4005 uint64_t q, alternateASig;
4006 int64_t sigMean;
4008 a = float64_squash_input_denormal(a, status);
4009 b = float64_squash_input_denormal(b, status);
4010 aSig = extractFloat64Frac( a );
4011 aExp = extractFloat64Exp( a );
4012 aSign = extractFloat64Sign( a );
4013 bSig = extractFloat64Frac( b );
4014 bExp = extractFloat64Exp( b );
4015 if ( aExp == 0x7FF ) {
4016 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4017 return propagateFloat64NaN(a, b, status);
4019 float_raise(float_flag_invalid, status);
4020 return float64_default_nan(status);
4022 if ( bExp == 0x7FF ) {
4023 if (bSig) {
4024 return propagateFloat64NaN(a, b, status);
4026 return a;
4028 if ( bExp == 0 ) {
4029 if ( bSig == 0 ) {
4030 float_raise(float_flag_invalid, status);
4031 return float64_default_nan(status);
4033 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4035 if ( aExp == 0 ) {
4036 if ( aSig == 0 ) return a;
4037 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4039 expDiff = aExp - bExp;
4040 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4041 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4042 if ( expDiff < 0 ) {
4043 if ( expDiff < -1 ) return a;
4044 aSig >>= 1;
4046 q = ( bSig <= aSig );
4047 if ( q ) aSig -= bSig;
4048 expDiff -= 64;
4049 while ( 0 < expDiff ) {
4050 q = estimateDiv128To64( aSig, 0, bSig );
4051 q = ( 2 < q ) ? q - 2 : 0;
4052 aSig = - ( ( bSig>>2 ) * q );
4053 expDiff -= 62;
4055 expDiff += 64;
4056 if ( 0 < expDiff ) {
4057 q = estimateDiv128To64( aSig, 0, bSig );
4058 q = ( 2 < q ) ? q - 2 : 0;
4059 q >>= 64 - expDiff;
4060 bSig >>= 2;
4061 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4063 else {
4064 aSig >>= 2;
4065 bSig >>= 2;
4067 do {
4068 alternateASig = aSig;
4069 ++q;
4070 aSig -= bSig;
4071 } while ( 0 <= (int64_t) aSig );
4072 sigMean = aSig + alternateASig;
4073 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4074 aSig = alternateASig;
4076 zSign = ( (int64_t) aSig < 0 );
4077 if ( zSign ) aSig = - aSig;
4078 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4082 /*----------------------------------------------------------------------------
4083 | Returns the binary log of the double-precision floating-point value `a'.
4084 | The operation is performed according to the IEC/IEEE Standard for Binary
4085 | Floating-Point Arithmetic.
4086 *----------------------------------------------------------------------------*/
4087 float64 float64_log2(float64 a, float_status *status)
4089 flag aSign, zSign;
4090 int aExp;
4091 uint64_t aSig, aSig0, aSig1, zSig, i;
4092 a = float64_squash_input_denormal(a, status);
4094 aSig = extractFloat64Frac( a );
4095 aExp = extractFloat64Exp( a );
4096 aSign = extractFloat64Sign( a );
4098 if ( aExp == 0 ) {
4099 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4100 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4102 if ( aSign ) {
4103 float_raise(float_flag_invalid, status);
4104 return float64_default_nan(status);
4106 if ( aExp == 0x7FF ) {
4107 if (aSig) {
4108 return propagateFloat64NaN(a, float64_zero, status);
4110 return a;
4113 aExp -= 0x3FF;
4114 aSig |= LIT64( 0x0010000000000000 );
4115 zSign = aExp < 0;
4116 zSig = (uint64_t)aExp << 52;
4117 for (i = 1LL << 51; i > 0; i >>= 1) {
4118 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4119 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4120 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4121 aSig >>= 1;
4122 zSig |= i;
4126 if ( zSign )
4127 zSig = -zSig;
4128 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4131 /*----------------------------------------------------------------------------
4132 | Returns 1 if the double-precision floating-point value `a' is equal to the
4133 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4134 | if either operand is a NaN. Otherwise, the comparison is performed
4135 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4136 *----------------------------------------------------------------------------*/
4138 int float64_eq(float64 a, float64 b, float_status *status)
4140 uint64_t av, bv;
4141 a = float64_squash_input_denormal(a, status);
4142 b = float64_squash_input_denormal(b, status);
4144 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4145 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4147 float_raise(float_flag_invalid, status);
4148 return 0;
4150 av = float64_val(a);
4151 bv = float64_val(b);
4152 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4156 /*----------------------------------------------------------------------------
4157 | Returns 1 if the double-precision floating-point value `a' is less than or
4158 | equal to the corresponding value `b', and 0 otherwise. The invalid
4159 | exception is raised if either operand is a NaN. The comparison is performed
4160 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4161 *----------------------------------------------------------------------------*/
4163 int float64_le(float64 a, float64 b, float_status *status)
4165 flag aSign, bSign;
4166 uint64_t av, bv;
4167 a = float64_squash_input_denormal(a, status);
4168 b = float64_squash_input_denormal(b, status);
4170 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4171 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4173 float_raise(float_flag_invalid, status);
4174 return 0;
4176 aSign = extractFloat64Sign( a );
4177 bSign = extractFloat64Sign( b );
4178 av = float64_val(a);
4179 bv = float64_val(b);
4180 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4181 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4185 /*----------------------------------------------------------------------------
4186 | Returns 1 if the double-precision floating-point value `a' is less than
4187 | the corresponding value `b', and 0 otherwise. The invalid exception is
4188 | raised if either operand is a NaN. The comparison is performed according
4189 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4190 *----------------------------------------------------------------------------*/
4192 int float64_lt(float64 a, float64 b, float_status *status)
4194 flag aSign, bSign;
4195 uint64_t av, bv;
4197 a = float64_squash_input_denormal(a, status);
4198 b = float64_squash_input_denormal(b, status);
4199 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4200 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4202 float_raise(float_flag_invalid, status);
4203 return 0;
4205 aSign = extractFloat64Sign( a );
4206 bSign = extractFloat64Sign( b );
4207 av = float64_val(a);
4208 bv = float64_val(b);
4209 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4210 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4214 /*----------------------------------------------------------------------------
4215 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4216 | be compared, and 0 otherwise. The invalid exception is raised if either
4217 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4218 | Standard for Binary Floating-Point Arithmetic.
4219 *----------------------------------------------------------------------------*/
4221 int float64_unordered(float64 a, float64 b, float_status *status)
4223 a = float64_squash_input_denormal(a, status);
4224 b = float64_squash_input_denormal(b, status);
4226 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4227 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4229 float_raise(float_flag_invalid, status);
4230 return 1;
4232 return 0;
4235 /*----------------------------------------------------------------------------
4236 | Returns 1 if the double-precision floating-point value `a' is equal to the
4237 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4238 | exception.The comparison is performed according to the IEC/IEEE Standard
4239 | for Binary Floating-Point Arithmetic.
4240 *----------------------------------------------------------------------------*/
4242 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4244 uint64_t av, bv;
4245 a = float64_squash_input_denormal(a, status);
4246 b = float64_squash_input_denormal(b, status);
4248 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4249 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4251 if (float64_is_signaling_nan(a, status)
4252 || float64_is_signaling_nan(b, status)) {
4253 float_raise(float_flag_invalid, status);
4255 return 0;
4257 av = float64_val(a);
4258 bv = float64_val(b);
4259 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4263 /*----------------------------------------------------------------------------
4264 | Returns 1 if the double-precision floating-point value `a' is less than or
4265 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4266 | cause an exception. Otherwise, the comparison is performed according to the
4267 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4268 *----------------------------------------------------------------------------*/
4270 int float64_le_quiet(float64 a, float64 b, float_status *status)
4272 flag aSign, bSign;
4273 uint64_t av, bv;
4274 a = float64_squash_input_denormal(a, status);
4275 b = float64_squash_input_denormal(b, status);
4277 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4278 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4280 if (float64_is_signaling_nan(a, status)
4281 || float64_is_signaling_nan(b, status)) {
4282 float_raise(float_flag_invalid, status);
4284 return 0;
4286 aSign = extractFloat64Sign( a );
4287 bSign = extractFloat64Sign( b );
4288 av = float64_val(a);
4289 bv = float64_val(b);
4290 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4291 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4295 /*----------------------------------------------------------------------------
4296 | Returns 1 if the double-precision floating-point value `a' is less than
4297 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4298 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4299 | Standard for Binary Floating-Point Arithmetic.
4300 *----------------------------------------------------------------------------*/
4302 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4304 flag aSign, bSign;
4305 uint64_t av, bv;
4306 a = float64_squash_input_denormal(a, status);
4307 b = float64_squash_input_denormal(b, status);
4309 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4310 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4312 if (float64_is_signaling_nan(a, status)
4313 || float64_is_signaling_nan(b, status)) {
4314 float_raise(float_flag_invalid, status);
4316 return 0;
4318 aSign = extractFloat64Sign( a );
4319 bSign = extractFloat64Sign( b );
4320 av = float64_val(a);
4321 bv = float64_val(b);
4322 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4323 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4327 /*----------------------------------------------------------------------------
4328 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4329 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4330 | comparison is performed according to the IEC/IEEE Standard for Binary
4331 | Floating-Point Arithmetic.
4332 *----------------------------------------------------------------------------*/
4334 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4336 a = float64_squash_input_denormal(a, status);
4337 b = float64_squash_input_denormal(b, status);
4339 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4340 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4342 if (float64_is_signaling_nan(a, status)
4343 || float64_is_signaling_nan(b, status)) {
4344 float_raise(float_flag_invalid, status);
4346 return 1;
4348 return 0;
4351 /*----------------------------------------------------------------------------
4352 | Returns the result of converting the extended double-precision floating-
4353 | point value `a' to the 32-bit two's complement integer format. The
4354 | conversion is performed according to the IEC/IEEE Standard for Binary
4355 | Floating-Point Arithmetic---which means in particular that the conversion
4356 | is rounded according to the current rounding mode. If `a' is a NaN, the
4357 | largest positive integer is returned. Otherwise, if the conversion
4358 | overflows, the largest integer with the same sign as `a' is returned.
4359 *----------------------------------------------------------------------------*/
4361 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4363 flag aSign;
4364 int32_t aExp, shiftCount;
4365 uint64_t aSig;
4367 if (floatx80_invalid_encoding(a)) {
4368 float_raise(float_flag_invalid, status);
4369 return 1 << 31;
4371 aSig = extractFloatx80Frac( a );
4372 aExp = extractFloatx80Exp( a );
4373 aSign = extractFloatx80Sign( a );
4374 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4375 shiftCount = 0x4037 - aExp;
4376 if ( shiftCount <= 0 ) shiftCount = 1;
4377 shift64RightJamming( aSig, shiftCount, &aSig );
4378 return roundAndPackInt32(aSign, aSig, status);
4382 /*----------------------------------------------------------------------------
4383 | Returns the result of converting the extended double-precision floating-
4384 | point value `a' to the 32-bit two's complement integer format. The
4385 | conversion is performed according to the IEC/IEEE Standard for Binary
4386 | Floating-Point Arithmetic, except that the conversion is always rounded
4387 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4388 | Otherwise, if the conversion overflows, the largest integer with the same
4389 | sign as `a' is returned.
4390 *----------------------------------------------------------------------------*/
4392 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4394 flag aSign;
4395 int32_t aExp, shiftCount;
4396 uint64_t aSig, savedASig;
4397 int32_t z;
4399 if (floatx80_invalid_encoding(a)) {
4400 float_raise(float_flag_invalid, status);
4401 return 1 << 31;
4403 aSig = extractFloatx80Frac( a );
4404 aExp = extractFloatx80Exp( a );
4405 aSign = extractFloatx80Sign( a );
4406 if ( 0x401E < aExp ) {
4407 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4408 goto invalid;
4410 else if ( aExp < 0x3FFF ) {
4411 if (aExp || aSig) {
4412 status->float_exception_flags |= float_flag_inexact;
4414 return 0;
4416 shiftCount = 0x403E - aExp;
4417 savedASig = aSig;
4418 aSig >>= shiftCount;
4419 z = aSig;
4420 if ( aSign ) z = - z;
4421 if ( ( z < 0 ) ^ aSign ) {
4422 invalid:
4423 float_raise(float_flag_invalid, status);
4424 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4426 if ( ( aSig<<shiftCount ) != savedASig ) {
4427 status->float_exception_flags |= float_flag_inexact;
4429 return z;
4433 /*----------------------------------------------------------------------------
4434 | Returns the result of converting the extended double-precision floating-
4435 | point value `a' to the 64-bit two's complement integer format. The
4436 | conversion is performed according to the IEC/IEEE Standard for Binary
4437 | Floating-Point Arithmetic---which means in particular that the conversion
4438 | is rounded according to the current rounding mode. If `a' is a NaN,
4439 | the largest positive integer is returned. Otherwise, if the conversion
4440 | overflows, the largest integer with the same sign as `a' is returned.
4441 *----------------------------------------------------------------------------*/
4443 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4445 flag aSign;
4446 int32_t aExp, shiftCount;
4447 uint64_t aSig, aSigExtra;
4449 if (floatx80_invalid_encoding(a)) {
4450 float_raise(float_flag_invalid, status);
4451 return 1ULL << 63;
4453 aSig = extractFloatx80Frac( a );
4454 aExp = extractFloatx80Exp( a );
4455 aSign = extractFloatx80Sign( a );
4456 shiftCount = 0x403E - aExp;
4457 if ( shiftCount <= 0 ) {
4458 if ( shiftCount ) {
4459 float_raise(float_flag_invalid, status);
4460 if (!aSign || floatx80_is_any_nan(a)) {
4461 return LIT64( 0x7FFFFFFFFFFFFFFF );
4463 return (int64_t) LIT64( 0x8000000000000000 );
4465 aSigExtra = 0;
4467 else {
4468 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4470 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4474 /*----------------------------------------------------------------------------
4475 | Returns the result of converting the extended double-precision floating-
4476 | point value `a' to the 64-bit two's complement integer format. The
4477 | conversion is performed according to the IEC/IEEE Standard for Binary
4478 | Floating-Point Arithmetic, except that the conversion is always rounded
4479 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4480 | Otherwise, if the conversion overflows, the largest integer with the same
4481 | sign as `a' is returned.
4482 *----------------------------------------------------------------------------*/
4484 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4486 flag aSign;
4487 int32_t aExp, shiftCount;
4488 uint64_t aSig;
4489 int64_t z;
4491 if (floatx80_invalid_encoding(a)) {
4492 float_raise(float_flag_invalid, status);
4493 return 1ULL << 63;
4495 aSig = extractFloatx80Frac( a );
4496 aExp = extractFloatx80Exp( a );
4497 aSign = extractFloatx80Sign( a );
4498 shiftCount = aExp - 0x403E;
4499 if ( 0 <= shiftCount ) {
4500 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4501 if ( ( a.high != 0xC03E ) || aSig ) {
4502 float_raise(float_flag_invalid, status);
4503 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4504 return LIT64( 0x7FFFFFFFFFFFFFFF );
4507 return (int64_t) LIT64( 0x8000000000000000 );
4509 else if ( aExp < 0x3FFF ) {
4510 if (aExp | aSig) {
4511 status->float_exception_flags |= float_flag_inexact;
4513 return 0;
4515 z = aSig>>( - shiftCount );
4516 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4517 status->float_exception_flags |= float_flag_inexact;
4519 if ( aSign ) z = - z;
4520 return z;
4524 /*----------------------------------------------------------------------------
4525 | Returns the result of converting the extended double-precision floating-
4526 | point value `a' to the single-precision floating-point format. The
4527 | conversion is performed according to the IEC/IEEE Standard for Binary
4528 | Floating-Point Arithmetic.
4529 *----------------------------------------------------------------------------*/
4531 float32 floatx80_to_float32(floatx80 a, float_status *status)
4533 flag aSign;
4534 int32_t aExp;
4535 uint64_t aSig;
4537 if (floatx80_invalid_encoding(a)) {
4538 float_raise(float_flag_invalid, status);
4539 return float32_default_nan(status);
4541 aSig = extractFloatx80Frac( a );
4542 aExp = extractFloatx80Exp( a );
4543 aSign = extractFloatx80Sign( a );
4544 if ( aExp == 0x7FFF ) {
4545 if ( (uint64_t) ( aSig<<1 ) ) {
4546 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4548 return packFloat32( aSign, 0xFF, 0 );
4550 shift64RightJamming( aSig, 33, &aSig );
4551 if ( aExp || aSig ) aExp -= 0x3F81;
4552 return roundAndPackFloat32(aSign, aExp, aSig, status);
4556 /*----------------------------------------------------------------------------
4557 | Returns the result of converting the extended double-precision floating-
4558 | point value `a' to the double-precision floating-point format. The
4559 | conversion is performed according to the IEC/IEEE Standard for Binary
4560 | Floating-Point Arithmetic.
4561 *----------------------------------------------------------------------------*/
4563 float64 floatx80_to_float64(floatx80 a, float_status *status)
4565 flag aSign;
4566 int32_t aExp;
4567 uint64_t aSig, zSig;
4569 if (floatx80_invalid_encoding(a)) {
4570 float_raise(float_flag_invalid, status);
4571 return float64_default_nan(status);
4573 aSig = extractFloatx80Frac( a );
4574 aExp = extractFloatx80Exp( a );
4575 aSign = extractFloatx80Sign( a );
4576 if ( aExp == 0x7FFF ) {
4577 if ( (uint64_t) ( aSig<<1 ) ) {
4578 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4580 return packFloat64( aSign, 0x7FF, 0 );
4582 shift64RightJamming( aSig, 1, &zSig );
4583 if ( aExp || aSig ) aExp -= 0x3C01;
4584 return roundAndPackFloat64(aSign, aExp, zSig, status);
4588 /*----------------------------------------------------------------------------
4589 | Returns the result of converting the extended double-precision floating-
4590 | point value `a' to the quadruple-precision floating-point format. The
4591 | conversion is performed according to the IEC/IEEE Standard for Binary
4592 | Floating-Point Arithmetic.
4593 *----------------------------------------------------------------------------*/
4595 float128 floatx80_to_float128(floatx80 a, float_status *status)
4597 flag aSign;
4598 int aExp;
4599 uint64_t aSig, zSig0, zSig1;
4601 if (floatx80_invalid_encoding(a)) {
4602 float_raise(float_flag_invalid, status);
4603 return float128_default_nan(status);
4605 aSig = extractFloatx80Frac( a );
4606 aExp = extractFloatx80Exp( a );
4607 aSign = extractFloatx80Sign( a );
4608 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4609 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4611 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4612 return packFloat128( aSign, aExp, zSig0, zSig1 );
4616 /*----------------------------------------------------------------------------
4617 | Rounds the extended double-precision floating-point value `a'
4618 | to the precision provided by floatx80_rounding_precision and returns the
4619 | result as an extended double-precision floating-point value.
4620 | The operation is performed according to the IEC/IEEE Standard for Binary
4621 | Floating-Point Arithmetic.
4622 *----------------------------------------------------------------------------*/
4624 floatx80 floatx80_round(floatx80 a, float_status *status)
4626 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4627 extractFloatx80Sign(a),
4628 extractFloatx80Exp(a),
4629 extractFloatx80Frac(a), 0, status);
4632 /*----------------------------------------------------------------------------
4633 | Rounds the extended double-precision floating-point value `a' to an integer,
4634 | and returns the result as an extended quadruple-precision floating-point
4635 | value. The operation is performed according to the IEC/IEEE Standard for
4636 | Binary Floating-Point Arithmetic.
4637 *----------------------------------------------------------------------------*/
4639 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4641 flag aSign;
4642 int32_t aExp;
4643 uint64_t lastBitMask, roundBitsMask;
4644 floatx80 z;
4646 if (floatx80_invalid_encoding(a)) {
4647 float_raise(float_flag_invalid, status);
4648 return floatx80_default_nan(status);
4650 aExp = extractFloatx80Exp( a );
4651 if ( 0x403E <= aExp ) {
4652 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4653 return propagateFloatx80NaN(a, a, status);
4655 return a;
4657 if ( aExp < 0x3FFF ) {
4658 if ( ( aExp == 0 )
4659 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4660 return a;
4662 status->float_exception_flags |= float_flag_inexact;
4663 aSign = extractFloatx80Sign( a );
4664 switch (status->float_rounding_mode) {
4665 case float_round_nearest_even:
4666 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4668 return
4669 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4671 break;
4672 case float_round_ties_away:
4673 if (aExp == 0x3FFE) {
4674 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4676 break;
4677 case float_round_down:
4678 return
4679 aSign ?
4680 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4681 : packFloatx80( 0, 0, 0 );
4682 case float_round_up:
4683 return
4684 aSign ? packFloatx80( 1, 0, 0 )
4685 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4687 return packFloatx80( aSign, 0, 0 );
4689 lastBitMask = 1;
4690 lastBitMask <<= 0x403E - aExp;
4691 roundBitsMask = lastBitMask - 1;
4692 z = a;
4693 switch (status->float_rounding_mode) {
4694 case float_round_nearest_even:
4695 z.low += lastBitMask>>1;
4696 if ((z.low & roundBitsMask) == 0) {
4697 z.low &= ~lastBitMask;
4699 break;
4700 case float_round_ties_away:
4701 z.low += lastBitMask >> 1;
4702 break;
4703 case float_round_to_zero:
4704 break;
4705 case float_round_up:
4706 if (!extractFloatx80Sign(z)) {
4707 z.low += roundBitsMask;
4709 break;
4710 case float_round_down:
4711 if (extractFloatx80Sign(z)) {
4712 z.low += roundBitsMask;
4714 break;
4715 default:
4716 abort();
4718 z.low &= ~ roundBitsMask;
4719 if ( z.low == 0 ) {
4720 ++z.high;
4721 z.low = LIT64( 0x8000000000000000 );
4723 if (z.low != a.low) {
4724 status->float_exception_flags |= float_flag_inexact;
4726 return z;
4730 /*----------------------------------------------------------------------------
4731 | Returns the result of adding the absolute values of the extended double-
4732 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4733 | negated before being returned. `zSign' is ignored if the result is a NaN.
4734 | The addition is performed according to the IEC/IEEE Standard for Binary
4735 | Floating-Point Arithmetic.
4736 *----------------------------------------------------------------------------*/
4738 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4739 float_status *status)
4741 int32_t aExp, bExp, zExp;
4742 uint64_t aSig, bSig, zSig0, zSig1;
4743 int32_t expDiff;
4745 aSig = extractFloatx80Frac( a );
4746 aExp = extractFloatx80Exp( a );
4747 bSig = extractFloatx80Frac( b );
4748 bExp = extractFloatx80Exp( b );
4749 expDiff = aExp - bExp;
4750 if ( 0 < expDiff ) {
4751 if ( aExp == 0x7FFF ) {
4752 if ((uint64_t)(aSig << 1)) {
4753 return propagateFloatx80NaN(a, b, status);
4755 return a;
4757 if ( bExp == 0 ) --expDiff;
4758 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4759 zExp = aExp;
4761 else if ( expDiff < 0 ) {
4762 if ( bExp == 0x7FFF ) {
4763 if ((uint64_t)(bSig << 1)) {
4764 return propagateFloatx80NaN(a, b, status);
4766 return packFloatx80(zSign,
4767 floatx80_infinity_high,
4768 floatx80_infinity_low);
4770 if ( aExp == 0 ) ++expDiff;
4771 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4772 zExp = bExp;
4774 else {
4775 if ( aExp == 0x7FFF ) {
4776 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4777 return propagateFloatx80NaN(a, b, status);
4779 return a;
4781 zSig1 = 0;
4782 zSig0 = aSig + bSig;
4783 if ( aExp == 0 ) {
4784 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4785 goto roundAndPack;
4787 zExp = aExp;
4788 goto shiftRight1;
4790 zSig0 = aSig + bSig;
4791 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4792 shiftRight1:
4793 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4794 zSig0 |= LIT64( 0x8000000000000000 );
4795 ++zExp;
4796 roundAndPack:
4797 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4798 zSign, zExp, zSig0, zSig1, status);
4801 /*----------------------------------------------------------------------------
4802 | Returns the result of subtracting the absolute values of the extended
4803 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4804 | difference is negated before being returned. `zSign' is ignored if the
4805 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4806 | Standard for Binary Floating-Point Arithmetic.
4807 *----------------------------------------------------------------------------*/
4809 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4810 float_status *status)
4812 int32_t aExp, bExp, zExp;
4813 uint64_t aSig, bSig, zSig0, zSig1;
4814 int32_t expDiff;
4816 aSig = extractFloatx80Frac( a );
4817 aExp = extractFloatx80Exp( a );
4818 bSig = extractFloatx80Frac( b );
4819 bExp = extractFloatx80Exp( b );
4820 expDiff = aExp - bExp;
4821 if ( 0 < expDiff ) goto aExpBigger;
4822 if ( expDiff < 0 ) goto bExpBigger;
4823 if ( aExp == 0x7FFF ) {
4824 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4825 return propagateFloatx80NaN(a, b, status);
4827 float_raise(float_flag_invalid, status);
4828 return floatx80_default_nan(status);
4830 if ( aExp == 0 ) {
4831 aExp = 1;
4832 bExp = 1;
4834 zSig1 = 0;
4835 if ( bSig < aSig ) goto aBigger;
4836 if ( aSig < bSig ) goto bBigger;
4837 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
4838 bExpBigger:
4839 if ( bExp == 0x7FFF ) {
4840 if ((uint64_t)(bSig << 1)) {
4841 return propagateFloatx80NaN(a, b, status);
4843 return packFloatx80(zSign ^ 1, floatx80_infinity_high,
4844 floatx80_infinity_low);
4846 if ( aExp == 0 ) ++expDiff;
4847 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4848 bBigger:
4849 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4850 zExp = bExp;
4851 zSign ^= 1;
4852 goto normalizeRoundAndPack;
4853 aExpBigger:
4854 if ( aExp == 0x7FFF ) {
4855 if ((uint64_t)(aSig << 1)) {
4856 return propagateFloatx80NaN(a, b, status);
4858 return a;
4860 if ( bExp == 0 ) --expDiff;
4861 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4862 aBigger:
4863 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4864 zExp = aExp;
4865 normalizeRoundAndPack:
4866 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
4867 zSign, zExp, zSig0, zSig1, status);
4870 /*----------------------------------------------------------------------------
4871 | Returns the result of adding the extended double-precision floating-point
4872 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4873 | Standard for Binary Floating-Point Arithmetic.
4874 *----------------------------------------------------------------------------*/
4876 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
4878 flag aSign, bSign;
4880 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4881 float_raise(float_flag_invalid, status);
4882 return floatx80_default_nan(status);
4884 aSign = extractFloatx80Sign( a );
4885 bSign = extractFloatx80Sign( b );
4886 if ( aSign == bSign ) {
4887 return addFloatx80Sigs(a, b, aSign, status);
4889 else {
4890 return subFloatx80Sigs(a, b, aSign, status);
4895 /*----------------------------------------------------------------------------
4896 | Returns the result of subtracting the extended double-precision floating-
4897 | point values `a' and `b'. The operation is performed according to the
4898 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4899 *----------------------------------------------------------------------------*/
4901 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
4903 flag aSign, bSign;
4905 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4906 float_raise(float_flag_invalid, status);
4907 return floatx80_default_nan(status);
4909 aSign = extractFloatx80Sign( a );
4910 bSign = extractFloatx80Sign( b );
4911 if ( aSign == bSign ) {
4912 return subFloatx80Sigs(a, b, aSign, status);
4914 else {
4915 return addFloatx80Sigs(a, b, aSign, status);
4920 /*----------------------------------------------------------------------------
4921 | Returns the result of multiplying the extended double-precision floating-
4922 | point values `a' and `b'. The operation is performed according to the
4923 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4924 *----------------------------------------------------------------------------*/
4926 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
4928 flag aSign, bSign, zSign;
4929 int32_t aExp, bExp, zExp;
4930 uint64_t aSig, bSig, zSig0, zSig1;
4932 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4933 float_raise(float_flag_invalid, status);
4934 return floatx80_default_nan(status);
4936 aSig = extractFloatx80Frac( a );
4937 aExp = extractFloatx80Exp( a );
4938 aSign = extractFloatx80Sign( a );
4939 bSig = extractFloatx80Frac( b );
4940 bExp = extractFloatx80Exp( b );
4941 bSign = extractFloatx80Sign( b );
4942 zSign = aSign ^ bSign;
4943 if ( aExp == 0x7FFF ) {
4944 if ( (uint64_t) ( aSig<<1 )
4945 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4946 return propagateFloatx80NaN(a, b, status);
4948 if ( ( bExp | bSig ) == 0 ) goto invalid;
4949 return packFloatx80(zSign, floatx80_infinity_high,
4950 floatx80_infinity_low);
4952 if ( bExp == 0x7FFF ) {
4953 if ((uint64_t)(bSig << 1)) {
4954 return propagateFloatx80NaN(a, b, status);
4956 if ( ( aExp | aSig ) == 0 ) {
4957 invalid:
4958 float_raise(float_flag_invalid, status);
4959 return floatx80_default_nan(status);
4961 return packFloatx80(zSign, floatx80_infinity_high,
4962 floatx80_infinity_low);
4964 if ( aExp == 0 ) {
4965 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4966 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4968 if ( bExp == 0 ) {
4969 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4970 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4972 zExp = aExp + bExp - 0x3FFE;
4973 mul64To128( aSig, bSig, &zSig0, &zSig1 );
4974 if ( 0 < (int64_t) zSig0 ) {
4975 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4976 --zExp;
4978 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4979 zSign, zExp, zSig0, zSig1, status);
4982 /*----------------------------------------------------------------------------
4983 | Returns the result of dividing the extended double-precision floating-point
4984 | value `a' by the corresponding value `b'. The operation is performed
4985 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4986 *----------------------------------------------------------------------------*/
4988 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
4990 flag aSign, bSign, zSign;
4991 int32_t aExp, bExp, zExp;
4992 uint64_t aSig, bSig, zSig0, zSig1;
4993 uint64_t rem0, rem1, rem2, term0, term1, term2;
4995 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4996 float_raise(float_flag_invalid, status);
4997 return floatx80_default_nan(status);
4999 aSig = extractFloatx80Frac( a );
5000 aExp = extractFloatx80Exp( a );
5001 aSign = extractFloatx80Sign( a );
5002 bSig = extractFloatx80Frac( b );
5003 bExp = extractFloatx80Exp( b );
5004 bSign = extractFloatx80Sign( b );
5005 zSign = aSign ^ bSign;
5006 if ( aExp == 0x7FFF ) {
5007 if ((uint64_t)(aSig << 1)) {
5008 return propagateFloatx80NaN(a, b, status);
5010 if ( bExp == 0x7FFF ) {
5011 if ((uint64_t)(bSig << 1)) {
5012 return propagateFloatx80NaN(a, b, status);
5014 goto invalid;
5016 return packFloatx80(zSign, floatx80_infinity_high,
5017 floatx80_infinity_low);
5019 if ( bExp == 0x7FFF ) {
5020 if ((uint64_t)(bSig << 1)) {
5021 return propagateFloatx80NaN(a, b, status);
5023 return packFloatx80( zSign, 0, 0 );
5025 if ( bExp == 0 ) {
5026 if ( bSig == 0 ) {
5027 if ( ( aExp | aSig ) == 0 ) {
5028 invalid:
5029 float_raise(float_flag_invalid, status);
5030 return floatx80_default_nan(status);
5032 float_raise(float_flag_divbyzero, status);
5033 return packFloatx80(zSign, floatx80_infinity_high,
5034 floatx80_infinity_low);
5036 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5038 if ( aExp == 0 ) {
5039 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5040 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5042 zExp = aExp - bExp + 0x3FFE;
5043 rem1 = 0;
5044 if ( bSig <= aSig ) {
5045 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5046 ++zExp;
5048 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5049 mul64To128( bSig, zSig0, &term0, &term1 );
5050 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5051 while ( (int64_t) rem0 < 0 ) {
5052 --zSig0;
5053 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5055 zSig1 = estimateDiv128To64( rem1, 0, bSig );
5056 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5057 mul64To128( bSig, zSig1, &term1, &term2 );
5058 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5059 while ( (int64_t) rem1 < 0 ) {
5060 --zSig1;
5061 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5063 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5065 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5066 zSign, zExp, zSig0, zSig1, status);
5069 /*----------------------------------------------------------------------------
5070 | Returns the remainder of the extended double-precision floating-point value
5071 | `a' with respect to the corresponding value `b'. The operation is performed
5072 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5073 *----------------------------------------------------------------------------*/
5075 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5077 flag aSign, zSign;
5078 int32_t aExp, bExp, expDiff;
5079 uint64_t aSig0, aSig1, bSig;
5080 uint64_t q, term0, term1, alternateASig0, alternateASig1;
5082 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5083 float_raise(float_flag_invalid, status);
5084 return floatx80_default_nan(status);
5086 aSig0 = extractFloatx80Frac( a );
5087 aExp = extractFloatx80Exp( a );
5088 aSign = extractFloatx80Sign( a );
5089 bSig = extractFloatx80Frac( b );
5090 bExp = extractFloatx80Exp( b );
5091 if ( aExp == 0x7FFF ) {
5092 if ( (uint64_t) ( aSig0<<1 )
5093 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5094 return propagateFloatx80NaN(a, b, status);
5096 goto invalid;
5098 if ( bExp == 0x7FFF ) {
5099 if ((uint64_t)(bSig << 1)) {
5100 return propagateFloatx80NaN(a, b, status);
5102 return a;
5104 if ( bExp == 0 ) {
5105 if ( bSig == 0 ) {
5106 invalid:
5107 float_raise(float_flag_invalid, status);
5108 return floatx80_default_nan(status);
5110 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5112 if ( aExp == 0 ) {
5113 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5114 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5116 bSig |= LIT64( 0x8000000000000000 );
5117 zSign = aSign;
5118 expDiff = aExp - bExp;
5119 aSig1 = 0;
5120 if ( expDiff < 0 ) {
5121 if ( expDiff < -1 ) return a;
5122 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5123 expDiff = 0;
5125 q = ( bSig <= aSig0 );
5126 if ( q ) aSig0 -= bSig;
5127 expDiff -= 64;
5128 while ( 0 < expDiff ) {
5129 q = estimateDiv128To64( aSig0, aSig1, bSig );
5130 q = ( 2 < q ) ? q - 2 : 0;
5131 mul64To128( bSig, q, &term0, &term1 );
5132 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5133 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5134 expDiff -= 62;
5136 expDiff += 64;
5137 if ( 0 < expDiff ) {
5138 q = estimateDiv128To64( aSig0, aSig1, bSig );
5139 q = ( 2 < q ) ? q - 2 : 0;
5140 q >>= 64 - expDiff;
5141 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5142 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5143 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5144 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5145 ++q;
5146 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5149 else {
5150 term1 = 0;
5151 term0 = bSig;
5153 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5154 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5155 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5156 && ( q & 1 ) )
5158 aSig0 = alternateASig0;
5159 aSig1 = alternateASig1;
5160 zSign = ! zSign;
5162 return
5163 normalizeRoundAndPackFloatx80(
5164 80, zSign, bExp + expDiff, aSig0, aSig1, status);
5168 /*----------------------------------------------------------------------------
5169 | Returns the square root of the extended double-precision floating-point
5170 | value `a'. The operation is performed according to the IEC/IEEE Standard
5171 | for Binary Floating-Point Arithmetic.
5172 *----------------------------------------------------------------------------*/
5174 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5176 flag aSign;
5177 int32_t aExp, zExp;
5178 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5179 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5181 if (floatx80_invalid_encoding(a)) {
5182 float_raise(float_flag_invalid, status);
5183 return floatx80_default_nan(status);
5185 aSig0 = extractFloatx80Frac( a );
5186 aExp = extractFloatx80Exp( a );
5187 aSign = extractFloatx80Sign( a );
5188 if ( aExp == 0x7FFF ) {
5189 if ((uint64_t)(aSig0 << 1)) {
5190 return propagateFloatx80NaN(a, a, status);
5192 if ( ! aSign ) return a;
5193 goto invalid;
5195 if ( aSign ) {
5196 if ( ( aExp | aSig0 ) == 0 ) return a;
5197 invalid:
5198 float_raise(float_flag_invalid, status);
5199 return floatx80_default_nan(status);
5201 if ( aExp == 0 ) {
5202 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5203 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5205 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5206 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5207 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5208 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5209 doubleZSig0 = zSig0<<1;
5210 mul64To128( zSig0, zSig0, &term0, &term1 );
5211 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5212 while ( (int64_t) rem0 < 0 ) {
5213 --zSig0;
5214 doubleZSig0 -= 2;
5215 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5217 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5218 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5219 if ( zSig1 == 0 ) zSig1 = 1;
5220 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5221 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5222 mul64To128( zSig1, zSig1, &term2, &term3 );
5223 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5224 while ( (int64_t) rem1 < 0 ) {
5225 --zSig1;
5226 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5227 term3 |= 1;
5228 term2 |= doubleZSig0;
5229 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5231 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5233 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5234 zSig0 |= doubleZSig0;
5235 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5236 0, zExp, zSig0, zSig1, status);
5239 /*----------------------------------------------------------------------------
5240 | Returns 1 if the extended double-precision floating-point value `a' is equal
5241 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5242 | raised if either operand is a NaN. Otherwise, the comparison is performed
5243 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5244 *----------------------------------------------------------------------------*/
5246 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5249 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5250 || (extractFloatx80Exp(a) == 0x7FFF
5251 && (uint64_t) (extractFloatx80Frac(a) << 1))
5252 || (extractFloatx80Exp(b) == 0x7FFF
5253 && (uint64_t) (extractFloatx80Frac(b) << 1))
5255 float_raise(float_flag_invalid, status);
5256 return 0;
5258 return
5259 ( a.low == b.low )
5260 && ( ( a.high == b.high )
5261 || ( ( a.low == 0 )
5262 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5267 /*----------------------------------------------------------------------------
5268 | Returns 1 if the extended double-precision floating-point value `a' is
5269 | less than or equal to the corresponding value `b', and 0 otherwise. The
5270 | invalid exception is raised if either operand is a NaN. The comparison is
5271 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5272 | Arithmetic.
5273 *----------------------------------------------------------------------------*/
5275 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5277 flag aSign, bSign;
5279 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5280 || (extractFloatx80Exp(a) == 0x7FFF
5281 && (uint64_t) (extractFloatx80Frac(a) << 1))
5282 || (extractFloatx80Exp(b) == 0x7FFF
5283 && (uint64_t) (extractFloatx80Frac(b) << 1))
5285 float_raise(float_flag_invalid, status);
5286 return 0;
5288 aSign = extractFloatx80Sign( a );
5289 bSign = extractFloatx80Sign( b );
5290 if ( aSign != bSign ) {
5291 return
5292 aSign
5293 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5294 == 0 );
5296 return
5297 aSign ? le128( b.high, b.low, a.high, a.low )
5298 : le128( a.high, a.low, b.high, b.low );
5302 /*----------------------------------------------------------------------------
5303 | Returns 1 if the extended double-precision floating-point value `a' is
5304 | less than the corresponding value `b', and 0 otherwise. The invalid
5305 | exception is raised if either operand is a NaN. The comparison is performed
5306 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5307 *----------------------------------------------------------------------------*/
5309 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5311 flag aSign, bSign;
5313 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5314 || (extractFloatx80Exp(a) == 0x7FFF
5315 && (uint64_t) (extractFloatx80Frac(a) << 1))
5316 || (extractFloatx80Exp(b) == 0x7FFF
5317 && (uint64_t) (extractFloatx80Frac(b) << 1))
5319 float_raise(float_flag_invalid, status);
5320 return 0;
5322 aSign = extractFloatx80Sign( a );
5323 bSign = extractFloatx80Sign( b );
5324 if ( aSign != bSign ) {
5325 return
5326 aSign
5327 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5328 != 0 );
5330 return
5331 aSign ? lt128( b.high, b.low, a.high, a.low )
5332 : lt128( a.high, a.low, b.high, b.low );
5336 /*----------------------------------------------------------------------------
5337 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5338 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5339 | either operand is a NaN. The comparison is performed according to the
5340 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5341 *----------------------------------------------------------------------------*/
5342 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5344 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5345 || (extractFloatx80Exp(a) == 0x7FFF
5346 && (uint64_t) (extractFloatx80Frac(a) << 1))
5347 || (extractFloatx80Exp(b) == 0x7FFF
5348 && (uint64_t) (extractFloatx80Frac(b) << 1))
5350 float_raise(float_flag_invalid, status);
5351 return 1;
5353 return 0;
5356 /*----------------------------------------------------------------------------
5357 | Returns 1 if the extended double-precision floating-point value `a' is
5358 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5359 | cause an exception. The comparison is performed according to the IEC/IEEE
5360 | Standard for Binary Floating-Point Arithmetic.
5361 *----------------------------------------------------------------------------*/
5363 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5366 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5367 float_raise(float_flag_invalid, status);
5368 return 0;
5370 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5371 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5372 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5373 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5375 if (floatx80_is_signaling_nan(a, status)
5376 || floatx80_is_signaling_nan(b, status)) {
5377 float_raise(float_flag_invalid, status);
5379 return 0;
5381 return
5382 ( a.low == b.low )
5383 && ( ( a.high == b.high )
5384 || ( ( a.low == 0 )
5385 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5390 /*----------------------------------------------------------------------------
5391 | Returns 1 if the extended double-precision floating-point value `a' is less
5392 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5393 | do not cause an exception. Otherwise, the comparison is performed according
5394 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5395 *----------------------------------------------------------------------------*/
5397 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5399 flag aSign, bSign;
5401 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5402 float_raise(float_flag_invalid, status);
5403 return 0;
5405 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5406 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5407 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5408 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5410 if (floatx80_is_signaling_nan(a, status)
5411 || floatx80_is_signaling_nan(b, status)) {
5412 float_raise(float_flag_invalid, status);
5414 return 0;
5416 aSign = extractFloatx80Sign( a );
5417 bSign = extractFloatx80Sign( b );
5418 if ( aSign != bSign ) {
5419 return
5420 aSign
5421 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5422 == 0 );
5424 return
5425 aSign ? le128( b.high, b.low, a.high, a.low )
5426 : le128( a.high, a.low, b.high, b.low );
5430 /*----------------------------------------------------------------------------
5431 | Returns 1 if the extended double-precision floating-point value `a' is less
5432 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5433 | an exception. Otherwise, the comparison is performed according to the
5434 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5435 *----------------------------------------------------------------------------*/
5437 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5439 flag aSign, bSign;
5441 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5442 float_raise(float_flag_invalid, status);
5443 return 0;
5445 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5446 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5447 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5448 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5450 if (floatx80_is_signaling_nan(a, status)
5451 || floatx80_is_signaling_nan(b, status)) {
5452 float_raise(float_flag_invalid, status);
5454 return 0;
5456 aSign = extractFloatx80Sign( a );
5457 bSign = extractFloatx80Sign( b );
5458 if ( aSign != bSign ) {
5459 return
5460 aSign
5461 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5462 != 0 );
5464 return
5465 aSign ? lt128( b.high, b.low, a.high, a.low )
5466 : lt128( a.high, a.low, b.high, b.low );
5470 /*----------------------------------------------------------------------------
5471 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5472 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5473 | The comparison is performed according to the IEC/IEEE Standard for Binary
5474 | Floating-Point Arithmetic.
5475 *----------------------------------------------------------------------------*/
5476 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5478 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5479 float_raise(float_flag_invalid, status);
5480 return 1;
5482 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5483 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5484 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5485 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5487 if (floatx80_is_signaling_nan(a, status)
5488 || floatx80_is_signaling_nan(b, status)) {
5489 float_raise(float_flag_invalid, status);
5491 return 1;
5493 return 0;
5496 /*----------------------------------------------------------------------------
5497 | Returns the result of converting the quadruple-precision floating-point
5498 | value `a' to the 32-bit two's complement integer format. The conversion
5499 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5500 | Arithmetic---which means in particular that the conversion is rounded
5501 | according to the current rounding mode. If `a' is a NaN, the largest
5502 | positive integer is returned. Otherwise, if the conversion overflows, the
5503 | largest integer with the same sign as `a' is returned.
5504 *----------------------------------------------------------------------------*/
5506 int32_t float128_to_int32(float128 a, float_status *status)
5508 flag aSign;
5509 int32_t aExp, shiftCount;
5510 uint64_t aSig0, aSig1;
5512 aSig1 = extractFloat128Frac1( a );
5513 aSig0 = extractFloat128Frac0( a );
5514 aExp = extractFloat128Exp( a );
5515 aSign = extractFloat128Sign( a );
5516 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5517 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5518 aSig0 |= ( aSig1 != 0 );
5519 shiftCount = 0x4028 - aExp;
5520 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5521 return roundAndPackInt32(aSign, aSig0, status);
5525 /*----------------------------------------------------------------------------
5526 | Returns the result of converting the quadruple-precision floating-point
5527 | value `a' to the 32-bit two's complement integer format. The conversion
5528 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5529 | Arithmetic, except that the conversion is always rounded toward zero. If
5530 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5531 | conversion overflows, the largest integer with the same sign as `a' is
5532 | returned.
5533 *----------------------------------------------------------------------------*/
5535 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5537 flag aSign;
5538 int32_t aExp, shiftCount;
5539 uint64_t aSig0, aSig1, savedASig;
5540 int32_t z;
5542 aSig1 = extractFloat128Frac1( a );
5543 aSig0 = extractFloat128Frac0( a );
5544 aExp = extractFloat128Exp( a );
5545 aSign = extractFloat128Sign( a );
5546 aSig0 |= ( aSig1 != 0 );
5547 if ( 0x401E < aExp ) {
5548 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5549 goto invalid;
5551 else if ( aExp < 0x3FFF ) {
5552 if (aExp || aSig0) {
5553 status->float_exception_flags |= float_flag_inexact;
5555 return 0;
5557 aSig0 |= LIT64( 0x0001000000000000 );
5558 shiftCount = 0x402F - aExp;
5559 savedASig = aSig0;
5560 aSig0 >>= shiftCount;
5561 z = aSig0;
5562 if ( aSign ) z = - z;
5563 if ( ( z < 0 ) ^ aSign ) {
5564 invalid:
5565 float_raise(float_flag_invalid, status);
5566 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5568 if ( ( aSig0<<shiftCount ) != savedASig ) {
5569 status->float_exception_flags |= float_flag_inexact;
5571 return z;
5575 /*----------------------------------------------------------------------------
5576 | Returns the result of converting the quadruple-precision floating-point
5577 | value `a' to the 64-bit two's complement integer format. The conversion
5578 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5579 | Arithmetic---which means in particular that the conversion is rounded
5580 | according to the current rounding mode. If `a' is a NaN, the largest
5581 | positive integer is returned. Otherwise, if the conversion overflows, the
5582 | largest integer with the same sign as `a' is returned.
5583 *----------------------------------------------------------------------------*/
5585 int64_t float128_to_int64(float128 a, float_status *status)
5587 flag aSign;
5588 int32_t aExp, shiftCount;
5589 uint64_t aSig0, aSig1;
5591 aSig1 = extractFloat128Frac1( a );
5592 aSig0 = extractFloat128Frac0( a );
5593 aExp = extractFloat128Exp( a );
5594 aSign = extractFloat128Sign( a );
5595 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5596 shiftCount = 0x402F - aExp;
5597 if ( shiftCount <= 0 ) {
5598 if ( 0x403E < aExp ) {
5599 float_raise(float_flag_invalid, status);
5600 if ( ! aSign
5601 || ( ( aExp == 0x7FFF )
5602 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5605 return LIT64( 0x7FFFFFFFFFFFFFFF );
5607 return (int64_t) LIT64( 0x8000000000000000 );
5609 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5611 else {
5612 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5614 return roundAndPackInt64(aSign, aSig0, aSig1, status);
5618 /*----------------------------------------------------------------------------
5619 | Returns the result of converting the quadruple-precision floating-point
5620 | value `a' to the 64-bit two's complement integer format. The conversion
5621 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5622 | Arithmetic, except that the conversion is always rounded toward zero.
5623 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5624 | the conversion overflows, the largest integer with the same sign as `a' is
5625 | returned.
5626 *----------------------------------------------------------------------------*/
5628 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5630 flag aSign;
5631 int32_t aExp, shiftCount;
5632 uint64_t aSig0, aSig1;
5633 int64_t z;
5635 aSig1 = extractFloat128Frac1( a );
5636 aSig0 = extractFloat128Frac0( a );
5637 aExp = extractFloat128Exp( a );
5638 aSign = extractFloat128Sign( a );
5639 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5640 shiftCount = aExp - 0x402F;
5641 if ( 0 < shiftCount ) {
5642 if ( 0x403E <= aExp ) {
5643 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5644 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5645 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5646 if (aSig1) {
5647 status->float_exception_flags |= float_flag_inexact;
5650 else {
5651 float_raise(float_flag_invalid, status);
5652 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5653 return LIT64( 0x7FFFFFFFFFFFFFFF );
5656 return (int64_t) LIT64( 0x8000000000000000 );
5658 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5659 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5660 status->float_exception_flags |= float_flag_inexact;
5663 else {
5664 if ( aExp < 0x3FFF ) {
5665 if ( aExp | aSig0 | aSig1 ) {
5666 status->float_exception_flags |= float_flag_inexact;
5668 return 0;
5670 z = aSig0>>( - shiftCount );
5671 if ( aSig1
5672 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5673 status->float_exception_flags |= float_flag_inexact;
5676 if ( aSign ) z = - z;
5677 return z;
5681 /*----------------------------------------------------------------------------
5682 | Returns the result of converting the quadruple-precision floating-point value
5683 | `a' to the 64-bit unsigned integer format. The conversion is
5684 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5685 | Arithmetic---which means in particular that the conversion is rounded
5686 | according to the current rounding mode. If `a' is a NaN, the largest
5687 | positive integer is returned. If the conversion overflows, the
5688 | largest unsigned integer is returned. If 'a' is negative, the value is
5689 | rounded and zero is returned; negative values that do not round to zero
5690 | will raise the inexact exception.
5691 *----------------------------------------------------------------------------*/
5693 uint64_t float128_to_uint64(float128 a, float_status *status)
5695 flag aSign;
5696 int aExp;
5697 int shiftCount;
5698 uint64_t aSig0, aSig1;
5700 aSig0 = extractFloat128Frac0(a);
5701 aSig1 = extractFloat128Frac1(a);
5702 aExp = extractFloat128Exp(a);
5703 aSign = extractFloat128Sign(a);
5704 if (aSign && (aExp > 0x3FFE)) {
5705 float_raise(float_flag_invalid, status);
5706 if (float128_is_any_nan(a)) {
5707 return LIT64(0xFFFFFFFFFFFFFFFF);
5708 } else {
5709 return 0;
5712 if (aExp) {
5713 aSig0 |= LIT64(0x0001000000000000);
5715 shiftCount = 0x402F - aExp;
5716 if (shiftCount <= 0) {
5717 if (0x403E < aExp) {
5718 float_raise(float_flag_invalid, status);
5719 return LIT64(0xFFFFFFFFFFFFFFFF);
5721 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5722 } else {
5723 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5725 return roundAndPackUint64(aSign, aSig0, aSig1, status);
5728 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5730 uint64_t v;
5731 signed char current_rounding_mode = status->float_rounding_mode;
5733 set_float_rounding_mode(float_round_to_zero, status);
5734 v = float128_to_uint64(a, status);
5735 set_float_rounding_mode(current_rounding_mode, status);
5737 return v;
5740 /*----------------------------------------------------------------------------
5741 | Returns the result of converting the quadruple-precision floating-point
5742 | value `a' to the 32-bit unsigned integer format. The conversion
5743 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5744 | Arithmetic except that the conversion is always rounded toward zero.
5745 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
5746 | if the conversion overflows, the largest unsigned integer is returned.
5747 | If 'a' is negative, the value is rounded and zero is returned; negative
5748 | values that do not round to zero will raise the inexact exception.
5749 *----------------------------------------------------------------------------*/
5751 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5753 uint64_t v;
5754 uint32_t res;
5755 int old_exc_flags = get_float_exception_flags(status);
5757 v = float128_to_uint64_round_to_zero(a, status);
5758 if (v > 0xffffffff) {
5759 res = 0xffffffff;
5760 } else {
5761 return v;
5763 set_float_exception_flags(old_exc_flags, status);
5764 float_raise(float_flag_invalid, status);
5765 return res;
5768 /*----------------------------------------------------------------------------
5769 | Returns the result of converting the quadruple-precision floating-point
5770 | value `a' to the single-precision floating-point format. The conversion
5771 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5772 | Arithmetic.
5773 *----------------------------------------------------------------------------*/
5775 float32 float128_to_float32(float128 a, float_status *status)
5777 flag aSign;
5778 int32_t aExp;
5779 uint64_t aSig0, aSig1;
5780 uint32_t zSig;
5782 aSig1 = extractFloat128Frac1( a );
5783 aSig0 = extractFloat128Frac0( a );
5784 aExp = extractFloat128Exp( a );
5785 aSign = extractFloat128Sign( a );
5786 if ( aExp == 0x7FFF ) {
5787 if ( aSig0 | aSig1 ) {
5788 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5790 return packFloat32( aSign, 0xFF, 0 );
5792 aSig0 |= ( aSig1 != 0 );
5793 shift64RightJamming( aSig0, 18, &aSig0 );
5794 zSig = aSig0;
5795 if ( aExp || zSig ) {
5796 zSig |= 0x40000000;
5797 aExp -= 0x3F81;
5799 return roundAndPackFloat32(aSign, aExp, zSig, status);
5803 /*----------------------------------------------------------------------------
5804 | Returns the result of converting the quadruple-precision floating-point
5805 | value `a' to the double-precision floating-point format. The conversion
5806 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5807 | Arithmetic.
5808 *----------------------------------------------------------------------------*/
5810 float64 float128_to_float64(float128 a, float_status *status)
5812 flag aSign;
5813 int32_t aExp;
5814 uint64_t aSig0, aSig1;
5816 aSig1 = extractFloat128Frac1( a );
5817 aSig0 = extractFloat128Frac0( a );
5818 aExp = extractFloat128Exp( a );
5819 aSign = extractFloat128Sign( a );
5820 if ( aExp == 0x7FFF ) {
5821 if ( aSig0 | aSig1 ) {
5822 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5824 return packFloat64( aSign, 0x7FF, 0 );
5826 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5827 aSig0 |= ( aSig1 != 0 );
5828 if ( aExp || aSig0 ) {
5829 aSig0 |= LIT64( 0x4000000000000000 );
5830 aExp -= 0x3C01;
5832 return roundAndPackFloat64(aSign, aExp, aSig0, status);
5836 /*----------------------------------------------------------------------------
5837 | Returns the result of converting the quadruple-precision floating-point
5838 | value `a' to the extended double-precision floating-point format. The
5839 | conversion is performed according to the IEC/IEEE Standard for Binary
5840 | Floating-Point Arithmetic.
5841 *----------------------------------------------------------------------------*/
5843 floatx80 float128_to_floatx80(float128 a, float_status *status)
5845 flag aSign;
5846 int32_t aExp;
5847 uint64_t aSig0, aSig1;
5849 aSig1 = extractFloat128Frac1( a );
5850 aSig0 = extractFloat128Frac0( a );
5851 aExp = extractFloat128Exp( a );
5852 aSign = extractFloat128Sign( a );
5853 if ( aExp == 0x7FFF ) {
5854 if ( aSig0 | aSig1 ) {
5855 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
5857 return packFloatx80(aSign, floatx80_infinity_high,
5858 floatx80_infinity_low);
5860 if ( aExp == 0 ) {
5861 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5862 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5864 else {
5865 aSig0 |= LIT64( 0x0001000000000000 );
5867 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5868 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
5872 /*----------------------------------------------------------------------------
5873 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5874 | returns the result as a quadruple-precision floating-point value. The
5875 | operation is performed according to the IEC/IEEE Standard for Binary
5876 | Floating-Point Arithmetic.
5877 *----------------------------------------------------------------------------*/
5879 float128 float128_round_to_int(float128 a, float_status *status)
5881 flag aSign;
5882 int32_t aExp;
5883 uint64_t lastBitMask, roundBitsMask;
5884 float128 z;
5886 aExp = extractFloat128Exp( a );
5887 if ( 0x402F <= aExp ) {
5888 if ( 0x406F <= aExp ) {
5889 if ( ( aExp == 0x7FFF )
5890 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5892 return propagateFloat128NaN(a, a, status);
5894 return a;
5896 lastBitMask = 1;
5897 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5898 roundBitsMask = lastBitMask - 1;
5899 z = a;
5900 switch (status->float_rounding_mode) {
5901 case float_round_nearest_even:
5902 if ( lastBitMask ) {
5903 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5904 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5906 else {
5907 if ( (int64_t) z.low < 0 ) {
5908 ++z.high;
5909 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5912 break;
5913 case float_round_ties_away:
5914 if (lastBitMask) {
5915 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
5916 } else {
5917 if ((int64_t) z.low < 0) {
5918 ++z.high;
5921 break;
5922 case float_round_to_zero:
5923 break;
5924 case float_round_up:
5925 if (!extractFloat128Sign(z)) {
5926 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5928 break;
5929 case float_round_down:
5930 if (extractFloat128Sign(z)) {
5931 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5933 break;
5934 default:
5935 abort();
5937 z.low &= ~ roundBitsMask;
5939 else {
5940 if ( aExp < 0x3FFF ) {
5941 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5942 status->float_exception_flags |= float_flag_inexact;
5943 aSign = extractFloat128Sign( a );
5944 switch (status->float_rounding_mode) {
5945 case float_round_nearest_even:
5946 if ( ( aExp == 0x3FFE )
5947 && ( extractFloat128Frac0( a )
5948 | extractFloat128Frac1( a ) )
5950 return packFloat128( aSign, 0x3FFF, 0, 0 );
5952 break;
5953 case float_round_ties_away:
5954 if (aExp == 0x3FFE) {
5955 return packFloat128(aSign, 0x3FFF, 0, 0);
5957 break;
5958 case float_round_down:
5959 return
5960 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5961 : packFloat128( 0, 0, 0, 0 );
5962 case float_round_up:
5963 return
5964 aSign ? packFloat128( 1, 0, 0, 0 )
5965 : packFloat128( 0, 0x3FFF, 0, 0 );
5967 return packFloat128( aSign, 0, 0, 0 );
5969 lastBitMask = 1;
5970 lastBitMask <<= 0x402F - aExp;
5971 roundBitsMask = lastBitMask - 1;
5972 z.low = 0;
5973 z.high = a.high;
5974 switch (status->float_rounding_mode) {
5975 case float_round_nearest_even:
5976 z.high += lastBitMask>>1;
5977 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5978 z.high &= ~ lastBitMask;
5980 break;
5981 case float_round_ties_away:
5982 z.high += lastBitMask>>1;
5983 break;
5984 case float_round_to_zero:
5985 break;
5986 case float_round_up:
5987 if (!extractFloat128Sign(z)) {
5988 z.high |= ( a.low != 0 );
5989 z.high += roundBitsMask;
5991 break;
5992 case float_round_down:
5993 if (extractFloat128Sign(z)) {
5994 z.high |= (a.low != 0);
5995 z.high += roundBitsMask;
5997 break;
5998 default:
5999 abort();
6001 z.high &= ~ roundBitsMask;
6003 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6004 status->float_exception_flags |= float_flag_inexact;
6006 return z;
6010 /*----------------------------------------------------------------------------
6011 | Returns the result of adding the absolute values of the quadruple-precision
6012 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6013 | before being returned. `zSign' is ignored if the result is a NaN.
6014 | The addition is performed according to the IEC/IEEE Standard for Binary
6015 | Floating-Point Arithmetic.
6016 *----------------------------------------------------------------------------*/
6018 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6019 float_status *status)
6021 int32_t aExp, bExp, zExp;
6022 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6023 int32_t expDiff;
6025 aSig1 = extractFloat128Frac1( a );
6026 aSig0 = extractFloat128Frac0( a );
6027 aExp = extractFloat128Exp( a );
6028 bSig1 = extractFloat128Frac1( b );
6029 bSig0 = extractFloat128Frac0( b );
6030 bExp = extractFloat128Exp( b );
6031 expDiff = aExp - bExp;
6032 if ( 0 < expDiff ) {
6033 if ( aExp == 0x7FFF ) {
6034 if (aSig0 | aSig1) {
6035 return propagateFloat128NaN(a, b, status);
6037 return a;
6039 if ( bExp == 0 ) {
6040 --expDiff;
6042 else {
6043 bSig0 |= LIT64( 0x0001000000000000 );
6045 shift128ExtraRightJamming(
6046 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6047 zExp = aExp;
6049 else if ( expDiff < 0 ) {
6050 if ( bExp == 0x7FFF ) {
6051 if (bSig0 | bSig1) {
6052 return propagateFloat128NaN(a, b, status);
6054 return packFloat128( zSign, 0x7FFF, 0, 0 );
6056 if ( aExp == 0 ) {
6057 ++expDiff;
6059 else {
6060 aSig0 |= LIT64( 0x0001000000000000 );
6062 shift128ExtraRightJamming(
6063 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6064 zExp = bExp;
6066 else {
6067 if ( aExp == 0x7FFF ) {
6068 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6069 return propagateFloat128NaN(a, b, status);
6071 return a;
6073 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6074 if ( aExp == 0 ) {
6075 if (status->flush_to_zero) {
6076 if (zSig0 | zSig1) {
6077 float_raise(float_flag_output_denormal, status);
6079 return packFloat128(zSign, 0, 0, 0);
6081 return packFloat128( zSign, 0, zSig0, zSig1 );
6083 zSig2 = 0;
6084 zSig0 |= LIT64( 0x0002000000000000 );
6085 zExp = aExp;
6086 goto shiftRight1;
6088 aSig0 |= LIT64( 0x0001000000000000 );
6089 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6090 --zExp;
6091 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6092 ++zExp;
6093 shiftRight1:
6094 shift128ExtraRightJamming(
6095 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6096 roundAndPack:
6097 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6101 /*----------------------------------------------------------------------------
6102 | Returns the result of subtracting the absolute values of the quadruple-
6103 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6104 | difference is negated before being returned. `zSign' is ignored if the
6105 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6106 | Standard for Binary Floating-Point Arithmetic.
6107 *----------------------------------------------------------------------------*/
6109 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6110 float_status *status)
6112 int32_t aExp, bExp, zExp;
6113 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6114 int32_t expDiff;
6116 aSig1 = extractFloat128Frac1( a );
6117 aSig0 = extractFloat128Frac0( a );
6118 aExp = extractFloat128Exp( a );
6119 bSig1 = extractFloat128Frac1( b );
6120 bSig0 = extractFloat128Frac0( b );
6121 bExp = extractFloat128Exp( b );
6122 expDiff = aExp - bExp;
6123 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6124 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6125 if ( 0 < expDiff ) goto aExpBigger;
6126 if ( expDiff < 0 ) goto bExpBigger;
6127 if ( aExp == 0x7FFF ) {
6128 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6129 return propagateFloat128NaN(a, b, status);
6131 float_raise(float_flag_invalid, status);
6132 return float128_default_nan(status);
6134 if ( aExp == 0 ) {
6135 aExp = 1;
6136 bExp = 1;
6138 if ( bSig0 < aSig0 ) goto aBigger;
6139 if ( aSig0 < bSig0 ) goto bBigger;
6140 if ( bSig1 < aSig1 ) goto aBigger;
6141 if ( aSig1 < bSig1 ) goto bBigger;
6142 return packFloat128(status->float_rounding_mode == float_round_down,
6143 0, 0, 0);
6144 bExpBigger:
6145 if ( bExp == 0x7FFF ) {
6146 if (bSig0 | bSig1) {
6147 return propagateFloat128NaN(a, b, status);
6149 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6151 if ( aExp == 0 ) {
6152 ++expDiff;
6154 else {
6155 aSig0 |= LIT64( 0x4000000000000000 );
6157 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6158 bSig0 |= LIT64( 0x4000000000000000 );
6159 bBigger:
6160 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6161 zExp = bExp;
6162 zSign ^= 1;
6163 goto normalizeRoundAndPack;
6164 aExpBigger:
6165 if ( aExp == 0x7FFF ) {
6166 if (aSig0 | aSig1) {
6167 return propagateFloat128NaN(a, b, status);
6169 return a;
6171 if ( bExp == 0 ) {
6172 --expDiff;
6174 else {
6175 bSig0 |= LIT64( 0x4000000000000000 );
6177 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6178 aSig0 |= LIT64( 0x4000000000000000 );
6179 aBigger:
6180 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6181 zExp = aExp;
6182 normalizeRoundAndPack:
6183 --zExp;
6184 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6185 status);
6189 /*----------------------------------------------------------------------------
6190 | Returns the result of adding the quadruple-precision floating-point values
6191 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6192 | for Binary Floating-Point Arithmetic.
6193 *----------------------------------------------------------------------------*/
6195 float128 float128_add(float128 a, float128 b, float_status *status)
6197 flag aSign, bSign;
6199 aSign = extractFloat128Sign( a );
6200 bSign = extractFloat128Sign( b );
6201 if ( aSign == bSign ) {
6202 return addFloat128Sigs(a, b, aSign, status);
6204 else {
6205 return subFloat128Sigs(a, b, aSign, status);
6210 /*----------------------------------------------------------------------------
6211 | Returns the result of subtracting the quadruple-precision floating-point
6212 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6213 | Standard for Binary Floating-Point Arithmetic.
6214 *----------------------------------------------------------------------------*/
6216 float128 float128_sub(float128 a, float128 b, float_status *status)
6218 flag aSign, bSign;
6220 aSign = extractFloat128Sign( a );
6221 bSign = extractFloat128Sign( b );
6222 if ( aSign == bSign ) {
6223 return subFloat128Sigs(a, b, aSign, status);
6225 else {
6226 return addFloat128Sigs(a, b, aSign, status);
6231 /*----------------------------------------------------------------------------
6232 | Returns the result of multiplying the quadruple-precision floating-point
6233 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6234 | Standard for Binary Floating-Point Arithmetic.
6235 *----------------------------------------------------------------------------*/
6237 float128 float128_mul(float128 a, float128 b, float_status *status)
6239 flag aSign, bSign, zSign;
6240 int32_t aExp, bExp, zExp;
6241 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6243 aSig1 = extractFloat128Frac1( a );
6244 aSig0 = extractFloat128Frac0( a );
6245 aExp = extractFloat128Exp( a );
6246 aSign = extractFloat128Sign( a );
6247 bSig1 = extractFloat128Frac1( b );
6248 bSig0 = extractFloat128Frac0( b );
6249 bExp = extractFloat128Exp( b );
6250 bSign = extractFloat128Sign( b );
6251 zSign = aSign ^ bSign;
6252 if ( aExp == 0x7FFF ) {
6253 if ( ( aSig0 | aSig1 )
6254 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6255 return propagateFloat128NaN(a, b, status);
6257 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6258 return packFloat128( zSign, 0x7FFF, 0, 0 );
6260 if ( bExp == 0x7FFF ) {
6261 if (bSig0 | bSig1) {
6262 return propagateFloat128NaN(a, b, status);
6264 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6265 invalid:
6266 float_raise(float_flag_invalid, status);
6267 return float128_default_nan(status);
6269 return packFloat128( zSign, 0x7FFF, 0, 0 );
6271 if ( aExp == 0 ) {
6272 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6273 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6275 if ( bExp == 0 ) {
6276 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6277 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6279 zExp = aExp + bExp - 0x4000;
6280 aSig0 |= LIT64( 0x0001000000000000 );
6281 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6282 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6283 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6284 zSig2 |= ( zSig3 != 0 );
6285 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6286 shift128ExtraRightJamming(
6287 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6288 ++zExp;
6290 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6294 /*----------------------------------------------------------------------------
6295 | Returns the result of dividing the quadruple-precision floating-point value
6296 | `a' by the corresponding value `b'. The operation is performed according to
6297 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6298 *----------------------------------------------------------------------------*/
6300 float128 float128_div(float128 a, float128 b, float_status *status)
6302 flag aSign, bSign, zSign;
6303 int32_t aExp, bExp, zExp;
6304 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6305 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6307 aSig1 = extractFloat128Frac1( a );
6308 aSig0 = extractFloat128Frac0( a );
6309 aExp = extractFloat128Exp( a );
6310 aSign = extractFloat128Sign( a );
6311 bSig1 = extractFloat128Frac1( b );
6312 bSig0 = extractFloat128Frac0( b );
6313 bExp = extractFloat128Exp( b );
6314 bSign = extractFloat128Sign( b );
6315 zSign = aSign ^ bSign;
6316 if ( aExp == 0x7FFF ) {
6317 if (aSig0 | aSig1) {
6318 return propagateFloat128NaN(a, b, status);
6320 if ( bExp == 0x7FFF ) {
6321 if (bSig0 | bSig1) {
6322 return propagateFloat128NaN(a, b, status);
6324 goto invalid;
6326 return packFloat128( zSign, 0x7FFF, 0, 0 );
6328 if ( bExp == 0x7FFF ) {
6329 if (bSig0 | bSig1) {
6330 return propagateFloat128NaN(a, b, status);
6332 return packFloat128( zSign, 0, 0, 0 );
6334 if ( bExp == 0 ) {
6335 if ( ( bSig0 | bSig1 ) == 0 ) {
6336 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6337 invalid:
6338 float_raise(float_flag_invalid, status);
6339 return float128_default_nan(status);
6341 float_raise(float_flag_divbyzero, status);
6342 return packFloat128( zSign, 0x7FFF, 0, 0 );
6344 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6346 if ( aExp == 0 ) {
6347 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6348 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6350 zExp = aExp - bExp + 0x3FFD;
6351 shortShift128Left(
6352 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6353 shortShift128Left(
6354 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6355 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6356 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6357 ++zExp;
6359 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6360 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6361 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6362 while ( (int64_t) rem0 < 0 ) {
6363 --zSig0;
6364 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6366 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6367 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6368 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6369 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6370 while ( (int64_t) rem1 < 0 ) {
6371 --zSig1;
6372 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6374 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6376 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6377 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6381 /*----------------------------------------------------------------------------
6382 | Returns the remainder of the quadruple-precision floating-point value `a'
6383 | with respect to the corresponding value `b'. The operation is performed
6384 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6385 *----------------------------------------------------------------------------*/
6387 float128 float128_rem(float128 a, float128 b, float_status *status)
6389 flag aSign, zSign;
6390 int32_t aExp, bExp, expDiff;
6391 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6392 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6393 int64_t sigMean0;
6395 aSig1 = extractFloat128Frac1( a );
6396 aSig0 = extractFloat128Frac0( a );
6397 aExp = extractFloat128Exp( a );
6398 aSign = extractFloat128Sign( a );
6399 bSig1 = extractFloat128Frac1( b );
6400 bSig0 = extractFloat128Frac0( b );
6401 bExp = extractFloat128Exp( b );
6402 if ( aExp == 0x7FFF ) {
6403 if ( ( aSig0 | aSig1 )
6404 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6405 return propagateFloat128NaN(a, b, status);
6407 goto invalid;
6409 if ( bExp == 0x7FFF ) {
6410 if (bSig0 | bSig1) {
6411 return propagateFloat128NaN(a, b, status);
6413 return a;
6415 if ( bExp == 0 ) {
6416 if ( ( bSig0 | bSig1 ) == 0 ) {
6417 invalid:
6418 float_raise(float_flag_invalid, status);
6419 return float128_default_nan(status);
6421 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6423 if ( aExp == 0 ) {
6424 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6425 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6427 expDiff = aExp - bExp;
6428 if ( expDiff < -1 ) return a;
6429 shortShift128Left(
6430 aSig0 | LIT64( 0x0001000000000000 ),
6431 aSig1,
6432 15 - ( expDiff < 0 ),
6433 &aSig0,
6434 &aSig1
6436 shortShift128Left(
6437 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6438 q = le128( bSig0, bSig1, aSig0, aSig1 );
6439 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6440 expDiff -= 64;
6441 while ( 0 < expDiff ) {
6442 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6443 q = ( 4 < q ) ? q - 4 : 0;
6444 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6445 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6446 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6447 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6448 expDiff -= 61;
6450 if ( -64 < expDiff ) {
6451 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6452 q = ( 4 < q ) ? q - 4 : 0;
6453 q >>= - expDiff;
6454 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6455 expDiff += 52;
6456 if ( expDiff < 0 ) {
6457 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6459 else {
6460 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6462 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6463 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6465 else {
6466 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6467 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6469 do {
6470 alternateASig0 = aSig0;
6471 alternateASig1 = aSig1;
6472 ++q;
6473 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6474 } while ( 0 <= (int64_t) aSig0 );
6475 add128(
6476 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6477 if ( ( sigMean0 < 0 )
6478 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6479 aSig0 = alternateASig0;
6480 aSig1 = alternateASig1;
6482 zSign = ( (int64_t) aSig0 < 0 );
6483 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6484 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6485 status);
6488 /*----------------------------------------------------------------------------
6489 | Returns the square root of the quadruple-precision floating-point value `a'.
6490 | The operation is performed according to the IEC/IEEE Standard for Binary
6491 | Floating-Point Arithmetic.
6492 *----------------------------------------------------------------------------*/
6494 float128 float128_sqrt(float128 a, float_status *status)
6496 flag aSign;
6497 int32_t aExp, zExp;
6498 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6499 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6501 aSig1 = extractFloat128Frac1( a );
6502 aSig0 = extractFloat128Frac0( a );
6503 aExp = extractFloat128Exp( a );
6504 aSign = extractFloat128Sign( a );
6505 if ( aExp == 0x7FFF ) {
6506 if (aSig0 | aSig1) {
6507 return propagateFloat128NaN(a, a, status);
6509 if ( ! aSign ) return a;
6510 goto invalid;
6512 if ( aSign ) {
6513 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6514 invalid:
6515 float_raise(float_flag_invalid, status);
6516 return float128_default_nan(status);
6518 if ( aExp == 0 ) {
6519 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6520 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6522 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6523 aSig0 |= LIT64( 0x0001000000000000 );
6524 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6525 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6526 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6527 doubleZSig0 = zSig0<<1;
6528 mul64To128( zSig0, zSig0, &term0, &term1 );
6529 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6530 while ( (int64_t) rem0 < 0 ) {
6531 --zSig0;
6532 doubleZSig0 -= 2;
6533 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6535 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6536 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6537 if ( zSig1 == 0 ) zSig1 = 1;
6538 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6539 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6540 mul64To128( zSig1, zSig1, &term2, &term3 );
6541 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6542 while ( (int64_t) rem1 < 0 ) {
6543 --zSig1;
6544 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6545 term3 |= 1;
6546 term2 |= doubleZSig0;
6547 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6549 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6551 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6552 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6556 /*----------------------------------------------------------------------------
6557 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6558 | the corresponding value `b', and 0 otherwise. The invalid exception is
6559 | raised if either operand is a NaN. Otherwise, the comparison is performed
6560 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6561 *----------------------------------------------------------------------------*/
6563 int float128_eq(float128 a, float128 b, float_status *status)
6566 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6567 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6568 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6569 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6571 float_raise(float_flag_invalid, status);
6572 return 0;
6574 return
6575 ( a.low == b.low )
6576 && ( ( a.high == b.high )
6577 || ( ( a.low == 0 )
6578 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6583 /*----------------------------------------------------------------------------
6584 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6585 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6586 | exception is raised if either operand is a NaN. The comparison is performed
6587 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6588 *----------------------------------------------------------------------------*/
6590 int float128_le(float128 a, float128 b, float_status *status)
6592 flag aSign, bSign;
6594 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6595 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6596 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6597 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6599 float_raise(float_flag_invalid, status);
6600 return 0;
6602 aSign = extractFloat128Sign( a );
6603 bSign = extractFloat128Sign( b );
6604 if ( aSign != bSign ) {
6605 return
6606 aSign
6607 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6608 == 0 );
6610 return
6611 aSign ? le128( b.high, b.low, a.high, a.low )
6612 : le128( a.high, a.low, b.high, b.low );
6616 /*----------------------------------------------------------------------------
6617 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6618 | the corresponding value `b', and 0 otherwise. The invalid exception is
6619 | raised if either operand is a NaN. The comparison is performed according
6620 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6621 *----------------------------------------------------------------------------*/
6623 int float128_lt(float128 a, float128 b, float_status *status)
6625 flag aSign, bSign;
6627 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6628 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6629 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6630 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6632 float_raise(float_flag_invalid, status);
6633 return 0;
6635 aSign = extractFloat128Sign( a );
6636 bSign = extractFloat128Sign( b );
6637 if ( aSign != bSign ) {
6638 return
6639 aSign
6640 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6641 != 0 );
6643 return
6644 aSign ? lt128( b.high, b.low, a.high, a.low )
6645 : lt128( a.high, a.low, b.high, b.low );
6649 /*----------------------------------------------------------------------------
6650 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6651 | be compared, and 0 otherwise. The invalid exception is raised if either
6652 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6653 | Standard for Binary Floating-Point Arithmetic.
6654 *----------------------------------------------------------------------------*/
6656 int float128_unordered(float128 a, float128 b, float_status *status)
6658 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6659 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6660 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6661 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6663 float_raise(float_flag_invalid, status);
6664 return 1;
6666 return 0;
6669 /*----------------------------------------------------------------------------
6670 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6671 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6672 | exception. The comparison is performed according to the IEC/IEEE Standard
6673 | for Binary Floating-Point Arithmetic.
6674 *----------------------------------------------------------------------------*/
6676 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6679 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6680 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6681 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6682 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6684 if (float128_is_signaling_nan(a, status)
6685 || float128_is_signaling_nan(b, status)) {
6686 float_raise(float_flag_invalid, status);
6688 return 0;
6690 return
6691 ( a.low == b.low )
6692 && ( ( a.high == b.high )
6693 || ( ( a.low == 0 )
6694 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6699 /*----------------------------------------------------------------------------
6700 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6701 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6702 | cause an exception. Otherwise, the comparison is performed according to the
6703 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6704 *----------------------------------------------------------------------------*/
6706 int float128_le_quiet(float128 a, float128 b, float_status *status)
6708 flag aSign, bSign;
6710 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6711 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6712 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6713 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6715 if (float128_is_signaling_nan(a, status)
6716 || float128_is_signaling_nan(b, status)) {
6717 float_raise(float_flag_invalid, status);
6719 return 0;
6721 aSign = extractFloat128Sign( a );
6722 bSign = extractFloat128Sign( b );
6723 if ( aSign != bSign ) {
6724 return
6725 aSign
6726 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6727 == 0 );
6729 return
6730 aSign ? le128( b.high, b.low, a.high, a.low )
6731 : le128( a.high, a.low, b.high, b.low );
6735 /*----------------------------------------------------------------------------
6736 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6737 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6738 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6739 | Standard for Binary Floating-Point Arithmetic.
6740 *----------------------------------------------------------------------------*/
6742 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6744 flag aSign, bSign;
6746 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6747 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6748 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6749 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6751 if (float128_is_signaling_nan(a, status)
6752 || float128_is_signaling_nan(b, status)) {
6753 float_raise(float_flag_invalid, status);
6755 return 0;
6757 aSign = extractFloat128Sign( a );
6758 bSign = extractFloat128Sign( b );
6759 if ( aSign != bSign ) {
6760 return
6761 aSign
6762 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6763 != 0 );
6765 return
6766 aSign ? lt128( b.high, b.low, a.high, a.low )
6767 : lt128( a.high, a.low, b.high, b.low );
6771 /*----------------------------------------------------------------------------
6772 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6773 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6774 | comparison is performed according to the IEC/IEEE Standard for Binary
6775 | Floating-Point Arithmetic.
6776 *----------------------------------------------------------------------------*/
6778 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6780 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6781 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6782 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6783 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6785 if (float128_is_signaling_nan(a, status)
6786 || float128_is_signaling_nan(b, status)) {
6787 float_raise(float_flag_invalid, status);
6789 return 1;
6791 return 0;
6794 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6795 int is_quiet, float_status *status)
6797 flag aSign, bSign;
6799 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6800 float_raise(float_flag_invalid, status);
6801 return float_relation_unordered;
6803 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6804 ( extractFloatx80Frac( a )<<1 ) ) ||
6805 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6806 ( extractFloatx80Frac( b )<<1 ) )) {
6807 if (!is_quiet ||
6808 floatx80_is_signaling_nan(a, status) ||
6809 floatx80_is_signaling_nan(b, status)) {
6810 float_raise(float_flag_invalid, status);
6812 return float_relation_unordered;
6814 aSign = extractFloatx80Sign( a );
6815 bSign = extractFloatx80Sign( b );
6816 if ( aSign != bSign ) {
6818 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6819 ( ( a.low | b.low ) == 0 ) ) {
6820 /* zero case */
6821 return float_relation_equal;
6822 } else {
6823 return 1 - (2 * aSign);
6825 } else {
6826 if (a.low == b.low && a.high == b.high) {
6827 return float_relation_equal;
6828 } else {
6829 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6834 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6836 return floatx80_compare_internal(a, b, 0, status);
6839 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6841 return floatx80_compare_internal(a, b, 1, status);
6844 static inline int float128_compare_internal(float128 a, float128 b,
6845 int is_quiet, float_status *status)
6847 flag aSign, bSign;
6849 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6850 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6851 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6852 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6853 if (!is_quiet ||
6854 float128_is_signaling_nan(a, status) ||
6855 float128_is_signaling_nan(b, status)) {
6856 float_raise(float_flag_invalid, status);
6858 return float_relation_unordered;
6860 aSign = extractFloat128Sign( a );
6861 bSign = extractFloat128Sign( b );
6862 if ( aSign != bSign ) {
6863 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6864 /* zero case */
6865 return float_relation_equal;
6866 } else {
6867 return 1 - (2 * aSign);
6869 } else {
6870 if (a.low == b.low && a.high == b.high) {
6871 return float_relation_equal;
6872 } else {
6873 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6878 int float128_compare(float128 a, float128 b, float_status *status)
6880 return float128_compare_internal(a, b, 0, status);
6883 int float128_compare_quiet(float128 a, float128 b, float_status *status)
6885 return float128_compare_internal(a, b, 1, status);
6888 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
6890 flag aSign;
6891 int32_t aExp;
6892 uint64_t aSig;
6894 if (floatx80_invalid_encoding(a)) {
6895 float_raise(float_flag_invalid, status);
6896 return floatx80_default_nan(status);
6898 aSig = extractFloatx80Frac( a );
6899 aExp = extractFloatx80Exp( a );
6900 aSign = extractFloatx80Sign( a );
6902 if ( aExp == 0x7FFF ) {
6903 if ( aSig<<1 ) {
6904 return propagateFloatx80NaN(a, a, status);
6906 return a;
6909 if (aExp == 0) {
6910 if (aSig == 0) {
6911 return a;
6913 aExp++;
6916 if (n > 0x10000) {
6917 n = 0x10000;
6918 } else if (n < -0x10000) {
6919 n = -0x10000;
6922 aExp += n;
6923 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
6924 aSign, aExp, aSig, 0, status);
6927 float128 float128_scalbn(float128 a, int n, float_status *status)
6929 flag aSign;
6930 int32_t aExp;
6931 uint64_t aSig0, aSig1;
6933 aSig1 = extractFloat128Frac1( a );
6934 aSig0 = extractFloat128Frac0( a );
6935 aExp = extractFloat128Exp( a );
6936 aSign = extractFloat128Sign( a );
6937 if ( aExp == 0x7FFF ) {
6938 if ( aSig0 | aSig1 ) {
6939 return propagateFloat128NaN(a, a, status);
6941 return a;
6943 if (aExp != 0) {
6944 aSig0 |= LIT64( 0x0001000000000000 );
6945 } else if (aSig0 == 0 && aSig1 == 0) {
6946 return a;
6947 } else {
6948 aExp++;
6951 if (n > 0x10000) {
6952 n = 0x10000;
6953 } else if (n < -0x10000) {
6954 n = -0x10000;
6957 aExp += n - 1;
6958 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6959 , status);