pc-dimm: factor out MemoryDevice interface
[qemu/ar7.git] / fpu / softfloat.c
blob70e0c40a1c0b56361673704a31588639d45cdc05
1 /*
2 * QEMU float support
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "fpu/softfloat-macros.h"
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine: (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output. These details are target-
104 | specific.
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
112 static inline uint32_t extractFloat16Frac(float16 a)
114 return float16_val(a) & 0x3ff;
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
121 static inline int extractFloat16Exp(float16 a)
123 return (float16_val(a) >> 10) & 0x1f;
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
130 static inline flag extractFloat16Sign(float16 a)
132 return float16_val(a)>>15;
135 /*----------------------------------------------------------------------------
136 | Returns the fraction bits of the single-precision floating-point value `a'.
137 *----------------------------------------------------------------------------*/
139 static inline uint32_t extractFloat32Frac(float32 a)
141 return float32_val(a) & 0x007FFFFF;
144 /*----------------------------------------------------------------------------
145 | Returns the exponent bits of the single-precision floating-point value `a'.
146 *----------------------------------------------------------------------------*/
148 static inline int extractFloat32Exp(float32 a)
150 return (float32_val(a) >> 23) & 0xFF;
153 /*----------------------------------------------------------------------------
154 | Returns the sign bit of the single-precision floating-point value `a'.
155 *----------------------------------------------------------------------------*/
157 static inline flag extractFloat32Sign(float32 a)
159 return float32_val(a) >> 31;
162 /*----------------------------------------------------------------------------
163 | Returns the fraction bits of the double-precision floating-point value `a'.
164 *----------------------------------------------------------------------------*/
166 static inline uint64_t extractFloat64Frac(float64 a)
168 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
171 /*----------------------------------------------------------------------------
172 | Returns the exponent bits of the double-precision floating-point value `a'.
173 *----------------------------------------------------------------------------*/
175 static inline int extractFloat64Exp(float64 a)
177 return (float64_val(a) >> 52) & 0x7FF;
180 /*----------------------------------------------------------------------------
181 | Returns the sign bit of the double-precision floating-point value `a'.
182 *----------------------------------------------------------------------------*/
184 static inline flag extractFloat64Sign(float64 a)
186 return float64_val(a) >> 63;
190 * Classify a floating point number. Everything above float_class_qnan
191 * is a NaN so cls >= float_class_qnan is any NaN.
194 typedef enum __attribute__ ((__packed__)) {
195 float_class_unclassified,
196 float_class_zero,
197 float_class_normal,
198 float_class_inf,
199 float_class_qnan, /* all NaNs from here */
200 float_class_snan,
201 float_class_dnan,
202 float_class_msnan, /* maybe silenced */
203 } FloatClass;
206 * Structure holding all of the decomposed parts of a float. The
207 * exponent is unbiased and the fraction is normalized. All
208 * calculations are done with a 64 bit fraction and then rounded as
209 * appropriate for the final format.
211 * Thanks to the packed FloatClass a decent compiler should be able to
212 * fit the whole structure into registers and avoid using the stack
213 * for parameter passing.
216 typedef struct {
217 uint64_t frac;
218 int32_t exp;
219 FloatClass cls;
220 bool sign;
221 } FloatParts;
223 #define DECOMPOSED_BINARY_POINT (64 - 2)
224 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
225 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
227 /* Structure holding all of the relevant parameters for a format.
228 * exp_size: the size of the exponent field
229 * exp_bias: the offset applied to the exponent field
230 * exp_max: the maximum normalised exponent
231 * frac_size: the size of the fraction field
232 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
233 * The following are computed based the size of fraction
234 * frac_lsb: least significant bit of fraction
235 * fram_lsbm1: the bit bellow the least significant bit (for rounding)
236 * round_mask/roundeven_mask: masks used for rounding
238 typedef struct {
239 int exp_size;
240 int exp_bias;
241 int exp_max;
242 int frac_size;
243 int frac_shift;
244 uint64_t frac_lsb;
245 uint64_t frac_lsbm1;
246 uint64_t round_mask;
247 uint64_t roundeven_mask;
248 } FloatFmt;
250 /* Expand fields based on the size of exponent and fraction */
251 #define FLOAT_PARAMS(E, F) \
252 .exp_size = E, \
253 .exp_bias = ((1 << E) - 1) >> 1, \
254 .exp_max = (1 << E) - 1, \
255 .frac_size = F, \
256 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
257 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
258 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
259 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
260 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
262 static const FloatFmt float16_params = {
263 FLOAT_PARAMS(5, 10)
266 static const FloatFmt float32_params = {
267 FLOAT_PARAMS(8, 23)
270 static const FloatFmt float64_params = {
271 FLOAT_PARAMS(11, 52)
274 /* Unpack a float to parts, but do not canonicalize. */
275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
277 const int sign_pos = fmt.frac_size + fmt.exp_size;
279 return (FloatParts) {
280 .cls = float_class_unclassified,
281 .sign = extract64(raw, sign_pos, 1),
282 .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
283 .frac = extract64(raw, 0, fmt.frac_size),
287 static inline FloatParts float16_unpack_raw(float16 f)
289 return unpack_raw(float16_params, f);
292 static inline FloatParts float32_unpack_raw(float32 f)
294 return unpack_raw(float32_params, f);
297 static inline FloatParts float64_unpack_raw(float64 f)
299 return unpack_raw(float64_params, f);
302 /* Pack a float from parts, but do not canonicalize. */
303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
305 const int sign_pos = fmt.frac_size + fmt.exp_size;
306 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
307 return deposit64(ret, sign_pos, 1, p.sign);
310 static inline float16 float16_pack_raw(FloatParts p)
312 return make_float16(pack_raw(float16_params, p));
315 static inline float32 float32_pack_raw(FloatParts p)
317 return make_float32(pack_raw(float32_params, p));
320 static inline float64 float64_pack_raw(FloatParts p)
322 return make_float64(pack_raw(float64_params, p));
325 /* Canonicalize EXP and FRAC, setting CLS. */
326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
327 float_status *status)
329 if (part.exp == parm->exp_max) {
330 if (part.frac == 0) {
331 part.cls = float_class_inf;
332 } else {
333 #ifdef NO_SIGNALING_NANS
334 part.cls = float_class_qnan;
335 #else
336 int64_t msb = part.frac << (parm->frac_shift + 2);
337 if ((msb < 0) == status->snan_bit_is_one) {
338 part.cls = float_class_snan;
339 } else {
340 part.cls = float_class_qnan;
342 #endif
344 } else if (part.exp == 0) {
345 if (likely(part.frac == 0)) {
346 part.cls = float_class_zero;
347 } else if (status->flush_inputs_to_zero) {
348 float_raise(float_flag_input_denormal, status);
349 part.cls = float_class_zero;
350 part.frac = 0;
351 } else {
352 int shift = clz64(part.frac) - 1;
353 part.cls = float_class_normal;
354 part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
355 part.frac <<= shift;
357 } else {
358 part.cls = float_class_normal;
359 part.exp -= parm->exp_bias;
360 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
362 return part;
365 /* Round and uncanonicalize a floating-point number by parts. There
366 * are FRAC_SHIFT bits that may require rounding at the bottom of the
367 * fraction; these bits will be removed. The exponent will be biased
368 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
371 static FloatParts round_canonical(FloatParts p, float_status *s,
372 const FloatFmt *parm)
374 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
375 const uint64_t round_mask = parm->round_mask;
376 const uint64_t roundeven_mask = parm->roundeven_mask;
377 const int exp_max = parm->exp_max;
378 const int frac_shift = parm->frac_shift;
379 uint64_t frac, inc;
380 int exp, flags = 0;
381 bool overflow_norm;
383 frac = p.frac;
384 exp = p.exp;
386 switch (p.cls) {
387 case float_class_normal:
388 switch (s->float_rounding_mode) {
389 case float_round_nearest_even:
390 overflow_norm = false;
391 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
392 break;
393 case float_round_ties_away:
394 overflow_norm = false;
395 inc = frac_lsbm1;
396 break;
397 case float_round_to_zero:
398 overflow_norm = true;
399 inc = 0;
400 break;
401 case float_round_up:
402 inc = p.sign ? 0 : round_mask;
403 overflow_norm = p.sign;
404 break;
405 case float_round_down:
406 inc = p.sign ? round_mask : 0;
407 overflow_norm = !p.sign;
408 break;
409 default:
410 g_assert_not_reached();
413 exp += parm->exp_bias;
414 if (likely(exp > 0)) {
415 if (frac & round_mask) {
416 flags |= float_flag_inexact;
417 frac += inc;
418 if (frac & DECOMPOSED_OVERFLOW_BIT) {
419 frac >>= 1;
420 exp++;
423 frac >>= frac_shift;
425 if (unlikely(exp >= exp_max)) {
426 flags |= float_flag_overflow | float_flag_inexact;
427 if (overflow_norm) {
428 exp = exp_max - 1;
429 frac = -1;
430 } else {
431 p.cls = float_class_inf;
432 goto do_inf;
435 } else if (s->flush_to_zero) {
436 flags |= float_flag_output_denormal;
437 p.cls = float_class_zero;
438 goto do_zero;
439 } else {
440 bool is_tiny = (s->float_detect_tininess
441 == float_tininess_before_rounding)
442 || (exp < 0)
443 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
445 shift64RightJamming(frac, 1 - exp, &frac);
446 if (frac & round_mask) {
447 /* Need to recompute round-to-even. */
448 if (s->float_rounding_mode == float_round_nearest_even) {
449 inc = ((frac & roundeven_mask) != frac_lsbm1
450 ? frac_lsbm1 : 0);
452 flags |= float_flag_inexact;
453 frac += inc;
456 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
457 frac >>= frac_shift;
459 if (is_tiny && (flags & float_flag_inexact)) {
460 flags |= float_flag_underflow;
462 if (exp == 0 && frac == 0) {
463 p.cls = float_class_zero;
466 break;
468 case float_class_zero:
469 do_zero:
470 exp = 0;
471 frac = 0;
472 break;
474 case float_class_inf:
475 do_inf:
476 exp = exp_max;
477 frac = 0;
478 break;
480 case float_class_qnan:
481 case float_class_snan:
482 exp = exp_max;
483 break;
485 default:
486 g_assert_not_reached();
489 float_raise(flags, s);
490 p.exp = exp;
491 p.frac = frac;
492 return p;
495 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
497 return canonicalize(float16_unpack_raw(f), &float16_params, s);
500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
502 switch (p.cls) {
503 case float_class_dnan:
504 return float16_default_nan(s);
505 case float_class_msnan:
506 return float16_maybe_silence_nan(float16_pack_raw(p), s);
507 default:
508 p = round_canonical(p, s, &float16_params);
509 return float16_pack_raw(p);
513 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
515 return canonicalize(float32_unpack_raw(f), &float32_params, s);
518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
520 switch (p.cls) {
521 case float_class_dnan:
522 return float32_default_nan(s);
523 case float_class_msnan:
524 return float32_maybe_silence_nan(float32_pack_raw(p), s);
525 default:
526 p = round_canonical(p, s, &float32_params);
527 return float32_pack_raw(p);
531 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
533 return canonicalize(float64_unpack_raw(f), &float64_params, s);
536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
538 switch (p.cls) {
539 case float_class_dnan:
540 return float64_default_nan(s);
541 case float_class_msnan:
542 return float64_maybe_silence_nan(float64_pack_raw(p), s);
543 default:
544 p = round_canonical(p, s, &float64_params);
545 return float64_pack_raw(p);
549 /* Simple helpers for checking if what NaN we have */
550 static bool is_nan(FloatClass c)
552 return unlikely(c >= float_class_qnan);
554 static bool is_snan(FloatClass c)
556 return c == float_class_snan;
558 static bool is_qnan(FloatClass c)
560 return c == float_class_qnan;
563 static FloatParts return_nan(FloatParts a, float_status *s)
565 switch (a.cls) {
566 case float_class_snan:
567 s->float_exception_flags |= float_flag_invalid;
568 a.cls = float_class_msnan;
569 /* fall through */
570 case float_class_qnan:
571 if (s->default_nan_mode) {
572 a.cls = float_class_dnan;
574 break;
576 default:
577 g_assert_not_reached();
579 return a;
582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
584 if (is_snan(a.cls) || is_snan(b.cls)) {
585 s->float_exception_flags |= float_flag_invalid;
588 if (s->default_nan_mode) {
589 a.cls = float_class_dnan;
590 } else {
591 if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
592 is_qnan(b.cls), is_snan(b.cls),
593 a.frac > b.frac ||
594 (a.frac == b.frac && a.sign < b.sign))) {
595 a = b;
597 a.cls = float_class_msnan;
599 return a;
602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
603 bool inf_zero, float_status *s)
605 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
606 s->float_exception_flags |= float_flag_invalid;
609 if (s->default_nan_mode) {
610 a.cls = float_class_dnan;
611 } else {
612 switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
613 is_qnan(b.cls), is_snan(b.cls),
614 is_qnan(c.cls), is_snan(c.cls),
615 inf_zero, s)) {
616 case 0:
617 break;
618 case 1:
619 a = b;
620 break;
621 case 2:
622 a = c;
623 break;
624 case 3:
625 a.cls = float_class_dnan;
626 return a;
627 default:
628 g_assert_not_reached();
631 a.cls = float_class_msnan;
633 return a;
637 * Returns the result of adding or subtracting the values of the
638 * floating-point values `a' and `b'. The operation is performed
639 * according to the IEC/IEEE Standard for Binary Floating-Point
640 * Arithmetic.
643 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
644 float_status *s)
646 bool a_sign = a.sign;
647 bool b_sign = b.sign ^ subtract;
649 if (a_sign != b_sign) {
650 /* Subtraction */
652 if (a.cls == float_class_normal && b.cls == float_class_normal) {
653 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
654 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
655 a.frac = a.frac - b.frac;
656 } else {
657 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
658 a.frac = b.frac - a.frac;
659 a.exp = b.exp;
660 a_sign ^= 1;
663 if (a.frac == 0) {
664 a.cls = float_class_zero;
665 a.sign = s->float_rounding_mode == float_round_down;
666 } else {
667 int shift = clz64(a.frac) - 1;
668 a.frac = a.frac << shift;
669 a.exp = a.exp - shift;
670 a.sign = a_sign;
672 return a;
674 if (is_nan(a.cls) || is_nan(b.cls)) {
675 return pick_nan(a, b, s);
677 if (a.cls == float_class_inf) {
678 if (b.cls == float_class_inf) {
679 float_raise(float_flag_invalid, s);
680 a.cls = float_class_dnan;
682 return a;
684 if (a.cls == float_class_zero && b.cls == float_class_zero) {
685 a.sign = s->float_rounding_mode == float_round_down;
686 return a;
688 if (a.cls == float_class_zero || b.cls == float_class_inf) {
689 b.sign = a_sign ^ 1;
690 return b;
692 if (b.cls == float_class_zero) {
693 return a;
695 } else {
696 /* Addition */
697 if (a.cls == float_class_normal && b.cls == float_class_normal) {
698 if (a.exp > b.exp) {
699 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
700 } else if (a.exp < b.exp) {
701 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
702 a.exp = b.exp;
704 a.frac += b.frac;
705 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
706 a.frac >>= 1;
707 a.exp += 1;
709 return a;
711 if (is_nan(a.cls) || is_nan(b.cls)) {
712 return pick_nan(a, b, s);
714 if (a.cls == float_class_inf || b.cls == float_class_zero) {
715 return a;
717 if (b.cls == float_class_inf || a.cls == float_class_zero) {
718 b.sign = b_sign;
719 return b;
722 g_assert_not_reached();
726 * Returns the result of adding or subtracting the floating-point
727 * values `a' and `b'. The operation is performed according to the
728 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
731 float16 __attribute__((flatten)) float16_add(float16 a, float16 b,
732 float_status *status)
734 FloatParts pa = float16_unpack_canonical(a, status);
735 FloatParts pb = float16_unpack_canonical(b, status);
736 FloatParts pr = addsub_floats(pa, pb, false, status);
738 return float16_round_pack_canonical(pr, status);
741 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
742 float_status *status)
744 FloatParts pa = float32_unpack_canonical(a, status);
745 FloatParts pb = float32_unpack_canonical(b, status);
746 FloatParts pr = addsub_floats(pa, pb, false, status);
748 return float32_round_pack_canonical(pr, status);
751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
752 float_status *status)
754 FloatParts pa = float64_unpack_canonical(a, status);
755 FloatParts pb = float64_unpack_canonical(b, status);
756 FloatParts pr = addsub_floats(pa, pb, false, status);
758 return float64_round_pack_canonical(pr, status);
761 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
762 float_status *status)
764 FloatParts pa = float16_unpack_canonical(a, status);
765 FloatParts pb = float16_unpack_canonical(b, status);
766 FloatParts pr = addsub_floats(pa, pb, true, status);
768 return float16_round_pack_canonical(pr, status);
771 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
772 float_status *status)
774 FloatParts pa = float32_unpack_canonical(a, status);
775 FloatParts pb = float32_unpack_canonical(b, status);
776 FloatParts pr = addsub_floats(pa, pb, true, status);
778 return float32_round_pack_canonical(pr, status);
781 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
782 float_status *status)
784 FloatParts pa = float64_unpack_canonical(a, status);
785 FloatParts pb = float64_unpack_canonical(b, status);
786 FloatParts pr = addsub_floats(pa, pb, true, status);
788 return float64_round_pack_canonical(pr, status);
792 * Returns the result of multiplying the floating-point values `a' and
793 * `b'. The operation is performed according to the IEC/IEEE Standard
794 * for Binary Floating-Point Arithmetic.
797 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
799 bool sign = a.sign ^ b.sign;
801 if (a.cls == float_class_normal && b.cls == float_class_normal) {
802 uint64_t hi, lo;
803 int exp = a.exp + b.exp;
805 mul64To128(a.frac, b.frac, &hi, &lo);
806 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
807 if (lo & DECOMPOSED_OVERFLOW_BIT) {
808 shift64RightJamming(lo, 1, &lo);
809 exp += 1;
812 /* Re-use a */
813 a.exp = exp;
814 a.sign = sign;
815 a.frac = lo;
816 return a;
818 /* handle all the NaN cases */
819 if (is_nan(a.cls) || is_nan(b.cls)) {
820 return pick_nan(a, b, s);
822 /* Inf * Zero == NaN */
823 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
824 (a.cls == float_class_zero && b.cls == float_class_inf)) {
825 s->float_exception_flags |= float_flag_invalid;
826 a.cls = float_class_dnan;
827 a.sign = sign;
828 return a;
830 /* Multiply by 0 or Inf */
831 if (a.cls == float_class_inf || a.cls == float_class_zero) {
832 a.sign = sign;
833 return a;
835 if (b.cls == float_class_inf || b.cls == float_class_zero) {
836 b.sign = sign;
837 return b;
839 g_assert_not_reached();
842 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
843 float_status *status)
845 FloatParts pa = float16_unpack_canonical(a, status);
846 FloatParts pb = float16_unpack_canonical(b, status);
847 FloatParts pr = mul_floats(pa, pb, status);
849 return float16_round_pack_canonical(pr, status);
852 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
853 float_status *status)
855 FloatParts pa = float32_unpack_canonical(a, status);
856 FloatParts pb = float32_unpack_canonical(b, status);
857 FloatParts pr = mul_floats(pa, pb, status);
859 return float32_round_pack_canonical(pr, status);
862 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
863 float_status *status)
865 FloatParts pa = float64_unpack_canonical(a, status);
866 FloatParts pb = float64_unpack_canonical(b, status);
867 FloatParts pr = mul_floats(pa, pb, status);
869 return float64_round_pack_canonical(pr, status);
873 * Returns the result of multiplying the floating-point values `a' and
874 * `b' then adding 'c', with no intermediate rounding step after the
875 * multiplication. The operation is performed according to the
876 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
877 * The flags argument allows the caller to select negation of the
878 * addend, the intermediate product, or the final result. (The
879 * difference between this and having the caller do a separate
880 * negation is that negating externally will flip the sign bit on
881 * NaNs.)
884 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
885 int flags, float_status *s)
887 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
888 ((1 << float_class_inf) | (1 << float_class_zero));
889 bool p_sign;
890 bool sign_flip = flags & float_muladd_negate_result;
891 FloatClass p_class;
892 uint64_t hi, lo;
893 int p_exp;
895 /* It is implementation-defined whether the cases of (0,inf,qnan)
896 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
897 * they return if they do), so we have to hand this information
898 * off to the target-specific pick-a-NaN routine.
900 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
901 return pick_nan_muladd(a, b, c, inf_zero, s);
904 if (inf_zero) {
905 s->float_exception_flags |= float_flag_invalid;
906 a.cls = float_class_dnan;
907 return a;
910 if (flags & float_muladd_negate_c) {
911 c.sign ^= 1;
914 p_sign = a.sign ^ b.sign;
916 if (flags & float_muladd_negate_product) {
917 p_sign ^= 1;
920 if (a.cls == float_class_inf || b.cls == float_class_inf) {
921 p_class = float_class_inf;
922 } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
923 p_class = float_class_zero;
924 } else {
925 p_class = float_class_normal;
928 if (c.cls == float_class_inf) {
929 if (p_class == float_class_inf && p_sign != c.sign) {
930 s->float_exception_flags |= float_flag_invalid;
931 a.cls = float_class_dnan;
932 } else {
933 a.cls = float_class_inf;
934 a.sign = c.sign ^ sign_flip;
936 return a;
939 if (p_class == float_class_inf) {
940 a.cls = float_class_inf;
941 a.sign = p_sign ^ sign_flip;
942 return a;
945 if (p_class == float_class_zero) {
946 if (c.cls == float_class_zero) {
947 if (p_sign != c.sign) {
948 p_sign = s->float_rounding_mode == float_round_down;
950 c.sign = p_sign;
951 } else if (flags & float_muladd_halve_result) {
952 c.exp -= 1;
954 c.sign ^= sign_flip;
955 return c;
958 /* a & b should be normals now... */
959 assert(a.cls == float_class_normal &&
960 b.cls == float_class_normal);
962 p_exp = a.exp + b.exp;
964 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
965 * result.
967 mul64To128(a.frac, b.frac, &hi, &lo);
968 /* binary point now at bit 124 */
970 /* check for overflow */
971 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
972 shift128RightJamming(hi, lo, 1, &hi, &lo);
973 p_exp += 1;
976 /* + add/sub */
977 if (c.cls == float_class_zero) {
978 /* move binary point back to 62 */
979 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
980 } else {
981 int exp_diff = p_exp - c.exp;
982 if (p_sign == c.sign) {
983 /* Addition */
984 if (exp_diff <= 0) {
985 shift128RightJamming(hi, lo,
986 DECOMPOSED_BINARY_POINT - exp_diff,
987 &hi, &lo);
988 lo += c.frac;
989 p_exp = c.exp;
990 } else {
991 uint64_t c_hi, c_lo;
992 /* shift c to the same binary point as the product (124) */
993 c_hi = c.frac >> 2;
994 c_lo = 0;
995 shift128RightJamming(c_hi, c_lo,
996 exp_diff,
997 &c_hi, &c_lo);
998 add128(hi, lo, c_hi, c_lo, &hi, &lo);
999 /* move binary point back to 62 */
1000 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1003 if (lo & DECOMPOSED_OVERFLOW_BIT) {
1004 shift64RightJamming(lo, 1, &lo);
1005 p_exp += 1;
1008 } else {
1009 /* Subtraction */
1010 uint64_t c_hi, c_lo;
1011 /* make C binary point match product at bit 124 */
1012 c_hi = c.frac >> 2;
1013 c_lo = 0;
1015 if (exp_diff <= 0) {
1016 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1017 if (exp_diff == 0
1019 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1020 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1021 } else {
1022 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1023 p_sign ^= 1;
1024 p_exp = c.exp;
1026 } else {
1027 shift128RightJamming(c_hi, c_lo,
1028 exp_diff,
1029 &c_hi, &c_lo);
1030 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1033 if (hi == 0 && lo == 0) {
1034 a.cls = float_class_zero;
1035 a.sign = s->float_rounding_mode == float_round_down;
1036 a.sign ^= sign_flip;
1037 return a;
1038 } else {
1039 int shift;
1040 if (hi != 0) {
1041 shift = clz64(hi);
1042 } else {
1043 shift = clz64(lo) + 64;
1045 /* Normalizing to a binary point of 124 is the
1046 correct adjust for the exponent. However since we're
1047 shifting, we might as well put the binary point back
1048 at 62 where we really want it. Therefore shift as
1049 if we're leaving 1 bit at the top of the word, but
1050 adjust the exponent as if we're leaving 3 bits. */
1051 shift -= 1;
1052 if (shift >= 64) {
1053 lo = lo << (shift - 64);
1054 } else {
1055 hi = (hi << shift) | (lo >> (64 - shift));
1056 lo = hi | ((lo << shift) != 0);
1058 p_exp -= shift - 2;
1063 if (flags & float_muladd_halve_result) {
1064 p_exp -= 1;
1067 /* finally prepare our result */
1068 a.cls = float_class_normal;
1069 a.sign = p_sign ^ sign_flip;
1070 a.exp = p_exp;
1071 a.frac = lo;
1073 return a;
1076 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1077 int flags, float_status *status)
1079 FloatParts pa = float16_unpack_canonical(a, status);
1080 FloatParts pb = float16_unpack_canonical(b, status);
1081 FloatParts pc = float16_unpack_canonical(c, status);
1082 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1084 return float16_round_pack_canonical(pr, status);
1087 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1088 int flags, float_status *status)
1090 FloatParts pa = float32_unpack_canonical(a, status);
1091 FloatParts pb = float32_unpack_canonical(b, status);
1092 FloatParts pc = float32_unpack_canonical(c, status);
1093 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1095 return float32_round_pack_canonical(pr, status);
1098 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1099 int flags, float_status *status)
1101 FloatParts pa = float64_unpack_canonical(a, status);
1102 FloatParts pb = float64_unpack_canonical(b, status);
1103 FloatParts pc = float64_unpack_canonical(c, status);
1104 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1106 return float64_round_pack_canonical(pr, status);
1110 * Returns the result of dividing the floating-point value `a' by the
1111 * corresponding value `b'. The operation is performed according to
1112 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1117 bool sign = a.sign ^ b.sign;
1119 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1120 uint64_t temp_lo, temp_hi;
1121 int exp = a.exp - b.exp;
1122 if (a.frac < b.frac) {
1123 exp -= 1;
1124 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1125 &temp_hi, &temp_lo);
1126 } else {
1127 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1128 &temp_hi, &temp_lo);
1130 /* LSB of quot is set if inexact which roundandpack will use
1131 * to set flags. Yet again we re-use a for the result */
1132 a.frac = div128To64(temp_lo, temp_hi, b.frac);
1133 a.sign = sign;
1134 a.exp = exp;
1135 return a;
1137 /* handle all the NaN cases */
1138 if (is_nan(a.cls) || is_nan(b.cls)) {
1139 return pick_nan(a, b, s);
1141 /* 0/0 or Inf/Inf */
1142 if (a.cls == b.cls
1144 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1145 s->float_exception_flags |= float_flag_invalid;
1146 a.cls = float_class_dnan;
1147 return a;
1149 /* Inf / x or 0 / x */
1150 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1151 a.sign = sign;
1152 return a;
1154 /* Div 0 => Inf */
1155 if (b.cls == float_class_zero) {
1156 s->float_exception_flags |= float_flag_divbyzero;
1157 a.cls = float_class_inf;
1158 a.sign = sign;
1159 return a;
1161 /* Div by Inf */
1162 if (b.cls == float_class_inf) {
1163 a.cls = float_class_zero;
1164 a.sign = sign;
1165 return a;
1167 g_assert_not_reached();
1170 float16 float16_div(float16 a, float16 b, float_status *status)
1172 FloatParts pa = float16_unpack_canonical(a, status);
1173 FloatParts pb = float16_unpack_canonical(b, status);
1174 FloatParts pr = div_floats(pa, pb, status);
1176 return float16_round_pack_canonical(pr, status);
1179 float32 float32_div(float32 a, float32 b, float_status *status)
1181 FloatParts pa = float32_unpack_canonical(a, status);
1182 FloatParts pb = float32_unpack_canonical(b, status);
1183 FloatParts pr = div_floats(pa, pb, status);
1185 return float32_round_pack_canonical(pr, status);
1188 float64 float64_div(float64 a, float64 b, float_status *status)
1190 FloatParts pa = float64_unpack_canonical(a, status);
1191 FloatParts pb = float64_unpack_canonical(b, status);
1192 FloatParts pr = div_floats(pa, pb, status);
1194 return float64_round_pack_canonical(pr, status);
1198 * Rounds the floating-point value `a' to an integer, and returns the
1199 * result as a floating-point value. The operation is performed
1200 * according to the IEC/IEEE Standard for Binary Floating-Point
1201 * Arithmetic.
1204 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1206 if (is_nan(a.cls)) {
1207 return return_nan(a, s);
1210 switch (a.cls) {
1211 case float_class_zero:
1212 case float_class_inf:
1213 case float_class_qnan:
1214 /* already "integral" */
1215 break;
1216 case float_class_normal:
1217 if (a.exp >= DECOMPOSED_BINARY_POINT) {
1218 /* already integral */
1219 break;
1221 if (a.exp < 0) {
1222 bool one;
1223 /* all fractional */
1224 s->float_exception_flags |= float_flag_inexact;
1225 switch (rounding_mode) {
1226 case float_round_nearest_even:
1227 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1228 break;
1229 case float_round_ties_away:
1230 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1231 break;
1232 case float_round_to_zero:
1233 one = false;
1234 break;
1235 case float_round_up:
1236 one = !a.sign;
1237 break;
1238 case float_round_down:
1239 one = a.sign;
1240 break;
1241 default:
1242 g_assert_not_reached();
1245 if (one) {
1246 a.frac = DECOMPOSED_IMPLICIT_BIT;
1247 a.exp = 0;
1248 } else {
1249 a.cls = float_class_zero;
1251 } else {
1252 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1253 uint64_t frac_lsbm1 = frac_lsb >> 1;
1254 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1255 uint64_t rnd_mask = rnd_even_mask >> 1;
1256 uint64_t inc;
1258 switch (rounding_mode) {
1259 case float_round_nearest_even:
1260 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1261 break;
1262 case float_round_ties_away:
1263 inc = frac_lsbm1;
1264 break;
1265 case float_round_to_zero:
1266 inc = 0;
1267 break;
1268 case float_round_up:
1269 inc = a.sign ? 0 : rnd_mask;
1270 break;
1271 case float_round_down:
1272 inc = a.sign ? rnd_mask : 0;
1273 break;
1274 default:
1275 g_assert_not_reached();
1278 if (a.frac & rnd_mask) {
1279 s->float_exception_flags |= float_flag_inexact;
1280 a.frac += inc;
1281 a.frac &= ~rnd_mask;
1282 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1283 a.frac >>= 1;
1284 a.exp++;
1288 break;
1289 default:
1290 g_assert_not_reached();
1292 return a;
1295 float16 float16_round_to_int(float16 a, float_status *s)
1297 FloatParts pa = float16_unpack_canonical(a, s);
1298 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1299 return float16_round_pack_canonical(pr, s);
1302 float32 float32_round_to_int(float32 a, float_status *s)
1304 FloatParts pa = float32_unpack_canonical(a, s);
1305 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1306 return float32_round_pack_canonical(pr, s);
1309 float64 float64_round_to_int(float64 a, float_status *s)
1311 FloatParts pa = float64_unpack_canonical(a, s);
1312 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1313 return float64_round_pack_canonical(pr, s);
1316 float64 float64_trunc_to_int(float64 a, float_status *s)
1318 FloatParts pa = float64_unpack_canonical(a, s);
1319 FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1320 return float64_round_pack_canonical(pr, s);
1324 * Returns the result of converting the floating-point value `a' to
1325 * the two's complement integer format. The conversion is performed
1326 * according to the IEC/IEEE Standard for Binary Floating-Point
1327 * Arithmetic---which means in particular that the conversion is
1328 * rounded according to the current rounding mode. If `a' is a NaN,
1329 * the largest positive integer is returned. Otherwise, if the
1330 * conversion overflows, the largest integer with the same sign as `a'
1331 * is returned.
1334 static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1335 int64_t min, int64_t max,
1336 float_status *s)
1338 uint64_t r;
1339 int orig_flags = get_float_exception_flags(s);
1340 FloatParts p = round_to_int(in, rmode, s);
1342 switch (p.cls) {
1343 case float_class_snan:
1344 case float_class_qnan:
1345 case float_class_dnan:
1346 case float_class_msnan:
1347 s->float_exception_flags = orig_flags | float_flag_invalid;
1348 return max;
1349 case float_class_inf:
1350 s->float_exception_flags = orig_flags | float_flag_invalid;
1351 return p.sign ? min : max;
1352 case float_class_zero:
1353 return 0;
1354 case float_class_normal:
1355 if (p.exp < DECOMPOSED_BINARY_POINT) {
1356 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1357 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1358 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1359 } else {
1360 r = UINT64_MAX;
1362 if (p.sign) {
1363 if (r < -(uint64_t) min) {
1364 return -r;
1365 } else {
1366 s->float_exception_flags = orig_flags | float_flag_invalid;
1367 return min;
1369 } else {
1370 if (r < max) {
1371 return r;
1372 } else {
1373 s->float_exception_flags = orig_flags | float_flag_invalid;
1374 return max;
1377 default:
1378 g_assert_not_reached();
1382 #define FLOAT_TO_INT(fsz, isz) \
1383 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \
1384 float_status *s) \
1386 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1387 return round_to_int_and_pack(p, s->float_rounding_mode, \
1388 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1389 s); \
1392 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \
1393 (float ## fsz a, float_status *s) \
1395 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1396 return round_to_int_and_pack(p, float_round_to_zero, \
1397 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1398 s); \
1401 FLOAT_TO_INT(16, 16)
1402 FLOAT_TO_INT(16, 32)
1403 FLOAT_TO_INT(16, 64)
1405 FLOAT_TO_INT(32, 16)
1406 FLOAT_TO_INT(32, 32)
1407 FLOAT_TO_INT(32, 64)
1409 FLOAT_TO_INT(64, 16)
1410 FLOAT_TO_INT(64, 32)
1411 FLOAT_TO_INT(64, 64)
1413 #undef FLOAT_TO_INT
1416 * Returns the result of converting the floating-point value `a' to
1417 * the unsigned integer format. The conversion is performed according
1418 * to the IEC/IEEE Standard for Binary Floating-Point
1419 * Arithmetic---which means in particular that the conversion is
1420 * rounded according to the current rounding mode. If `a' is a NaN,
1421 * the largest unsigned integer is returned. Otherwise, if the
1422 * conversion overflows, the largest unsigned integer is returned. If
1423 * the 'a' is negative, the result is rounded and zero is returned;
1424 * values that do not round to zero will raise the inexact exception
1425 * flag.
1428 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1429 float_status *s)
1431 int orig_flags = get_float_exception_flags(s);
1432 FloatParts p = round_to_int(in, rmode, s);
1434 switch (p.cls) {
1435 case float_class_snan:
1436 case float_class_qnan:
1437 case float_class_dnan:
1438 case float_class_msnan:
1439 s->float_exception_flags = orig_flags | float_flag_invalid;
1440 return max;
1441 case float_class_inf:
1442 s->float_exception_flags = orig_flags | float_flag_invalid;
1443 return p.sign ? 0 : max;
1444 case float_class_zero:
1445 return 0;
1446 case float_class_normal:
1448 uint64_t r;
1449 if (p.sign) {
1450 s->float_exception_flags = orig_flags | float_flag_invalid;
1451 return 0;
1454 if (p.exp < DECOMPOSED_BINARY_POINT) {
1455 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1456 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1457 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1458 } else {
1459 s->float_exception_flags = orig_flags | float_flag_invalid;
1460 return max;
1463 /* For uint64 this will never trip, but if p.exp is too large
1464 * to shift a decomposed fraction we shall have exited via the
1465 * 3rd leg above.
1467 if (r > max) {
1468 s->float_exception_flags = orig_flags | float_flag_invalid;
1469 return max;
1470 } else {
1471 return r;
1474 default:
1475 g_assert_not_reached();
1479 #define FLOAT_TO_UINT(fsz, isz) \
1480 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \
1481 float_status *s) \
1483 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1484 return round_to_uint_and_pack(p, s->float_rounding_mode, \
1485 UINT ## isz ## _MAX, s); \
1488 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \
1489 (float ## fsz a, float_status *s) \
1491 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1492 return round_to_uint_and_pack(p, float_round_to_zero, \
1493 UINT ## isz ## _MAX, s); \
1496 FLOAT_TO_UINT(16, 16)
1497 FLOAT_TO_UINT(16, 32)
1498 FLOAT_TO_UINT(16, 64)
1500 FLOAT_TO_UINT(32, 16)
1501 FLOAT_TO_UINT(32, 32)
1502 FLOAT_TO_UINT(32, 64)
1504 FLOAT_TO_UINT(64, 16)
1505 FLOAT_TO_UINT(64, 32)
1506 FLOAT_TO_UINT(64, 64)
1508 #undef FLOAT_TO_UINT
1511 * Integer to float conversions
1513 * Returns the result of converting the two's complement integer `a'
1514 * to the floating-point format. The conversion is performed according
1515 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1518 static FloatParts int_to_float(int64_t a, float_status *status)
1520 FloatParts r;
1521 if (a == 0) {
1522 r.cls = float_class_zero;
1523 r.sign = false;
1524 } else if (a == (1ULL << 63)) {
1525 r.cls = float_class_normal;
1526 r.sign = true;
1527 r.frac = DECOMPOSED_IMPLICIT_BIT;
1528 r.exp = 63;
1529 } else {
1530 uint64_t f;
1531 if (a < 0) {
1532 f = -a;
1533 r.sign = true;
1534 } else {
1535 f = a;
1536 r.sign = false;
1538 int shift = clz64(f) - 1;
1539 r.cls = float_class_normal;
1540 r.exp = (DECOMPOSED_BINARY_POINT - shift);
1541 r.frac = f << shift;
1544 return r;
1547 float16 int64_to_float16(int64_t a, float_status *status)
1549 FloatParts pa = int_to_float(a, status);
1550 return float16_round_pack_canonical(pa, status);
1553 float16 int32_to_float16(int32_t a, float_status *status)
1555 return int64_to_float16(a, status);
1558 float16 int16_to_float16(int16_t a, float_status *status)
1560 return int64_to_float16(a, status);
1563 float32 int64_to_float32(int64_t a, float_status *status)
1565 FloatParts pa = int_to_float(a, status);
1566 return float32_round_pack_canonical(pa, status);
1569 float32 int32_to_float32(int32_t a, float_status *status)
1571 return int64_to_float32(a, status);
1574 float32 int16_to_float32(int16_t a, float_status *status)
1576 return int64_to_float32(a, status);
1579 float64 int64_to_float64(int64_t a, float_status *status)
1581 FloatParts pa = int_to_float(a, status);
1582 return float64_round_pack_canonical(pa, status);
1585 float64 int32_to_float64(int32_t a, float_status *status)
1587 return int64_to_float64(a, status);
1590 float64 int16_to_float64(int16_t a, float_status *status)
1592 return int64_to_float64(a, status);
1597 * Unsigned Integer to float conversions
1599 * Returns the result of converting the unsigned integer `a' to the
1600 * floating-point format. The conversion is performed according to the
1601 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1604 static FloatParts uint_to_float(uint64_t a, float_status *status)
1606 FloatParts r = { .sign = false};
1608 if (a == 0) {
1609 r.cls = float_class_zero;
1610 } else {
1611 int spare_bits = clz64(a) - 1;
1612 r.cls = float_class_normal;
1613 r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1614 if (spare_bits < 0) {
1615 shift64RightJamming(a, -spare_bits, &a);
1616 r.frac = a;
1617 } else {
1618 r.frac = a << spare_bits;
1622 return r;
1625 float16 uint64_to_float16(uint64_t a, float_status *status)
1627 FloatParts pa = uint_to_float(a, status);
1628 return float16_round_pack_canonical(pa, status);
1631 float16 uint32_to_float16(uint32_t a, float_status *status)
1633 return uint64_to_float16(a, status);
1636 float16 uint16_to_float16(uint16_t a, float_status *status)
1638 return uint64_to_float16(a, status);
1641 float32 uint64_to_float32(uint64_t a, float_status *status)
1643 FloatParts pa = uint_to_float(a, status);
1644 return float32_round_pack_canonical(pa, status);
1647 float32 uint32_to_float32(uint32_t a, float_status *status)
1649 return uint64_to_float32(a, status);
1652 float32 uint16_to_float32(uint16_t a, float_status *status)
1654 return uint64_to_float32(a, status);
1657 float64 uint64_to_float64(uint64_t a, float_status *status)
1659 FloatParts pa = uint_to_float(a, status);
1660 return float64_round_pack_canonical(pa, status);
1663 float64 uint32_to_float64(uint32_t a, float_status *status)
1665 return uint64_to_float64(a, status);
1668 float64 uint16_to_float64(uint16_t a, float_status *status)
1670 return uint64_to_float64(a, status);
1673 /* Float Min/Max */
1674 /* min() and max() functions. These can't be implemented as
1675 * 'compare and pick one input' because that would mishandle
1676 * NaNs and +0 vs -0.
1678 * minnum() and maxnum() functions. These are similar to the min()
1679 * and max() functions but if one of the arguments is a QNaN and
1680 * the other is numerical then the numerical argument is returned.
1681 * SNaNs will get quietened before being returned.
1682 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1683 * and maxNum() operations. min() and max() are the typical min/max
1684 * semantics provided by many CPUs which predate that specification.
1686 * minnummag() and maxnummag() functions correspond to minNumMag()
1687 * and minNumMag() from the IEEE-754 2008.
1689 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1690 bool ieee, bool ismag, float_status *s)
1692 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1693 if (ieee) {
1694 /* Takes two floating-point values `a' and `b', one of
1695 * which is a NaN, and returns the appropriate NaN
1696 * result. If either `a' or `b' is a signaling NaN,
1697 * the invalid exception is raised.
1699 if (is_snan(a.cls) || is_snan(b.cls)) {
1700 return pick_nan(a, b, s);
1701 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1702 return b;
1703 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1704 return a;
1707 return pick_nan(a, b, s);
1708 } else {
1709 int a_exp, b_exp;
1711 switch (a.cls) {
1712 case float_class_normal:
1713 a_exp = a.exp;
1714 break;
1715 case float_class_inf:
1716 a_exp = INT_MAX;
1717 break;
1718 case float_class_zero:
1719 a_exp = INT_MIN;
1720 break;
1721 default:
1722 g_assert_not_reached();
1723 break;
1725 switch (b.cls) {
1726 case float_class_normal:
1727 b_exp = b.exp;
1728 break;
1729 case float_class_inf:
1730 b_exp = INT_MAX;
1731 break;
1732 case float_class_zero:
1733 b_exp = INT_MIN;
1734 break;
1735 default:
1736 g_assert_not_reached();
1737 break;
1740 if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
1741 bool a_less = a_exp < b_exp;
1742 if (a_exp == b_exp) {
1743 a_less = a.frac < b.frac;
1745 return a_less ^ ismin ? b : a;
1748 if (a.sign == b.sign) {
1749 bool a_less = a_exp < b_exp;
1750 if (a_exp == b_exp) {
1751 a_less = a.frac < b.frac;
1753 return a.sign ^ a_less ^ ismin ? b : a;
1754 } else {
1755 return a.sign ^ ismin ? b : a;
1760 #define MINMAX(sz, name, ismin, isiee, ismag) \
1761 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
1762 float_status *s) \
1764 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1765 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1766 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
1768 return float ## sz ## _round_pack_canonical(pr, s); \
1771 MINMAX(16, min, true, false, false)
1772 MINMAX(16, minnum, true, true, false)
1773 MINMAX(16, minnummag, true, true, true)
1774 MINMAX(16, max, false, false, false)
1775 MINMAX(16, maxnum, false, true, false)
1776 MINMAX(16, maxnummag, false, true, true)
1778 MINMAX(32, min, true, false, false)
1779 MINMAX(32, minnum, true, true, false)
1780 MINMAX(32, minnummag, true, true, true)
1781 MINMAX(32, max, false, false, false)
1782 MINMAX(32, maxnum, false, true, false)
1783 MINMAX(32, maxnummag, false, true, true)
1785 MINMAX(64, min, true, false, false)
1786 MINMAX(64, minnum, true, true, false)
1787 MINMAX(64, minnummag, true, true, true)
1788 MINMAX(64, max, false, false, false)
1789 MINMAX(64, maxnum, false, true, false)
1790 MINMAX(64, maxnummag, false, true, true)
1792 #undef MINMAX
1794 /* Floating point compare */
1795 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
1796 float_status *s)
1798 if (is_nan(a.cls) || is_nan(b.cls)) {
1799 if (!is_quiet ||
1800 a.cls == float_class_snan ||
1801 b.cls == float_class_snan) {
1802 s->float_exception_flags |= float_flag_invalid;
1804 return float_relation_unordered;
1807 if (a.cls == float_class_zero) {
1808 if (b.cls == float_class_zero) {
1809 return float_relation_equal;
1811 return b.sign ? float_relation_greater : float_relation_less;
1812 } else if (b.cls == float_class_zero) {
1813 return a.sign ? float_relation_less : float_relation_greater;
1816 /* The only really important thing about infinity is its sign. If
1817 * both are infinities the sign marks the smallest of the two.
1819 if (a.cls == float_class_inf) {
1820 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
1821 return float_relation_equal;
1823 return a.sign ? float_relation_less : float_relation_greater;
1824 } else if (b.cls == float_class_inf) {
1825 return b.sign ? float_relation_greater : float_relation_less;
1828 if (a.sign != b.sign) {
1829 return a.sign ? float_relation_less : float_relation_greater;
1832 if (a.exp == b.exp) {
1833 if (a.frac == b.frac) {
1834 return float_relation_equal;
1836 if (a.sign) {
1837 return a.frac > b.frac ?
1838 float_relation_less : float_relation_greater;
1839 } else {
1840 return a.frac > b.frac ?
1841 float_relation_greater : float_relation_less;
1843 } else {
1844 if (a.sign) {
1845 return a.exp > b.exp ? float_relation_less : float_relation_greater;
1846 } else {
1847 return a.exp > b.exp ? float_relation_greater : float_relation_less;
1852 #define COMPARE(sz) \
1853 int float ## sz ## _compare(float ## sz a, float ## sz b, \
1854 float_status *s) \
1856 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1857 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1858 return compare_floats(pa, pb, false, s); \
1860 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \
1861 float_status *s) \
1863 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1864 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1865 return compare_floats(pa, pb, true, s); \
1868 COMPARE(16)
1869 COMPARE(32)
1870 COMPARE(64)
1872 #undef COMPARE
1874 /* Multiply A by 2 raised to the power N. */
1875 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1877 if (unlikely(is_nan(a.cls))) {
1878 return return_nan(a, s);
1880 if (a.cls == float_class_normal) {
1881 /* The largest float type (even though not supported by FloatParts)
1882 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
1883 * still allows rounding to infinity, without allowing overflow
1884 * within the int32_t that backs FloatParts.exp.
1886 n = MIN(MAX(n, -0x10000), 0x10000);
1887 a.exp += n;
1889 return a;
1892 float16 float16_scalbn(float16 a, int n, float_status *status)
1894 FloatParts pa = float16_unpack_canonical(a, status);
1895 FloatParts pr = scalbn_decomposed(pa, n, status);
1896 return float16_round_pack_canonical(pr, status);
1899 float32 float32_scalbn(float32 a, int n, float_status *status)
1901 FloatParts pa = float32_unpack_canonical(a, status);
1902 FloatParts pr = scalbn_decomposed(pa, n, status);
1903 return float32_round_pack_canonical(pr, status);
1906 float64 float64_scalbn(float64 a, int n, float_status *status)
1908 FloatParts pa = float64_unpack_canonical(a, status);
1909 FloatParts pr = scalbn_decomposed(pa, n, status);
1910 return float64_round_pack_canonical(pr, status);
1914 * Square Root
1916 * The old softfloat code did an approximation step before zeroing in
1917 * on the final result. However for simpleness we just compute the
1918 * square root by iterating down from the implicit bit to enough extra
1919 * bits to ensure we get a correctly rounded result.
1921 * This does mean however the calculation is slower than before,
1922 * especially for 64 bit floats.
1925 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
1927 uint64_t a_frac, r_frac, s_frac;
1928 int bit, last_bit;
1930 if (is_nan(a.cls)) {
1931 return return_nan(a, s);
1933 if (a.cls == float_class_zero) {
1934 return a; /* sqrt(+-0) = +-0 */
1936 if (a.sign) {
1937 s->float_exception_flags |= float_flag_invalid;
1938 a.cls = float_class_dnan;
1939 return a;
1941 if (a.cls == float_class_inf) {
1942 return a; /* sqrt(+inf) = +inf */
1945 assert(a.cls == float_class_normal);
1947 /* We need two overflow bits at the top. Adding room for that is a
1948 * right shift. If the exponent is odd, we can discard the low bit
1949 * by multiplying the fraction by 2; that's a left shift. Combine
1950 * those and we shift right if the exponent is even.
1952 a_frac = a.frac;
1953 if (!(a.exp & 1)) {
1954 a_frac >>= 1;
1956 a.exp >>= 1;
1958 /* Bit-by-bit computation of sqrt. */
1959 r_frac = 0;
1960 s_frac = 0;
1962 /* Iterate from implicit bit down to the 3 extra bits to compute a
1963 * properly rounded result. Remember we've inserted one more bit
1964 * at the top, so these positions are one less.
1966 bit = DECOMPOSED_BINARY_POINT - 1;
1967 last_bit = MAX(p->frac_shift - 4, 0);
1968 do {
1969 uint64_t q = 1ULL << bit;
1970 uint64_t t_frac = s_frac + q;
1971 if (t_frac <= a_frac) {
1972 s_frac = t_frac + q;
1973 a_frac -= t_frac;
1974 r_frac += q;
1976 a_frac <<= 1;
1977 } while (--bit >= last_bit);
1979 /* Undo the right shift done above. If there is any remaining
1980 * fraction, the result is inexact. Set the sticky bit.
1982 a.frac = (r_frac << 1) + (a_frac != 0);
1984 return a;
1987 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
1989 FloatParts pa = float16_unpack_canonical(a, status);
1990 FloatParts pr = sqrt_float(pa, status, &float16_params);
1991 return float16_round_pack_canonical(pr, status);
1994 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
1996 FloatParts pa = float32_unpack_canonical(a, status);
1997 FloatParts pr = sqrt_float(pa, status, &float32_params);
1998 return float32_round_pack_canonical(pr, status);
2001 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
2003 FloatParts pa = float64_unpack_canonical(a, status);
2004 FloatParts pr = sqrt_float(pa, status, &float64_params);
2005 return float64_round_pack_canonical(pr, status);
2009 /*----------------------------------------------------------------------------
2010 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2011 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2012 | input. If `zSign' is 1, the input is negated before being converted to an
2013 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
2014 | is simply rounded to an integer, with the inexact exception raised if the
2015 | input cannot be represented exactly as an integer. However, if the fixed-
2016 | point input is too large, the invalid exception is raised and the largest
2017 | positive or negative integer is returned.
2018 *----------------------------------------------------------------------------*/
2020 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2022 int8_t roundingMode;
2023 flag roundNearestEven;
2024 int8_t roundIncrement, roundBits;
2025 int32_t z;
2027 roundingMode = status->float_rounding_mode;
2028 roundNearestEven = ( roundingMode == float_round_nearest_even );
2029 switch (roundingMode) {
2030 case float_round_nearest_even:
2031 case float_round_ties_away:
2032 roundIncrement = 0x40;
2033 break;
2034 case float_round_to_zero:
2035 roundIncrement = 0;
2036 break;
2037 case float_round_up:
2038 roundIncrement = zSign ? 0 : 0x7f;
2039 break;
2040 case float_round_down:
2041 roundIncrement = zSign ? 0x7f : 0;
2042 break;
2043 default:
2044 abort();
2046 roundBits = absZ & 0x7F;
2047 absZ = ( absZ + roundIncrement )>>7;
2048 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2049 z = absZ;
2050 if ( zSign ) z = - z;
2051 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2052 float_raise(float_flag_invalid, status);
2053 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2055 if (roundBits) {
2056 status->float_exception_flags |= float_flag_inexact;
2058 return z;
2062 /*----------------------------------------------------------------------------
2063 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2064 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2065 | and returns the properly rounded 64-bit integer corresponding to the input.
2066 | If `zSign' is 1, the input is negated before being converted to an integer.
2067 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2068 | the inexact exception raised if the input cannot be represented exactly as
2069 | an integer. However, if the fixed-point input is too large, the invalid
2070 | exception is raised and the largest positive or negative integer is
2071 | returned.
2072 *----------------------------------------------------------------------------*/
2074 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2075 float_status *status)
2077 int8_t roundingMode;
2078 flag roundNearestEven, increment;
2079 int64_t z;
2081 roundingMode = status->float_rounding_mode;
2082 roundNearestEven = ( roundingMode == float_round_nearest_even );
2083 switch (roundingMode) {
2084 case float_round_nearest_even:
2085 case float_round_ties_away:
2086 increment = ((int64_t) absZ1 < 0);
2087 break;
2088 case float_round_to_zero:
2089 increment = 0;
2090 break;
2091 case float_round_up:
2092 increment = !zSign && absZ1;
2093 break;
2094 case float_round_down:
2095 increment = zSign && absZ1;
2096 break;
2097 default:
2098 abort();
2100 if ( increment ) {
2101 ++absZ0;
2102 if ( absZ0 == 0 ) goto overflow;
2103 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2105 z = absZ0;
2106 if ( zSign ) z = - z;
2107 if ( z && ( ( z < 0 ) ^ zSign ) ) {
2108 overflow:
2109 float_raise(float_flag_invalid, status);
2110 return
2111 zSign ? (int64_t) LIT64( 0x8000000000000000 )
2112 : LIT64( 0x7FFFFFFFFFFFFFFF );
2114 if (absZ1) {
2115 status->float_exception_flags |= float_flag_inexact;
2117 return z;
2121 /*----------------------------------------------------------------------------
2122 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2123 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2124 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2125 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
2126 | with the inexact exception raised if the input cannot be represented exactly
2127 | as an integer. However, if the fixed-point input is too large, the invalid
2128 | exception is raised and the largest unsigned integer is returned.
2129 *----------------------------------------------------------------------------*/
2131 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2132 uint64_t absZ1, float_status *status)
2134 int8_t roundingMode;
2135 flag roundNearestEven, increment;
2137 roundingMode = status->float_rounding_mode;
2138 roundNearestEven = (roundingMode == float_round_nearest_even);
2139 switch (roundingMode) {
2140 case float_round_nearest_even:
2141 case float_round_ties_away:
2142 increment = ((int64_t)absZ1 < 0);
2143 break;
2144 case float_round_to_zero:
2145 increment = 0;
2146 break;
2147 case float_round_up:
2148 increment = !zSign && absZ1;
2149 break;
2150 case float_round_down:
2151 increment = zSign && absZ1;
2152 break;
2153 default:
2154 abort();
2156 if (increment) {
2157 ++absZ0;
2158 if (absZ0 == 0) {
2159 float_raise(float_flag_invalid, status);
2160 return LIT64(0xFFFFFFFFFFFFFFFF);
2162 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2165 if (zSign && absZ0) {
2166 float_raise(float_flag_invalid, status);
2167 return 0;
2170 if (absZ1) {
2171 status->float_exception_flags |= float_flag_inexact;
2173 return absZ0;
2176 /*----------------------------------------------------------------------------
2177 | If `a' is denormal and we are in flush-to-zero mode then set the
2178 | input-denormal exception and return zero. Otherwise just return the value.
2179 *----------------------------------------------------------------------------*/
2180 float32 float32_squash_input_denormal(float32 a, float_status *status)
2182 if (status->flush_inputs_to_zero) {
2183 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2184 float_raise(float_flag_input_denormal, status);
2185 return make_float32(float32_val(a) & 0x80000000);
2188 return a;
2191 /*----------------------------------------------------------------------------
2192 | Normalizes the subnormal single-precision floating-point value represented
2193 | by the denormalized significand `aSig'. The normalized exponent and
2194 | significand are stored at the locations pointed to by `zExpPtr' and
2195 | `zSigPtr', respectively.
2196 *----------------------------------------------------------------------------*/
2198 static void
2199 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2201 int8_t shiftCount;
2203 shiftCount = countLeadingZeros32( aSig ) - 8;
2204 *zSigPtr = aSig<<shiftCount;
2205 *zExpPtr = 1 - shiftCount;
2209 /*----------------------------------------------------------------------------
2210 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2211 | and significand `zSig', and returns the proper single-precision floating-
2212 | point value corresponding to the abstract input. Ordinarily, the abstract
2213 | value is simply rounded and packed into the single-precision format, with
2214 | the inexact exception raised if the abstract input cannot be represented
2215 | exactly. However, if the abstract value is too large, the overflow and
2216 | inexact exceptions are raised and an infinity or maximal finite value is
2217 | returned. If the abstract value is too small, the input value is rounded to
2218 | a subnormal number, and the underflow and inexact exceptions are raised if
2219 | the abstract input cannot be represented exactly as a subnormal single-
2220 | precision floating-point number.
2221 | The input significand `zSig' has its binary point between bits 30
2222 | and 29, which is 7 bits to the left of the usual location. This shifted
2223 | significand must be normalized or smaller. If `zSig' is not normalized,
2224 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2225 | and it must not require rounding. In the usual case that `zSig' is
2226 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2227 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2228 | Binary Floating-Point Arithmetic.
2229 *----------------------------------------------------------------------------*/
2231 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2232 float_status *status)
2234 int8_t roundingMode;
2235 flag roundNearestEven;
2236 int8_t roundIncrement, roundBits;
2237 flag isTiny;
2239 roundingMode = status->float_rounding_mode;
2240 roundNearestEven = ( roundingMode == float_round_nearest_even );
2241 switch (roundingMode) {
2242 case float_round_nearest_even:
2243 case float_round_ties_away:
2244 roundIncrement = 0x40;
2245 break;
2246 case float_round_to_zero:
2247 roundIncrement = 0;
2248 break;
2249 case float_round_up:
2250 roundIncrement = zSign ? 0 : 0x7f;
2251 break;
2252 case float_round_down:
2253 roundIncrement = zSign ? 0x7f : 0;
2254 break;
2255 default:
2256 abort();
2257 break;
2259 roundBits = zSig & 0x7F;
2260 if ( 0xFD <= (uint16_t) zExp ) {
2261 if ( ( 0xFD < zExp )
2262 || ( ( zExp == 0xFD )
2263 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2265 float_raise(float_flag_overflow | float_flag_inexact, status);
2266 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2268 if ( zExp < 0 ) {
2269 if (status->flush_to_zero) {
2270 float_raise(float_flag_output_denormal, status);
2271 return packFloat32(zSign, 0, 0);
2273 isTiny =
2274 (status->float_detect_tininess
2275 == float_tininess_before_rounding)
2276 || ( zExp < -1 )
2277 || ( zSig + roundIncrement < 0x80000000 );
2278 shift32RightJamming( zSig, - zExp, &zSig );
2279 zExp = 0;
2280 roundBits = zSig & 0x7F;
2281 if (isTiny && roundBits) {
2282 float_raise(float_flag_underflow, status);
2286 if (roundBits) {
2287 status->float_exception_flags |= float_flag_inexact;
2289 zSig = ( zSig + roundIncrement )>>7;
2290 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2291 if ( zSig == 0 ) zExp = 0;
2292 return packFloat32( zSign, zExp, zSig );
2296 /*----------------------------------------------------------------------------
2297 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2298 | and significand `zSig', and returns the proper single-precision floating-
2299 | point value corresponding to the abstract input. This routine is just like
2300 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2301 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2302 | floating-point exponent.
2303 *----------------------------------------------------------------------------*/
2305 static float32
2306 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2307 float_status *status)
2309 int8_t shiftCount;
2311 shiftCount = countLeadingZeros32( zSig ) - 1;
2312 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2313 status);
2317 /*----------------------------------------------------------------------------
2318 | If `a' is denormal and we are in flush-to-zero mode then set the
2319 | input-denormal exception and return zero. Otherwise just return the value.
2320 *----------------------------------------------------------------------------*/
2321 float64 float64_squash_input_denormal(float64 a, float_status *status)
2323 if (status->flush_inputs_to_zero) {
2324 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2325 float_raise(float_flag_input_denormal, status);
2326 return make_float64(float64_val(a) & (1ULL << 63));
2329 return a;
2332 /*----------------------------------------------------------------------------
2333 | Normalizes the subnormal double-precision floating-point value represented
2334 | by the denormalized significand `aSig'. The normalized exponent and
2335 | significand are stored at the locations pointed to by `zExpPtr' and
2336 | `zSigPtr', respectively.
2337 *----------------------------------------------------------------------------*/
2339 static void
2340 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2342 int8_t shiftCount;
2344 shiftCount = countLeadingZeros64( aSig ) - 11;
2345 *zSigPtr = aSig<<shiftCount;
2346 *zExpPtr = 1 - shiftCount;
2350 /*----------------------------------------------------------------------------
2351 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2352 | double-precision floating-point value, returning the result. After being
2353 | shifted into the proper positions, the three fields are simply added
2354 | together to form the result. This means that any integer portion of `zSig'
2355 | will be added into the exponent. Since a properly normalized significand
2356 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2357 | than the desired result exponent whenever `zSig' is a complete, normalized
2358 | significand.
2359 *----------------------------------------------------------------------------*/
2361 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2364 return make_float64(
2365 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2369 /*----------------------------------------------------------------------------
2370 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2371 | and significand `zSig', and returns the proper double-precision floating-
2372 | point value corresponding to the abstract input. Ordinarily, the abstract
2373 | value is simply rounded and packed into the double-precision format, with
2374 | the inexact exception raised if the abstract input cannot be represented
2375 | exactly. However, if the abstract value is too large, the overflow and
2376 | inexact exceptions are raised and an infinity or maximal finite value is
2377 | returned. If the abstract value is too small, the input value is rounded to
2378 | a subnormal number, and the underflow and inexact exceptions are raised if
2379 | the abstract input cannot be represented exactly as a subnormal double-
2380 | precision floating-point number.
2381 | The input significand `zSig' has its binary point between bits 62
2382 | and 61, which is 10 bits to the left of the usual location. This shifted
2383 | significand must be normalized or smaller. If `zSig' is not normalized,
2384 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2385 | and it must not require rounding. In the usual case that `zSig' is
2386 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2387 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2388 | Binary Floating-Point Arithmetic.
2389 *----------------------------------------------------------------------------*/
2391 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2392 float_status *status)
2394 int8_t roundingMode;
2395 flag roundNearestEven;
2396 int roundIncrement, roundBits;
2397 flag isTiny;
2399 roundingMode = status->float_rounding_mode;
2400 roundNearestEven = ( roundingMode == float_round_nearest_even );
2401 switch (roundingMode) {
2402 case float_round_nearest_even:
2403 case float_round_ties_away:
2404 roundIncrement = 0x200;
2405 break;
2406 case float_round_to_zero:
2407 roundIncrement = 0;
2408 break;
2409 case float_round_up:
2410 roundIncrement = zSign ? 0 : 0x3ff;
2411 break;
2412 case float_round_down:
2413 roundIncrement = zSign ? 0x3ff : 0;
2414 break;
2415 case float_round_to_odd:
2416 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2417 break;
2418 default:
2419 abort();
2421 roundBits = zSig & 0x3FF;
2422 if ( 0x7FD <= (uint16_t) zExp ) {
2423 if ( ( 0x7FD < zExp )
2424 || ( ( zExp == 0x7FD )
2425 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2427 bool overflow_to_inf = roundingMode != float_round_to_odd &&
2428 roundIncrement != 0;
2429 float_raise(float_flag_overflow | float_flag_inexact, status);
2430 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2432 if ( zExp < 0 ) {
2433 if (status->flush_to_zero) {
2434 float_raise(float_flag_output_denormal, status);
2435 return packFloat64(zSign, 0, 0);
2437 isTiny =
2438 (status->float_detect_tininess
2439 == float_tininess_before_rounding)
2440 || ( zExp < -1 )
2441 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2442 shift64RightJamming( zSig, - zExp, &zSig );
2443 zExp = 0;
2444 roundBits = zSig & 0x3FF;
2445 if (isTiny && roundBits) {
2446 float_raise(float_flag_underflow, status);
2448 if (roundingMode == float_round_to_odd) {
2450 * For round-to-odd case, the roundIncrement depends on
2451 * zSig which just changed.
2453 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2457 if (roundBits) {
2458 status->float_exception_flags |= float_flag_inexact;
2460 zSig = ( zSig + roundIncrement )>>10;
2461 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2462 if ( zSig == 0 ) zExp = 0;
2463 return packFloat64( zSign, zExp, zSig );
2467 /*----------------------------------------------------------------------------
2468 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2469 | and significand `zSig', and returns the proper double-precision floating-
2470 | point value corresponding to the abstract input. This routine is just like
2471 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2472 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2473 | floating-point exponent.
2474 *----------------------------------------------------------------------------*/
2476 static float64
2477 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2478 float_status *status)
2480 int8_t shiftCount;
2482 shiftCount = countLeadingZeros64( zSig ) - 1;
2483 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2484 status);
2488 /*----------------------------------------------------------------------------
2489 | Normalizes the subnormal extended double-precision floating-point value
2490 | represented by the denormalized significand `aSig'. The normalized exponent
2491 | and significand are stored at the locations pointed to by `zExpPtr' and
2492 | `zSigPtr', respectively.
2493 *----------------------------------------------------------------------------*/
2495 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2496 uint64_t *zSigPtr)
2498 int8_t shiftCount;
2500 shiftCount = countLeadingZeros64( aSig );
2501 *zSigPtr = aSig<<shiftCount;
2502 *zExpPtr = 1 - shiftCount;
2505 /*----------------------------------------------------------------------------
2506 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2507 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2508 | and returns the proper extended double-precision floating-point value
2509 | corresponding to the abstract input. Ordinarily, the abstract value is
2510 | rounded and packed into the extended double-precision format, with the
2511 | inexact exception raised if the abstract input cannot be represented
2512 | exactly. However, if the abstract value is too large, the overflow and
2513 | inexact exceptions are raised and an infinity or maximal finite value is
2514 | returned. If the abstract value is too small, the input value is rounded to
2515 | a subnormal number, and the underflow and inexact exceptions are raised if
2516 | the abstract input cannot be represented exactly as a subnormal extended
2517 | double-precision floating-point number.
2518 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
2519 | number of bits as single or double precision, respectively. Otherwise, the
2520 | result is rounded to the full precision of the extended double-precision
2521 | format.
2522 | The input significand must be normalized or smaller. If the input
2523 | significand is not normalized, `zExp' must be 0; in that case, the result
2524 | returned is a subnormal number, and it must not require rounding. The
2525 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2526 | Floating-Point Arithmetic.
2527 *----------------------------------------------------------------------------*/
2529 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2530 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2531 float_status *status)
2533 int8_t roundingMode;
2534 flag roundNearestEven, increment, isTiny;
2535 int64_t roundIncrement, roundMask, roundBits;
2537 roundingMode = status->float_rounding_mode;
2538 roundNearestEven = ( roundingMode == float_round_nearest_even );
2539 if ( roundingPrecision == 80 ) goto precision80;
2540 if ( roundingPrecision == 64 ) {
2541 roundIncrement = LIT64( 0x0000000000000400 );
2542 roundMask = LIT64( 0x00000000000007FF );
2544 else if ( roundingPrecision == 32 ) {
2545 roundIncrement = LIT64( 0x0000008000000000 );
2546 roundMask = LIT64( 0x000000FFFFFFFFFF );
2548 else {
2549 goto precision80;
2551 zSig0 |= ( zSig1 != 0 );
2552 switch (roundingMode) {
2553 case float_round_nearest_even:
2554 case float_round_ties_away:
2555 break;
2556 case float_round_to_zero:
2557 roundIncrement = 0;
2558 break;
2559 case float_round_up:
2560 roundIncrement = zSign ? 0 : roundMask;
2561 break;
2562 case float_round_down:
2563 roundIncrement = zSign ? roundMask : 0;
2564 break;
2565 default:
2566 abort();
2568 roundBits = zSig0 & roundMask;
2569 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2570 if ( ( 0x7FFE < zExp )
2571 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2573 goto overflow;
2575 if ( zExp <= 0 ) {
2576 if (status->flush_to_zero) {
2577 float_raise(float_flag_output_denormal, status);
2578 return packFloatx80(zSign, 0, 0);
2580 isTiny =
2581 (status->float_detect_tininess
2582 == float_tininess_before_rounding)
2583 || ( zExp < 0 )
2584 || ( zSig0 <= zSig0 + roundIncrement );
2585 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2586 zExp = 0;
2587 roundBits = zSig0 & roundMask;
2588 if (isTiny && roundBits) {
2589 float_raise(float_flag_underflow, status);
2591 if (roundBits) {
2592 status->float_exception_flags |= float_flag_inexact;
2594 zSig0 += roundIncrement;
2595 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2596 roundIncrement = roundMask + 1;
2597 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2598 roundMask |= roundIncrement;
2600 zSig0 &= ~ roundMask;
2601 return packFloatx80( zSign, zExp, zSig0 );
2604 if (roundBits) {
2605 status->float_exception_flags |= float_flag_inexact;
2607 zSig0 += roundIncrement;
2608 if ( zSig0 < roundIncrement ) {
2609 ++zExp;
2610 zSig0 = LIT64( 0x8000000000000000 );
2612 roundIncrement = roundMask + 1;
2613 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2614 roundMask |= roundIncrement;
2616 zSig0 &= ~ roundMask;
2617 if ( zSig0 == 0 ) zExp = 0;
2618 return packFloatx80( zSign, zExp, zSig0 );
2619 precision80:
2620 switch (roundingMode) {
2621 case float_round_nearest_even:
2622 case float_round_ties_away:
2623 increment = ((int64_t)zSig1 < 0);
2624 break;
2625 case float_round_to_zero:
2626 increment = 0;
2627 break;
2628 case float_round_up:
2629 increment = !zSign && zSig1;
2630 break;
2631 case float_round_down:
2632 increment = zSign && zSig1;
2633 break;
2634 default:
2635 abort();
2637 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2638 if ( ( 0x7FFE < zExp )
2639 || ( ( zExp == 0x7FFE )
2640 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2641 && increment
2644 roundMask = 0;
2645 overflow:
2646 float_raise(float_flag_overflow | float_flag_inexact, status);
2647 if ( ( roundingMode == float_round_to_zero )
2648 || ( zSign && ( roundingMode == float_round_up ) )
2649 || ( ! zSign && ( roundingMode == float_round_down ) )
2651 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2653 return packFloatx80(zSign,
2654 floatx80_infinity_high,
2655 floatx80_infinity_low);
2657 if ( zExp <= 0 ) {
2658 isTiny =
2659 (status->float_detect_tininess
2660 == float_tininess_before_rounding)
2661 || ( zExp < 0 )
2662 || ! increment
2663 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2664 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2665 zExp = 0;
2666 if (isTiny && zSig1) {
2667 float_raise(float_flag_underflow, status);
2669 if (zSig1) {
2670 status->float_exception_flags |= float_flag_inexact;
2672 switch (roundingMode) {
2673 case float_round_nearest_even:
2674 case float_round_ties_away:
2675 increment = ((int64_t)zSig1 < 0);
2676 break;
2677 case float_round_to_zero:
2678 increment = 0;
2679 break;
2680 case float_round_up:
2681 increment = !zSign && zSig1;
2682 break;
2683 case float_round_down:
2684 increment = zSign && zSig1;
2685 break;
2686 default:
2687 abort();
2689 if ( increment ) {
2690 ++zSig0;
2691 zSig0 &=
2692 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2693 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2695 return packFloatx80( zSign, zExp, zSig0 );
2698 if (zSig1) {
2699 status->float_exception_flags |= float_flag_inexact;
2701 if ( increment ) {
2702 ++zSig0;
2703 if ( zSig0 == 0 ) {
2704 ++zExp;
2705 zSig0 = LIT64( 0x8000000000000000 );
2707 else {
2708 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2711 else {
2712 if ( zSig0 == 0 ) zExp = 0;
2714 return packFloatx80( zSign, zExp, zSig0 );
2718 /*----------------------------------------------------------------------------
2719 | Takes an abstract floating-point value having sign `zSign', exponent
2720 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2721 | and returns the proper extended double-precision floating-point value
2722 | corresponding to the abstract input. This routine is just like
2723 | `roundAndPackFloatx80' except that the input significand does not have to be
2724 | normalized.
2725 *----------------------------------------------------------------------------*/
2727 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2728 flag zSign, int32_t zExp,
2729 uint64_t zSig0, uint64_t zSig1,
2730 float_status *status)
2732 int8_t shiftCount;
2734 if ( zSig0 == 0 ) {
2735 zSig0 = zSig1;
2736 zSig1 = 0;
2737 zExp -= 64;
2739 shiftCount = countLeadingZeros64( zSig0 );
2740 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2741 zExp -= shiftCount;
2742 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2743 zSig0, zSig1, status);
2747 /*----------------------------------------------------------------------------
2748 | Returns the least-significant 64 fraction bits of the quadruple-precision
2749 | floating-point value `a'.
2750 *----------------------------------------------------------------------------*/
2752 static inline uint64_t extractFloat128Frac1( float128 a )
2755 return a.low;
2759 /*----------------------------------------------------------------------------
2760 | Returns the most-significant 48 fraction bits of the quadruple-precision
2761 | floating-point value `a'.
2762 *----------------------------------------------------------------------------*/
2764 static inline uint64_t extractFloat128Frac0( float128 a )
2767 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2771 /*----------------------------------------------------------------------------
2772 | Returns the exponent bits of the quadruple-precision floating-point value
2773 | `a'.
2774 *----------------------------------------------------------------------------*/
2776 static inline int32_t extractFloat128Exp( float128 a )
2779 return ( a.high>>48 ) & 0x7FFF;
2783 /*----------------------------------------------------------------------------
2784 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2785 *----------------------------------------------------------------------------*/
2787 static inline flag extractFloat128Sign( float128 a )
2790 return a.high>>63;
2794 /*----------------------------------------------------------------------------
2795 | Normalizes the subnormal quadruple-precision floating-point value
2796 | represented by the denormalized significand formed by the concatenation of
2797 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
2798 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
2799 | significand are stored at the location pointed to by `zSig0Ptr', and the
2800 | least significant 64 bits of the normalized significand are stored at the
2801 | location pointed to by `zSig1Ptr'.
2802 *----------------------------------------------------------------------------*/
2804 static void
2805 normalizeFloat128Subnormal(
2806 uint64_t aSig0,
2807 uint64_t aSig1,
2808 int32_t *zExpPtr,
2809 uint64_t *zSig0Ptr,
2810 uint64_t *zSig1Ptr
2813 int8_t shiftCount;
2815 if ( aSig0 == 0 ) {
2816 shiftCount = countLeadingZeros64( aSig1 ) - 15;
2817 if ( shiftCount < 0 ) {
2818 *zSig0Ptr = aSig1>>( - shiftCount );
2819 *zSig1Ptr = aSig1<<( shiftCount & 63 );
2821 else {
2822 *zSig0Ptr = aSig1<<shiftCount;
2823 *zSig1Ptr = 0;
2825 *zExpPtr = - shiftCount - 63;
2827 else {
2828 shiftCount = countLeadingZeros64( aSig0 ) - 15;
2829 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2830 *zExpPtr = 1 - shiftCount;
2835 /*----------------------------------------------------------------------------
2836 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2837 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2838 | floating-point value, returning the result. After being shifted into the
2839 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2840 | added together to form the most significant 32 bits of the result. This
2841 | means that any integer portion of `zSig0' will be added into the exponent.
2842 | Since a properly normalized significand will have an integer portion equal
2843 | to 1, the `zExp' input should be 1 less than the desired result exponent
2844 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2845 | significand.
2846 *----------------------------------------------------------------------------*/
2848 static inline float128
2849 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
2851 float128 z;
2853 z.low = zSig1;
2854 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
2855 return z;
2859 /*----------------------------------------------------------------------------
2860 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2861 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2862 | and `zSig2', and returns the proper quadruple-precision floating-point value
2863 | corresponding to the abstract input. Ordinarily, the abstract value is
2864 | simply rounded and packed into the quadruple-precision format, with the
2865 | inexact exception raised if the abstract input cannot be represented
2866 | exactly. However, if the abstract value is too large, the overflow and
2867 | inexact exceptions are raised and an infinity or maximal finite value is
2868 | returned. If the abstract value is too small, the input value is rounded to
2869 | a subnormal number, and the underflow and inexact exceptions are raised if
2870 | the abstract input cannot be represented exactly as a subnormal quadruple-
2871 | precision floating-point number.
2872 | The input significand must be normalized or smaller. If the input
2873 | significand is not normalized, `zExp' must be 0; in that case, the result
2874 | returned is a subnormal number, and it must not require rounding. In the
2875 | usual case that the input significand is normalized, `zExp' must be 1 less
2876 | than the ``true'' floating-point exponent. The handling of underflow and
2877 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2878 *----------------------------------------------------------------------------*/
2880 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
2881 uint64_t zSig0, uint64_t zSig1,
2882 uint64_t zSig2, float_status *status)
2884 int8_t roundingMode;
2885 flag roundNearestEven, increment, isTiny;
2887 roundingMode = status->float_rounding_mode;
2888 roundNearestEven = ( roundingMode == float_round_nearest_even );
2889 switch (roundingMode) {
2890 case float_round_nearest_even:
2891 case float_round_ties_away:
2892 increment = ((int64_t)zSig2 < 0);
2893 break;
2894 case float_round_to_zero:
2895 increment = 0;
2896 break;
2897 case float_round_up:
2898 increment = !zSign && zSig2;
2899 break;
2900 case float_round_down:
2901 increment = zSign && zSig2;
2902 break;
2903 case float_round_to_odd:
2904 increment = !(zSig1 & 0x1) && zSig2;
2905 break;
2906 default:
2907 abort();
2909 if ( 0x7FFD <= (uint32_t) zExp ) {
2910 if ( ( 0x7FFD < zExp )
2911 || ( ( zExp == 0x7FFD )
2912 && eq128(
2913 LIT64( 0x0001FFFFFFFFFFFF ),
2914 LIT64( 0xFFFFFFFFFFFFFFFF ),
2915 zSig0,
2916 zSig1
2918 && increment
2921 float_raise(float_flag_overflow | float_flag_inexact, status);
2922 if ( ( roundingMode == float_round_to_zero )
2923 || ( zSign && ( roundingMode == float_round_up ) )
2924 || ( ! zSign && ( roundingMode == float_round_down ) )
2925 || (roundingMode == float_round_to_odd)
2927 return
2928 packFloat128(
2929 zSign,
2930 0x7FFE,
2931 LIT64( 0x0000FFFFFFFFFFFF ),
2932 LIT64( 0xFFFFFFFFFFFFFFFF )
2935 return packFloat128( zSign, 0x7FFF, 0, 0 );
2937 if ( zExp < 0 ) {
2938 if (status->flush_to_zero) {
2939 float_raise(float_flag_output_denormal, status);
2940 return packFloat128(zSign, 0, 0, 0);
2942 isTiny =
2943 (status->float_detect_tininess
2944 == float_tininess_before_rounding)
2945 || ( zExp < -1 )
2946 || ! increment
2947 || lt128(
2948 zSig0,
2949 zSig1,
2950 LIT64( 0x0001FFFFFFFFFFFF ),
2951 LIT64( 0xFFFFFFFFFFFFFFFF )
2953 shift128ExtraRightJamming(
2954 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
2955 zExp = 0;
2956 if (isTiny && zSig2) {
2957 float_raise(float_flag_underflow, status);
2959 switch (roundingMode) {
2960 case float_round_nearest_even:
2961 case float_round_ties_away:
2962 increment = ((int64_t)zSig2 < 0);
2963 break;
2964 case float_round_to_zero:
2965 increment = 0;
2966 break;
2967 case float_round_up:
2968 increment = !zSign && zSig2;
2969 break;
2970 case float_round_down:
2971 increment = zSign && zSig2;
2972 break;
2973 case float_round_to_odd:
2974 increment = !(zSig1 & 0x1) && zSig2;
2975 break;
2976 default:
2977 abort();
2981 if (zSig2) {
2982 status->float_exception_flags |= float_flag_inexact;
2984 if ( increment ) {
2985 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
2986 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
2988 else {
2989 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
2991 return packFloat128( zSign, zExp, zSig0, zSig1 );
2995 /*----------------------------------------------------------------------------
2996 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2997 | and significand formed by the concatenation of `zSig0' and `zSig1', and
2998 | returns the proper quadruple-precision floating-point value corresponding
2999 | to the abstract input. This routine is just like `roundAndPackFloat128'
3000 | except that the input significand has fewer bits and does not have to be
3001 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
3002 | point exponent.
3003 *----------------------------------------------------------------------------*/
3005 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
3006 uint64_t zSig0, uint64_t zSig1,
3007 float_status *status)
3009 int8_t shiftCount;
3010 uint64_t zSig2;
3012 if ( zSig0 == 0 ) {
3013 zSig0 = zSig1;
3014 zSig1 = 0;
3015 zExp -= 64;
3017 shiftCount = countLeadingZeros64( zSig0 ) - 15;
3018 if ( 0 <= shiftCount ) {
3019 zSig2 = 0;
3020 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3022 else {
3023 shift128ExtraRightJamming(
3024 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3026 zExp -= shiftCount;
3027 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3032 /*----------------------------------------------------------------------------
3033 | Returns the result of converting the 32-bit two's complement integer `a'
3034 | to the extended double-precision floating-point format. The conversion
3035 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3036 | Arithmetic.
3037 *----------------------------------------------------------------------------*/
3039 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3041 flag zSign;
3042 uint32_t absA;
3043 int8_t shiftCount;
3044 uint64_t zSig;
3046 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3047 zSign = ( a < 0 );
3048 absA = zSign ? - a : a;
3049 shiftCount = countLeadingZeros32( absA ) + 32;
3050 zSig = absA;
3051 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3055 /*----------------------------------------------------------------------------
3056 | Returns the result of converting the 32-bit two's complement integer `a' to
3057 | the quadruple-precision floating-point format. The conversion is performed
3058 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3059 *----------------------------------------------------------------------------*/
3061 float128 int32_to_float128(int32_t a, float_status *status)
3063 flag zSign;
3064 uint32_t absA;
3065 int8_t shiftCount;
3066 uint64_t zSig0;
3068 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3069 zSign = ( a < 0 );
3070 absA = zSign ? - a : a;
3071 shiftCount = countLeadingZeros32( absA ) + 17;
3072 zSig0 = absA;
3073 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3077 /*----------------------------------------------------------------------------
3078 | Returns the result of converting the 64-bit two's complement integer `a'
3079 | to the extended double-precision floating-point format. The conversion
3080 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3081 | Arithmetic.
3082 *----------------------------------------------------------------------------*/
3084 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3086 flag zSign;
3087 uint64_t absA;
3088 int8_t shiftCount;
3090 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3091 zSign = ( a < 0 );
3092 absA = zSign ? - a : a;
3093 shiftCount = countLeadingZeros64( absA );
3094 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3098 /*----------------------------------------------------------------------------
3099 | Returns the result of converting the 64-bit two's complement integer `a' to
3100 | the quadruple-precision floating-point format. The conversion is performed
3101 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3102 *----------------------------------------------------------------------------*/
3104 float128 int64_to_float128(int64_t a, float_status *status)
3106 flag zSign;
3107 uint64_t absA;
3108 int8_t shiftCount;
3109 int32_t zExp;
3110 uint64_t zSig0, zSig1;
3112 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3113 zSign = ( a < 0 );
3114 absA = zSign ? - a : a;
3115 shiftCount = countLeadingZeros64( absA ) + 49;
3116 zExp = 0x406E - shiftCount;
3117 if ( 64 <= shiftCount ) {
3118 zSig1 = 0;
3119 zSig0 = absA;
3120 shiftCount -= 64;
3122 else {
3123 zSig1 = absA;
3124 zSig0 = 0;
3126 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3127 return packFloat128( zSign, zExp, zSig0, zSig1 );
3131 /*----------------------------------------------------------------------------
3132 | Returns the result of converting the 64-bit unsigned integer `a'
3133 | to the quadruple-precision floating-point format. The conversion is performed
3134 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3135 *----------------------------------------------------------------------------*/
3137 float128 uint64_to_float128(uint64_t a, float_status *status)
3139 if (a == 0) {
3140 return float128_zero;
3142 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
3148 /*----------------------------------------------------------------------------
3149 | Returns the result of converting the single-precision floating-point value
3150 | `a' to the double-precision floating-point format. The conversion is
3151 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3152 | Arithmetic.
3153 *----------------------------------------------------------------------------*/
3155 float64 float32_to_float64(float32 a, float_status *status)
3157 flag aSign;
3158 int aExp;
3159 uint32_t aSig;
3160 a = float32_squash_input_denormal(a, status);
3162 aSig = extractFloat32Frac( a );
3163 aExp = extractFloat32Exp( a );
3164 aSign = extractFloat32Sign( a );
3165 if ( aExp == 0xFF ) {
3166 if (aSig) {
3167 return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
3169 return packFloat64( aSign, 0x7FF, 0 );
3171 if ( aExp == 0 ) {
3172 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
3173 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3174 --aExp;
3176 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
3180 /*----------------------------------------------------------------------------
3181 | Returns the result of converting the single-precision floating-point value
3182 | `a' to the extended double-precision floating-point format. The conversion
3183 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3184 | Arithmetic.
3185 *----------------------------------------------------------------------------*/
3187 floatx80 float32_to_floatx80(float32 a, float_status *status)
3189 flag aSign;
3190 int aExp;
3191 uint32_t aSig;
3193 a = float32_squash_input_denormal(a, status);
3194 aSig = extractFloat32Frac( a );
3195 aExp = extractFloat32Exp( a );
3196 aSign = extractFloat32Sign( a );
3197 if ( aExp == 0xFF ) {
3198 if (aSig) {
3199 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3201 return packFloatx80(aSign,
3202 floatx80_infinity_high,
3203 floatx80_infinity_low);
3205 if ( aExp == 0 ) {
3206 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3207 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3209 aSig |= 0x00800000;
3210 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3214 /*----------------------------------------------------------------------------
3215 | Returns the result of converting the single-precision floating-point value
3216 | `a' to the double-precision floating-point format. The conversion is
3217 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3218 | Arithmetic.
3219 *----------------------------------------------------------------------------*/
3221 float128 float32_to_float128(float32 a, float_status *status)
3223 flag aSign;
3224 int aExp;
3225 uint32_t aSig;
3227 a = float32_squash_input_denormal(a, status);
3228 aSig = extractFloat32Frac( a );
3229 aExp = extractFloat32Exp( a );
3230 aSign = extractFloat32Sign( a );
3231 if ( aExp == 0xFF ) {
3232 if (aSig) {
3233 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3235 return packFloat128( aSign, 0x7FFF, 0, 0 );
3237 if ( aExp == 0 ) {
3238 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3239 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3240 --aExp;
3242 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3246 /*----------------------------------------------------------------------------
3247 | Returns the remainder of the single-precision floating-point value `a'
3248 | with respect to the corresponding value `b'. The operation is performed
3249 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3250 *----------------------------------------------------------------------------*/
3252 float32 float32_rem(float32 a, float32 b, float_status *status)
3254 flag aSign, zSign;
3255 int aExp, bExp, expDiff;
3256 uint32_t aSig, bSig;
3257 uint32_t q;
3258 uint64_t aSig64, bSig64, q64;
3259 uint32_t alternateASig;
3260 int32_t sigMean;
3261 a = float32_squash_input_denormal(a, status);
3262 b = float32_squash_input_denormal(b, status);
3264 aSig = extractFloat32Frac( a );
3265 aExp = extractFloat32Exp( a );
3266 aSign = extractFloat32Sign( a );
3267 bSig = extractFloat32Frac( b );
3268 bExp = extractFloat32Exp( b );
3269 if ( aExp == 0xFF ) {
3270 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3271 return propagateFloat32NaN(a, b, status);
3273 float_raise(float_flag_invalid, status);
3274 return float32_default_nan(status);
3276 if ( bExp == 0xFF ) {
3277 if (bSig) {
3278 return propagateFloat32NaN(a, b, status);
3280 return a;
3282 if ( bExp == 0 ) {
3283 if ( bSig == 0 ) {
3284 float_raise(float_flag_invalid, status);
3285 return float32_default_nan(status);
3287 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3289 if ( aExp == 0 ) {
3290 if ( aSig == 0 ) return a;
3291 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3293 expDiff = aExp - bExp;
3294 aSig |= 0x00800000;
3295 bSig |= 0x00800000;
3296 if ( expDiff < 32 ) {
3297 aSig <<= 8;
3298 bSig <<= 8;
3299 if ( expDiff < 0 ) {
3300 if ( expDiff < -1 ) return a;
3301 aSig >>= 1;
3303 q = ( bSig <= aSig );
3304 if ( q ) aSig -= bSig;
3305 if ( 0 < expDiff ) {
3306 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3307 q >>= 32 - expDiff;
3308 bSig >>= 2;
3309 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3311 else {
3312 aSig >>= 2;
3313 bSig >>= 2;
3316 else {
3317 if ( bSig <= aSig ) aSig -= bSig;
3318 aSig64 = ( (uint64_t) aSig )<<40;
3319 bSig64 = ( (uint64_t) bSig )<<40;
3320 expDiff -= 64;
3321 while ( 0 < expDiff ) {
3322 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3323 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3324 aSig64 = - ( ( bSig * q64 )<<38 );
3325 expDiff -= 62;
3327 expDiff += 64;
3328 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3329 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3330 q = q64>>( 64 - expDiff );
3331 bSig <<= 6;
3332 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3334 do {
3335 alternateASig = aSig;
3336 ++q;
3337 aSig -= bSig;
3338 } while ( 0 <= (int32_t) aSig );
3339 sigMean = aSig + alternateASig;
3340 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3341 aSig = alternateASig;
3343 zSign = ( (int32_t) aSig < 0 );
3344 if ( zSign ) aSig = - aSig;
3345 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3350 /*----------------------------------------------------------------------------
3351 | Returns the binary exponential of the single-precision floating-point value
3352 | `a'. The operation is performed according to the IEC/IEEE Standard for
3353 | Binary Floating-Point Arithmetic.
3355 | Uses the following identities:
3357 | 1. -------------------------------------------------------------------------
3358 | x x*ln(2)
3359 | 2 = e
3361 | 2. -------------------------------------------------------------------------
3362 | 2 3 4 5 n
3363 | x x x x x x x
3364 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3365 | 1! 2! 3! 4! 5! n!
3366 *----------------------------------------------------------------------------*/
3368 static const float64 float32_exp2_coefficients[15] =
3370 const_float64( 0x3ff0000000000000ll ), /* 1 */
3371 const_float64( 0x3fe0000000000000ll ), /* 2 */
3372 const_float64( 0x3fc5555555555555ll ), /* 3 */
3373 const_float64( 0x3fa5555555555555ll ), /* 4 */
3374 const_float64( 0x3f81111111111111ll ), /* 5 */
3375 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
3376 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
3377 const_float64( 0x3efa01a01a01a01all ), /* 8 */
3378 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
3379 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3380 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3381 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3382 const_float64( 0x3de6124613a86d09ll ), /* 13 */
3383 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3384 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3387 float32 float32_exp2(float32 a, float_status *status)
3389 flag aSign;
3390 int aExp;
3391 uint32_t aSig;
3392 float64 r, x, xn;
3393 int i;
3394 a = float32_squash_input_denormal(a, status);
3396 aSig = extractFloat32Frac( a );
3397 aExp = extractFloat32Exp( a );
3398 aSign = extractFloat32Sign( a );
3400 if ( aExp == 0xFF) {
3401 if (aSig) {
3402 return propagateFloat32NaN(a, float32_zero, status);
3404 return (aSign) ? float32_zero : a;
3406 if (aExp == 0) {
3407 if (aSig == 0) return float32_one;
3410 float_raise(float_flag_inexact, status);
3412 /* ******************************* */
3413 /* using float64 for approximation */
3414 /* ******************************* */
3415 x = float32_to_float64(a, status);
3416 x = float64_mul(x, float64_ln2, status);
3418 xn = x;
3419 r = float64_one;
3420 for (i = 0 ; i < 15 ; i++) {
3421 float64 f;
3423 f = float64_mul(xn, float32_exp2_coefficients[i], status);
3424 r = float64_add(r, f, status);
3426 xn = float64_mul(xn, x, status);
3429 return float64_to_float32(r, status);
3432 /*----------------------------------------------------------------------------
3433 | Returns the binary log of the single-precision floating-point value `a'.
3434 | The operation is performed according to the IEC/IEEE Standard for Binary
3435 | Floating-Point Arithmetic.
3436 *----------------------------------------------------------------------------*/
3437 float32 float32_log2(float32 a, float_status *status)
3439 flag aSign, zSign;
3440 int aExp;
3441 uint32_t aSig, zSig, i;
3443 a = float32_squash_input_denormal(a, status);
3444 aSig = extractFloat32Frac( a );
3445 aExp = extractFloat32Exp( a );
3446 aSign = extractFloat32Sign( a );
3448 if ( aExp == 0 ) {
3449 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3450 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3452 if ( aSign ) {
3453 float_raise(float_flag_invalid, status);
3454 return float32_default_nan(status);
3456 if ( aExp == 0xFF ) {
3457 if (aSig) {
3458 return propagateFloat32NaN(a, float32_zero, status);
3460 return a;
3463 aExp -= 0x7F;
3464 aSig |= 0x00800000;
3465 zSign = aExp < 0;
3466 zSig = aExp << 23;
3468 for (i = 1 << 22; i > 0; i >>= 1) {
3469 aSig = ( (uint64_t)aSig * aSig ) >> 23;
3470 if ( aSig & 0x01000000 ) {
3471 aSig >>= 1;
3472 zSig |= i;
3476 if ( zSign )
3477 zSig = -zSig;
3479 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3482 /*----------------------------------------------------------------------------
3483 | Returns 1 if the single-precision floating-point value `a' is equal to
3484 | the corresponding value `b', and 0 otherwise. The invalid exception is
3485 | raised if either operand is a NaN. Otherwise, the comparison is performed
3486 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3487 *----------------------------------------------------------------------------*/
3489 int float32_eq(float32 a, float32 b, float_status *status)
3491 uint32_t av, bv;
3492 a = float32_squash_input_denormal(a, status);
3493 b = float32_squash_input_denormal(b, status);
3495 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3496 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3498 float_raise(float_flag_invalid, status);
3499 return 0;
3501 av = float32_val(a);
3502 bv = float32_val(b);
3503 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3506 /*----------------------------------------------------------------------------
3507 | Returns 1 if the single-precision floating-point value `a' is less than
3508 | or equal to the corresponding value `b', and 0 otherwise. The invalid
3509 | exception is raised if either operand is a NaN. The comparison is performed
3510 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3511 *----------------------------------------------------------------------------*/
3513 int float32_le(float32 a, float32 b, float_status *status)
3515 flag aSign, bSign;
3516 uint32_t av, bv;
3517 a = float32_squash_input_denormal(a, status);
3518 b = float32_squash_input_denormal(b, status);
3520 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3521 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3523 float_raise(float_flag_invalid, status);
3524 return 0;
3526 aSign = extractFloat32Sign( a );
3527 bSign = extractFloat32Sign( b );
3528 av = float32_val(a);
3529 bv = float32_val(b);
3530 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3531 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3535 /*----------------------------------------------------------------------------
3536 | Returns 1 if the single-precision floating-point value `a' is less than
3537 | the corresponding value `b', and 0 otherwise. The invalid exception is
3538 | raised if either operand is a NaN. The comparison is performed according
3539 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3540 *----------------------------------------------------------------------------*/
3542 int float32_lt(float32 a, float32 b, float_status *status)
3544 flag aSign, bSign;
3545 uint32_t av, bv;
3546 a = float32_squash_input_denormal(a, status);
3547 b = float32_squash_input_denormal(b, status);
3549 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3550 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3552 float_raise(float_flag_invalid, status);
3553 return 0;
3555 aSign = extractFloat32Sign( a );
3556 bSign = extractFloat32Sign( b );
3557 av = float32_val(a);
3558 bv = float32_val(b);
3559 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3560 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3564 /*----------------------------------------------------------------------------
3565 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3566 | be compared, and 0 otherwise. The invalid exception is raised if either
3567 | operand is a NaN. The comparison is performed according to the IEC/IEEE
3568 | Standard for Binary Floating-Point Arithmetic.
3569 *----------------------------------------------------------------------------*/
3571 int float32_unordered(float32 a, float32 b, float_status *status)
3573 a = float32_squash_input_denormal(a, status);
3574 b = float32_squash_input_denormal(b, status);
3576 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3577 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3579 float_raise(float_flag_invalid, status);
3580 return 1;
3582 return 0;
3585 /*----------------------------------------------------------------------------
3586 | Returns 1 if the single-precision floating-point value `a' is equal to
3587 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3588 | exception. The comparison is performed according to the IEC/IEEE Standard
3589 | for Binary Floating-Point Arithmetic.
3590 *----------------------------------------------------------------------------*/
3592 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3594 a = float32_squash_input_denormal(a, status);
3595 b = float32_squash_input_denormal(b, status);
3597 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3598 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3600 if (float32_is_signaling_nan(a, status)
3601 || float32_is_signaling_nan(b, status)) {
3602 float_raise(float_flag_invalid, status);
3604 return 0;
3606 return ( float32_val(a) == float32_val(b) ) ||
3607 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3610 /*----------------------------------------------------------------------------
3611 | Returns 1 if the single-precision floating-point value `a' is less than or
3612 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3613 | cause an exception. Otherwise, the comparison is performed according to the
3614 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3615 *----------------------------------------------------------------------------*/
3617 int float32_le_quiet(float32 a, float32 b, float_status *status)
3619 flag aSign, bSign;
3620 uint32_t av, bv;
3621 a = float32_squash_input_denormal(a, status);
3622 b = float32_squash_input_denormal(b, status);
3624 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3625 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3627 if (float32_is_signaling_nan(a, status)
3628 || float32_is_signaling_nan(b, status)) {
3629 float_raise(float_flag_invalid, status);
3631 return 0;
3633 aSign = extractFloat32Sign( a );
3634 bSign = extractFloat32Sign( b );
3635 av = float32_val(a);
3636 bv = float32_val(b);
3637 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3638 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3642 /*----------------------------------------------------------------------------
3643 | Returns 1 if the single-precision floating-point value `a' is less than
3644 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3645 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3646 | Standard for Binary Floating-Point Arithmetic.
3647 *----------------------------------------------------------------------------*/
3649 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3651 flag aSign, bSign;
3652 uint32_t av, bv;
3653 a = float32_squash_input_denormal(a, status);
3654 b = float32_squash_input_denormal(b, status);
3656 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3657 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3659 if (float32_is_signaling_nan(a, status)
3660 || float32_is_signaling_nan(b, status)) {
3661 float_raise(float_flag_invalid, status);
3663 return 0;
3665 aSign = extractFloat32Sign( a );
3666 bSign = extractFloat32Sign( b );
3667 av = float32_val(a);
3668 bv = float32_val(b);
3669 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3670 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3674 /*----------------------------------------------------------------------------
3675 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3676 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3677 | comparison is performed according to the IEC/IEEE Standard for Binary
3678 | Floating-Point Arithmetic.
3679 *----------------------------------------------------------------------------*/
3681 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3683 a = float32_squash_input_denormal(a, status);
3684 b = float32_squash_input_denormal(b, status);
3686 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3687 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3689 if (float32_is_signaling_nan(a, status)
3690 || float32_is_signaling_nan(b, status)) {
3691 float_raise(float_flag_invalid, status);
3693 return 1;
3695 return 0;
3699 /*----------------------------------------------------------------------------
3700 | Returns the result of converting the double-precision floating-point value
3701 | `a' to the single-precision floating-point format. The conversion is
3702 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3703 | Arithmetic.
3704 *----------------------------------------------------------------------------*/
3706 float32 float64_to_float32(float64 a, float_status *status)
3708 flag aSign;
3709 int aExp;
3710 uint64_t aSig;
3711 uint32_t zSig;
3712 a = float64_squash_input_denormal(a, status);
3714 aSig = extractFloat64Frac( a );
3715 aExp = extractFloat64Exp( a );
3716 aSign = extractFloat64Sign( a );
3717 if ( aExp == 0x7FF ) {
3718 if (aSig) {
3719 return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3721 return packFloat32( aSign, 0xFF, 0 );
3723 shift64RightJamming( aSig, 22, &aSig );
3724 zSig = aSig;
3725 if ( aExp || zSig ) {
3726 zSig |= 0x40000000;
3727 aExp -= 0x381;
3729 return roundAndPackFloat32(aSign, aExp, zSig, status);
3734 /*----------------------------------------------------------------------------
3735 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3736 | half-precision floating-point value, returning the result. After being
3737 | shifted into the proper positions, the three fields are simply added
3738 | together to form the result. This means that any integer portion of `zSig'
3739 | will be added into the exponent. Since a properly normalized significand
3740 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3741 | than the desired result exponent whenever `zSig' is a complete, normalized
3742 | significand.
3743 *----------------------------------------------------------------------------*/
3744 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3746 return make_float16(
3747 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3750 /*----------------------------------------------------------------------------
3751 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3752 | and significand `zSig', and returns the proper half-precision floating-
3753 | point value corresponding to the abstract input. Ordinarily, the abstract
3754 | value is simply rounded and packed into the half-precision format, with
3755 | the inexact exception raised if the abstract input cannot be represented
3756 | exactly. However, if the abstract value is too large, the overflow and
3757 | inexact exceptions are raised and an infinity or maximal finite value is
3758 | returned. If the abstract value is too small, the input value is rounded to
3759 | a subnormal number, and the underflow and inexact exceptions are raised if
3760 | the abstract input cannot be represented exactly as a subnormal half-
3761 | precision floating-point number.
3762 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3763 | ARM-style "alternative representation", which omits the NaN and Inf
3764 | encodings in order to raise the maximum representable exponent by one.
3765 | The input significand `zSig' has its binary point between bits 22
3766 | and 23, which is 13 bits to the left of the usual location. This shifted
3767 | significand must be normalized or smaller. If `zSig' is not normalized,
3768 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3769 | and it must not require rounding. In the usual case that `zSig' is
3770 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3771 | Note the slightly odd position of the binary point in zSig compared with the
3772 | other roundAndPackFloat functions. This should probably be fixed if we
3773 | need to implement more float16 routines than just conversion.
3774 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3775 | Binary Floating-Point Arithmetic.
3776 *----------------------------------------------------------------------------*/
3778 static float16 roundAndPackFloat16(flag zSign, int zExp,
3779 uint32_t zSig, flag ieee,
3780 float_status *status)
3782 int maxexp = ieee ? 29 : 30;
3783 uint32_t mask;
3784 uint32_t increment;
3785 bool rounding_bumps_exp;
3786 bool is_tiny = false;
3788 /* Calculate the mask of bits of the mantissa which are not
3789 * representable in half-precision and will be lost.
3791 if (zExp < 1) {
3792 /* Will be denormal in halfprec */
3793 mask = 0x00ffffff;
3794 if (zExp >= -11) {
3795 mask >>= 11 + zExp;
3797 } else {
3798 /* Normal number in halfprec */
3799 mask = 0x00001fff;
3802 switch (status->float_rounding_mode) {
3803 case float_round_nearest_even:
3804 increment = (mask + 1) >> 1;
3805 if ((zSig & mask) == increment) {
3806 increment = zSig & (increment << 1);
3808 break;
3809 case float_round_ties_away:
3810 increment = (mask + 1) >> 1;
3811 break;
3812 case float_round_up:
3813 increment = zSign ? 0 : mask;
3814 break;
3815 case float_round_down:
3816 increment = zSign ? mask : 0;
3817 break;
3818 default: /* round_to_zero */
3819 increment = 0;
3820 break;
3823 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3825 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3826 if (ieee) {
3827 float_raise(float_flag_overflow | float_flag_inexact, status);
3828 return packFloat16(zSign, 0x1f, 0);
3829 } else {
3830 float_raise(float_flag_invalid, status);
3831 return packFloat16(zSign, 0x1f, 0x3ff);
3835 if (zExp < 0) {
3836 /* Note that flush-to-zero does not affect half-precision results */
3837 is_tiny =
3838 (status->float_detect_tininess == float_tininess_before_rounding)
3839 || (zExp < -1)
3840 || (!rounding_bumps_exp);
3842 if (zSig & mask) {
3843 float_raise(float_flag_inexact, status);
3844 if (is_tiny) {
3845 float_raise(float_flag_underflow, status);
3849 zSig += increment;
3850 if (rounding_bumps_exp) {
3851 zSig >>= 1;
3852 zExp++;
3855 if (zExp < -10) {
3856 return packFloat16(zSign, 0, 0);
3858 if (zExp < 0) {
3859 zSig >>= -zExp;
3860 zExp = 0;
3862 return packFloat16(zSign, zExp, zSig >> 13);
3865 /*----------------------------------------------------------------------------
3866 | If `a' is denormal and we are in flush-to-zero mode then set the
3867 | input-denormal exception and return zero. Otherwise just return the value.
3868 *----------------------------------------------------------------------------*/
3869 float16 float16_squash_input_denormal(float16 a, float_status *status)
3871 if (status->flush_inputs_to_zero) {
3872 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3873 float_raise(float_flag_input_denormal, status);
3874 return make_float16(float16_val(a) & 0x8000);
3877 return a;
3880 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3881 uint32_t *zSigPtr)
3883 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3884 *zSigPtr = aSig << shiftCount;
3885 *zExpPtr = 1 - shiftCount;
3888 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3889 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3891 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3893 flag aSign;
3894 int aExp;
3895 uint32_t aSig;
3897 aSign = extractFloat16Sign(a);
3898 aExp = extractFloat16Exp(a);
3899 aSig = extractFloat16Frac(a);
3901 if (aExp == 0x1f && ieee) {
3902 if (aSig) {
3903 return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3905 return packFloat32(aSign, 0xff, 0);
3907 if (aExp == 0) {
3908 if (aSig == 0) {
3909 return packFloat32(aSign, 0, 0);
3912 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3913 aExp--;
3915 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3918 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3920 flag aSign;
3921 int aExp;
3922 uint32_t aSig;
3924 a = float32_squash_input_denormal(a, status);
3926 aSig = extractFloat32Frac( a );
3927 aExp = extractFloat32Exp( a );
3928 aSign = extractFloat32Sign( a );
3929 if ( aExp == 0xFF ) {
3930 if (aSig) {
3931 /* Input is a NaN */
3932 if (!ieee) {
3933 float_raise(float_flag_invalid, status);
3934 return packFloat16(aSign, 0, 0);
3936 return commonNaNToFloat16(
3937 float32ToCommonNaN(a, status), status);
3939 /* Infinity */
3940 if (!ieee) {
3941 float_raise(float_flag_invalid, status);
3942 return packFloat16(aSign, 0x1f, 0x3ff);
3944 return packFloat16(aSign, 0x1f, 0);
3946 if (aExp == 0 && aSig == 0) {
3947 return packFloat16(aSign, 0, 0);
3949 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3950 * even if the input is denormal; however this is harmless because
3951 * the largest possible single-precision denormal is still smaller
3952 * than the smallest representable half-precision denormal, and so we
3953 * will end up ignoring aSig and returning via the "always return zero"
3954 * codepath.
3956 aSig |= 0x00800000;
3957 aExp -= 0x71;
3959 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3962 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3964 flag aSign;
3965 int aExp;
3966 uint32_t aSig;
3968 aSign = extractFloat16Sign(a);
3969 aExp = extractFloat16Exp(a);
3970 aSig = extractFloat16Frac(a);
3972 if (aExp == 0x1f && ieee) {
3973 if (aSig) {
3974 return commonNaNToFloat64(
3975 float16ToCommonNaN(a, status), status);
3977 return packFloat64(aSign, 0x7ff, 0);
3979 if (aExp == 0) {
3980 if (aSig == 0) {
3981 return packFloat64(aSign, 0, 0);
3984 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3985 aExp--;
3987 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3990 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3992 flag aSign;
3993 int aExp;
3994 uint64_t aSig;
3995 uint32_t zSig;
3997 a = float64_squash_input_denormal(a, status);
3999 aSig = extractFloat64Frac(a);
4000 aExp = extractFloat64Exp(a);
4001 aSign = extractFloat64Sign(a);
4002 if (aExp == 0x7FF) {
4003 if (aSig) {
4004 /* Input is a NaN */
4005 if (!ieee) {
4006 float_raise(float_flag_invalid, status);
4007 return packFloat16(aSign, 0, 0);
4009 return commonNaNToFloat16(
4010 float64ToCommonNaN(a, status), status);
4012 /* Infinity */
4013 if (!ieee) {
4014 float_raise(float_flag_invalid, status);
4015 return packFloat16(aSign, 0x1f, 0x3ff);
4017 return packFloat16(aSign, 0x1f, 0);
4019 shift64RightJamming(aSig, 29, &aSig);
4020 zSig = aSig;
4021 if (aExp == 0 && zSig == 0) {
4022 return packFloat16(aSign, 0, 0);
4024 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4025 * even if the input is denormal; however this is harmless because
4026 * the largest possible single-precision denormal is still smaller
4027 * than the smallest representable half-precision denormal, and so we
4028 * will end up ignoring aSig and returning via the "always return zero"
4029 * codepath.
4031 zSig |= 0x00800000;
4032 aExp -= 0x3F1;
4034 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
4037 /*----------------------------------------------------------------------------
4038 | Returns the result of converting the double-precision floating-point value
4039 | `a' to the extended double-precision floating-point format. The conversion
4040 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4041 | Arithmetic.
4042 *----------------------------------------------------------------------------*/
4044 floatx80 float64_to_floatx80(float64 a, float_status *status)
4046 flag aSign;
4047 int aExp;
4048 uint64_t aSig;
4050 a = float64_squash_input_denormal(a, status);
4051 aSig = extractFloat64Frac( a );
4052 aExp = extractFloat64Exp( a );
4053 aSign = extractFloat64Sign( a );
4054 if ( aExp == 0x7FF ) {
4055 if (aSig) {
4056 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4058 return packFloatx80(aSign,
4059 floatx80_infinity_high,
4060 floatx80_infinity_low);
4062 if ( aExp == 0 ) {
4063 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4064 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4066 return
4067 packFloatx80(
4068 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4072 /*----------------------------------------------------------------------------
4073 | Returns the result of converting the double-precision floating-point value
4074 | `a' to the quadruple-precision floating-point format. The conversion is
4075 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4076 | Arithmetic.
4077 *----------------------------------------------------------------------------*/
4079 float128 float64_to_float128(float64 a, float_status *status)
4081 flag aSign;
4082 int aExp;
4083 uint64_t aSig, zSig0, zSig1;
4085 a = float64_squash_input_denormal(a, status);
4086 aSig = extractFloat64Frac( a );
4087 aExp = extractFloat64Exp( a );
4088 aSign = extractFloat64Sign( a );
4089 if ( aExp == 0x7FF ) {
4090 if (aSig) {
4091 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4093 return packFloat128( aSign, 0x7FFF, 0, 0 );
4095 if ( aExp == 0 ) {
4096 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4097 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4098 --aExp;
4100 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4101 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4106 /*----------------------------------------------------------------------------
4107 | Returns the remainder of the double-precision floating-point value `a'
4108 | with respect to the corresponding value `b'. The operation is performed
4109 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4110 *----------------------------------------------------------------------------*/
4112 float64 float64_rem(float64 a, float64 b, float_status *status)
4114 flag aSign, zSign;
4115 int aExp, bExp, expDiff;
4116 uint64_t aSig, bSig;
4117 uint64_t q, alternateASig;
4118 int64_t sigMean;
4120 a = float64_squash_input_denormal(a, status);
4121 b = float64_squash_input_denormal(b, status);
4122 aSig = extractFloat64Frac( a );
4123 aExp = extractFloat64Exp( a );
4124 aSign = extractFloat64Sign( a );
4125 bSig = extractFloat64Frac( b );
4126 bExp = extractFloat64Exp( b );
4127 if ( aExp == 0x7FF ) {
4128 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4129 return propagateFloat64NaN(a, b, status);
4131 float_raise(float_flag_invalid, status);
4132 return float64_default_nan(status);
4134 if ( bExp == 0x7FF ) {
4135 if (bSig) {
4136 return propagateFloat64NaN(a, b, status);
4138 return a;
4140 if ( bExp == 0 ) {
4141 if ( bSig == 0 ) {
4142 float_raise(float_flag_invalid, status);
4143 return float64_default_nan(status);
4145 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4147 if ( aExp == 0 ) {
4148 if ( aSig == 0 ) return a;
4149 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4151 expDiff = aExp - bExp;
4152 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4153 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4154 if ( expDiff < 0 ) {
4155 if ( expDiff < -1 ) return a;
4156 aSig >>= 1;
4158 q = ( bSig <= aSig );
4159 if ( q ) aSig -= bSig;
4160 expDiff -= 64;
4161 while ( 0 < expDiff ) {
4162 q = estimateDiv128To64( aSig, 0, bSig );
4163 q = ( 2 < q ) ? q - 2 : 0;
4164 aSig = - ( ( bSig>>2 ) * q );
4165 expDiff -= 62;
4167 expDiff += 64;
4168 if ( 0 < expDiff ) {
4169 q = estimateDiv128To64( aSig, 0, bSig );
4170 q = ( 2 < q ) ? q - 2 : 0;
4171 q >>= 64 - expDiff;
4172 bSig >>= 2;
4173 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4175 else {
4176 aSig >>= 2;
4177 bSig >>= 2;
4179 do {
4180 alternateASig = aSig;
4181 ++q;
4182 aSig -= bSig;
4183 } while ( 0 <= (int64_t) aSig );
4184 sigMean = aSig + alternateASig;
4185 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4186 aSig = alternateASig;
4188 zSign = ( (int64_t) aSig < 0 );
4189 if ( zSign ) aSig = - aSig;
4190 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4194 /*----------------------------------------------------------------------------
4195 | Returns the binary log of the double-precision floating-point value `a'.
4196 | The operation is performed according to the IEC/IEEE Standard for Binary
4197 | Floating-Point Arithmetic.
4198 *----------------------------------------------------------------------------*/
4199 float64 float64_log2(float64 a, float_status *status)
4201 flag aSign, zSign;
4202 int aExp;
4203 uint64_t aSig, aSig0, aSig1, zSig, i;
4204 a = float64_squash_input_denormal(a, status);
4206 aSig = extractFloat64Frac( a );
4207 aExp = extractFloat64Exp( a );
4208 aSign = extractFloat64Sign( a );
4210 if ( aExp == 0 ) {
4211 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4212 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4214 if ( aSign ) {
4215 float_raise(float_flag_invalid, status);
4216 return float64_default_nan(status);
4218 if ( aExp == 0x7FF ) {
4219 if (aSig) {
4220 return propagateFloat64NaN(a, float64_zero, status);
4222 return a;
4225 aExp -= 0x3FF;
4226 aSig |= LIT64( 0x0010000000000000 );
4227 zSign = aExp < 0;
4228 zSig = (uint64_t)aExp << 52;
4229 for (i = 1LL << 51; i > 0; i >>= 1) {
4230 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4231 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4232 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4233 aSig >>= 1;
4234 zSig |= i;
4238 if ( zSign )
4239 zSig = -zSig;
4240 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4243 /*----------------------------------------------------------------------------
4244 | Returns 1 if the double-precision floating-point value `a' is equal to the
4245 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4246 | if either operand is a NaN. Otherwise, the comparison is performed
4247 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4248 *----------------------------------------------------------------------------*/
4250 int float64_eq(float64 a, float64 b, float_status *status)
4252 uint64_t av, bv;
4253 a = float64_squash_input_denormal(a, status);
4254 b = float64_squash_input_denormal(b, status);
4256 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4257 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4259 float_raise(float_flag_invalid, status);
4260 return 0;
4262 av = float64_val(a);
4263 bv = float64_val(b);
4264 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4268 /*----------------------------------------------------------------------------
4269 | Returns 1 if the double-precision floating-point value `a' is less than or
4270 | equal to the corresponding value `b', and 0 otherwise. The invalid
4271 | exception is raised if either operand is a NaN. The comparison is performed
4272 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4273 *----------------------------------------------------------------------------*/
4275 int float64_le(float64 a, float64 b, float_status *status)
4277 flag aSign, bSign;
4278 uint64_t av, bv;
4279 a = float64_squash_input_denormal(a, status);
4280 b = float64_squash_input_denormal(b, status);
4282 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4283 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4285 float_raise(float_flag_invalid, status);
4286 return 0;
4288 aSign = extractFloat64Sign( a );
4289 bSign = extractFloat64Sign( b );
4290 av = float64_val(a);
4291 bv = float64_val(b);
4292 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4293 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4297 /*----------------------------------------------------------------------------
4298 | Returns 1 if the double-precision floating-point value `a' is less than
4299 | the corresponding value `b', and 0 otherwise. The invalid exception is
4300 | raised if either operand is a NaN. The comparison is performed according
4301 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4302 *----------------------------------------------------------------------------*/
4304 int float64_lt(float64 a, float64 b, float_status *status)
4306 flag aSign, bSign;
4307 uint64_t av, bv;
4309 a = float64_squash_input_denormal(a, status);
4310 b = float64_squash_input_denormal(b, status);
4311 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4312 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4314 float_raise(float_flag_invalid, status);
4315 return 0;
4317 aSign = extractFloat64Sign( a );
4318 bSign = extractFloat64Sign( b );
4319 av = float64_val(a);
4320 bv = float64_val(b);
4321 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4322 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4326 /*----------------------------------------------------------------------------
4327 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4328 | be compared, and 0 otherwise. The invalid exception is raised if either
4329 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4330 | Standard for Binary Floating-Point Arithmetic.
4331 *----------------------------------------------------------------------------*/
4333 int float64_unordered(float64 a, float64 b, float_status *status)
4335 a = float64_squash_input_denormal(a, status);
4336 b = float64_squash_input_denormal(b, status);
4338 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4339 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4341 float_raise(float_flag_invalid, status);
4342 return 1;
4344 return 0;
4347 /*----------------------------------------------------------------------------
4348 | Returns 1 if the double-precision floating-point value `a' is equal to the
4349 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4350 | exception.The comparison is performed according to the IEC/IEEE Standard
4351 | for Binary Floating-Point Arithmetic.
4352 *----------------------------------------------------------------------------*/
4354 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4356 uint64_t av, bv;
4357 a = float64_squash_input_denormal(a, status);
4358 b = float64_squash_input_denormal(b, status);
4360 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4361 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4363 if (float64_is_signaling_nan(a, status)
4364 || float64_is_signaling_nan(b, status)) {
4365 float_raise(float_flag_invalid, status);
4367 return 0;
4369 av = float64_val(a);
4370 bv = float64_val(b);
4371 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4375 /*----------------------------------------------------------------------------
4376 | Returns 1 if the double-precision floating-point value `a' is less than or
4377 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4378 | cause an exception. Otherwise, the comparison is performed according to the
4379 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4380 *----------------------------------------------------------------------------*/
4382 int float64_le_quiet(float64 a, float64 b, float_status *status)
4384 flag aSign, bSign;
4385 uint64_t av, bv;
4386 a = float64_squash_input_denormal(a, status);
4387 b = float64_squash_input_denormal(b, status);
4389 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4390 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4392 if (float64_is_signaling_nan(a, status)
4393 || float64_is_signaling_nan(b, status)) {
4394 float_raise(float_flag_invalid, status);
4396 return 0;
4398 aSign = extractFloat64Sign( a );
4399 bSign = extractFloat64Sign( b );
4400 av = float64_val(a);
4401 bv = float64_val(b);
4402 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4403 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4407 /*----------------------------------------------------------------------------
4408 | Returns 1 if the double-precision floating-point value `a' is less than
4409 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4410 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4411 | Standard for Binary Floating-Point Arithmetic.
4412 *----------------------------------------------------------------------------*/
4414 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4416 flag aSign, bSign;
4417 uint64_t av, bv;
4418 a = float64_squash_input_denormal(a, status);
4419 b = float64_squash_input_denormal(b, status);
4421 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4422 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4424 if (float64_is_signaling_nan(a, status)
4425 || float64_is_signaling_nan(b, status)) {
4426 float_raise(float_flag_invalid, status);
4428 return 0;
4430 aSign = extractFloat64Sign( a );
4431 bSign = extractFloat64Sign( b );
4432 av = float64_val(a);
4433 bv = float64_val(b);
4434 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4435 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4439 /*----------------------------------------------------------------------------
4440 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4441 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4442 | comparison is performed according to the IEC/IEEE Standard for Binary
4443 | Floating-Point Arithmetic.
4444 *----------------------------------------------------------------------------*/
4446 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4448 a = float64_squash_input_denormal(a, status);
4449 b = float64_squash_input_denormal(b, status);
4451 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4452 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4454 if (float64_is_signaling_nan(a, status)
4455 || float64_is_signaling_nan(b, status)) {
4456 float_raise(float_flag_invalid, status);
4458 return 1;
4460 return 0;
4463 /*----------------------------------------------------------------------------
4464 | Returns the result of converting the extended double-precision floating-
4465 | point value `a' to the 32-bit two's complement integer format. The
4466 | conversion is performed according to the IEC/IEEE Standard for Binary
4467 | Floating-Point Arithmetic---which means in particular that the conversion
4468 | is rounded according to the current rounding mode. If `a' is a NaN, the
4469 | largest positive integer is returned. Otherwise, if the conversion
4470 | overflows, the largest integer with the same sign as `a' is returned.
4471 *----------------------------------------------------------------------------*/
4473 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4475 flag aSign;
4476 int32_t aExp, shiftCount;
4477 uint64_t aSig;
4479 if (floatx80_invalid_encoding(a)) {
4480 float_raise(float_flag_invalid, status);
4481 return 1 << 31;
4483 aSig = extractFloatx80Frac( a );
4484 aExp = extractFloatx80Exp( a );
4485 aSign = extractFloatx80Sign( a );
4486 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4487 shiftCount = 0x4037 - aExp;
4488 if ( shiftCount <= 0 ) shiftCount = 1;
4489 shift64RightJamming( aSig, shiftCount, &aSig );
4490 return roundAndPackInt32(aSign, aSig, status);
4494 /*----------------------------------------------------------------------------
4495 | Returns the result of converting the extended double-precision floating-
4496 | point value `a' to the 32-bit two's complement integer format. The
4497 | conversion is performed according to the IEC/IEEE Standard for Binary
4498 | Floating-Point Arithmetic, except that the conversion is always rounded
4499 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4500 | Otherwise, if the conversion overflows, the largest integer with the same
4501 | sign as `a' is returned.
4502 *----------------------------------------------------------------------------*/
4504 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4506 flag aSign;
4507 int32_t aExp, shiftCount;
4508 uint64_t aSig, savedASig;
4509 int32_t z;
4511 if (floatx80_invalid_encoding(a)) {
4512 float_raise(float_flag_invalid, status);
4513 return 1 << 31;
4515 aSig = extractFloatx80Frac( a );
4516 aExp = extractFloatx80Exp( a );
4517 aSign = extractFloatx80Sign( a );
4518 if ( 0x401E < aExp ) {
4519 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4520 goto invalid;
4522 else if ( aExp < 0x3FFF ) {
4523 if (aExp || aSig) {
4524 status->float_exception_flags |= float_flag_inexact;
4526 return 0;
4528 shiftCount = 0x403E - aExp;
4529 savedASig = aSig;
4530 aSig >>= shiftCount;
4531 z = aSig;
4532 if ( aSign ) z = - z;
4533 if ( ( z < 0 ) ^ aSign ) {
4534 invalid:
4535 float_raise(float_flag_invalid, status);
4536 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4538 if ( ( aSig<<shiftCount ) != savedASig ) {
4539 status->float_exception_flags |= float_flag_inexact;
4541 return z;
4545 /*----------------------------------------------------------------------------
4546 | Returns the result of converting the extended double-precision floating-
4547 | point value `a' to the 64-bit two's complement integer format. The
4548 | conversion is performed according to the IEC/IEEE Standard for Binary
4549 | Floating-Point Arithmetic---which means in particular that the conversion
4550 | is rounded according to the current rounding mode. If `a' is a NaN,
4551 | the largest positive integer is returned. Otherwise, if the conversion
4552 | overflows, the largest integer with the same sign as `a' is returned.
4553 *----------------------------------------------------------------------------*/
4555 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4557 flag aSign;
4558 int32_t aExp, shiftCount;
4559 uint64_t aSig, aSigExtra;
4561 if (floatx80_invalid_encoding(a)) {
4562 float_raise(float_flag_invalid, status);
4563 return 1ULL << 63;
4565 aSig = extractFloatx80Frac( a );
4566 aExp = extractFloatx80Exp( a );
4567 aSign = extractFloatx80Sign( a );
4568 shiftCount = 0x403E - aExp;
4569 if ( shiftCount <= 0 ) {
4570 if ( shiftCount ) {
4571 float_raise(float_flag_invalid, status);
4572 if (!aSign || floatx80_is_any_nan(a)) {
4573 return LIT64( 0x7FFFFFFFFFFFFFFF );
4575 return (int64_t) LIT64( 0x8000000000000000 );
4577 aSigExtra = 0;
4579 else {
4580 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4582 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4586 /*----------------------------------------------------------------------------
4587 | Returns the result of converting the extended double-precision floating-
4588 | point value `a' to the 64-bit two's complement integer format. The
4589 | conversion is performed according to the IEC/IEEE Standard for Binary
4590 | Floating-Point Arithmetic, except that the conversion is always rounded
4591 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4592 | Otherwise, if the conversion overflows, the largest integer with the same
4593 | sign as `a' is returned.
4594 *----------------------------------------------------------------------------*/
4596 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4598 flag aSign;
4599 int32_t aExp, shiftCount;
4600 uint64_t aSig;
4601 int64_t z;
4603 if (floatx80_invalid_encoding(a)) {
4604 float_raise(float_flag_invalid, status);
4605 return 1ULL << 63;
4607 aSig = extractFloatx80Frac( a );
4608 aExp = extractFloatx80Exp( a );
4609 aSign = extractFloatx80Sign( a );
4610 shiftCount = aExp - 0x403E;
4611 if ( 0 <= shiftCount ) {
4612 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4613 if ( ( a.high != 0xC03E ) || aSig ) {
4614 float_raise(float_flag_invalid, status);
4615 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4616 return LIT64( 0x7FFFFFFFFFFFFFFF );
4619 return (int64_t) LIT64( 0x8000000000000000 );
4621 else if ( aExp < 0x3FFF ) {
4622 if (aExp | aSig) {
4623 status->float_exception_flags |= float_flag_inexact;
4625 return 0;
4627 z = aSig>>( - shiftCount );
4628 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4629 status->float_exception_flags |= float_flag_inexact;
4631 if ( aSign ) z = - z;
4632 return z;
4636 /*----------------------------------------------------------------------------
4637 | Returns the result of converting the extended double-precision floating-
4638 | point value `a' to the single-precision floating-point format. The
4639 | conversion is performed according to the IEC/IEEE Standard for Binary
4640 | Floating-Point Arithmetic.
4641 *----------------------------------------------------------------------------*/
4643 float32 floatx80_to_float32(floatx80 a, float_status *status)
4645 flag aSign;
4646 int32_t aExp;
4647 uint64_t aSig;
4649 if (floatx80_invalid_encoding(a)) {
4650 float_raise(float_flag_invalid, status);
4651 return float32_default_nan(status);
4653 aSig = extractFloatx80Frac( a );
4654 aExp = extractFloatx80Exp( a );
4655 aSign = extractFloatx80Sign( a );
4656 if ( aExp == 0x7FFF ) {
4657 if ( (uint64_t) ( aSig<<1 ) ) {
4658 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4660 return packFloat32( aSign, 0xFF, 0 );
4662 shift64RightJamming( aSig, 33, &aSig );
4663 if ( aExp || aSig ) aExp -= 0x3F81;
4664 return roundAndPackFloat32(aSign, aExp, aSig, status);
4668 /*----------------------------------------------------------------------------
4669 | Returns the result of converting the extended double-precision floating-
4670 | point value `a' to the double-precision floating-point format. The
4671 | conversion is performed according to the IEC/IEEE Standard for Binary
4672 | Floating-Point Arithmetic.
4673 *----------------------------------------------------------------------------*/
4675 float64 floatx80_to_float64(floatx80 a, float_status *status)
4677 flag aSign;
4678 int32_t aExp;
4679 uint64_t aSig, zSig;
4681 if (floatx80_invalid_encoding(a)) {
4682 float_raise(float_flag_invalid, status);
4683 return float64_default_nan(status);
4685 aSig = extractFloatx80Frac( a );
4686 aExp = extractFloatx80Exp( a );
4687 aSign = extractFloatx80Sign( a );
4688 if ( aExp == 0x7FFF ) {
4689 if ( (uint64_t) ( aSig<<1 ) ) {
4690 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4692 return packFloat64( aSign, 0x7FF, 0 );
4694 shift64RightJamming( aSig, 1, &zSig );
4695 if ( aExp || aSig ) aExp -= 0x3C01;
4696 return roundAndPackFloat64(aSign, aExp, zSig, status);
4700 /*----------------------------------------------------------------------------
4701 | Returns the result of converting the extended double-precision floating-
4702 | point value `a' to the quadruple-precision floating-point format. The
4703 | conversion is performed according to the IEC/IEEE Standard for Binary
4704 | Floating-Point Arithmetic.
4705 *----------------------------------------------------------------------------*/
4707 float128 floatx80_to_float128(floatx80 a, float_status *status)
4709 flag aSign;
4710 int aExp;
4711 uint64_t aSig, zSig0, zSig1;
4713 if (floatx80_invalid_encoding(a)) {
4714 float_raise(float_flag_invalid, status);
4715 return float128_default_nan(status);
4717 aSig = extractFloatx80Frac( a );
4718 aExp = extractFloatx80Exp( a );
4719 aSign = extractFloatx80Sign( a );
4720 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4721 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4723 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4724 return packFloat128( aSign, aExp, zSig0, zSig1 );
4728 /*----------------------------------------------------------------------------
4729 | Rounds the extended double-precision floating-point value `a'
4730 | to the precision provided by floatx80_rounding_precision and returns the
4731 | result as an extended double-precision floating-point value.
4732 | The operation is performed according to the IEC/IEEE Standard for Binary
4733 | Floating-Point Arithmetic.
4734 *----------------------------------------------------------------------------*/
4736 floatx80 floatx80_round(floatx80 a, float_status *status)
4738 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4739 extractFloatx80Sign(a),
4740 extractFloatx80Exp(a),
4741 extractFloatx80Frac(a), 0, status);
4744 /*----------------------------------------------------------------------------
4745 | Rounds the extended double-precision floating-point value `a' to an integer,
4746 | and returns the result as an extended quadruple-precision floating-point
4747 | value. The operation is performed according to the IEC/IEEE Standard for
4748 | Binary Floating-Point Arithmetic.
4749 *----------------------------------------------------------------------------*/
4751 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4753 flag aSign;
4754 int32_t aExp;
4755 uint64_t lastBitMask, roundBitsMask;
4756 floatx80 z;
4758 if (floatx80_invalid_encoding(a)) {
4759 float_raise(float_flag_invalid, status);
4760 return floatx80_default_nan(status);
4762 aExp = extractFloatx80Exp( a );
4763 if ( 0x403E <= aExp ) {
4764 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4765 return propagateFloatx80NaN(a, a, status);
4767 return a;
4769 if ( aExp < 0x3FFF ) {
4770 if ( ( aExp == 0 )
4771 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4772 return a;
4774 status->float_exception_flags |= float_flag_inexact;
4775 aSign = extractFloatx80Sign( a );
4776 switch (status->float_rounding_mode) {
4777 case float_round_nearest_even:
4778 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4780 return
4781 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4783 break;
4784 case float_round_ties_away:
4785 if (aExp == 0x3FFE) {
4786 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4788 break;
4789 case float_round_down:
4790 return
4791 aSign ?
4792 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4793 : packFloatx80( 0, 0, 0 );
4794 case float_round_up:
4795 return
4796 aSign ? packFloatx80( 1, 0, 0 )
4797 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4799 return packFloatx80( aSign, 0, 0 );
4801 lastBitMask = 1;
4802 lastBitMask <<= 0x403E - aExp;
4803 roundBitsMask = lastBitMask - 1;
4804 z = a;
4805 switch (status->float_rounding_mode) {
4806 case float_round_nearest_even:
4807 z.low += lastBitMask>>1;
4808 if ((z.low & roundBitsMask) == 0) {
4809 z.low &= ~lastBitMask;
4811 break;
4812 case float_round_ties_away:
4813 z.low += lastBitMask >> 1;
4814 break;
4815 case float_round_to_zero:
4816 break;
4817 case float_round_up:
4818 if (!extractFloatx80Sign(z)) {
4819 z.low += roundBitsMask;
4821 break;
4822 case float_round_down:
4823 if (extractFloatx80Sign(z)) {
4824 z.low += roundBitsMask;
4826 break;
4827 default:
4828 abort();
4830 z.low &= ~ roundBitsMask;
4831 if ( z.low == 0 ) {
4832 ++z.high;
4833 z.low = LIT64( 0x8000000000000000 );
4835 if (z.low != a.low) {
4836 status->float_exception_flags |= float_flag_inexact;
4838 return z;
4842 /*----------------------------------------------------------------------------
4843 | Returns the result of adding the absolute values of the extended double-
4844 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4845 | negated before being returned. `zSign' is ignored if the result is a NaN.
4846 | The addition is performed according to the IEC/IEEE Standard for Binary
4847 | Floating-Point Arithmetic.
4848 *----------------------------------------------------------------------------*/
4850 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4851 float_status *status)
4853 int32_t aExp, bExp, zExp;
4854 uint64_t aSig, bSig, zSig0, zSig1;
4855 int32_t expDiff;
4857 aSig = extractFloatx80Frac( a );
4858 aExp = extractFloatx80Exp( a );
4859 bSig = extractFloatx80Frac( b );
4860 bExp = extractFloatx80Exp( b );
4861 expDiff = aExp - bExp;
4862 if ( 0 < expDiff ) {
4863 if ( aExp == 0x7FFF ) {
4864 if ((uint64_t)(aSig << 1)) {
4865 return propagateFloatx80NaN(a, b, status);
4867 return a;
4869 if ( bExp == 0 ) --expDiff;
4870 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4871 zExp = aExp;
4873 else if ( expDiff < 0 ) {
4874 if ( bExp == 0x7FFF ) {
4875 if ((uint64_t)(bSig << 1)) {
4876 return propagateFloatx80NaN(a, b, status);
4878 return packFloatx80(zSign,
4879 floatx80_infinity_high,
4880 floatx80_infinity_low);
4882 if ( aExp == 0 ) ++expDiff;
4883 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4884 zExp = bExp;
4886 else {
4887 if ( aExp == 0x7FFF ) {
4888 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4889 return propagateFloatx80NaN(a, b, status);
4891 return a;
4893 zSig1 = 0;
4894 zSig0 = aSig + bSig;
4895 if ( aExp == 0 ) {
4896 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4897 goto roundAndPack;
4899 zExp = aExp;
4900 goto shiftRight1;
4902 zSig0 = aSig + bSig;
4903 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4904 shiftRight1:
4905 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4906 zSig0 |= LIT64( 0x8000000000000000 );
4907 ++zExp;
4908 roundAndPack:
4909 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4910 zSign, zExp, zSig0, zSig1, status);
4913 /*----------------------------------------------------------------------------
4914 | Returns the result of subtracting the absolute values of the extended
4915 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4916 | difference is negated before being returned. `zSign' is ignored if the
4917 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4918 | Standard for Binary Floating-Point Arithmetic.
4919 *----------------------------------------------------------------------------*/
4921 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4922 float_status *status)
4924 int32_t aExp, bExp, zExp;
4925 uint64_t aSig, bSig, zSig0, zSig1;
4926 int32_t expDiff;
4928 aSig = extractFloatx80Frac( a );
4929 aExp = extractFloatx80Exp( a );
4930 bSig = extractFloatx80Frac( b );
4931 bExp = extractFloatx80Exp( b );
4932 expDiff = aExp - bExp;
4933 if ( 0 < expDiff ) goto aExpBigger;
4934 if ( expDiff < 0 ) goto bExpBigger;
4935 if ( aExp == 0x7FFF ) {
4936 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4937 return propagateFloatx80NaN(a, b, status);
4939 float_raise(float_flag_invalid, status);
4940 return floatx80_default_nan(status);
4942 if ( aExp == 0 ) {
4943 aExp = 1;
4944 bExp = 1;
4946 zSig1 = 0;
4947 if ( bSig < aSig ) goto aBigger;
4948 if ( aSig < bSig ) goto bBigger;
4949 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
4950 bExpBigger:
4951 if ( bExp == 0x7FFF ) {
4952 if ((uint64_t)(bSig << 1)) {
4953 return propagateFloatx80NaN(a, b, status);
4955 return packFloatx80(zSign ^ 1, floatx80_infinity_high,
4956 floatx80_infinity_low);
4958 if ( aExp == 0 ) ++expDiff;
4959 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4960 bBigger:
4961 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4962 zExp = bExp;
4963 zSign ^= 1;
4964 goto normalizeRoundAndPack;
4965 aExpBigger:
4966 if ( aExp == 0x7FFF ) {
4967 if ((uint64_t)(aSig << 1)) {
4968 return propagateFloatx80NaN(a, b, status);
4970 return a;
4972 if ( bExp == 0 ) --expDiff;
4973 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4974 aBigger:
4975 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4976 zExp = aExp;
4977 normalizeRoundAndPack:
4978 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
4979 zSign, zExp, zSig0, zSig1, status);
4982 /*----------------------------------------------------------------------------
4983 | Returns the result of adding the extended double-precision floating-point
4984 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4985 | Standard for Binary Floating-Point Arithmetic.
4986 *----------------------------------------------------------------------------*/
4988 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
4990 flag aSign, bSign;
4992 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4993 float_raise(float_flag_invalid, status);
4994 return floatx80_default_nan(status);
4996 aSign = extractFloatx80Sign( a );
4997 bSign = extractFloatx80Sign( b );
4998 if ( aSign == bSign ) {
4999 return addFloatx80Sigs(a, b, aSign, status);
5001 else {
5002 return subFloatx80Sigs(a, b, aSign, status);
5007 /*----------------------------------------------------------------------------
5008 | Returns the result of subtracting the extended double-precision floating-
5009 | point values `a' and `b'. The operation is performed according to the
5010 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5011 *----------------------------------------------------------------------------*/
5013 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5015 flag aSign, bSign;
5017 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5018 float_raise(float_flag_invalid, status);
5019 return floatx80_default_nan(status);
5021 aSign = extractFloatx80Sign( a );
5022 bSign = extractFloatx80Sign( b );
5023 if ( aSign == bSign ) {
5024 return subFloatx80Sigs(a, b, aSign, status);
5026 else {
5027 return addFloatx80Sigs(a, b, aSign, status);
5032 /*----------------------------------------------------------------------------
5033 | Returns the result of multiplying the extended double-precision floating-
5034 | point values `a' and `b'. The operation is performed according to the
5035 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5036 *----------------------------------------------------------------------------*/
5038 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5040 flag aSign, bSign, zSign;
5041 int32_t aExp, bExp, zExp;
5042 uint64_t aSig, bSig, zSig0, zSig1;
5044 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5045 float_raise(float_flag_invalid, status);
5046 return floatx80_default_nan(status);
5048 aSig = extractFloatx80Frac( a );
5049 aExp = extractFloatx80Exp( a );
5050 aSign = extractFloatx80Sign( a );
5051 bSig = extractFloatx80Frac( b );
5052 bExp = extractFloatx80Exp( b );
5053 bSign = extractFloatx80Sign( b );
5054 zSign = aSign ^ bSign;
5055 if ( aExp == 0x7FFF ) {
5056 if ( (uint64_t) ( aSig<<1 )
5057 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5058 return propagateFloatx80NaN(a, b, status);
5060 if ( ( bExp | bSig ) == 0 ) goto invalid;
5061 return packFloatx80(zSign, floatx80_infinity_high,
5062 floatx80_infinity_low);
5064 if ( bExp == 0x7FFF ) {
5065 if ((uint64_t)(bSig << 1)) {
5066 return propagateFloatx80NaN(a, b, status);
5068 if ( ( aExp | aSig ) == 0 ) {
5069 invalid:
5070 float_raise(float_flag_invalid, status);
5071 return floatx80_default_nan(status);
5073 return packFloatx80(zSign, floatx80_infinity_high,
5074 floatx80_infinity_low);
5076 if ( aExp == 0 ) {
5077 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5078 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5080 if ( bExp == 0 ) {
5081 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5082 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5084 zExp = aExp + bExp - 0x3FFE;
5085 mul64To128( aSig, bSig, &zSig0, &zSig1 );
5086 if ( 0 < (int64_t) zSig0 ) {
5087 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5088 --zExp;
5090 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5091 zSign, zExp, zSig0, zSig1, status);
5094 /*----------------------------------------------------------------------------
5095 | Returns the result of dividing the extended double-precision floating-point
5096 | value `a' by the corresponding value `b'. The operation is performed
5097 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5098 *----------------------------------------------------------------------------*/
5100 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5102 flag aSign, bSign, zSign;
5103 int32_t aExp, bExp, zExp;
5104 uint64_t aSig, bSig, zSig0, zSig1;
5105 uint64_t rem0, rem1, rem2, term0, term1, term2;
5107 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5108 float_raise(float_flag_invalid, status);
5109 return floatx80_default_nan(status);
5111 aSig = extractFloatx80Frac( a );
5112 aExp = extractFloatx80Exp( a );
5113 aSign = extractFloatx80Sign( a );
5114 bSig = extractFloatx80Frac( b );
5115 bExp = extractFloatx80Exp( b );
5116 bSign = extractFloatx80Sign( b );
5117 zSign = aSign ^ bSign;
5118 if ( aExp == 0x7FFF ) {
5119 if ((uint64_t)(aSig << 1)) {
5120 return propagateFloatx80NaN(a, b, status);
5122 if ( bExp == 0x7FFF ) {
5123 if ((uint64_t)(bSig << 1)) {
5124 return propagateFloatx80NaN(a, b, status);
5126 goto invalid;
5128 return packFloatx80(zSign, floatx80_infinity_high,
5129 floatx80_infinity_low);
5131 if ( bExp == 0x7FFF ) {
5132 if ((uint64_t)(bSig << 1)) {
5133 return propagateFloatx80NaN(a, b, status);
5135 return packFloatx80( zSign, 0, 0 );
5137 if ( bExp == 0 ) {
5138 if ( bSig == 0 ) {
5139 if ( ( aExp | aSig ) == 0 ) {
5140 invalid:
5141 float_raise(float_flag_invalid, status);
5142 return floatx80_default_nan(status);
5144 float_raise(float_flag_divbyzero, status);
5145 return packFloatx80(zSign, floatx80_infinity_high,
5146 floatx80_infinity_low);
5148 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5150 if ( aExp == 0 ) {
5151 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5152 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5154 zExp = aExp - bExp + 0x3FFE;
5155 rem1 = 0;
5156 if ( bSig <= aSig ) {
5157 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5158 ++zExp;
5160 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5161 mul64To128( bSig, zSig0, &term0, &term1 );
5162 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5163 while ( (int64_t) rem0 < 0 ) {
5164 --zSig0;
5165 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5167 zSig1 = estimateDiv128To64( rem1, 0, bSig );
5168 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5169 mul64To128( bSig, zSig1, &term1, &term2 );
5170 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5171 while ( (int64_t) rem1 < 0 ) {
5172 --zSig1;
5173 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5175 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5177 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5178 zSign, zExp, zSig0, zSig1, status);
5181 /*----------------------------------------------------------------------------
5182 | Returns the remainder of the extended double-precision floating-point value
5183 | `a' with respect to the corresponding value `b'. The operation is performed
5184 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5185 *----------------------------------------------------------------------------*/
5187 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5189 flag aSign, zSign;
5190 int32_t aExp, bExp, expDiff;
5191 uint64_t aSig0, aSig1, bSig;
5192 uint64_t q, term0, term1, alternateASig0, alternateASig1;
5194 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5195 float_raise(float_flag_invalid, status);
5196 return floatx80_default_nan(status);
5198 aSig0 = extractFloatx80Frac( a );
5199 aExp = extractFloatx80Exp( a );
5200 aSign = extractFloatx80Sign( a );
5201 bSig = extractFloatx80Frac( b );
5202 bExp = extractFloatx80Exp( b );
5203 if ( aExp == 0x7FFF ) {
5204 if ( (uint64_t) ( aSig0<<1 )
5205 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5206 return propagateFloatx80NaN(a, b, status);
5208 goto invalid;
5210 if ( bExp == 0x7FFF ) {
5211 if ((uint64_t)(bSig << 1)) {
5212 return propagateFloatx80NaN(a, b, status);
5214 return a;
5216 if ( bExp == 0 ) {
5217 if ( bSig == 0 ) {
5218 invalid:
5219 float_raise(float_flag_invalid, status);
5220 return floatx80_default_nan(status);
5222 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5224 if ( aExp == 0 ) {
5225 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5226 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5228 bSig |= LIT64( 0x8000000000000000 );
5229 zSign = aSign;
5230 expDiff = aExp - bExp;
5231 aSig1 = 0;
5232 if ( expDiff < 0 ) {
5233 if ( expDiff < -1 ) return a;
5234 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5235 expDiff = 0;
5237 q = ( bSig <= aSig0 );
5238 if ( q ) aSig0 -= bSig;
5239 expDiff -= 64;
5240 while ( 0 < expDiff ) {
5241 q = estimateDiv128To64( aSig0, aSig1, bSig );
5242 q = ( 2 < q ) ? q - 2 : 0;
5243 mul64To128( bSig, q, &term0, &term1 );
5244 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5245 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5246 expDiff -= 62;
5248 expDiff += 64;
5249 if ( 0 < expDiff ) {
5250 q = estimateDiv128To64( aSig0, aSig1, bSig );
5251 q = ( 2 < q ) ? q - 2 : 0;
5252 q >>= 64 - expDiff;
5253 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5254 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5255 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5256 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5257 ++q;
5258 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5261 else {
5262 term1 = 0;
5263 term0 = bSig;
5265 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5266 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5267 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5268 && ( q & 1 ) )
5270 aSig0 = alternateASig0;
5271 aSig1 = alternateASig1;
5272 zSign = ! zSign;
5274 return
5275 normalizeRoundAndPackFloatx80(
5276 80, zSign, bExp + expDiff, aSig0, aSig1, status);
5280 /*----------------------------------------------------------------------------
5281 | Returns the square root of the extended double-precision floating-point
5282 | value `a'. The operation is performed according to the IEC/IEEE Standard
5283 | for Binary Floating-Point Arithmetic.
5284 *----------------------------------------------------------------------------*/
5286 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5288 flag aSign;
5289 int32_t aExp, zExp;
5290 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5291 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5293 if (floatx80_invalid_encoding(a)) {
5294 float_raise(float_flag_invalid, status);
5295 return floatx80_default_nan(status);
5297 aSig0 = extractFloatx80Frac( a );
5298 aExp = extractFloatx80Exp( a );
5299 aSign = extractFloatx80Sign( a );
5300 if ( aExp == 0x7FFF ) {
5301 if ((uint64_t)(aSig0 << 1)) {
5302 return propagateFloatx80NaN(a, a, status);
5304 if ( ! aSign ) return a;
5305 goto invalid;
5307 if ( aSign ) {
5308 if ( ( aExp | aSig0 ) == 0 ) return a;
5309 invalid:
5310 float_raise(float_flag_invalid, status);
5311 return floatx80_default_nan(status);
5313 if ( aExp == 0 ) {
5314 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5315 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5317 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5318 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5319 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5320 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5321 doubleZSig0 = zSig0<<1;
5322 mul64To128( zSig0, zSig0, &term0, &term1 );
5323 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5324 while ( (int64_t) rem0 < 0 ) {
5325 --zSig0;
5326 doubleZSig0 -= 2;
5327 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5329 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5330 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5331 if ( zSig1 == 0 ) zSig1 = 1;
5332 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5333 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5334 mul64To128( zSig1, zSig1, &term2, &term3 );
5335 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5336 while ( (int64_t) rem1 < 0 ) {
5337 --zSig1;
5338 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5339 term3 |= 1;
5340 term2 |= doubleZSig0;
5341 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5343 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5345 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5346 zSig0 |= doubleZSig0;
5347 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5348 0, zExp, zSig0, zSig1, status);
5351 /*----------------------------------------------------------------------------
5352 | Returns 1 if the extended double-precision floating-point value `a' is equal
5353 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5354 | raised if either operand is a NaN. Otherwise, the comparison is performed
5355 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5356 *----------------------------------------------------------------------------*/
5358 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5361 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5362 || (extractFloatx80Exp(a) == 0x7FFF
5363 && (uint64_t) (extractFloatx80Frac(a) << 1))
5364 || (extractFloatx80Exp(b) == 0x7FFF
5365 && (uint64_t) (extractFloatx80Frac(b) << 1))
5367 float_raise(float_flag_invalid, status);
5368 return 0;
5370 return
5371 ( a.low == b.low )
5372 && ( ( a.high == b.high )
5373 || ( ( a.low == 0 )
5374 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5379 /*----------------------------------------------------------------------------
5380 | Returns 1 if the extended double-precision floating-point value `a' is
5381 | less than or equal to the corresponding value `b', and 0 otherwise. The
5382 | invalid exception is raised if either operand is a NaN. The comparison is
5383 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5384 | Arithmetic.
5385 *----------------------------------------------------------------------------*/
5387 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5389 flag aSign, bSign;
5391 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5392 || (extractFloatx80Exp(a) == 0x7FFF
5393 && (uint64_t) (extractFloatx80Frac(a) << 1))
5394 || (extractFloatx80Exp(b) == 0x7FFF
5395 && (uint64_t) (extractFloatx80Frac(b) << 1))
5397 float_raise(float_flag_invalid, status);
5398 return 0;
5400 aSign = extractFloatx80Sign( a );
5401 bSign = extractFloatx80Sign( b );
5402 if ( aSign != bSign ) {
5403 return
5404 aSign
5405 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5406 == 0 );
5408 return
5409 aSign ? le128( b.high, b.low, a.high, a.low )
5410 : le128( a.high, a.low, b.high, b.low );
5414 /*----------------------------------------------------------------------------
5415 | Returns 1 if the extended double-precision floating-point value `a' is
5416 | less than the corresponding value `b', and 0 otherwise. The invalid
5417 | exception is raised if either operand is a NaN. The comparison is performed
5418 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5419 *----------------------------------------------------------------------------*/
5421 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5423 flag aSign, bSign;
5425 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5426 || (extractFloatx80Exp(a) == 0x7FFF
5427 && (uint64_t) (extractFloatx80Frac(a) << 1))
5428 || (extractFloatx80Exp(b) == 0x7FFF
5429 && (uint64_t) (extractFloatx80Frac(b) << 1))
5431 float_raise(float_flag_invalid, status);
5432 return 0;
5434 aSign = extractFloatx80Sign( a );
5435 bSign = extractFloatx80Sign( b );
5436 if ( aSign != bSign ) {
5437 return
5438 aSign
5439 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5440 != 0 );
5442 return
5443 aSign ? lt128( b.high, b.low, a.high, a.low )
5444 : lt128( a.high, a.low, b.high, b.low );
5448 /*----------------------------------------------------------------------------
5449 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5450 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5451 | either operand is a NaN. The comparison is performed according to the
5452 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5453 *----------------------------------------------------------------------------*/
5454 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5456 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5457 || (extractFloatx80Exp(a) == 0x7FFF
5458 && (uint64_t) (extractFloatx80Frac(a) << 1))
5459 || (extractFloatx80Exp(b) == 0x7FFF
5460 && (uint64_t) (extractFloatx80Frac(b) << 1))
5462 float_raise(float_flag_invalid, status);
5463 return 1;
5465 return 0;
5468 /*----------------------------------------------------------------------------
5469 | Returns 1 if the extended double-precision floating-point value `a' is
5470 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5471 | cause an exception. The comparison is performed according to the IEC/IEEE
5472 | Standard for Binary Floating-Point Arithmetic.
5473 *----------------------------------------------------------------------------*/
5475 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5478 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5479 float_raise(float_flag_invalid, status);
5480 return 0;
5482 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5483 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5484 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5485 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5487 if (floatx80_is_signaling_nan(a, status)
5488 || floatx80_is_signaling_nan(b, status)) {
5489 float_raise(float_flag_invalid, status);
5491 return 0;
5493 return
5494 ( a.low == b.low )
5495 && ( ( a.high == b.high )
5496 || ( ( a.low == 0 )
5497 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5502 /*----------------------------------------------------------------------------
5503 | Returns 1 if the extended double-precision floating-point value `a' is less
5504 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5505 | do not cause an exception. Otherwise, the comparison is performed according
5506 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5507 *----------------------------------------------------------------------------*/
5509 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5511 flag aSign, bSign;
5513 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5514 float_raise(float_flag_invalid, status);
5515 return 0;
5517 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5518 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5519 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5520 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5522 if (floatx80_is_signaling_nan(a, status)
5523 || floatx80_is_signaling_nan(b, status)) {
5524 float_raise(float_flag_invalid, status);
5526 return 0;
5528 aSign = extractFloatx80Sign( a );
5529 bSign = extractFloatx80Sign( b );
5530 if ( aSign != bSign ) {
5531 return
5532 aSign
5533 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5534 == 0 );
5536 return
5537 aSign ? le128( b.high, b.low, a.high, a.low )
5538 : le128( a.high, a.low, b.high, b.low );
5542 /*----------------------------------------------------------------------------
5543 | Returns 1 if the extended double-precision floating-point value `a' is less
5544 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5545 | an exception. Otherwise, the comparison is performed according to the
5546 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5547 *----------------------------------------------------------------------------*/
5549 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5551 flag aSign, bSign;
5553 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5554 float_raise(float_flag_invalid, status);
5555 return 0;
5557 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5558 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5559 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5560 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5562 if (floatx80_is_signaling_nan(a, status)
5563 || floatx80_is_signaling_nan(b, status)) {
5564 float_raise(float_flag_invalid, status);
5566 return 0;
5568 aSign = extractFloatx80Sign( a );
5569 bSign = extractFloatx80Sign( b );
5570 if ( aSign != bSign ) {
5571 return
5572 aSign
5573 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5574 != 0 );
5576 return
5577 aSign ? lt128( b.high, b.low, a.high, a.low )
5578 : lt128( a.high, a.low, b.high, b.low );
5582 /*----------------------------------------------------------------------------
5583 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5584 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5585 | The comparison is performed according to the IEC/IEEE Standard for Binary
5586 | Floating-Point Arithmetic.
5587 *----------------------------------------------------------------------------*/
5588 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5590 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5591 float_raise(float_flag_invalid, status);
5592 return 1;
5594 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5595 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5596 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5597 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5599 if (floatx80_is_signaling_nan(a, status)
5600 || floatx80_is_signaling_nan(b, status)) {
5601 float_raise(float_flag_invalid, status);
5603 return 1;
5605 return 0;
5608 /*----------------------------------------------------------------------------
5609 | Returns the result of converting the quadruple-precision floating-point
5610 | value `a' to the 32-bit two's complement integer format. The conversion
5611 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5612 | Arithmetic---which means in particular that the conversion is rounded
5613 | according to the current rounding mode. If `a' is a NaN, the largest
5614 | positive integer is returned. Otherwise, if the conversion overflows, the
5615 | largest integer with the same sign as `a' is returned.
5616 *----------------------------------------------------------------------------*/
5618 int32_t float128_to_int32(float128 a, float_status *status)
5620 flag aSign;
5621 int32_t aExp, shiftCount;
5622 uint64_t aSig0, aSig1;
5624 aSig1 = extractFloat128Frac1( a );
5625 aSig0 = extractFloat128Frac0( a );
5626 aExp = extractFloat128Exp( a );
5627 aSign = extractFloat128Sign( a );
5628 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5629 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5630 aSig0 |= ( aSig1 != 0 );
5631 shiftCount = 0x4028 - aExp;
5632 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5633 return roundAndPackInt32(aSign, aSig0, status);
5637 /*----------------------------------------------------------------------------
5638 | Returns the result of converting the quadruple-precision floating-point
5639 | value `a' to the 32-bit two's complement integer format. The conversion
5640 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5641 | Arithmetic, except that the conversion is always rounded toward zero. If
5642 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5643 | conversion overflows, the largest integer with the same sign as `a' is
5644 | returned.
5645 *----------------------------------------------------------------------------*/
5647 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5649 flag aSign;
5650 int32_t aExp, shiftCount;
5651 uint64_t aSig0, aSig1, savedASig;
5652 int32_t z;
5654 aSig1 = extractFloat128Frac1( a );
5655 aSig0 = extractFloat128Frac0( a );
5656 aExp = extractFloat128Exp( a );
5657 aSign = extractFloat128Sign( a );
5658 aSig0 |= ( aSig1 != 0 );
5659 if ( 0x401E < aExp ) {
5660 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5661 goto invalid;
5663 else if ( aExp < 0x3FFF ) {
5664 if (aExp || aSig0) {
5665 status->float_exception_flags |= float_flag_inexact;
5667 return 0;
5669 aSig0 |= LIT64( 0x0001000000000000 );
5670 shiftCount = 0x402F - aExp;
5671 savedASig = aSig0;
5672 aSig0 >>= shiftCount;
5673 z = aSig0;
5674 if ( aSign ) z = - z;
5675 if ( ( z < 0 ) ^ aSign ) {
5676 invalid:
5677 float_raise(float_flag_invalid, status);
5678 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5680 if ( ( aSig0<<shiftCount ) != savedASig ) {
5681 status->float_exception_flags |= float_flag_inexact;
5683 return z;
5687 /*----------------------------------------------------------------------------
5688 | Returns the result of converting the quadruple-precision floating-point
5689 | value `a' to the 64-bit two's complement integer format. The conversion
5690 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5691 | Arithmetic---which means in particular that the conversion is rounded
5692 | according to the current rounding mode. If `a' is a NaN, the largest
5693 | positive integer is returned. Otherwise, if the conversion overflows, the
5694 | largest integer with the same sign as `a' is returned.
5695 *----------------------------------------------------------------------------*/
5697 int64_t float128_to_int64(float128 a, float_status *status)
5699 flag aSign;
5700 int32_t aExp, shiftCount;
5701 uint64_t aSig0, aSig1;
5703 aSig1 = extractFloat128Frac1( a );
5704 aSig0 = extractFloat128Frac0( a );
5705 aExp = extractFloat128Exp( a );
5706 aSign = extractFloat128Sign( a );
5707 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5708 shiftCount = 0x402F - aExp;
5709 if ( shiftCount <= 0 ) {
5710 if ( 0x403E < aExp ) {
5711 float_raise(float_flag_invalid, status);
5712 if ( ! aSign
5713 || ( ( aExp == 0x7FFF )
5714 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5717 return LIT64( 0x7FFFFFFFFFFFFFFF );
5719 return (int64_t) LIT64( 0x8000000000000000 );
5721 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5723 else {
5724 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5726 return roundAndPackInt64(aSign, aSig0, aSig1, status);
5730 /*----------------------------------------------------------------------------
5731 | Returns the result of converting the quadruple-precision floating-point
5732 | value `a' to the 64-bit two's complement integer format. The conversion
5733 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5734 | Arithmetic, except that the conversion is always rounded toward zero.
5735 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5736 | the conversion overflows, the largest integer with the same sign as `a' is
5737 | returned.
5738 *----------------------------------------------------------------------------*/
5740 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5742 flag aSign;
5743 int32_t aExp, shiftCount;
5744 uint64_t aSig0, aSig1;
5745 int64_t z;
5747 aSig1 = extractFloat128Frac1( a );
5748 aSig0 = extractFloat128Frac0( a );
5749 aExp = extractFloat128Exp( a );
5750 aSign = extractFloat128Sign( a );
5751 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5752 shiftCount = aExp - 0x402F;
5753 if ( 0 < shiftCount ) {
5754 if ( 0x403E <= aExp ) {
5755 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5756 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5757 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5758 if (aSig1) {
5759 status->float_exception_flags |= float_flag_inexact;
5762 else {
5763 float_raise(float_flag_invalid, status);
5764 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5765 return LIT64( 0x7FFFFFFFFFFFFFFF );
5768 return (int64_t) LIT64( 0x8000000000000000 );
5770 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5771 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5772 status->float_exception_flags |= float_flag_inexact;
5775 else {
5776 if ( aExp < 0x3FFF ) {
5777 if ( aExp | aSig0 | aSig1 ) {
5778 status->float_exception_flags |= float_flag_inexact;
5780 return 0;
5782 z = aSig0>>( - shiftCount );
5783 if ( aSig1
5784 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5785 status->float_exception_flags |= float_flag_inexact;
5788 if ( aSign ) z = - z;
5789 return z;
5793 /*----------------------------------------------------------------------------
5794 | Returns the result of converting the quadruple-precision floating-point value
5795 | `a' to the 64-bit unsigned integer format. The conversion is
5796 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5797 | Arithmetic---which means in particular that the conversion is rounded
5798 | according to the current rounding mode. If `a' is a NaN, the largest
5799 | positive integer is returned. If the conversion overflows, the
5800 | largest unsigned integer is returned. If 'a' is negative, the value is
5801 | rounded and zero is returned; negative values that do not round to zero
5802 | will raise the inexact exception.
5803 *----------------------------------------------------------------------------*/
5805 uint64_t float128_to_uint64(float128 a, float_status *status)
5807 flag aSign;
5808 int aExp;
5809 int shiftCount;
5810 uint64_t aSig0, aSig1;
5812 aSig0 = extractFloat128Frac0(a);
5813 aSig1 = extractFloat128Frac1(a);
5814 aExp = extractFloat128Exp(a);
5815 aSign = extractFloat128Sign(a);
5816 if (aSign && (aExp > 0x3FFE)) {
5817 float_raise(float_flag_invalid, status);
5818 if (float128_is_any_nan(a)) {
5819 return LIT64(0xFFFFFFFFFFFFFFFF);
5820 } else {
5821 return 0;
5824 if (aExp) {
5825 aSig0 |= LIT64(0x0001000000000000);
5827 shiftCount = 0x402F - aExp;
5828 if (shiftCount <= 0) {
5829 if (0x403E < aExp) {
5830 float_raise(float_flag_invalid, status);
5831 return LIT64(0xFFFFFFFFFFFFFFFF);
5833 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5834 } else {
5835 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5837 return roundAndPackUint64(aSign, aSig0, aSig1, status);
5840 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5842 uint64_t v;
5843 signed char current_rounding_mode = status->float_rounding_mode;
5845 set_float_rounding_mode(float_round_to_zero, status);
5846 v = float128_to_uint64(a, status);
5847 set_float_rounding_mode(current_rounding_mode, status);
5849 return v;
5852 /*----------------------------------------------------------------------------
5853 | Returns the result of converting the quadruple-precision floating-point
5854 | value `a' to the 32-bit unsigned integer format. The conversion
5855 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5856 | Arithmetic except that the conversion is always rounded toward zero.
5857 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
5858 | if the conversion overflows, the largest unsigned integer is returned.
5859 | If 'a' is negative, the value is rounded and zero is returned; negative
5860 | values that do not round to zero will raise the inexact exception.
5861 *----------------------------------------------------------------------------*/
5863 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5865 uint64_t v;
5866 uint32_t res;
5867 int old_exc_flags = get_float_exception_flags(status);
5869 v = float128_to_uint64_round_to_zero(a, status);
5870 if (v > 0xffffffff) {
5871 res = 0xffffffff;
5872 } else {
5873 return v;
5875 set_float_exception_flags(old_exc_flags, status);
5876 float_raise(float_flag_invalid, status);
5877 return res;
5880 /*----------------------------------------------------------------------------
5881 | Returns the result of converting the quadruple-precision floating-point
5882 | value `a' to the single-precision floating-point format. The conversion
5883 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5884 | Arithmetic.
5885 *----------------------------------------------------------------------------*/
5887 float32 float128_to_float32(float128 a, float_status *status)
5889 flag aSign;
5890 int32_t aExp;
5891 uint64_t aSig0, aSig1;
5892 uint32_t zSig;
5894 aSig1 = extractFloat128Frac1( a );
5895 aSig0 = extractFloat128Frac0( a );
5896 aExp = extractFloat128Exp( a );
5897 aSign = extractFloat128Sign( a );
5898 if ( aExp == 0x7FFF ) {
5899 if ( aSig0 | aSig1 ) {
5900 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5902 return packFloat32( aSign, 0xFF, 0 );
5904 aSig0 |= ( aSig1 != 0 );
5905 shift64RightJamming( aSig0, 18, &aSig0 );
5906 zSig = aSig0;
5907 if ( aExp || zSig ) {
5908 zSig |= 0x40000000;
5909 aExp -= 0x3F81;
5911 return roundAndPackFloat32(aSign, aExp, zSig, status);
5915 /*----------------------------------------------------------------------------
5916 | Returns the result of converting the quadruple-precision floating-point
5917 | value `a' to the double-precision floating-point format. The conversion
5918 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5919 | Arithmetic.
5920 *----------------------------------------------------------------------------*/
5922 float64 float128_to_float64(float128 a, float_status *status)
5924 flag aSign;
5925 int32_t aExp;
5926 uint64_t aSig0, aSig1;
5928 aSig1 = extractFloat128Frac1( a );
5929 aSig0 = extractFloat128Frac0( a );
5930 aExp = extractFloat128Exp( a );
5931 aSign = extractFloat128Sign( a );
5932 if ( aExp == 0x7FFF ) {
5933 if ( aSig0 | aSig1 ) {
5934 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5936 return packFloat64( aSign, 0x7FF, 0 );
5938 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5939 aSig0 |= ( aSig1 != 0 );
5940 if ( aExp || aSig0 ) {
5941 aSig0 |= LIT64( 0x4000000000000000 );
5942 aExp -= 0x3C01;
5944 return roundAndPackFloat64(aSign, aExp, aSig0, status);
5948 /*----------------------------------------------------------------------------
5949 | Returns the result of converting the quadruple-precision floating-point
5950 | value `a' to the extended double-precision floating-point format. The
5951 | conversion is performed according to the IEC/IEEE Standard for Binary
5952 | Floating-Point Arithmetic.
5953 *----------------------------------------------------------------------------*/
5955 floatx80 float128_to_floatx80(float128 a, float_status *status)
5957 flag aSign;
5958 int32_t aExp;
5959 uint64_t aSig0, aSig1;
5961 aSig1 = extractFloat128Frac1( a );
5962 aSig0 = extractFloat128Frac0( a );
5963 aExp = extractFloat128Exp( a );
5964 aSign = extractFloat128Sign( a );
5965 if ( aExp == 0x7FFF ) {
5966 if ( aSig0 | aSig1 ) {
5967 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
5969 return packFloatx80(aSign, floatx80_infinity_high,
5970 floatx80_infinity_low);
5972 if ( aExp == 0 ) {
5973 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5974 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5976 else {
5977 aSig0 |= LIT64( 0x0001000000000000 );
5979 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5980 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
5984 /*----------------------------------------------------------------------------
5985 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5986 | returns the result as a quadruple-precision floating-point value. The
5987 | operation is performed according to the IEC/IEEE Standard for Binary
5988 | Floating-Point Arithmetic.
5989 *----------------------------------------------------------------------------*/
5991 float128 float128_round_to_int(float128 a, float_status *status)
5993 flag aSign;
5994 int32_t aExp;
5995 uint64_t lastBitMask, roundBitsMask;
5996 float128 z;
5998 aExp = extractFloat128Exp( a );
5999 if ( 0x402F <= aExp ) {
6000 if ( 0x406F <= aExp ) {
6001 if ( ( aExp == 0x7FFF )
6002 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6004 return propagateFloat128NaN(a, a, status);
6006 return a;
6008 lastBitMask = 1;
6009 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6010 roundBitsMask = lastBitMask - 1;
6011 z = a;
6012 switch (status->float_rounding_mode) {
6013 case float_round_nearest_even:
6014 if ( lastBitMask ) {
6015 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6016 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6018 else {
6019 if ( (int64_t) z.low < 0 ) {
6020 ++z.high;
6021 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6024 break;
6025 case float_round_ties_away:
6026 if (lastBitMask) {
6027 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6028 } else {
6029 if ((int64_t) z.low < 0) {
6030 ++z.high;
6033 break;
6034 case float_round_to_zero:
6035 break;
6036 case float_round_up:
6037 if (!extractFloat128Sign(z)) {
6038 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6040 break;
6041 case float_round_down:
6042 if (extractFloat128Sign(z)) {
6043 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6045 break;
6046 default:
6047 abort();
6049 z.low &= ~ roundBitsMask;
6051 else {
6052 if ( aExp < 0x3FFF ) {
6053 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6054 status->float_exception_flags |= float_flag_inexact;
6055 aSign = extractFloat128Sign( a );
6056 switch (status->float_rounding_mode) {
6057 case float_round_nearest_even:
6058 if ( ( aExp == 0x3FFE )
6059 && ( extractFloat128Frac0( a )
6060 | extractFloat128Frac1( a ) )
6062 return packFloat128( aSign, 0x3FFF, 0, 0 );
6064 break;
6065 case float_round_ties_away:
6066 if (aExp == 0x3FFE) {
6067 return packFloat128(aSign, 0x3FFF, 0, 0);
6069 break;
6070 case float_round_down:
6071 return
6072 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6073 : packFloat128( 0, 0, 0, 0 );
6074 case float_round_up:
6075 return
6076 aSign ? packFloat128( 1, 0, 0, 0 )
6077 : packFloat128( 0, 0x3FFF, 0, 0 );
6079 return packFloat128( aSign, 0, 0, 0 );
6081 lastBitMask = 1;
6082 lastBitMask <<= 0x402F - aExp;
6083 roundBitsMask = lastBitMask - 1;
6084 z.low = 0;
6085 z.high = a.high;
6086 switch (status->float_rounding_mode) {
6087 case float_round_nearest_even:
6088 z.high += lastBitMask>>1;
6089 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6090 z.high &= ~ lastBitMask;
6092 break;
6093 case float_round_ties_away:
6094 z.high += lastBitMask>>1;
6095 break;
6096 case float_round_to_zero:
6097 break;
6098 case float_round_up:
6099 if (!extractFloat128Sign(z)) {
6100 z.high |= ( a.low != 0 );
6101 z.high += roundBitsMask;
6103 break;
6104 case float_round_down:
6105 if (extractFloat128Sign(z)) {
6106 z.high |= (a.low != 0);
6107 z.high += roundBitsMask;
6109 break;
6110 default:
6111 abort();
6113 z.high &= ~ roundBitsMask;
6115 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6116 status->float_exception_flags |= float_flag_inexact;
6118 return z;
6122 /*----------------------------------------------------------------------------
6123 | Returns the result of adding the absolute values of the quadruple-precision
6124 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6125 | before being returned. `zSign' is ignored if the result is a NaN.
6126 | The addition is performed according to the IEC/IEEE Standard for Binary
6127 | Floating-Point Arithmetic.
6128 *----------------------------------------------------------------------------*/
6130 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6131 float_status *status)
6133 int32_t aExp, bExp, zExp;
6134 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6135 int32_t expDiff;
6137 aSig1 = extractFloat128Frac1( a );
6138 aSig0 = extractFloat128Frac0( a );
6139 aExp = extractFloat128Exp( a );
6140 bSig1 = extractFloat128Frac1( b );
6141 bSig0 = extractFloat128Frac0( b );
6142 bExp = extractFloat128Exp( b );
6143 expDiff = aExp - bExp;
6144 if ( 0 < expDiff ) {
6145 if ( aExp == 0x7FFF ) {
6146 if (aSig0 | aSig1) {
6147 return propagateFloat128NaN(a, b, status);
6149 return a;
6151 if ( bExp == 0 ) {
6152 --expDiff;
6154 else {
6155 bSig0 |= LIT64( 0x0001000000000000 );
6157 shift128ExtraRightJamming(
6158 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6159 zExp = aExp;
6161 else if ( expDiff < 0 ) {
6162 if ( bExp == 0x7FFF ) {
6163 if (bSig0 | bSig1) {
6164 return propagateFloat128NaN(a, b, status);
6166 return packFloat128( zSign, 0x7FFF, 0, 0 );
6168 if ( aExp == 0 ) {
6169 ++expDiff;
6171 else {
6172 aSig0 |= LIT64( 0x0001000000000000 );
6174 shift128ExtraRightJamming(
6175 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6176 zExp = bExp;
6178 else {
6179 if ( aExp == 0x7FFF ) {
6180 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6181 return propagateFloat128NaN(a, b, status);
6183 return a;
6185 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6186 if ( aExp == 0 ) {
6187 if (status->flush_to_zero) {
6188 if (zSig0 | zSig1) {
6189 float_raise(float_flag_output_denormal, status);
6191 return packFloat128(zSign, 0, 0, 0);
6193 return packFloat128( zSign, 0, zSig0, zSig1 );
6195 zSig2 = 0;
6196 zSig0 |= LIT64( 0x0002000000000000 );
6197 zExp = aExp;
6198 goto shiftRight1;
6200 aSig0 |= LIT64( 0x0001000000000000 );
6201 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6202 --zExp;
6203 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6204 ++zExp;
6205 shiftRight1:
6206 shift128ExtraRightJamming(
6207 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6208 roundAndPack:
6209 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6213 /*----------------------------------------------------------------------------
6214 | Returns the result of subtracting the absolute values of the quadruple-
6215 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6216 | difference is negated before being returned. `zSign' is ignored if the
6217 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6218 | Standard for Binary Floating-Point Arithmetic.
6219 *----------------------------------------------------------------------------*/
6221 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6222 float_status *status)
6224 int32_t aExp, bExp, zExp;
6225 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6226 int32_t expDiff;
6228 aSig1 = extractFloat128Frac1( a );
6229 aSig0 = extractFloat128Frac0( a );
6230 aExp = extractFloat128Exp( a );
6231 bSig1 = extractFloat128Frac1( b );
6232 bSig0 = extractFloat128Frac0( b );
6233 bExp = extractFloat128Exp( b );
6234 expDiff = aExp - bExp;
6235 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6236 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6237 if ( 0 < expDiff ) goto aExpBigger;
6238 if ( expDiff < 0 ) goto bExpBigger;
6239 if ( aExp == 0x7FFF ) {
6240 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6241 return propagateFloat128NaN(a, b, status);
6243 float_raise(float_flag_invalid, status);
6244 return float128_default_nan(status);
6246 if ( aExp == 0 ) {
6247 aExp = 1;
6248 bExp = 1;
6250 if ( bSig0 < aSig0 ) goto aBigger;
6251 if ( aSig0 < bSig0 ) goto bBigger;
6252 if ( bSig1 < aSig1 ) goto aBigger;
6253 if ( aSig1 < bSig1 ) goto bBigger;
6254 return packFloat128(status->float_rounding_mode == float_round_down,
6255 0, 0, 0);
6256 bExpBigger:
6257 if ( bExp == 0x7FFF ) {
6258 if (bSig0 | bSig1) {
6259 return propagateFloat128NaN(a, b, status);
6261 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6263 if ( aExp == 0 ) {
6264 ++expDiff;
6266 else {
6267 aSig0 |= LIT64( 0x4000000000000000 );
6269 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6270 bSig0 |= LIT64( 0x4000000000000000 );
6271 bBigger:
6272 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6273 zExp = bExp;
6274 zSign ^= 1;
6275 goto normalizeRoundAndPack;
6276 aExpBigger:
6277 if ( aExp == 0x7FFF ) {
6278 if (aSig0 | aSig1) {
6279 return propagateFloat128NaN(a, b, status);
6281 return a;
6283 if ( bExp == 0 ) {
6284 --expDiff;
6286 else {
6287 bSig0 |= LIT64( 0x4000000000000000 );
6289 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6290 aSig0 |= LIT64( 0x4000000000000000 );
6291 aBigger:
6292 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6293 zExp = aExp;
6294 normalizeRoundAndPack:
6295 --zExp;
6296 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6297 status);
6301 /*----------------------------------------------------------------------------
6302 | Returns the result of adding the quadruple-precision floating-point values
6303 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6304 | for Binary Floating-Point Arithmetic.
6305 *----------------------------------------------------------------------------*/
6307 float128 float128_add(float128 a, float128 b, float_status *status)
6309 flag aSign, bSign;
6311 aSign = extractFloat128Sign( a );
6312 bSign = extractFloat128Sign( b );
6313 if ( aSign == bSign ) {
6314 return addFloat128Sigs(a, b, aSign, status);
6316 else {
6317 return subFloat128Sigs(a, b, aSign, status);
6322 /*----------------------------------------------------------------------------
6323 | Returns the result of subtracting the quadruple-precision floating-point
6324 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6325 | Standard for Binary Floating-Point Arithmetic.
6326 *----------------------------------------------------------------------------*/
6328 float128 float128_sub(float128 a, float128 b, float_status *status)
6330 flag aSign, bSign;
6332 aSign = extractFloat128Sign( a );
6333 bSign = extractFloat128Sign( b );
6334 if ( aSign == bSign ) {
6335 return subFloat128Sigs(a, b, aSign, status);
6337 else {
6338 return addFloat128Sigs(a, b, aSign, status);
6343 /*----------------------------------------------------------------------------
6344 | Returns the result of multiplying the quadruple-precision floating-point
6345 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6346 | Standard for Binary Floating-Point Arithmetic.
6347 *----------------------------------------------------------------------------*/
6349 float128 float128_mul(float128 a, float128 b, float_status *status)
6351 flag aSign, bSign, zSign;
6352 int32_t aExp, bExp, zExp;
6353 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6355 aSig1 = extractFloat128Frac1( a );
6356 aSig0 = extractFloat128Frac0( a );
6357 aExp = extractFloat128Exp( a );
6358 aSign = extractFloat128Sign( a );
6359 bSig1 = extractFloat128Frac1( b );
6360 bSig0 = extractFloat128Frac0( b );
6361 bExp = extractFloat128Exp( b );
6362 bSign = extractFloat128Sign( b );
6363 zSign = aSign ^ bSign;
6364 if ( aExp == 0x7FFF ) {
6365 if ( ( aSig0 | aSig1 )
6366 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6367 return propagateFloat128NaN(a, b, status);
6369 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6370 return packFloat128( zSign, 0x7FFF, 0, 0 );
6372 if ( bExp == 0x7FFF ) {
6373 if (bSig0 | bSig1) {
6374 return propagateFloat128NaN(a, b, status);
6376 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6377 invalid:
6378 float_raise(float_flag_invalid, status);
6379 return float128_default_nan(status);
6381 return packFloat128( zSign, 0x7FFF, 0, 0 );
6383 if ( aExp == 0 ) {
6384 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6385 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6387 if ( bExp == 0 ) {
6388 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6389 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6391 zExp = aExp + bExp - 0x4000;
6392 aSig0 |= LIT64( 0x0001000000000000 );
6393 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6394 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6395 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6396 zSig2 |= ( zSig3 != 0 );
6397 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6398 shift128ExtraRightJamming(
6399 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6400 ++zExp;
6402 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6406 /*----------------------------------------------------------------------------
6407 | Returns the result of dividing the quadruple-precision floating-point value
6408 | `a' by the corresponding value `b'. The operation is performed according to
6409 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6410 *----------------------------------------------------------------------------*/
6412 float128 float128_div(float128 a, float128 b, float_status *status)
6414 flag aSign, bSign, zSign;
6415 int32_t aExp, bExp, zExp;
6416 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6417 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6419 aSig1 = extractFloat128Frac1( a );
6420 aSig0 = extractFloat128Frac0( a );
6421 aExp = extractFloat128Exp( a );
6422 aSign = extractFloat128Sign( a );
6423 bSig1 = extractFloat128Frac1( b );
6424 bSig0 = extractFloat128Frac0( b );
6425 bExp = extractFloat128Exp( b );
6426 bSign = extractFloat128Sign( b );
6427 zSign = aSign ^ bSign;
6428 if ( aExp == 0x7FFF ) {
6429 if (aSig0 | aSig1) {
6430 return propagateFloat128NaN(a, b, status);
6432 if ( bExp == 0x7FFF ) {
6433 if (bSig0 | bSig1) {
6434 return propagateFloat128NaN(a, b, status);
6436 goto invalid;
6438 return packFloat128( zSign, 0x7FFF, 0, 0 );
6440 if ( bExp == 0x7FFF ) {
6441 if (bSig0 | bSig1) {
6442 return propagateFloat128NaN(a, b, status);
6444 return packFloat128( zSign, 0, 0, 0 );
6446 if ( bExp == 0 ) {
6447 if ( ( bSig0 | bSig1 ) == 0 ) {
6448 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6449 invalid:
6450 float_raise(float_flag_invalid, status);
6451 return float128_default_nan(status);
6453 float_raise(float_flag_divbyzero, status);
6454 return packFloat128( zSign, 0x7FFF, 0, 0 );
6456 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6458 if ( aExp == 0 ) {
6459 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6460 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6462 zExp = aExp - bExp + 0x3FFD;
6463 shortShift128Left(
6464 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6465 shortShift128Left(
6466 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6467 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6468 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6469 ++zExp;
6471 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6472 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6473 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6474 while ( (int64_t) rem0 < 0 ) {
6475 --zSig0;
6476 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6478 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6479 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6480 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6481 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6482 while ( (int64_t) rem1 < 0 ) {
6483 --zSig1;
6484 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6486 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6488 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6489 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6493 /*----------------------------------------------------------------------------
6494 | Returns the remainder of the quadruple-precision floating-point value `a'
6495 | with respect to the corresponding value `b'. The operation is performed
6496 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6497 *----------------------------------------------------------------------------*/
6499 float128 float128_rem(float128 a, float128 b, float_status *status)
6501 flag aSign, zSign;
6502 int32_t aExp, bExp, expDiff;
6503 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6504 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6505 int64_t sigMean0;
6507 aSig1 = extractFloat128Frac1( a );
6508 aSig0 = extractFloat128Frac0( a );
6509 aExp = extractFloat128Exp( a );
6510 aSign = extractFloat128Sign( a );
6511 bSig1 = extractFloat128Frac1( b );
6512 bSig0 = extractFloat128Frac0( b );
6513 bExp = extractFloat128Exp( b );
6514 if ( aExp == 0x7FFF ) {
6515 if ( ( aSig0 | aSig1 )
6516 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6517 return propagateFloat128NaN(a, b, status);
6519 goto invalid;
6521 if ( bExp == 0x7FFF ) {
6522 if (bSig0 | bSig1) {
6523 return propagateFloat128NaN(a, b, status);
6525 return a;
6527 if ( bExp == 0 ) {
6528 if ( ( bSig0 | bSig1 ) == 0 ) {
6529 invalid:
6530 float_raise(float_flag_invalid, status);
6531 return float128_default_nan(status);
6533 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6535 if ( aExp == 0 ) {
6536 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6537 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6539 expDiff = aExp - bExp;
6540 if ( expDiff < -1 ) return a;
6541 shortShift128Left(
6542 aSig0 | LIT64( 0x0001000000000000 ),
6543 aSig1,
6544 15 - ( expDiff < 0 ),
6545 &aSig0,
6546 &aSig1
6548 shortShift128Left(
6549 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6550 q = le128( bSig0, bSig1, aSig0, aSig1 );
6551 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6552 expDiff -= 64;
6553 while ( 0 < expDiff ) {
6554 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6555 q = ( 4 < q ) ? q - 4 : 0;
6556 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6557 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6558 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6559 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6560 expDiff -= 61;
6562 if ( -64 < expDiff ) {
6563 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6564 q = ( 4 < q ) ? q - 4 : 0;
6565 q >>= - expDiff;
6566 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6567 expDiff += 52;
6568 if ( expDiff < 0 ) {
6569 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6571 else {
6572 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6574 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6575 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6577 else {
6578 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6579 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6581 do {
6582 alternateASig0 = aSig0;
6583 alternateASig1 = aSig1;
6584 ++q;
6585 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6586 } while ( 0 <= (int64_t) aSig0 );
6587 add128(
6588 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6589 if ( ( sigMean0 < 0 )
6590 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6591 aSig0 = alternateASig0;
6592 aSig1 = alternateASig1;
6594 zSign = ( (int64_t) aSig0 < 0 );
6595 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6596 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6597 status);
6600 /*----------------------------------------------------------------------------
6601 | Returns the square root of the quadruple-precision floating-point value `a'.
6602 | The operation is performed according to the IEC/IEEE Standard for Binary
6603 | Floating-Point Arithmetic.
6604 *----------------------------------------------------------------------------*/
6606 float128 float128_sqrt(float128 a, float_status *status)
6608 flag aSign;
6609 int32_t aExp, zExp;
6610 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6611 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6613 aSig1 = extractFloat128Frac1( a );
6614 aSig0 = extractFloat128Frac0( a );
6615 aExp = extractFloat128Exp( a );
6616 aSign = extractFloat128Sign( a );
6617 if ( aExp == 0x7FFF ) {
6618 if (aSig0 | aSig1) {
6619 return propagateFloat128NaN(a, a, status);
6621 if ( ! aSign ) return a;
6622 goto invalid;
6624 if ( aSign ) {
6625 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6626 invalid:
6627 float_raise(float_flag_invalid, status);
6628 return float128_default_nan(status);
6630 if ( aExp == 0 ) {
6631 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6632 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6634 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6635 aSig0 |= LIT64( 0x0001000000000000 );
6636 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6637 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6638 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6639 doubleZSig0 = zSig0<<1;
6640 mul64To128( zSig0, zSig0, &term0, &term1 );
6641 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6642 while ( (int64_t) rem0 < 0 ) {
6643 --zSig0;
6644 doubleZSig0 -= 2;
6645 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6647 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6648 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6649 if ( zSig1 == 0 ) zSig1 = 1;
6650 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6651 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6652 mul64To128( zSig1, zSig1, &term2, &term3 );
6653 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6654 while ( (int64_t) rem1 < 0 ) {
6655 --zSig1;
6656 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6657 term3 |= 1;
6658 term2 |= doubleZSig0;
6659 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6661 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6663 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6664 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6668 /*----------------------------------------------------------------------------
6669 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6670 | the corresponding value `b', and 0 otherwise. The invalid exception is
6671 | raised if either operand is a NaN. Otherwise, the comparison is performed
6672 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6673 *----------------------------------------------------------------------------*/
6675 int float128_eq(float128 a, float128 b, float_status *status)
6678 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6679 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6680 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6681 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6683 float_raise(float_flag_invalid, status);
6684 return 0;
6686 return
6687 ( a.low == b.low )
6688 && ( ( a.high == b.high )
6689 || ( ( a.low == 0 )
6690 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6695 /*----------------------------------------------------------------------------
6696 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6697 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6698 | exception is raised if either operand is a NaN. The comparison is performed
6699 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6700 *----------------------------------------------------------------------------*/
6702 int float128_le(float128 a, float128 b, float_status *status)
6704 flag aSign, bSign;
6706 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6707 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6708 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6709 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6711 float_raise(float_flag_invalid, status);
6712 return 0;
6714 aSign = extractFloat128Sign( a );
6715 bSign = extractFloat128Sign( b );
6716 if ( aSign != bSign ) {
6717 return
6718 aSign
6719 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6720 == 0 );
6722 return
6723 aSign ? le128( b.high, b.low, a.high, a.low )
6724 : le128( a.high, a.low, b.high, b.low );
6728 /*----------------------------------------------------------------------------
6729 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6730 | the corresponding value `b', and 0 otherwise. The invalid exception is
6731 | raised if either operand is a NaN. The comparison is performed according
6732 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6733 *----------------------------------------------------------------------------*/
6735 int float128_lt(float128 a, float128 b, float_status *status)
6737 flag aSign, bSign;
6739 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6740 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6741 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6742 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6744 float_raise(float_flag_invalid, status);
6745 return 0;
6747 aSign = extractFloat128Sign( a );
6748 bSign = extractFloat128Sign( b );
6749 if ( aSign != bSign ) {
6750 return
6751 aSign
6752 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6753 != 0 );
6755 return
6756 aSign ? lt128( b.high, b.low, a.high, a.low )
6757 : lt128( a.high, a.low, b.high, b.low );
6761 /*----------------------------------------------------------------------------
6762 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6763 | be compared, and 0 otherwise. The invalid exception is raised if either
6764 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6765 | Standard for Binary Floating-Point Arithmetic.
6766 *----------------------------------------------------------------------------*/
6768 int float128_unordered(float128 a, float128 b, float_status *status)
6770 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6771 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6772 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6773 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6775 float_raise(float_flag_invalid, status);
6776 return 1;
6778 return 0;
6781 /*----------------------------------------------------------------------------
6782 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6783 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6784 | exception. The comparison is performed according to the IEC/IEEE Standard
6785 | for Binary Floating-Point Arithmetic.
6786 *----------------------------------------------------------------------------*/
6788 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6791 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6792 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6793 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6794 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6796 if (float128_is_signaling_nan(a, status)
6797 || float128_is_signaling_nan(b, status)) {
6798 float_raise(float_flag_invalid, status);
6800 return 0;
6802 return
6803 ( a.low == b.low )
6804 && ( ( a.high == b.high )
6805 || ( ( a.low == 0 )
6806 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6811 /*----------------------------------------------------------------------------
6812 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6813 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6814 | cause an exception. Otherwise, the comparison is performed according to the
6815 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6816 *----------------------------------------------------------------------------*/
6818 int float128_le_quiet(float128 a, float128 b, float_status *status)
6820 flag aSign, bSign;
6822 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6823 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6824 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6825 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6827 if (float128_is_signaling_nan(a, status)
6828 || float128_is_signaling_nan(b, status)) {
6829 float_raise(float_flag_invalid, status);
6831 return 0;
6833 aSign = extractFloat128Sign( a );
6834 bSign = extractFloat128Sign( b );
6835 if ( aSign != bSign ) {
6836 return
6837 aSign
6838 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6839 == 0 );
6841 return
6842 aSign ? le128( b.high, b.low, a.high, a.low )
6843 : le128( a.high, a.low, b.high, b.low );
6847 /*----------------------------------------------------------------------------
6848 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6849 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6850 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6851 | Standard for Binary Floating-Point Arithmetic.
6852 *----------------------------------------------------------------------------*/
6854 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6856 flag aSign, bSign;
6858 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6859 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6860 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6861 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6863 if (float128_is_signaling_nan(a, status)
6864 || float128_is_signaling_nan(b, status)) {
6865 float_raise(float_flag_invalid, status);
6867 return 0;
6869 aSign = extractFloat128Sign( a );
6870 bSign = extractFloat128Sign( b );
6871 if ( aSign != bSign ) {
6872 return
6873 aSign
6874 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6875 != 0 );
6877 return
6878 aSign ? lt128( b.high, b.low, a.high, a.low )
6879 : lt128( a.high, a.low, b.high, b.low );
6883 /*----------------------------------------------------------------------------
6884 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6885 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6886 | comparison is performed according to the IEC/IEEE Standard for Binary
6887 | Floating-Point Arithmetic.
6888 *----------------------------------------------------------------------------*/
6890 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6892 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6893 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6894 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6895 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6897 if (float128_is_signaling_nan(a, status)
6898 || float128_is_signaling_nan(b, status)) {
6899 float_raise(float_flag_invalid, status);
6901 return 1;
6903 return 0;
6906 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6907 int is_quiet, float_status *status)
6909 flag aSign, bSign;
6911 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6912 float_raise(float_flag_invalid, status);
6913 return float_relation_unordered;
6915 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6916 ( extractFloatx80Frac( a )<<1 ) ) ||
6917 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6918 ( extractFloatx80Frac( b )<<1 ) )) {
6919 if (!is_quiet ||
6920 floatx80_is_signaling_nan(a, status) ||
6921 floatx80_is_signaling_nan(b, status)) {
6922 float_raise(float_flag_invalid, status);
6924 return float_relation_unordered;
6926 aSign = extractFloatx80Sign( a );
6927 bSign = extractFloatx80Sign( b );
6928 if ( aSign != bSign ) {
6930 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6931 ( ( a.low | b.low ) == 0 ) ) {
6932 /* zero case */
6933 return float_relation_equal;
6934 } else {
6935 return 1 - (2 * aSign);
6937 } else {
6938 if (a.low == b.low && a.high == b.high) {
6939 return float_relation_equal;
6940 } else {
6941 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6946 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6948 return floatx80_compare_internal(a, b, 0, status);
6951 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6953 return floatx80_compare_internal(a, b, 1, status);
6956 static inline int float128_compare_internal(float128 a, float128 b,
6957 int is_quiet, float_status *status)
6959 flag aSign, bSign;
6961 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6962 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6963 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6964 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6965 if (!is_quiet ||
6966 float128_is_signaling_nan(a, status) ||
6967 float128_is_signaling_nan(b, status)) {
6968 float_raise(float_flag_invalid, status);
6970 return float_relation_unordered;
6972 aSign = extractFloat128Sign( a );
6973 bSign = extractFloat128Sign( b );
6974 if ( aSign != bSign ) {
6975 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6976 /* zero case */
6977 return float_relation_equal;
6978 } else {
6979 return 1 - (2 * aSign);
6981 } else {
6982 if (a.low == b.low && a.high == b.high) {
6983 return float_relation_equal;
6984 } else {
6985 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6990 int float128_compare(float128 a, float128 b, float_status *status)
6992 return float128_compare_internal(a, b, 0, status);
6995 int float128_compare_quiet(float128 a, float128 b, float_status *status)
6997 return float128_compare_internal(a, b, 1, status);
7000 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7002 flag aSign;
7003 int32_t aExp;
7004 uint64_t aSig;
7006 if (floatx80_invalid_encoding(a)) {
7007 float_raise(float_flag_invalid, status);
7008 return floatx80_default_nan(status);
7010 aSig = extractFloatx80Frac( a );
7011 aExp = extractFloatx80Exp( a );
7012 aSign = extractFloatx80Sign( a );
7014 if ( aExp == 0x7FFF ) {
7015 if ( aSig<<1 ) {
7016 return propagateFloatx80NaN(a, a, status);
7018 return a;
7021 if (aExp == 0) {
7022 if (aSig == 0) {
7023 return a;
7025 aExp++;
7028 if (n > 0x10000) {
7029 n = 0x10000;
7030 } else if (n < -0x10000) {
7031 n = -0x10000;
7034 aExp += n;
7035 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7036 aSign, aExp, aSig, 0, status);
7039 float128 float128_scalbn(float128 a, int n, float_status *status)
7041 flag aSign;
7042 int32_t aExp;
7043 uint64_t aSig0, aSig1;
7045 aSig1 = extractFloat128Frac1( a );
7046 aSig0 = extractFloat128Frac0( a );
7047 aExp = extractFloat128Exp( a );
7048 aSign = extractFloat128Sign( a );
7049 if ( aExp == 0x7FFF ) {
7050 if ( aSig0 | aSig1 ) {
7051 return propagateFloat128NaN(a, a, status);
7053 return a;
7055 if (aExp != 0) {
7056 aSig0 |= LIT64( 0x0001000000000000 );
7057 } else if (aSig0 == 0 && aSig1 == 0) {
7058 return a;
7059 } else {
7060 aExp++;
7063 if (n > 0x10000) {
7064 n = 0x10000;
7065 } else if (n < -0x10000) {
7066 n = -0x10000;
7069 aExp += n - 1;
7070 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7071 , status);