sheepdog: Switch to .bdrv_co_block_status()
[qemu/ar7.git] / fpu / softfloat.c
blobe7fb0d357a5b857c8e9038bc5ece05daedd7d0a6
1 /*
2 * QEMU float support
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "softfloat-macros.h"
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine: (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output. These details are target-
104 | specific.
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
112 static inline uint32_t extractFloat16Frac(float16 a)
114 return float16_val(a) & 0x3ff;
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
121 static inline int extractFloat16Exp(float16 a)
123 return (float16_val(a) >> 10) & 0x1f;
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
130 static inline flag extractFloat16Sign(float16 a)
132 return float16_val(a)>>15;
135 /*----------------------------------------------------------------------------
136 | Returns the fraction bits of the single-precision floating-point value `a'.
137 *----------------------------------------------------------------------------*/
139 static inline uint32_t extractFloat32Frac(float32 a)
141 return float32_val(a) & 0x007FFFFF;
144 /*----------------------------------------------------------------------------
145 | Returns the exponent bits of the single-precision floating-point value `a'.
146 *----------------------------------------------------------------------------*/
148 static inline int extractFloat32Exp(float32 a)
150 return (float32_val(a) >> 23) & 0xFF;
153 /*----------------------------------------------------------------------------
154 | Returns the sign bit of the single-precision floating-point value `a'.
155 *----------------------------------------------------------------------------*/
157 static inline flag extractFloat32Sign(float32 a)
159 return float32_val(a) >> 31;
162 /*----------------------------------------------------------------------------
163 | Returns the fraction bits of the double-precision floating-point value `a'.
164 *----------------------------------------------------------------------------*/
166 static inline uint64_t extractFloat64Frac(float64 a)
168 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
171 /*----------------------------------------------------------------------------
172 | Returns the exponent bits of the double-precision floating-point value `a'.
173 *----------------------------------------------------------------------------*/
175 static inline int extractFloat64Exp(float64 a)
177 return (float64_val(a) >> 52) & 0x7FF;
180 /*----------------------------------------------------------------------------
181 | Returns the sign bit of the double-precision floating-point value `a'.
182 *----------------------------------------------------------------------------*/
184 static inline flag extractFloat64Sign(float64 a)
186 return float64_val(a) >> 63;
190 * Classify a floating point number. Everything above float_class_qnan
191 * is a NaN so cls >= float_class_qnan is any NaN.
194 typedef enum __attribute__ ((__packed__)) {
195 float_class_unclassified,
196 float_class_zero,
197 float_class_normal,
198 float_class_inf,
199 float_class_qnan, /* all NaNs from here */
200 float_class_snan,
201 float_class_dnan,
202 float_class_msnan, /* maybe silenced */
203 } FloatClass;
206 * Structure holding all of the decomposed parts of a float. The
207 * exponent is unbiased and the fraction is normalized. All
208 * calculations are done with a 64 bit fraction and then rounded as
209 * appropriate for the final format.
211 * Thanks to the packed FloatClass a decent compiler should be able to
212 * fit the whole structure into registers and avoid using the stack
213 * for parameter passing.
216 typedef struct {
217 uint64_t frac;
218 int32_t exp;
219 FloatClass cls;
220 bool sign;
221 } FloatParts;
223 #define DECOMPOSED_BINARY_POINT (64 - 2)
224 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
225 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
227 /* Structure holding all of the relevant parameters for a format.
228 * exp_size: the size of the exponent field
229 * exp_bias: the offset applied to the exponent field
230 * exp_max: the maximum normalised exponent
231 * frac_size: the size of the fraction field
232 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
233 * The following are computed based the size of fraction
234 * frac_lsb: least significant bit of fraction
235 * fram_lsbm1: the bit bellow the least significant bit (for rounding)
236 * round_mask/roundeven_mask: masks used for rounding
238 typedef struct {
239 int exp_size;
240 int exp_bias;
241 int exp_max;
242 int frac_size;
243 int frac_shift;
244 uint64_t frac_lsb;
245 uint64_t frac_lsbm1;
246 uint64_t round_mask;
247 uint64_t roundeven_mask;
248 } FloatFmt;
250 /* Expand fields based on the size of exponent and fraction */
251 #define FLOAT_PARAMS(E, F) \
252 .exp_size = E, \
253 .exp_bias = ((1 << E) - 1) >> 1, \
254 .exp_max = (1 << E) - 1, \
255 .frac_size = F, \
256 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
257 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
258 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
259 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
260 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
262 static const FloatFmt float16_params = {
263 FLOAT_PARAMS(5, 10)
266 static const FloatFmt float32_params = {
267 FLOAT_PARAMS(8, 23)
270 static const FloatFmt float64_params = {
271 FLOAT_PARAMS(11, 52)
274 /* Unpack a float to parts, but do not canonicalize. */
275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
277 const int sign_pos = fmt.frac_size + fmt.exp_size;
279 return (FloatParts) {
280 .cls = float_class_unclassified,
281 .sign = extract64(raw, sign_pos, 1),
282 .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
283 .frac = extract64(raw, 0, fmt.frac_size),
287 static inline FloatParts float16_unpack_raw(float16 f)
289 return unpack_raw(float16_params, f);
292 static inline FloatParts float32_unpack_raw(float32 f)
294 return unpack_raw(float32_params, f);
297 static inline FloatParts float64_unpack_raw(float64 f)
299 return unpack_raw(float64_params, f);
302 /* Pack a float from parts, but do not canonicalize. */
303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
305 const int sign_pos = fmt.frac_size + fmt.exp_size;
306 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
307 return deposit64(ret, sign_pos, 1, p.sign);
310 static inline float16 float16_pack_raw(FloatParts p)
312 return make_float16(pack_raw(float16_params, p));
315 static inline float32 float32_pack_raw(FloatParts p)
317 return make_float32(pack_raw(float32_params, p));
320 static inline float64 float64_pack_raw(FloatParts p)
322 return make_float64(pack_raw(float64_params, p));
325 /* Canonicalize EXP and FRAC, setting CLS. */
326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
327 float_status *status)
329 if (part.exp == parm->exp_max) {
330 if (part.frac == 0) {
331 part.cls = float_class_inf;
332 } else {
333 #ifdef NO_SIGNALING_NANS
334 part.cls = float_class_qnan;
335 #else
336 int64_t msb = part.frac << (parm->frac_shift + 2);
337 if ((msb < 0) == status->snan_bit_is_one) {
338 part.cls = float_class_snan;
339 } else {
340 part.cls = float_class_qnan;
342 #endif
344 } else if (part.exp == 0) {
345 if (likely(part.frac == 0)) {
346 part.cls = float_class_zero;
347 } else if (status->flush_inputs_to_zero) {
348 float_raise(float_flag_input_denormal, status);
349 part.cls = float_class_zero;
350 part.frac = 0;
351 } else {
352 int shift = clz64(part.frac) - 1;
353 part.cls = float_class_normal;
354 part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
355 part.frac <<= shift;
357 } else {
358 part.cls = float_class_normal;
359 part.exp -= parm->exp_bias;
360 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
362 return part;
365 /* Round and uncanonicalize a floating-point number by parts. There
366 * are FRAC_SHIFT bits that may require rounding at the bottom of the
367 * fraction; these bits will be removed. The exponent will be biased
368 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
371 static FloatParts round_canonical(FloatParts p, float_status *s,
372 const FloatFmt *parm)
374 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
375 const uint64_t round_mask = parm->round_mask;
376 const uint64_t roundeven_mask = parm->roundeven_mask;
377 const int exp_max = parm->exp_max;
378 const int frac_shift = parm->frac_shift;
379 uint64_t frac, inc;
380 int exp, flags = 0;
381 bool overflow_norm;
383 frac = p.frac;
384 exp = p.exp;
386 switch (p.cls) {
387 case float_class_normal:
388 switch (s->float_rounding_mode) {
389 case float_round_nearest_even:
390 overflow_norm = false;
391 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
392 break;
393 case float_round_ties_away:
394 overflow_norm = false;
395 inc = frac_lsbm1;
396 break;
397 case float_round_to_zero:
398 overflow_norm = true;
399 inc = 0;
400 break;
401 case float_round_up:
402 inc = p.sign ? 0 : round_mask;
403 overflow_norm = p.sign;
404 break;
405 case float_round_down:
406 inc = p.sign ? round_mask : 0;
407 overflow_norm = !p.sign;
408 break;
409 default:
410 g_assert_not_reached();
413 exp += parm->exp_bias;
414 if (likely(exp > 0)) {
415 if (frac & round_mask) {
416 flags |= float_flag_inexact;
417 frac += inc;
418 if (frac & DECOMPOSED_OVERFLOW_BIT) {
419 frac >>= 1;
420 exp++;
423 frac >>= frac_shift;
425 if (unlikely(exp >= exp_max)) {
426 flags |= float_flag_overflow | float_flag_inexact;
427 if (overflow_norm) {
428 exp = exp_max - 1;
429 frac = -1;
430 } else {
431 p.cls = float_class_inf;
432 goto do_inf;
435 } else if (s->flush_to_zero) {
436 flags |= float_flag_output_denormal;
437 p.cls = float_class_zero;
438 goto do_zero;
439 } else {
440 bool is_tiny = (s->float_detect_tininess
441 == float_tininess_before_rounding)
442 || (exp < 0)
443 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
445 shift64RightJamming(frac, 1 - exp, &frac);
446 if (frac & round_mask) {
447 /* Need to recompute round-to-even. */
448 if (s->float_rounding_mode == float_round_nearest_even) {
449 inc = ((frac & roundeven_mask) != frac_lsbm1
450 ? frac_lsbm1 : 0);
452 flags |= float_flag_inexact;
453 frac += inc;
456 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
457 frac >>= frac_shift;
459 if (is_tiny && (flags & float_flag_inexact)) {
460 flags |= float_flag_underflow;
462 if (exp == 0 && frac == 0) {
463 p.cls = float_class_zero;
466 break;
468 case float_class_zero:
469 do_zero:
470 exp = 0;
471 frac = 0;
472 break;
474 case float_class_inf:
475 do_inf:
476 exp = exp_max;
477 frac = 0;
478 break;
480 case float_class_qnan:
481 case float_class_snan:
482 exp = exp_max;
483 break;
485 default:
486 g_assert_not_reached();
489 float_raise(flags, s);
490 p.exp = exp;
491 p.frac = frac;
492 return p;
495 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
497 return canonicalize(float16_unpack_raw(f), &float16_params, s);
500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
502 switch (p.cls) {
503 case float_class_dnan:
504 return float16_default_nan(s);
505 case float_class_msnan:
506 return float16_maybe_silence_nan(float16_pack_raw(p), s);
507 default:
508 p = round_canonical(p, s, &float16_params);
509 return float16_pack_raw(p);
513 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
515 return canonicalize(float32_unpack_raw(f), &float32_params, s);
518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
520 switch (p.cls) {
521 case float_class_dnan:
522 return float32_default_nan(s);
523 case float_class_msnan:
524 return float32_maybe_silence_nan(float32_pack_raw(p), s);
525 default:
526 p = round_canonical(p, s, &float32_params);
527 return float32_pack_raw(p);
531 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
533 return canonicalize(float64_unpack_raw(f), &float64_params, s);
536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
538 switch (p.cls) {
539 case float_class_dnan:
540 return float64_default_nan(s);
541 case float_class_msnan:
542 return float64_maybe_silence_nan(float64_pack_raw(p), s);
543 default:
544 p = round_canonical(p, s, &float64_params);
545 return float64_pack_raw(p);
549 /* Simple helpers for checking if what NaN we have */
550 static bool is_nan(FloatClass c)
552 return unlikely(c >= float_class_qnan);
554 static bool is_snan(FloatClass c)
556 return c == float_class_snan;
558 static bool is_qnan(FloatClass c)
560 return c == float_class_qnan;
563 static FloatParts return_nan(FloatParts a, float_status *s)
565 switch (a.cls) {
566 case float_class_snan:
567 s->float_exception_flags |= float_flag_invalid;
568 a.cls = float_class_msnan;
569 /* fall through */
570 case float_class_qnan:
571 if (s->default_nan_mode) {
572 a.cls = float_class_dnan;
574 break;
576 default:
577 g_assert_not_reached();
579 return a;
582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
584 if (is_snan(a.cls) || is_snan(b.cls)) {
585 s->float_exception_flags |= float_flag_invalid;
588 if (s->default_nan_mode) {
589 a.cls = float_class_dnan;
590 } else {
591 if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
592 is_qnan(b.cls), is_snan(b.cls),
593 a.frac > b.frac ||
594 (a.frac == b.frac && a.sign < b.sign))) {
595 a = b;
597 a.cls = float_class_msnan;
599 return a;
602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
603 bool inf_zero, float_status *s)
605 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
606 s->float_exception_flags |= float_flag_invalid;
609 if (s->default_nan_mode) {
610 a.cls = float_class_dnan;
611 } else {
612 switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
613 is_qnan(b.cls), is_snan(b.cls),
614 is_qnan(c.cls), is_snan(c.cls),
615 inf_zero, s)) {
616 case 0:
617 break;
618 case 1:
619 a = b;
620 break;
621 case 2:
622 a = c;
623 break;
624 case 3:
625 a.cls = float_class_dnan;
626 return a;
627 default:
628 g_assert_not_reached();
631 a.cls = float_class_msnan;
633 return a;
637 * Returns the result of adding or subtracting the values of the
638 * floating-point values `a' and `b'. The operation is performed
639 * according to the IEC/IEEE Standard for Binary Floating-Point
640 * Arithmetic.
643 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
644 float_status *s)
646 bool a_sign = a.sign;
647 bool b_sign = b.sign ^ subtract;
649 if (a_sign != b_sign) {
650 /* Subtraction */
652 if (a.cls == float_class_normal && b.cls == float_class_normal) {
653 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
654 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
655 a.frac = a.frac - b.frac;
656 } else {
657 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
658 a.frac = b.frac - a.frac;
659 a.exp = b.exp;
660 a_sign ^= 1;
663 if (a.frac == 0) {
664 a.cls = float_class_zero;
665 a.sign = s->float_rounding_mode == float_round_down;
666 } else {
667 int shift = clz64(a.frac) - 1;
668 a.frac = a.frac << shift;
669 a.exp = a.exp - shift;
670 a.sign = a_sign;
672 return a;
674 if (is_nan(a.cls) || is_nan(b.cls)) {
675 return pick_nan(a, b, s);
677 if (a.cls == float_class_inf) {
678 if (b.cls == float_class_inf) {
679 float_raise(float_flag_invalid, s);
680 a.cls = float_class_dnan;
682 return a;
684 if (a.cls == float_class_zero && b.cls == float_class_zero) {
685 a.sign = s->float_rounding_mode == float_round_down;
686 return a;
688 if (a.cls == float_class_zero || b.cls == float_class_inf) {
689 b.sign = a_sign ^ 1;
690 return b;
692 if (b.cls == float_class_zero) {
693 return a;
695 } else {
696 /* Addition */
697 if (a.cls == float_class_normal && b.cls == float_class_normal) {
698 if (a.exp > b.exp) {
699 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
700 } else if (a.exp < b.exp) {
701 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
702 a.exp = b.exp;
704 a.frac += b.frac;
705 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
706 a.frac >>= 1;
707 a.exp += 1;
709 return a;
711 if (is_nan(a.cls) || is_nan(b.cls)) {
712 return pick_nan(a, b, s);
714 if (a.cls == float_class_inf || b.cls == float_class_zero) {
715 return a;
717 if (b.cls == float_class_inf || a.cls == float_class_zero) {
718 b.sign = b_sign;
719 return b;
722 g_assert_not_reached();
726 * Returns the result of adding or subtracting the floating-point
727 * values `a' and `b'. The operation is performed according to the
728 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
731 float16 __attribute__((flatten)) float16_add(float16 a, float16 b,
732 float_status *status)
734 FloatParts pa = float16_unpack_canonical(a, status);
735 FloatParts pb = float16_unpack_canonical(b, status);
736 FloatParts pr = addsub_floats(pa, pb, false, status);
738 return float16_round_pack_canonical(pr, status);
741 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
742 float_status *status)
744 FloatParts pa = float32_unpack_canonical(a, status);
745 FloatParts pb = float32_unpack_canonical(b, status);
746 FloatParts pr = addsub_floats(pa, pb, false, status);
748 return float32_round_pack_canonical(pr, status);
751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
752 float_status *status)
754 FloatParts pa = float64_unpack_canonical(a, status);
755 FloatParts pb = float64_unpack_canonical(b, status);
756 FloatParts pr = addsub_floats(pa, pb, false, status);
758 return float64_round_pack_canonical(pr, status);
761 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
762 float_status *status)
764 FloatParts pa = float16_unpack_canonical(a, status);
765 FloatParts pb = float16_unpack_canonical(b, status);
766 FloatParts pr = addsub_floats(pa, pb, true, status);
768 return float16_round_pack_canonical(pr, status);
771 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
772 float_status *status)
774 FloatParts pa = float32_unpack_canonical(a, status);
775 FloatParts pb = float32_unpack_canonical(b, status);
776 FloatParts pr = addsub_floats(pa, pb, true, status);
778 return float32_round_pack_canonical(pr, status);
781 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
782 float_status *status)
784 FloatParts pa = float64_unpack_canonical(a, status);
785 FloatParts pb = float64_unpack_canonical(b, status);
786 FloatParts pr = addsub_floats(pa, pb, true, status);
788 return float64_round_pack_canonical(pr, status);
792 * Returns the result of multiplying the floating-point values `a' and
793 * `b'. The operation is performed according to the IEC/IEEE Standard
794 * for Binary Floating-Point Arithmetic.
797 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
799 bool sign = a.sign ^ b.sign;
801 if (a.cls == float_class_normal && b.cls == float_class_normal) {
802 uint64_t hi, lo;
803 int exp = a.exp + b.exp;
805 mul64To128(a.frac, b.frac, &hi, &lo);
806 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
807 if (lo & DECOMPOSED_OVERFLOW_BIT) {
808 shift64RightJamming(lo, 1, &lo);
809 exp += 1;
812 /* Re-use a */
813 a.exp = exp;
814 a.sign = sign;
815 a.frac = lo;
816 return a;
818 /* handle all the NaN cases */
819 if (is_nan(a.cls) || is_nan(b.cls)) {
820 return pick_nan(a, b, s);
822 /* Inf * Zero == NaN */
823 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
824 (a.cls == float_class_zero && b.cls == float_class_inf)) {
825 s->float_exception_flags |= float_flag_invalid;
826 a.cls = float_class_dnan;
827 a.sign = sign;
828 return a;
830 /* Multiply by 0 or Inf */
831 if (a.cls == float_class_inf || a.cls == float_class_zero) {
832 a.sign = sign;
833 return a;
835 if (b.cls == float_class_inf || b.cls == float_class_zero) {
836 b.sign = sign;
837 return b;
839 g_assert_not_reached();
842 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
843 float_status *status)
845 FloatParts pa = float16_unpack_canonical(a, status);
846 FloatParts pb = float16_unpack_canonical(b, status);
847 FloatParts pr = mul_floats(pa, pb, status);
849 return float16_round_pack_canonical(pr, status);
852 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
853 float_status *status)
855 FloatParts pa = float32_unpack_canonical(a, status);
856 FloatParts pb = float32_unpack_canonical(b, status);
857 FloatParts pr = mul_floats(pa, pb, status);
859 return float32_round_pack_canonical(pr, status);
862 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
863 float_status *status)
865 FloatParts pa = float64_unpack_canonical(a, status);
866 FloatParts pb = float64_unpack_canonical(b, status);
867 FloatParts pr = mul_floats(pa, pb, status);
869 return float64_round_pack_canonical(pr, status);
873 * Returns the result of multiplying the floating-point values `a' and
874 * `b' then adding 'c', with no intermediate rounding step after the
875 * multiplication. The operation is performed according to the
876 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
877 * The flags argument allows the caller to select negation of the
878 * addend, the intermediate product, or the final result. (The
879 * difference between this and having the caller do a separate
880 * negation is that negating externally will flip the sign bit on
881 * NaNs.)
884 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
885 int flags, float_status *s)
887 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
888 ((1 << float_class_inf) | (1 << float_class_zero));
889 bool p_sign;
890 bool sign_flip = flags & float_muladd_negate_result;
891 FloatClass p_class;
892 uint64_t hi, lo;
893 int p_exp;
895 /* It is implementation-defined whether the cases of (0,inf,qnan)
896 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
897 * they return if they do), so we have to hand this information
898 * off to the target-specific pick-a-NaN routine.
900 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
901 return pick_nan_muladd(a, b, c, inf_zero, s);
904 if (inf_zero) {
905 s->float_exception_flags |= float_flag_invalid;
906 a.cls = float_class_dnan;
907 return a;
910 if (flags & float_muladd_negate_c) {
911 c.sign ^= 1;
914 p_sign = a.sign ^ b.sign;
916 if (flags & float_muladd_negate_product) {
917 p_sign ^= 1;
920 if (a.cls == float_class_inf || b.cls == float_class_inf) {
921 p_class = float_class_inf;
922 } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
923 p_class = float_class_zero;
924 } else {
925 p_class = float_class_normal;
928 if (c.cls == float_class_inf) {
929 if (p_class == float_class_inf && p_sign != c.sign) {
930 s->float_exception_flags |= float_flag_invalid;
931 a.cls = float_class_dnan;
932 } else {
933 a.cls = float_class_inf;
934 a.sign = c.sign ^ sign_flip;
936 return a;
939 if (p_class == float_class_inf) {
940 a.cls = float_class_inf;
941 a.sign = p_sign ^ sign_flip;
942 return a;
945 if (p_class == float_class_zero) {
946 if (c.cls == float_class_zero) {
947 if (p_sign != c.sign) {
948 p_sign = s->float_rounding_mode == float_round_down;
950 c.sign = p_sign;
951 } else if (flags & float_muladd_halve_result) {
952 c.exp -= 1;
954 c.sign ^= sign_flip;
955 return c;
958 /* a & b should be normals now... */
959 assert(a.cls == float_class_normal &&
960 b.cls == float_class_normal);
962 p_exp = a.exp + b.exp;
964 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
965 * result.
967 mul64To128(a.frac, b.frac, &hi, &lo);
968 /* binary point now at bit 124 */
970 /* check for overflow */
971 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
972 shift128RightJamming(hi, lo, 1, &hi, &lo);
973 p_exp += 1;
976 /* + add/sub */
977 if (c.cls == float_class_zero) {
978 /* move binary point back to 62 */
979 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
980 } else {
981 int exp_diff = p_exp - c.exp;
982 if (p_sign == c.sign) {
983 /* Addition */
984 if (exp_diff <= 0) {
985 shift128RightJamming(hi, lo,
986 DECOMPOSED_BINARY_POINT - exp_diff,
987 &hi, &lo);
988 lo += c.frac;
989 p_exp = c.exp;
990 } else {
991 uint64_t c_hi, c_lo;
992 /* shift c to the same binary point as the product (124) */
993 c_hi = c.frac >> 2;
994 c_lo = 0;
995 shift128RightJamming(c_hi, c_lo,
996 exp_diff,
997 &c_hi, &c_lo);
998 add128(hi, lo, c_hi, c_lo, &hi, &lo);
999 /* move binary point back to 62 */
1000 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1003 if (lo & DECOMPOSED_OVERFLOW_BIT) {
1004 shift64RightJamming(lo, 1, &lo);
1005 p_exp += 1;
1008 } else {
1009 /* Subtraction */
1010 uint64_t c_hi, c_lo;
1011 /* make C binary point match product at bit 124 */
1012 c_hi = c.frac >> 2;
1013 c_lo = 0;
1015 if (exp_diff <= 0) {
1016 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1017 if (exp_diff == 0
1019 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1020 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1021 } else {
1022 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1023 p_sign ^= 1;
1024 p_exp = c.exp;
1026 } else {
1027 shift128RightJamming(c_hi, c_lo,
1028 exp_diff,
1029 &c_hi, &c_lo);
1030 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1033 if (hi == 0 && lo == 0) {
1034 a.cls = float_class_zero;
1035 a.sign = s->float_rounding_mode == float_round_down;
1036 a.sign ^= sign_flip;
1037 return a;
1038 } else {
1039 int shift;
1040 if (hi != 0) {
1041 shift = clz64(hi);
1042 } else {
1043 shift = clz64(lo) + 64;
1045 /* Normalizing to a binary point of 124 is the
1046 correct adjust for the exponent. However since we're
1047 shifting, we might as well put the binary point back
1048 at 62 where we really want it. Therefore shift as
1049 if we're leaving 1 bit at the top of the word, but
1050 adjust the exponent as if we're leaving 3 bits. */
1051 shift -= 1;
1052 if (shift >= 64) {
1053 lo = lo << (shift - 64);
1054 } else {
1055 hi = (hi << shift) | (lo >> (64 - shift));
1056 lo = hi | ((lo << shift) != 0);
1058 p_exp -= shift - 2;
1063 if (flags & float_muladd_halve_result) {
1064 p_exp -= 1;
1067 /* finally prepare our result */
1068 a.cls = float_class_normal;
1069 a.sign = p_sign ^ sign_flip;
1070 a.exp = p_exp;
1071 a.frac = lo;
1073 return a;
1076 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1077 int flags, float_status *status)
1079 FloatParts pa = float16_unpack_canonical(a, status);
1080 FloatParts pb = float16_unpack_canonical(b, status);
1081 FloatParts pc = float16_unpack_canonical(c, status);
1082 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1084 return float16_round_pack_canonical(pr, status);
1087 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1088 int flags, float_status *status)
1090 FloatParts pa = float32_unpack_canonical(a, status);
1091 FloatParts pb = float32_unpack_canonical(b, status);
1092 FloatParts pc = float32_unpack_canonical(c, status);
1093 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1095 return float32_round_pack_canonical(pr, status);
1098 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1099 int flags, float_status *status)
1101 FloatParts pa = float64_unpack_canonical(a, status);
1102 FloatParts pb = float64_unpack_canonical(b, status);
1103 FloatParts pc = float64_unpack_canonical(c, status);
1104 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1106 return float64_round_pack_canonical(pr, status);
1110 * Returns the result of dividing the floating-point value `a' by the
1111 * corresponding value `b'. The operation is performed according to
1112 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1117 bool sign = a.sign ^ b.sign;
1119 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1120 uint64_t temp_lo, temp_hi;
1121 int exp = a.exp - b.exp;
1122 if (a.frac < b.frac) {
1123 exp -= 1;
1124 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1125 &temp_hi, &temp_lo);
1126 } else {
1127 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1128 &temp_hi, &temp_lo);
1130 /* LSB of quot is set if inexact which roundandpack will use
1131 * to set flags. Yet again we re-use a for the result */
1132 a.frac = div128To64(temp_lo, temp_hi, b.frac);
1133 a.sign = sign;
1134 a.exp = exp;
1135 return a;
1137 /* handle all the NaN cases */
1138 if (is_nan(a.cls) || is_nan(b.cls)) {
1139 return pick_nan(a, b, s);
1141 /* 0/0 or Inf/Inf */
1142 if (a.cls == b.cls
1144 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1145 s->float_exception_flags |= float_flag_invalid;
1146 a.cls = float_class_dnan;
1147 return a;
1149 /* Div 0 => Inf */
1150 if (b.cls == float_class_zero) {
1151 s->float_exception_flags |= float_flag_divbyzero;
1152 a.cls = float_class_inf;
1153 a.sign = sign;
1154 return a;
1156 /* Inf / x or 0 / x */
1157 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1158 a.sign = sign;
1159 return a;
1161 /* Div by Inf */
1162 if (b.cls == float_class_inf) {
1163 a.cls = float_class_zero;
1164 a.sign = sign;
1165 return a;
1167 g_assert_not_reached();
1170 float16 float16_div(float16 a, float16 b, float_status *status)
1172 FloatParts pa = float16_unpack_canonical(a, status);
1173 FloatParts pb = float16_unpack_canonical(b, status);
1174 FloatParts pr = div_floats(pa, pb, status);
1176 return float16_round_pack_canonical(pr, status);
1179 float32 float32_div(float32 a, float32 b, float_status *status)
1181 FloatParts pa = float32_unpack_canonical(a, status);
1182 FloatParts pb = float32_unpack_canonical(b, status);
1183 FloatParts pr = div_floats(pa, pb, status);
1185 return float32_round_pack_canonical(pr, status);
1188 float64 float64_div(float64 a, float64 b, float_status *status)
1190 FloatParts pa = float64_unpack_canonical(a, status);
1191 FloatParts pb = float64_unpack_canonical(b, status);
1192 FloatParts pr = div_floats(pa, pb, status);
1194 return float64_round_pack_canonical(pr, status);
1198 * Rounds the floating-point value `a' to an integer, and returns the
1199 * result as a floating-point value. The operation is performed
1200 * according to the IEC/IEEE Standard for Binary Floating-Point
1201 * Arithmetic.
1204 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1206 if (is_nan(a.cls)) {
1207 return return_nan(a, s);
1210 switch (a.cls) {
1211 case float_class_zero:
1212 case float_class_inf:
1213 case float_class_qnan:
1214 /* already "integral" */
1215 break;
1216 case float_class_normal:
1217 if (a.exp >= DECOMPOSED_BINARY_POINT) {
1218 /* already integral */
1219 break;
1221 if (a.exp < 0) {
1222 bool one;
1223 /* all fractional */
1224 s->float_exception_flags |= float_flag_inexact;
1225 switch (rounding_mode) {
1226 case float_round_nearest_even:
1227 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1228 break;
1229 case float_round_ties_away:
1230 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1231 break;
1232 case float_round_to_zero:
1233 one = false;
1234 break;
1235 case float_round_up:
1236 one = !a.sign;
1237 break;
1238 case float_round_down:
1239 one = a.sign;
1240 break;
1241 default:
1242 g_assert_not_reached();
1245 if (one) {
1246 a.frac = DECOMPOSED_IMPLICIT_BIT;
1247 a.exp = 0;
1248 } else {
1249 a.cls = float_class_zero;
1251 } else {
1252 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1253 uint64_t frac_lsbm1 = frac_lsb >> 1;
1254 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1255 uint64_t rnd_mask = rnd_even_mask >> 1;
1256 uint64_t inc;
1258 switch (rounding_mode) {
1259 case float_round_nearest_even:
1260 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1261 break;
1262 case float_round_ties_away:
1263 inc = frac_lsbm1;
1264 break;
1265 case float_round_to_zero:
1266 inc = 0;
1267 break;
1268 case float_round_up:
1269 inc = a.sign ? 0 : rnd_mask;
1270 break;
1271 case float_round_down:
1272 inc = a.sign ? rnd_mask : 0;
1273 break;
1274 default:
1275 g_assert_not_reached();
1278 if (a.frac & rnd_mask) {
1279 s->float_exception_flags |= float_flag_inexact;
1280 a.frac += inc;
1281 a.frac &= ~rnd_mask;
1282 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1283 a.frac >>= 1;
1284 a.exp++;
1288 break;
1289 default:
1290 g_assert_not_reached();
1292 return a;
1295 float16 float16_round_to_int(float16 a, float_status *s)
1297 FloatParts pa = float16_unpack_canonical(a, s);
1298 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1299 return float16_round_pack_canonical(pr, s);
1302 float32 float32_round_to_int(float32 a, float_status *s)
1304 FloatParts pa = float32_unpack_canonical(a, s);
1305 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1306 return float32_round_pack_canonical(pr, s);
1309 float64 float64_round_to_int(float64 a, float_status *s)
1311 FloatParts pa = float64_unpack_canonical(a, s);
1312 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1313 return float64_round_pack_canonical(pr, s);
1316 float64 float64_trunc_to_int(float64 a, float_status *s)
1318 FloatParts pa = float64_unpack_canonical(a, s);
1319 FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1320 return float64_round_pack_canonical(pr, s);
1324 * Returns the result of converting the floating-point value `a' to
1325 * the two's complement integer format. The conversion is performed
1326 * according to the IEC/IEEE Standard for Binary Floating-Point
1327 * Arithmetic---which means in particular that the conversion is
1328 * rounded according to the current rounding mode. If `a' is a NaN,
1329 * the largest positive integer is returned. Otherwise, if the
1330 * conversion overflows, the largest integer with the same sign as `a'
1331 * is returned.
1334 static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1335 int64_t min, int64_t max,
1336 float_status *s)
1338 uint64_t r;
1339 int orig_flags = get_float_exception_flags(s);
1340 FloatParts p = round_to_int(in, rmode, s);
1342 switch (p.cls) {
1343 case float_class_snan:
1344 case float_class_qnan:
1345 return max;
1346 case float_class_inf:
1347 return p.sign ? min : max;
1348 case float_class_zero:
1349 return 0;
1350 case float_class_normal:
1351 if (p.exp < DECOMPOSED_BINARY_POINT) {
1352 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1353 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1354 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1355 } else {
1356 r = UINT64_MAX;
1358 if (p.sign) {
1359 if (r < -(uint64_t) min) {
1360 return -r;
1361 } else {
1362 s->float_exception_flags = orig_flags | float_flag_invalid;
1363 return min;
1365 } else {
1366 if (r < max) {
1367 return r;
1368 } else {
1369 s->float_exception_flags = orig_flags | float_flag_invalid;
1370 return max;
1373 default:
1374 g_assert_not_reached();
1378 #define FLOAT_TO_INT(fsz, isz) \
1379 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \
1380 float_status *s) \
1382 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1383 return round_to_int_and_pack(p, s->float_rounding_mode, \
1384 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1385 s); \
1388 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \
1389 (float ## fsz a, float_status *s) \
1391 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1392 return round_to_int_and_pack(p, float_round_to_zero, \
1393 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1394 s); \
1397 FLOAT_TO_INT(16, 16)
1398 FLOAT_TO_INT(16, 32)
1399 FLOAT_TO_INT(16, 64)
1401 FLOAT_TO_INT(32, 16)
1402 FLOAT_TO_INT(32, 32)
1403 FLOAT_TO_INT(32, 64)
1405 FLOAT_TO_INT(64, 16)
1406 FLOAT_TO_INT(64, 32)
1407 FLOAT_TO_INT(64, 64)
1409 #undef FLOAT_TO_INT
1412 * Returns the result of converting the floating-point value `a' to
1413 * the unsigned integer format. The conversion is performed according
1414 * to the IEC/IEEE Standard for Binary Floating-Point
1415 * Arithmetic---which means in particular that the conversion is
1416 * rounded according to the current rounding mode. If `a' is a NaN,
1417 * the largest unsigned integer is returned. Otherwise, if the
1418 * conversion overflows, the largest unsigned integer is returned. If
1419 * the 'a' is negative, the result is rounded and zero is returned;
1420 * values that do not round to zero will raise the inexact exception
1421 * flag.
1424 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1425 float_status *s)
1427 int orig_flags = get_float_exception_flags(s);
1428 FloatParts p = round_to_int(in, rmode, s);
1430 switch (p.cls) {
1431 case float_class_snan:
1432 case float_class_qnan:
1433 s->float_exception_flags = orig_flags | float_flag_invalid;
1434 return max;
1435 case float_class_inf:
1436 return p.sign ? 0 : max;
1437 case float_class_zero:
1438 return 0;
1439 case float_class_normal:
1441 uint64_t r;
1442 if (p.sign) {
1443 s->float_exception_flags = orig_flags | float_flag_invalid;
1444 return 0;
1447 if (p.exp < DECOMPOSED_BINARY_POINT) {
1448 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1449 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1450 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1451 } else {
1452 s->float_exception_flags = orig_flags | float_flag_invalid;
1453 return max;
1456 /* For uint64 this will never trip, but if p.exp is too large
1457 * to shift a decomposed fraction we shall have exited via the
1458 * 3rd leg above.
1460 if (r > max) {
1461 s->float_exception_flags = orig_flags | float_flag_invalid;
1462 return max;
1463 } else {
1464 return r;
1467 default:
1468 g_assert_not_reached();
1472 #define FLOAT_TO_UINT(fsz, isz) \
1473 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \
1474 float_status *s) \
1476 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1477 return round_to_uint_and_pack(p, s->float_rounding_mode, \
1478 UINT ## isz ## _MAX, s); \
1481 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \
1482 (float ## fsz a, float_status *s) \
1484 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1485 return round_to_uint_and_pack(p, s->float_rounding_mode, \
1486 UINT ## isz ## _MAX, s); \
1489 FLOAT_TO_UINT(16, 16)
1490 FLOAT_TO_UINT(16, 32)
1491 FLOAT_TO_UINT(16, 64)
1493 FLOAT_TO_UINT(32, 16)
1494 FLOAT_TO_UINT(32, 32)
1495 FLOAT_TO_UINT(32, 64)
1497 FLOAT_TO_UINT(64, 16)
1498 FLOAT_TO_UINT(64, 32)
1499 FLOAT_TO_UINT(64, 64)
1501 #undef FLOAT_TO_UINT
1504 * Integer to float conversions
1506 * Returns the result of converting the two's complement integer `a'
1507 * to the floating-point format. The conversion is performed according
1508 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1511 static FloatParts int_to_float(int64_t a, float_status *status)
1513 FloatParts r;
1514 if (a == 0) {
1515 r.cls = float_class_zero;
1516 r.sign = false;
1517 } else if (a == (1ULL << 63)) {
1518 r.cls = float_class_normal;
1519 r.sign = true;
1520 r.frac = DECOMPOSED_IMPLICIT_BIT;
1521 r.exp = 63;
1522 } else {
1523 uint64_t f;
1524 if (a < 0) {
1525 f = -a;
1526 r.sign = true;
1527 } else {
1528 f = a;
1529 r.sign = false;
1531 int shift = clz64(f) - 1;
1532 r.cls = float_class_normal;
1533 r.exp = (DECOMPOSED_BINARY_POINT - shift);
1534 r.frac = f << shift;
1537 return r;
1540 float16 int64_to_float16(int64_t a, float_status *status)
1542 FloatParts pa = int_to_float(a, status);
1543 return float16_round_pack_canonical(pa, status);
1546 float16 int32_to_float16(int32_t a, float_status *status)
1548 return int64_to_float16(a, status);
1551 float16 int16_to_float16(int16_t a, float_status *status)
1553 return int64_to_float16(a, status);
1556 float32 int64_to_float32(int64_t a, float_status *status)
1558 FloatParts pa = int_to_float(a, status);
1559 return float32_round_pack_canonical(pa, status);
1562 float32 int32_to_float32(int32_t a, float_status *status)
1564 return int64_to_float32(a, status);
1567 float32 int16_to_float32(int16_t a, float_status *status)
1569 return int64_to_float32(a, status);
1572 float64 int64_to_float64(int64_t a, float_status *status)
1574 FloatParts pa = int_to_float(a, status);
1575 return float64_round_pack_canonical(pa, status);
1578 float64 int32_to_float64(int32_t a, float_status *status)
1580 return int64_to_float64(a, status);
1583 float64 int16_to_float64(int16_t a, float_status *status)
1585 return int64_to_float64(a, status);
1590 * Unsigned Integer to float conversions
1592 * Returns the result of converting the unsigned integer `a' to the
1593 * floating-point format. The conversion is performed according to the
1594 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1597 static FloatParts uint_to_float(uint64_t a, float_status *status)
1599 FloatParts r = { .sign = false};
1601 if (a == 0) {
1602 r.cls = float_class_zero;
1603 } else {
1604 int spare_bits = clz64(a) - 1;
1605 r.cls = float_class_normal;
1606 r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1607 if (spare_bits < 0) {
1608 shift64RightJamming(a, -spare_bits, &a);
1609 r.frac = a;
1610 } else {
1611 r.frac = a << spare_bits;
1615 return r;
1618 float16 uint64_to_float16(uint64_t a, float_status *status)
1620 FloatParts pa = uint_to_float(a, status);
1621 return float16_round_pack_canonical(pa, status);
1624 float16 uint32_to_float16(uint32_t a, float_status *status)
1626 return uint64_to_float16(a, status);
1629 float16 uint16_to_float16(uint16_t a, float_status *status)
1631 return uint64_to_float16(a, status);
1634 float32 uint64_to_float32(uint64_t a, float_status *status)
1636 FloatParts pa = uint_to_float(a, status);
1637 return float32_round_pack_canonical(pa, status);
1640 float32 uint32_to_float32(uint32_t a, float_status *status)
1642 return uint64_to_float32(a, status);
1645 float32 uint16_to_float32(uint16_t a, float_status *status)
1647 return uint64_to_float32(a, status);
1650 float64 uint64_to_float64(uint64_t a, float_status *status)
1652 FloatParts pa = uint_to_float(a, status);
1653 return float64_round_pack_canonical(pa, status);
1656 float64 uint32_to_float64(uint32_t a, float_status *status)
1658 return uint64_to_float64(a, status);
1661 float64 uint16_to_float64(uint16_t a, float_status *status)
1663 return uint64_to_float64(a, status);
1666 /* Float Min/Max */
1667 /* min() and max() functions. These can't be implemented as
1668 * 'compare and pick one input' because that would mishandle
1669 * NaNs and +0 vs -0.
1671 * minnum() and maxnum() functions. These are similar to the min()
1672 * and max() functions but if one of the arguments is a QNaN and
1673 * the other is numerical then the numerical argument is returned.
1674 * SNaNs will get quietened before being returned.
1675 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1676 * and maxNum() operations. min() and max() are the typical min/max
1677 * semantics provided by many CPUs which predate that specification.
1679 * minnummag() and maxnummag() functions correspond to minNumMag()
1680 * and minNumMag() from the IEEE-754 2008.
1682 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1683 bool ieee, bool ismag, float_status *s)
1685 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1686 if (ieee) {
1687 /* Takes two floating-point values `a' and `b', one of
1688 * which is a NaN, and returns the appropriate NaN
1689 * result. If either `a' or `b' is a signaling NaN,
1690 * the invalid exception is raised.
1692 if (is_snan(a.cls) || is_snan(b.cls)) {
1693 return pick_nan(a, b, s);
1694 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1695 return b;
1696 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1697 return a;
1700 return pick_nan(a, b, s);
1701 } else {
1702 int a_exp, b_exp;
1703 bool a_sign, b_sign;
1705 switch (a.cls) {
1706 case float_class_normal:
1707 a_exp = a.exp;
1708 break;
1709 case float_class_inf:
1710 a_exp = INT_MAX;
1711 break;
1712 case float_class_zero:
1713 a_exp = INT_MIN;
1714 break;
1715 default:
1716 g_assert_not_reached();
1717 break;
1719 switch (b.cls) {
1720 case float_class_normal:
1721 b_exp = b.exp;
1722 break;
1723 case float_class_inf:
1724 b_exp = INT_MAX;
1725 break;
1726 case float_class_zero:
1727 b_exp = INT_MIN;
1728 break;
1729 default:
1730 g_assert_not_reached();
1731 break;
1734 a_sign = a.sign;
1735 b_sign = b.sign;
1736 if (ismag) {
1737 a_sign = b_sign = 0;
1740 if (a_sign == b_sign) {
1741 bool a_less = a_exp < b_exp;
1742 if (a_exp == b_exp) {
1743 a_less = a.frac < b.frac;
1745 return a_sign ^ a_less ^ ismin ? b : a;
1746 } else {
1747 return a_sign ^ ismin ? b : a;
1752 #define MINMAX(sz, name, ismin, isiee, ismag) \
1753 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
1754 float_status *s) \
1756 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1757 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1758 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
1760 return float ## sz ## _round_pack_canonical(pr, s); \
1763 MINMAX(16, min, true, false, false)
1764 MINMAX(16, minnum, true, true, false)
1765 MINMAX(16, minnummag, true, true, true)
1766 MINMAX(16, max, false, false, false)
1767 MINMAX(16, maxnum, false, true, false)
1768 MINMAX(16, maxnummag, false, true, true)
1770 MINMAX(32, min, true, false, false)
1771 MINMAX(32, minnum, true, true, false)
1772 MINMAX(32, minnummag, true, true, true)
1773 MINMAX(32, max, false, false, false)
1774 MINMAX(32, maxnum, false, true, false)
1775 MINMAX(32, maxnummag, false, true, true)
1777 MINMAX(64, min, true, false, false)
1778 MINMAX(64, minnum, true, true, false)
1779 MINMAX(64, minnummag, true, true, true)
1780 MINMAX(64, max, false, false, false)
1781 MINMAX(64, maxnum, false, true, false)
1782 MINMAX(64, maxnummag, false, true, true)
1784 #undef MINMAX
1786 /* Floating point compare */
1787 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
1788 float_status *s)
1790 if (is_nan(a.cls) || is_nan(b.cls)) {
1791 if (!is_quiet ||
1792 a.cls == float_class_snan ||
1793 b.cls == float_class_snan) {
1794 s->float_exception_flags |= float_flag_invalid;
1796 return float_relation_unordered;
1799 if (a.cls == float_class_zero) {
1800 if (b.cls == float_class_zero) {
1801 return float_relation_equal;
1803 return b.sign ? float_relation_greater : float_relation_less;
1804 } else if (b.cls == float_class_zero) {
1805 return a.sign ? float_relation_less : float_relation_greater;
1808 /* The only really important thing about infinity is its sign. If
1809 * both are infinities the sign marks the smallest of the two.
1811 if (a.cls == float_class_inf) {
1812 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
1813 return float_relation_equal;
1815 return a.sign ? float_relation_less : float_relation_greater;
1816 } else if (b.cls == float_class_inf) {
1817 return b.sign ? float_relation_greater : float_relation_less;
1820 if (a.sign != b.sign) {
1821 return a.sign ? float_relation_less : float_relation_greater;
1824 if (a.exp == b.exp) {
1825 if (a.frac == b.frac) {
1826 return float_relation_equal;
1828 if (a.sign) {
1829 return a.frac > b.frac ?
1830 float_relation_less : float_relation_greater;
1831 } else {
1832 return a.frac > b.frac ?
1833 float_relation_greater : float_relation_less;
1835 } else {
1836 if (a.sign) {
1837 return a.exp > b.exp ? float_relation_less : float_relation_greater;
1838 } else {
1839 return a.exp > b.exp ? float_relation_greater : float_relation_less;
1844 #define COMPARE(sz) \
1845 int float ## sz ## _compare(float ## sz a, float ## sz b, \
1846 float_status *s) \
1848 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1849 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1850 return compare_floats(pa, pb, false, s); \
1852 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \
1853 float_status *s) \
1855 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1856 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1857 return compare_floats(pa, pb, true, s); \
1860 COMPARE(16)
1861 COMPARE(32)
1862 COMPARE(64)
1864 #undef COMPARE
1866 /* Multiply A by 2 raised to the power N. */
1867 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1869 if (unlikely(is_nan(a.cls))) {
1870 return return_nan(a, s);
1872 if (a.cls == float_class_normal) {
1873 a.exp += n;
1875 return a;
1878 float16 float16_scalbn(float16 a, int n, float_status *status)
1880 FloatParts pa = float16_unpack_canonical(a, status);
1881 FloatParts pr = scalbn_decomposed(pa, n, status);
1882 return float16_round_pack_canonical(pr, status);
1885 float32 float32_scalbn(float32 a, int n, float_status *status)
1887 FloatParts pa = float32_unpack_canonical(a, status);
1888 FloatParts pr = scalbn_decomposed(pa, n, status);
1889 return float32_round_pack_canonical(pr, status);
1892 float64 float64_scalbn(float64 a, int n, float_status *status)
1894 FloatParts pa = float64_unpack_canonical(a, status);
1895 FloatParts pr = scalbn_decomposed(pa, n, status);
1896 return float64_round_pack_canonical(pr, status);
1900 * Square Root
1902 * The old softfloat code did an approximation step before zeroing in
1903 * on the final result. However for simpleness we just compute the
1904 * square root by iterating down from the implicit bit to enough extra
1905 * bits to ensure we get a correctly rounded result.
1907 * This does mean however the calculation is slower than before,
1908 * especially for 64 bit floats.
1911 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
1913 uint64_t a_frac, r_frac, s_frac;
1914 int bit, last_bit;
1916 if (is_nan(a.cls)) {
1917 return return_nan(a, s);
1919 if (a.cls == float_class_zero) {
1920 return a; /* sqrt(+-0) = +-0 */
1922 if (a.sign) {
1923 s->float_exception_flags |= float_flag_invalid;
1924 a.cls = float_class_dnan;
1925 return a;
1927 if (a.cls == float_class_inf) {
1928 return a; /* sqrt(+inf) = +inf */
1931 assert(a.cls == float_class_normal);
1933 /* We need two overflow bits at the top. Adding room for that is a
1934 * right shift. If the exponent is odd, we can discard the low bit
1935 * by multiplying the fraction by 2; that's a left shift. Combine
1936 * those and we shift right if the exponent is even.
1938 a_frac = a.frac;
1939 if (!(a.exp & 1)) {
1940 a_frac >>= 1;
1942 a.exp >>= 1;
1944 /* Bit-by-bit computation of sqrt. */
1945 r_frac = 0;
1946 s_frac = 0;
1948 /* Iterate from implicit bit down to the 3 extra bits to compute a
1949 * properly rounded result. Remember we've inserted one more bit
1950 * at the top, so these positions are one less.
1952 bit = DECOMPOSED_BINARY_POINT - 1;
1953 last_bit = MAX(p->frac_shift - 4, 0);
1954 do {
1955 uint64_t q = 1ULL << bit;
1956 uint64_t t_frac = s_frac + q;
1957 if (t_frac <= a_frac) {
1958 s_frac = t_frac + q;
1959 a_frac -= t_frac;
1960 r_frac += q;
1962 a_frac <<= 1;
1963 } while (--bit >= last_bit);
1965 /* Undo the right shift done above. If there is any remaining
1966 * fraction, the result is inexact. Set the sticky bit.
1968 a.frac = (r_frac << 1) + (a_frac != 0);
1970 return a;
1973 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
1975 FloatParts pa = float16_unpack_canonical(a, status);
1976 FloatParts pr = sqrt_float(pa, status, &float16_params);
1977 return float16_round_pack_canonical(pr, status);
1980 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
1982 FloatParts pa = float32_unpack_canonical(a, status);
1983 FloatParts pr = sqrt_float(pa, status, &float32_params);
1984 return float32_round_pack_canonical(pr, status);
1987 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
1989 FloatParts pa = float64_unpack_canonical(a, status);
1990 FloatParts pr = sqrt_float(pa, status, &float64_params);
1991 return float64_round_pack_canonical(pr, status);
1995 /*----------------------------------------------------------------------------
1996 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
1997 | and 7, and returns the properly rounded 32-bit integer corresponding to the
1998 | input. If `zSign' is 1, the input is negated before being converted to an
1999 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
2000 | is simply rounded to an integer, with the inexact exception raised if the
2001 | input cannot be represented exactly as an integer. However, if the fixed-
2002 | point input is too large, the invalid exception is raised and the largest
2003 | positive or negative integer is returned.
2004 *----------------------------------------------------------------------------*/
2006 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2008 int8_t roundingMode;
2009 flag roundNearestEven;
2010 int8_t roundIncrement, roundBits;
2011 int32_t z;
2013 roundingMode = status->float_rounding_mode;
2014 roundNearestEven = ( roundingMode == float_round_nearest_even );
2015 switch (roundingMode) {
2016 case float_round_nearest_even:
2017 case float_round_ties_away:
2018 roundIncrement = 0x40;
2019 break;
2020 case float_round_to_zero:
2021 roundIncrement = 0;
2022 break;
2023 case float_round_up:
2024 roundIncrement = zSign ? 0 : 0x7f;
2025 break;
2026 case float_round_down:
2027 roundIncrement = zSign ? 0x7f : 0;
2028 break;
2029 default:
2030 abort();
2032 roundBits = absZ & 0x7F;
2033 absZ = ( absZ + roundIncrement )>>7;
2034 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2035 z = absZ;
2036 if ( zSign ) z = - z;
2037 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2038 float_raise(float_flag_invalid, status);
2039 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2041 if (roundBits) {
2042 status->float_exception_flags |= float_flag_inexact;
2044 return z;
2048 /*----------------------------------------------------------------------------
2049 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2050 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2051 | and returns the properly rounded 64-bit integer corresponding to the input.
2052 | If `zSign' is 1, the input is negated before being converted to an integer.
2053 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2054 | the inexact exception raised if the input cannot be represented exactly as
2055 | an integer. However, if the fixed-point input is too large, the invalid
2056 | exception is raised and the largest positive or negative integer is
2057 | returned.
2058 *----------------------------------------------------------------------------*/
2060 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2061 float_status *status)
2063 int8_t roundingMode;
2064 flag roundNearestEven, increment;
2065 int64_t z;
2067 roundingMode = status->float_rounding_mode;
2068 roundNearestEven = ( roundingMode == float_round_nearest_even );
2069 switch (roundingMode) {
2070 case float_round_nearest_even:
2071 case float_round_ties_away:
2072 increment = ((int64_t) absZ1 < 0);
2073 break;
2074 case float_round_to_zero:
2075 increment = 0;
2076 break;
2077 case float_round_up:
2078 increment = !zSign && absZ1;
2079 break;
2080 case float_round_down:
2081 increment = zSign && absZ1;
2082 break;
2083 default:
2084 abort();
2086 if ( increment ) {
2087 ++absZ0;
2088 if ( absZ0 == 0 ) goto overflow;
2089 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2091 z = absZ0;
2092 if ( zSign ) z = - z;
2093 if ( z && ( ( z < 0 ) ^ zSign ) ) {
2094 overflow:
2095 float_raise(float_flag_invalid, status);
2096 return
2097 zSign ? (int64_t) LIT64( 0x8000000000000000 )
2098 : LIT64( 0x7FFFFFFFFFFFFFFF );
2100 if (absZ1) {
2101 status->float_exception_flags |= float_flag_inexact;
2103 return z;
2107 /*----------------------------------------------------------------------------
2108 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2109 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2110 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2111 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
2112 | with the inexact exception raised if the input cannot be represented exactly
2113 | as an integer. However, if the fixed-point input is too large, the invalid
2114 | exception is raised and the largest unsigned integer is returned.
2115 *----------------------------------------------------------------------------*/
2117 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2118 uint64_t absZ1, float_status *status)
2120 int8_t roundingMode;
2121 flag roundNearestEven, increment;
2123 roundingMode = status->float_rounding_mode;
2124 roundNearestEven = (roundingMode == float_round_nearest_even);
2125 switch (roundingMode) {
2126 case float_round_nearest_even:
2127 case float_round_ties_away:
2128 increment = ((int64_t)absZ1 < 0);
2129 break;
2130 case float_round_to_zero:
2131 increment = 0;
2132 break;
2133 case float_round_up:
2134 increment = !zSign && absZ1;
2135 break;
2136 case float_round_down:
2137 increment = zSign && absZ1;
2138 break;
2139 default:
2140 abort();
2142 if (increment) {
2143 ++absZ0;
2144 if (absZ0 == 0) {
2145 float_raise(float_flag_invalid, status);
2146 return LIT64(0xFFFFFFFFFFFFFFFF);
2148 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2151 if (zSign && absZ0) {
2152 float_raise(float_flag_invalid, status);
2153 return 0;
2156 if (absZ1) {
2157 status->float_exception_flags |= float_flag_inexact;
2159 return absZ0;
2162 /*----------------------------------------------------------------------------
2163 | If `a' is denormal and we are in flush-to-zero mode then set the
2164 | input-denormal exception and return zero. Otherwise just return the value.
2165 *----------------------------------------------------------------------------*/
2166 float32 float32_squash_input_denormal(float32 a, float_status *status)
2168 if (status->flush_inputs_to_zero) {
2169 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2170 float_raise(float_flag_input_denormal, status);
2171 return make_float32(float32_val(a) & 0x80000000);
2174 return a;
2177 /*----------------------------------------------------------------------------
2178 | Normalizes the subnormal single-precision floating-point value represented
2179 | by the denormalized significand `aSig'. The normalized exponent and
2180 | significand are stored at the locations pointed to by `zExpPtr' and
2181 | `zSigPtr', respectively.
2182 *----------------------------------------------------------------------------*/
2184 static void
2185 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2187 int8_t shiftCount;
2189 shiftCount = countLeadingZeros32( aSig ) - 8;
2190 *zSigPtr = aSig<<shiftCount;
2191 *zExpPtr = 1 - shiftCount;
2195 /*----------------------------------------------------------------------------
2196 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2197 | single-precision floating-point value, returning the result. After being
2198 | shifted into the proper positions, the three fields are simply added
2199 | together to form the result. This means that any integer portion of `zSig'
2200 | will be added into the exponent. Since a properly normalized significand
2201 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2202 | than the desired result exponent whenever `zSig' is a complete, normalized
2203 | significand.
2204 *----------------------------------------------------------------------------*/
2206 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig)
2209 return make_float32(
2210 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
2214 /*----------------------------------------------------------------------------
2215 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2216 | and significand `zSig', and returns the proper single-precision floating-
2217 | point value corresponding to the abstract input. Ordinarily, the abstract
2218 | value is simply rounded and packed into the single-precision format, with
2219 | the inexact exception raised if the abstract input cannot be represented
2220 | exactly. However, if the abstract value is too large, the overflow and
2221 | inexact exceptions are raised and an infinity or maximal finite value is
2222 | returned. If the abstract value is too small, the input value is rounded to
2223 | a subnormal number, and the underflow and inexact exceptions are raised if
2224 | the abstract input cannot be represented exactly as a subnormal single-
2225 | precision floating-point number.
2226 | The input significand `zSig' has its binary point between bits 30
2227 | and 29, which is 7 bits to the left of the usual location. This shifted
2228 | significand must be normalized or smaller. If `zSig' is not normalized,
2229 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2230 | and it must not require rounding. In the usual case that `zSig' is
2231 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2232 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2233 | Binary Floating-Point Arithmetic.
2234 *----------------------------------------------------------------------------*/
2236 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2237 float_status *status)
2239 int8_t roundingMode;
2240 flag roundNearestEven;
2241 int8_t roundIncrement, roundBits;
2242 flag isTiny;
2244 roundingMode = status->float_rounding_mode;
2245 roundNearestEven = ( roundingMode == float_round_nearest_even );
2246 switch (roundingMode) {
2247 case float_round_nearest_even:
2248 case float_round_ties_away:
2249 roundIncrement = 0x40;
2250 break;
2251 case float_round_to_zero:
2252 roundIncrement = 0;
2253 break;
2254 case float_round_up:
2255 roundIncrement = zSign ? 0 : 0x7f;
2256 break;
2257 case float_round_down:
2258 roundIncrement = zSign ? 0x7f : 0;
2259 break;
2260 default:
2261 abort();
2262 break;
2264 roundBits = zSig & 0x7F;
2265 if ( 0xFD <= (uint16_t) zExp ) {
2266 if ( ( 0xFD < zExp )
2267 || ( ( zExp == 0xFD )
2268 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2270 float_raise(float_flag_overflow | float_flag_inexact, status);
2271 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2273 if ( zExp < 0 ) {
2274 if (status->flush_to_zero) {
2275 float_raise(float_flag_output_denormal, status);
2276 return packFloat32(zSign, 0, 0);
2278 isTiny =
2279 (status->float_detect_tininess
2280 == float_tininess_before_rounding)
2281 || ( zExp < -1 )
2282 || ( zSig + roundIncrement < 0x80000000 );
2283 shift32RightJamming( zSig, - zExp, &zSig );
2284 zExp = 0;
2285 roundBits = zSig & 0x7F;
2286 if (isTiny && roundBits) {
2287 float_raise(float_flag_underflow, status);
2291 if (roundBits) {
2292 status->float_exception_flags |= float_flag_inexact;
2294 zSig = ( zSig + roundIncrement )>>7;
2295 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2296 if ( zSig == 0 ) zExp = 0;
2297 return packFloat32( zSign, zExp, zSig );
2301 /*----------------------------------------------------------------------------
2302 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2303 | and significand `zSig', and returns the proper single-precision floating-
2304 | point value corresponding to the abstract input. This routine is just like
2305 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2306 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2307 | floating-point exponent.
2308 *----------------------------------------------------------------------------*/
2310 static float32
2311 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2312 float_status *status)
2314 int8_t shiftCount;
2316 shiftCount = countLeadingZeros32( zSig ) - 1;
2317 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2318 status);
2322 /*----------------------------------------------------------------------------
2323 | If `a' is denormal and we are in flush-to-zero mode then set the
2324 | input-denormal exception and return zero. Otherwise just return the value.
2325 *----------------------------------------------------------------------------*/
2326 float64 float64_squash_input_denormal(float64 a, float_status *status)
2328 if (status->flush_inputs_to_zero) {
2329 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2330 float_raise(float_flag_input_denormal, status);
2331 return make_float64(float64_val(a) & (1ULL << 63));
2334 return a;
2337 /*----------------------------------------------------------------------------
2338 | Normalizes the subnormal double-precision floating-point value represented
2339 | by the denormalized significand `aSig'. The normalized exponent and
2340 | significand are stored at the locations pointed to by `zExpPtr' and
2341 | `zSigPtr', respectively.
2342 *----------------------------------------------------------------------------*/
2344 static void
2345 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2347 int8_t shiftCount;
2349 shiftCount = countLeadingZeros64( aSig ) - 11;
2350 *zSigPtr = aSig<<shiftCount;
2351 *zExpPtr = 1 - shiftCount;
2355 /*----------------------------------------------------------------------------
2356 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2357 | double-precision floating-point value, returning the result. After being
2358 | shifted into the proper positions, the three fields are simply added
2359 | together to form the result. This means that any integer portion of `zSig'
2360 | will be added into the exponent. Since a properly normalized significand
2361 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2362 | than the desired result exponent whenever `zSig' is a complete, normalized
2363 | significand.
2364 *----------------------------------------------------------------------------*/
2366 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2369 return make_float64(
2370 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2374 /*----------------------------------------------------------------------------
2375 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2376 | and significand `zSig', and returns the proper double-precision floating-
2377 | point value corresponding to the abstract input. Ordinarily, the abstract
2378 | value is simply rounded and packed into the double-precision format, with
2379 | the inexact exception raised if the abstract input cannot be represented
2380 | exactly. However, if the abstract value is too large, the overflow and
2381 | inexact exceptions are raised and an infinity or maximal finite value is
2382 | returned. If the abstract value is too small, the input value is rounded to
2383 | a subnormal number, and the underflow and inexact exceptions are raised if
2384 | the abstract input cannot be represented exactly as a subnormal double-
2385 | precision floating-point number.
2386 | The input significand `zSig' has its binary point between bits 62
2387 | and 61, which is 10 bits to the left of the usual location. This shifted
2388 | significand must be normalized or smaller. If `zSig' is not normalized,
2389 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2390 | and it must not require rounding. In the usual case that `zSig' is
2391 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2392 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2393 | Binary Floating-Point Arithmetic.
2394 *----------------------------------------------------------------------------*/
2396 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2397 float_status *status)
2399 int8_t roundingMode;
2400 flag roundNearestEven;
2401 int roundIncrement, roundBits;
2402 flag isTiny;
2404 roundingMode = status->float_rounding_mode;
2405 roundNearestEven = ( roundingMode == float_round_nearest_even );
2406 switch (roundingMode) {
2407 case float_round_nearest_even:
2408 case float_round_ties_away:
2409 roundIncrement = 0x200;
2410 break;
2411 case float_round_to_zero:
2412 roundIncrement = 0;
2413 break;
2414 case float_round_up:
2415 roundIncrement = zSign ? 0 : 0x3ff;
2416 break;
2417 case float_round_down:
2418 roundIncrement = zSign ? 0x3ff : 0;
2419 break;
2420 case float_round_to_odd:
2421 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2422 break;
2423 default:
2424 abort();
2426 roundBits = zSig & 0x3FF;
2427 if ( 0x7FD <= (uint16_t) zExp ) {
2428 if ( ( 0x7FD < zExp )
2429 || ( ( zExp == 0x7FD )
2430 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2432 bool overflow_to_inf = roundingMode != float_round_to_odd &&
2433 roundIncrement != 0;
2434 float_raise(float_flag_overflow | float_flag_inexact, status);
2435 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2437 if ( zExp < 0 ) {
2438 if (status->flush_to_zero) {
2439 float_raise(float_flag_output_denormal, status);
2440 return packFloat64(zSign, 0, 0);
2442 isTiny =
2443 (status->float_detect_tininess
2444 == float_tininess_before_rounding)
2445 || ( zExp < -1 )
2446 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2447 shift64RightJamming( zSig, - zExp, &zSig );
2448 zExp = 0;
2449 roundBits = zSig & 0x3FF;
2450 if (isTiny && roundBits) {
2451 float_raise(float_flag_underflow, status);
2453 if (roundingMode == float_round_to_odd) {
2455 * For round-to-odd case, the roundIncrement depends on
2456 * zSig which just changed.
2458 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2462 if (roundBits) {
2463 status->float_exception_flags |= float_flag_inexact;
2465 zSig = ( zSig + roundIncrement )>>10;
2466 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2467 if ( zSig == 0 ) zExp = 0;
2468 return packFloat64( zSign, zExp, zSig );
2472 /*----------------------------------------------------------------------------
2473 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2474 | and significand `zSig', and returns the proper double-precision floating-
2475 | point value corresponding to the abstract input. This routine is just like
2476 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2477 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2478 | floating-point exponent.
2479 *----------------------------------------------------------------------------*/
2481 static float64
2482 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2483 float_status *status)
2485 int8_t shiftCount;
2487 shiftCount = countLeadingZeros64( zSig ) - 1;
2488 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2489 status);
2493 /*----------------------------------------------------------------------------
2494 | Returns the fraction bits of the extended double-precision floating-point
2495 | value `a'.
2496 *----------------------------------------------------------------------------*/
2498 static inline uint64_t extractFloatx80Frac( floatx80 a )
2501 return a.low;
2505 /*----------------------------------------------------------------------------
2506 | Returns the exponent bits of the extended double-precision floating-point
2507 | value `a'.
2508 *----------------------------------------------------------------------------*/
2510 static inline int32_t extractFloatx80Exp( floatx80 a )
2513 return a.high & 0x7FFF;
2517 /*----------------------------------------------------------------------------
2518 | Returns the sign bit of the extended double-precision floating-point value
2519 | `a'.
2520 *----------------------------------------------------------------------------*/
2522 static inline flag extractFloatx80Sign( floatx80 a )
2525 return a.high>>15;
2529 /*----------------------------------------------------------------------------
2530 | Normalizes the subnormal extended double-precision floating-point value
2531 | represented by the denormalized significand `aSig'. The normalized exponent
2532 | and significand are stored at the locations pointed to by `zExpPtr' and
2533 | `zSigPtr', respectively.
2534 *----------------------------------------------------------------------------*/
2536 static void
2537 normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
2539 int8_t shiftCount;
2541 shiftCount = countLeadingZeros64( aSig );
2542 *zSigPtr = aSig<<shiftCount;
2543 *zExpPtr = 1 - shiftCount;
2547 /*----------------------------------------------------------------------------
2548 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
2549 | extended double-precision floating-point value, returning the result.
2550 *----------------------------------------------------------------------------*/
2552 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
2554 floatx80 z;
2556 z.low = zSig;
2557 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
2558 return z;
2562 /*----------------------------------------------------------------------------
2563 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2564 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2565 | and returns the proper extended double-precision floating-point value
2566 | corresponding to the abstract input. Ordinarily, the abstract value is
2567 | rounded and packed into the extended double-precision format, with the
2568 | inexact exception raised if the abstract input cannot be represented
2569 | exactly. However, if the abstract value is too large, the overflow and
2570 | inexact exceptions are raised and an infinity or maximal finite value is
2571 | returned. If the abstract value is too small, the input value is rounded to
2572 | a subnormal number, and the underflow and inexact exceptions are raised if
2573 | the abstract input cannot be represented exactly as a subnormal extended
2574 | double-precision floating-point number.
2575 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
2576 | number of bits as single or double precision, respectively. Otherwise, the
2577 | result is rounded to the full precision of the extended double-precision
2578 | format.
2579 | The input significand must be normalized or smaller. If the input
2580 | significand is not normalized, `zExp' must be 0; in that case, the result
2581 | returned is a subnormal number, and it must not require rounding. The
2582 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2583 | Floating-Point Arithmetic.
2584 *----------------------------------------------------------------------------*/
2586 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2587 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2588 float_status *status)
2590 int8_t roundingMode;
2591 flag roundNearestEven, increment, isTiny;
2592 int64_t roundIncrement, roundMask, roundBits;
2594 roundingMode = status->float_rounding_mode;
2595 roundNearestEven = ( roundingMode == float_round_nearest_even );
2596 if ( roundingPrecision == 80 ) goto precision80;
2597 if ( roundingPrecision == 64 ) {
2598 roundIncrement = LIT64( 0x0000000000000400 );
2599 roundMask = LIT64( 0x00000000000007FF );
2601 else if ( roundingPrecision == 32 ) {
2602 roundIncrement = LIT64( 0x0000008000000000 );
2603 roundMask = LIT64( 0x000000FFFFFFFFFF );
2605 else {
2606 goto precision80;
2608 zSig0 |= ( zSig1 != 0 );
2609 switch (roundingMode) {
2610 case float_round_nearest_even:
2611 case float_round_ties_away:
2612 break;
2613 case float_round_to_zero:
2614 roundIncrement = 0;
2615 break;
2616 case float_round_up:
2617 roundIncrement = zSign ? 0 : roundMask;
2618 break;
2619 case float_round_down:
2620 roundIncrement = zSign ? roundMask : 0;
2621 break;
2622 default:
2623 abort();
2625 roundBits = zSig0 & roundMask;
2626 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2627 if ( ( 0x7FFE < zExp )
2628 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2630 goto overflow;
2632 if ( zExp <= 0 ) {
2633 if (status->flush_to_zero) {
2634 float_raise(float_flag_output_denormal, status);
2635 return packFloatx80(zSign, 0, 0);
2637 isTiny =
2638 (status->float_detect_tininess
2639 == float_tininess_before_rounding)
2640 || ( zExp < 0 )
2641 || ( zSig0 <= zSig0 + roundIncrement );
2642 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2643 zExp = 0;
2644 roundBits = zSig0 & roundMask;
2645 if (isTiny && roundBits) {
2646 float_raise(float_flag_underflow, status);
2648 if (roundBits) {
2649 status->float_exception_flags |= float_flag_inexact;
2651 zSig0 += roundIncrement;
2652 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2653 roundIncrement = roundMask + 1;
2654 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2655 roundMask |= roundIncrement;
2657 zSig0 &= ~ roundMask;
2658 return packFloatx80( zSign, zExp, zSig0 );
2661 if (roundBits) {
2662 status->float_exception_flags |= float_flag_inexact;
2664 zSig0 += roundIncrement;
2665 if ( zSig0 < roundIncrement ) {
2666 ++zExp;
2667 zSig0 = LIT64( 0x8000000000000000 );
2669 roundIncrement = roundMask + 1;
2670 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2671 roundMask |= roundIncrement;
2673 zSig0 &= ~ roundMask;
2674 if ( zSig0 == 0 ) zExp = 0;
2675 return packFloatx80( zSign, zExp, zSig0 );
2676 precision80:
2677 switch (roundingMode) {
2678 case float_round_nearest_even:
2679 case float_round_ties_away:
2680 increment = ((int64_t)zSig1 < 0);
2681 break;
2682 case float_round_to_zero:
2683 increment = 0;
2684 break;
2685 case float_round_up:
2686 increment = !zSign && zSig1;
2687 break;
2688 case float_round_down:
2689 increment = zSign && zSig1;
2690 break;
2691 default:
2692 abort();
2694 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2695 if ( ( 0x7FFE < zExp )
2696 || ( ( zExp == 0x7FFE )
2697 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2698 && increment
2701 roundMask = 0;
2702 overflow:
2703 float_raise(float_flag_overflow | float_flag_inexact, status);
2704 if ( ( roundingMode == float_round_to_zero )
2705 || ( zSign && ( roundingMode == float_round_up ) )
2706 || ( ! zSign && ( roundingMode == float_round_down ) )
2708 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2710 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2712 if ( zExp <= 0 ) {
2713 isTiny =
2714 (status->float_detect_tininess
2715 == float_tininess_before_rounding)
2716 || ( zExp < 0 )
2717 || ! increment
2718 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2719 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2720 zExp = 0;
2721 if (isTiny && zSig1) {
2722 float_raise(float_flag_underflow, status);
2724 if (zSig1) {
2725 status->float_exception_flags |= float_flag_inexact;
2727 switch (roundingMode) {
2728 case float_round_nearest_even:
2729 case float_round_ties_away:
2730 increment = ((int64_t)zSig1 < 0);
2731 break;
2732 case float_round_to_zero:
2733 increment = 0;
2734 break;
2735 case float_round_up:
2736 increment = !zSign && zSig1;
2737 break;
2738 case float_round_down:
2739 increment = zSign && zSig1;
2740 break;
2741 default:
2742 abort();
2744 if ( increment ) {
2745 ++zSig0;
2746 zSig0 &=
2747 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2748 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2750 return packFloatx80( zSign, zExp, zSig0 );
2753 if (zSig1) {
2754 status->float_exception_flags |= float_flag_inexact;
2756 if ( increment ) {
2757 ++zSig0;
2758 if ( zSig0 == 0 ) {
2759 ++zExp;
2760 zSig0 = LIT64( 0x8000000000000000 );
2762 else {
2763 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2766 else {
2767 if ( zSig0 == 0 ) zExp = 0;
2769 return packFloatx80( zSign, zExp, zSig0 );
2773 /*----------------------------------------------------------------------------
2774 | Takes an abstract floating-point value having sign `zSign', exponent
2775 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2776 | and returns the proper extended double-precision floating-point value
2777 | corresponding to the abstract input. This routine is just like
2778 | `roundAndPackFloatx80' except that the input significand does not have to be
2779 | normalized.
2780 *----------------------------------------------------------------------------*/
2782 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2783 flag zSign, int32_t zExp,
2784 uint64_t zSig0, uint64_t zSig1,
2785 float_status *status)
2787 int8_t shiftCount;
2789 if ( zSig0 == 0 ) {
2790 zSig0 = zSig1;
2791 zSig1 = 0;
2792 zExp -= 64;
2794 shiftCount = countLeadingZeros64( zSig0 );
2795 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2796 zExp -= shiftCount;
2797 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2798 zSig0, zSig1, status);
2802 /*----------------------------------------------------------------------------
2803 | Returns the least-significant 64 fraction bits of the quadruple-precision
2804 | floating-point value `a'.
2805 *----------------------------------------------------------------------------*/
2807 static inline uint64_t extractFloat128Frac1( float128 a )
2810 return a.low;
2814 /*----------------------------------------------------------------------------
2815 | Returns the most-significant 48 fraction bits of the quadruple-precision
2816 | floating-point value `a'.
2817 *----------------------------------------------------------------------------*/
2819 static inline uint64_t extractFloat128Frac0( float128 a )
2822 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2826 /*----------------------------------------------------------------------------
2827 | Returns the exponent bits of the quadruple-precision floating-point value
2828 | `a'.
2829 *----------------------------------------------------------------------------*/
2831 static inline int32_t extractFloat128Exp( float128 a )
2834 return ( a.high>>48 ) & 0x7FFF;
2838 /*----------------------------------------------------------------------------
2839 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2840 *----------------------------------------------------------------------------*/
2842 static inline flag extractFloat128Sign( float128 a )
2845 return a.high>>63;
2849 /*----------------------------------------------------------------------------
2850 | Normalizes the subnormal quadruple-precision floating-point value
2851 | represented by the denormalized significand formed by the concatenation of
2852 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
2853 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
2854 | significand are stored at the location pointed to by `zSig0Ptr', and the
2855 | least significant 64 bits of the normalized significand are stored at the
2856 | location pointed to by `zSig1Ptr'.
2857 *----------------------------------------------------------------------------*/
2859 static void
2860 normalizeFloat128Subnormal(
2861 uint64_t aSig0,
2862 uint64_t aSig1,
2863 int32_t *zExpPtr,
2864 uint64_t *zSig0Ptr,
2865 uint64_t *zSig1Ptr
2868 int8_t shiftCount;
2870 if ( aSig0 == 0 ) {
2871 shiftCount = countLeadingZeros64( aSig1 ) - 15;
2872 if ( shiftCount < 0 ) {
2873 *zSig0Ptr = aSig1>>( - shiftCount );
2874 *zSig1Ptr = aSig1<<( shiftCount & 63 );
2876 else {
2877 *zSig0Ptr = aSig1<<shiftCount;
2878 *zSig1Ptr = 0;
2880 *zExpPtr = - shiftCount - 63;
2882 else {
2883 shiftCount = countLeadingZeros64( aSig0 ) - 15;
2884 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2885 *zExpPtr = 1 - shiftCount;
2890 /*----------------------------------------------------------------------------
2891 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2892 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2893 | floating-point value, returning the result. After being shifted into the
2894 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2895 | added together to form the most significant 32 bits of the result. This
2896 | means that any integer portion of `zSig0' will be added into the exponent.
2897 | Since a properly normalized significand will have an integer portion equal
2898 | to 1, the `zExp' input should be 1 less than the desired result exponent
2899 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2900 | significand.
2901 *----------------------------------------------------------------------------*/
2903 static inline float128
2904 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
2906 float128 z;
2908 z.low = zSig1;
2909 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
2910 return z;
2914 /*----------------------------------------------------------------------------
2915 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2916 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2917 | and `zSig2', and returns the proper quadruple-precision floating-point value
2918 | corresponding to the abstract input. Ordinarily, the abstract value is
2919 | simply rounded and packed into the quadruple-precision format, with the
2920 | inexact exception raised if the abstract input cannot be represented
2921 | exactly. However, if the abstract value is too large, the overflow and
2922 | inexact exceptions are raised and an infinity or maximal finite value is
2923 | returned. If the abstract value is too small, the input value is rounded to
2924 | a subnormal number, and the underflow and inexact exceptions are raised if
2925 | the abstract input cannot be represented exactly as a subnormal quadruple-
2926 | precision floating-point number.
2927 | The input significand must be normalized or smaller. If the input
2928 | significand is not normalized, `zExp' must be 0; in that case, the result
2929 | returned is a subnormal number, and it must not require rounding. In the
2930 | usual case that the input significand is normalized, `zExp' must be 1 less
2931 | than the ``true'' floating-point exponent. The handling of underflow and
2932 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2933 *----------------------------------------------------------------------------*/
2935 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
2936 uint64_t zSig0, uint64_t zSig1,
2937 uint64_t zSig2, float_status *status)
2939 int8_t roundingMode;
2940 flag roundNearestEven, increment, isTiny;
2942 roundingMode = status->float_rounding_mode;
2943 roundNearestEven = ( roundingMode == float_round_nearest_even );
2944 switch (roundingMode) {
2945 case float_round_nearest_even:
2946 case float_round_ties_away:
2947 increment = ((int64_t)zSig2 < 0);
2948 break;
2949 case float_round_to_zero:
2950 increment = 0;
2951 break;
2952 case float_round_up:
2953 increment = !zSign && zSig2;
2954 break;
2955 case float_round_down:
2956 increment = zSign && zSig2;
2957 break;
2958 case float_round_to_odd:
2959 increment = !(zSig1 & 0x1) && zSig2;
2960 break;
2961 default:
2962 abort();
2964 if ( 0x7FFD <= (uint32_t) zExp ) {
2965 if ( ( 0x7FFD < zExp )
2966 || ( ( zExp == 0x7FFD )
2967 && eq128(
2968 LIT64( 0x0001FFFFFFFFFFFF ),
2969 LIT64( 0xFFFFFFFFFFFFFFFF ),
2970 zSig0,
2971 zSig1
2973 && increment
2976 float_raise(float_flag_overflow | float_flag_inexact, status);
2977 if ( ( roundingMode == float_round_to_zero )
2978 || ( zSign && ( roundingMode == float_round_up ) )
2979 || ( ! zSign && ( roundingMode == float_round_down ) )
2980 || (roundingMode == float_round_to_odd)
2982 return
2983 packFloat128(
2984 zSign,
2985 0x7FFE,
2986 LIT64( 0x0000FFFFFFFFFFFF ),
2987 LIT64( 0xFFFFFFFFFFFFFFFF )
2990 return packFloat128( zSign, 0x7FFF, 0, 0 );
2992 if ( zExp < 0 ) {
2993 if (status->flush_to_zero) {
2994 float_raise(float_flag_output_denormal, status);
2995 return packFloat128(zSign, 0, 0, 0);
2997 isTiny =
2998 (status->float_detect_tininess
2999 == float_tininess_before_rounding)
3000 || ( zExp < -1 )
3001 || ! increment
3002 || lt128(
3003 zSig0,
3004 zSig1,
3005 LIT64( 0x0001FFFFFFFFFFFF ),
3006 LIT64( 0xFFFFFFFFFFFFFFFF )
3008 shift128ExtraRightJamming(
3009 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
3010 zExp = 0;
3011 if (isTiny && zSig2) {
3012 float_raise(float_flag_underflow, status);
3014 switch (roundingMode) {
3015 case float_round_nearest_even:
3016 case float_round_ties_away:
3017 increment = ((int64_t)zSig2 < 0);
3018 break;
3019 case float_round_to_zero:
3020 increment = 0;
3021 break;
3022 case float_round_up:
3023 increment = !zSign && zSig2;
3024 break;
3025 case float_round_down:
3026 increment = zSign && zSig2;
3027 break;
3028 case float_round_to_odd:
3029 increment = !(zSig1 & 0x1) && zSig2;
3030 break;
3031 default:
3032 abort();
3036 if (zSig2) {
3037 status->float_exception_flags |= float_flag_inexact;
3039 if ( increment ) {
3040 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
3041 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
3043 else {
3044 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
3046 return packFloat128( zSign, zExp, zSig0, zSig1 );
3050 /*----------------------------------------------------------------------------
3051 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3052 | and significand formed by the concatenation of `zSig0' and `zSig1', and
3053 | returns the proper quadruple-precision floating-point value corresponding
3054 | to the abstract input. This routine is just like `roundAndPackFloat128'
3055 | except that the input significand has fewer bits and does not have to be
3056 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
3057 | point exponent.
3058 *----------------------------------------------------------------------------*/
3060 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
3061 uint64_t zSig0, uint64_t zSig1,
3062 float_status *status)
3064 int8_t shiftCount;
3065 uint64_t zSig2;
3067 if ( zSig0 == 0 ) {
3068 zSig0 = zSig1;
3069 zSig1 = 0;
3070 zExp -= 64;
3072 shiftCount = countLeadingZeros64( zSig0 ) - 15;
3073 if ( 0 <= shiftCount ) {
3074 zSig2 = 0;
3075 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3077 else {
3078 shift128ExtraRightJamming(
3079 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3081 zExp -= shiftCount;
3082 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3087 /*----------------------------------------------------------------------------
3088 | Returns the result of converting the 32-bit two's complement integer `a'
3089 | to the extended double-precision floating-point format. The conversion
3090 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3091 | Arithmetic.
3092 *----------------------------------------------------------------------------*/
3094 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3096 flag zSign;
3097 uint32_t absA;
3098 int8_t shiftCount;
3099 uint64_t zSig;
3101 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3102 zSign = ( a < 0 );
3103 absA = zSign ? - a : a;
3104 shiftCount = countLeadingZeros32( absA ) + 32;
3105 zSig = absA;
3106 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3110 /*----------------------------------------------------------------------------
3111 | Returns the result of converting the 32-bit two's complement integer `a' to
3112 | the quadruple-precision floating-point format. The conversion is performed
3113 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3114 *----------------------------------------------------------------------------*/
3116 float128 int32_to_float128(int32_t a, float_status *status)
3118 flag zSign;
3119 uint32_t absA;
3120 int8_t shiftCount;
3121 uint64_t zSig0;
3123 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3124 zSign = ( a < 0 );
3125 absA = zSign ? - a : a;
3126 shiftCount = countLeadingZeros32( absA ) + 17;
3127 zSig0 = absA;
3128 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3132 /*----------------------------------------------------------------------------
3133 | Returns the result of converting the 64-bit two's complement integer `a'
3134 | to the extended double-precision floating-point format. The conversion
3135 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3136 | Arithmetic.
3137 *----------------------------------------------------------------------------*/
3139 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3141 flag zSign;
3142 uint64_t absA;
3143 int8_t shiftCount;
3145 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3146 zSign = ( a < 0 );
3147 absA = zSign ? - a : a;
3148 shiftCount = countLeadingZeros64( absA );
3149 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3153 /*----------------------------------------------------------------------------
3154 | Returns the result of converting the 64-bit two's complement integer `a' to
3155 | the quadruple-precision floating-point format. The conversion is performed
3156 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3157 *----------------------------------------------------------------------------*/
3159 float128 int64_to_float128(int64_t a, float_status *status)
3161 flag zSign;
3162 uint64_t absA;
3163 int8_t shiftCount;
3164 int32_t zExp;
3165 uint64_t zSig0, zSig1;
3167 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3168 zSign = ( a < 0 );
3169 absA = zSign ? - a : a;
3170 shiftCount = countLeadingZeros64( absA ) + 49;
3171 zExp = 0x406E - shiftCount;
3172 if ( 64 <= shiftCount ) {
3173 zSig1 = 0;
3174 zSig0 = absA;
3175 shiftCount -= 64;
3177 else {
3178 zSig1 = absA;
3179 zSig0 = 0;
3181 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3182 return packFloat128( zSign, zExp, zSig0, zSig1 );
3186 /*----------------------------------------------------------------------------
3187 | Returns the result of converting the 64-bit unsigned integer `a'
3188 | to the quadruple-precision floating-point format. The conversion is performed
3189 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3190 *----------------------------------------------------------------------------*/
3192 float128 uint64_to_float128(uint64_t a, float_status *status)
3194 if (a == 0) {
3195 return float128_zero;
3197 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
3203 /*----------------------------------------------------------------------------
3204 | Returns the result of converting the single-precision floating-point value
3205 | `a' to the double-precision floating-point format. The conversion is
3206 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3207 | Arithmetic.
3208 *----------------------------------------------------------------------------*/
3210 float64 float32_to_float64(float32 a, float_status *status)
3212 flag aSign;
3213 int aExp;
3214 uint32_t aSig;
3215 a = float32_squash_input_denormal(a, status);
3217 aSig = extractFloat32Frac( a );
3218 aExp = extractFloat32Exp( a );
3219 aSign = extractFloat32Sign( a );
3220 if ( aExp == 0xFF ) {
3221 if (aSig) {
3222 return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
3224 return packFloat64( aSign, 0x7FF, 0 );
3226 if ( aExp == 0 ) {
3227 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
3228 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3229 --aExp;
3231 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
3235 /*----------------------------------------------------------------------------
3236 | Returns the result of converting the single-precision floating-point value
3237 | `a' to the extended double-precision floating-point format. The conversion
3238 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3239 | Arithmetic.
3240 *----------------------------------------------------------------------------*/
3242 floatx80 float32_to_floatx80(float32 a, float_status *status)
3244 flag aSign;
3245 int aExp;
3246 uint32_t aSig;
3248 a = float32_squash_input_denormal(a, status);
3249 aSig = extractFloat32Frac( a );
3250 aExp = extractFloat32Exp( a );
3251 aSign = extractFloat32Sign( a );
3252 if ( aExp == 0xFF ) {
3253 if (aSig) {
3254 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3256 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3258 if ( aExp == 0 ) {
3259 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3260 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3262 aSig |= 0x00800000;
3263 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3267 /*----------------------------------------------------------------------------
3268 | Returns the result of converting the single-precision floating-point value
3269 | `a' to the double-precision floating-point format. The conversion is
3270 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3271 | Arithmetic.
3272 *----------------------------------------------------------------------------*/
3274 float128 float32_to_float128(float32 a, float_status *status)
3276 flag aSign;
3277 int aExp;
3278 uint32_t aSig;
3280 a = float32_squash_input_denormal(a, status);
3281 aSig = extractFloat32Frac( a );
3282 aExp = extractFloat32Exp( a );
3283 aSign = extractFloat32Sign( a );
3284 if ( aExp == 0xFF ) {
3285 if (aSig) {
3286 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3288 return packFloat128( aSign, 0x7FFF, 0, 0 );
3290 if ( aExp == 0 ) {
3291 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3292 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3293 --aExp;
3295 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3299 /*----------------------------------------------------------------------------
3300 | Returns the remainder of the single-precision floating-point value `a'
3301 | with respect to the corresponding value `b'. The operation is performed
3302 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3303 *----------------------------------------------------------------------------*/
3305 float32 float32_rem(float32 a, float32 b, float_status *status)
3307 flag aSign, zSign;
3308 int aExp, bExp, expDiff;
3309 uint32_t aSig, bSig;
3310 uint32_t q;
3311 uint64_t aSig64, bSig64, q64;
3312 uint32_t alternateASig;
3313 int32_t sigMean;
3314 a = float32_squash_input_denormal(a, status);
3315 b = float32_squash_input_denormal(b, status);
3317 aSig = extractFloat32Frac( a );
3318 aExp = extractFloat32Exp( a );
3319 aSign = extractFloat32Sign( a );
3320 bSig = extractFloat32Frac( b );
3321 bExp = extractFloat32Exp( b );
3322 if ( aExp == 0xFF ) {
3323 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3324 return propagateFloat32NaN(a, b, status);
3326 float_raise(float_flag_invalid, status);
3327 return float32_default_nan(status);
3329 if ( bExp == 0xFF ) {
3330 if (bSig) {
3331 return propagateFloat32NaN(a, b, status);
3333 return a;
3335 if ( bExp == 0 ) {
3336 if ( bSig == 0 ) {
3337 float_raise(float_flag_invalid, status);
3338 return float32_default_nan(status);
3340 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3342 if ( aExp == 0 ) {
3343 if ( aSig == 0 ) return a;
3344 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3346 expDiff = aExp - bExp;
3347 aSig |= 0x00800000;
3348 bSig |= 0x00800000;
3349 if ( expDiff < 32 ) {
3350 aSig <<= 8;
3351 bSig <<= 8;
3352 if ( expDiff < 0 ) {
3353 if ( expDiff < -1 ) return a;
3354 aSig >>= 1;
3356 q = ( bSig <= aSig );
3357 if ( q ) aSig -= bSig;
3358 if ( 0 < expDiff ) {
3359 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3360 q >>= 32 - expDiff;
3361 bSig >>= 2;
3362 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3364 else {
3365 aSig >>= 2;
3366 bSig >>= 2;
3369 else {
3370 if ( bSig <= aSig ) aSig -= bSig;
3371 aSig64 = ( (uint64_t) aSig )<<40;
3372 bSig64 = ( (uint64_t) bSig )<<40;
3373 expDiff -= 64;
3374 while ( 0 < expDiff ) {
3375 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3376 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3377 aSig64 = - ( ( bSig * q64 )<<38 );
3378 expDiff -= 62;
3380 expDiff += 64;
3381 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3382 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3383 q = q64>>( 64 - expDiff );
3384 bSig <<= 6;
3385 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3387 do {
3388 alternateASig = aSig;
3389 ++q;
3390 aSig -= bSig;
3391 } while ( 0 <= (int32_t) aSig );
3392 sigMean = aSig + alternateASig;
3393 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3394 aSig = alternateASig;
3396 zSign = ( (int32_t) aSig < 0 );
3397 if ( zSign ) aSig = - aSig;
3398 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3403 /*----------------------------------------------------------------------------
3404 | Returns the binary exponential of the single-precision floating-point value
3405 | `a'. The operation is performed according to the IEC/IEEE Standard for
3406 | Binary Floating-Point Arithmetic.
3408 | Uses the following identities:
3410 | 1. -------------------------------------------------------------------------
3411 | x x*ln(2)
3412 | 2 = e
3414 | 2. -------------------------------------------------------------------------
3415 | 2 3 4 5 n
3416 | x x x x x x x
3417 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3418 | 1! 2! 3! 4! 5! n!
3419 *----------------------------------------------------------------------------*/
3421 static const float64 float32_exp2_coefficients[15] =
3423 const_float64( 0x3ff0000000000000ll ), /* 1 */
3424 const_float64( 0x3fe0000000000000ll ), /* 2 */
3425 const_float64( 0x3fc5555555555555ll ), /* 3 */
3426 const_float64( 0x3fa5555555555555ll ), /* 4 */
3427 const_float64( 0x3f81111111111111ll ), /* 5 */
3428 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
3429 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
3430 const_float64( 0x3efa01a01a01a01all ), /* 8 */
3431 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
3432 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3433 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3434 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3435 const_float64( 0x3de6124613a86d09ll ), /* 13 */
3436 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3437 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3440 float32 float32_exp2(float32 a, float_status *status)
3442 flag aSign;
3443 int aExp;
3444 uint32_t aSig;
3445 float64 r, x, xn;
3446 int i;
3447 a = float32_squash_input_denormal(a, status);
3449 aSig = extractFloat32Frac( a );
3450 aExp = extractFloat32Exp( a );
3451 aSign = extractFloat32Sign( a );
3453 if ( aExp == 0xFF) {
3454 if (aSig) {
3455 return propagateFloat32NaN(a, float32_zero, status);
3457 return (aSign) ? float32_zero : a;
3459 if (aExp == 0) {
3460 if (aSig == 0) return float32_one;
3463 float_raise(float_flag_inexact, status);
3465 /* ******************************* */
3466 /* using float64 for approximation */
3467 /* ******************************* */
3468 x = float32_to_float64(a, status);
3469 x = float64_mul(x, float64_ln2, status);
3471 xn = x;
3472 r = float64_one;
3473 for (i = 0 ; i < 15 ; i++) {
3474 float64 f;
3476 f = float64_mul(xn, float32_exp2_coefficients[i], status);
3477 r = float64_add(r, f, status);
3479 xn = float64_mul(xn, x, status);
3482 return float64_to_float32(r, status);
3485 /*----------------------------------------------------------------------------
3486 | Returns the binary log of the single-precision floating-point value `a'.
3487 | The operation is performed according to the IEC/IEEE Standard for Binary
3488 | Floating-Point Arithmetic.
3489 *----------------------------------------------------------------------------*/
3490 float32 float32_log2(float32 a, float_status *status)
3492 flag aSign, zSign;
3493 int aExp;
3494 uint32_t aSig, zSig, i;
3496 a = float32_squash_input_denormal(a, status);
3497 aSig = extractFloat32Frac( a );
3498 aExp = extractFloat32Exp( a );
3499 aSign = extractFloat32Sign( a );
3501 if ( aExp == 0 ) {
3502 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3503 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3505 if ( aSign ) {
3506 float_raise(float_flag_invalid, status);
3507 return float32_default_nan(status);
3509 if ( aExp == 0xFF ) {
3510 if (aSig) {
3511 return propagateFloat32NaN(a, float32_zero, status);
3513 return a;
3516 aExp -= 0x7F;
3517 aSig |= 0x00800000;
3518 zSign = aExp < 0;
3519 zSig = aExp << 23;
3521 for (i = 1 << 22; i > 0; i >>= 1) {
3522 aSig = ( (uint64_t)aSig * aSig ) >> 23;
3523 if ( aSig & 0x01000000 ) {
3524 aSig >>= 1;
3525 zSig |= i;
3529 if ( zSign )
3530 zSig = -zSig;
3532 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3535 /*----------------------------------------------------------------------------
3536 | Returns 1 if the single-precision floating-point value `a' is equal to
3537 | the corresponding value `b', and 0 otherwise. The invalid exception is
3538 | raised if either operand is a NaN. Otherwise, the comparison is performed
3539 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3540 *----------------------------------------------------------------------------*/
3542 int float32_eq(float32 a, float32 b, float_status *status)
3544 uint32_t av, bv;
3545 a = float32_squash_input_denormal(a, status);
3546 b = float32_squash_input_denormal(b, status);
3548 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3549 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3551 float_raise(float_flag_invalid, status);
3552 return 0;
3554 av = float32_val(a);
3555 bv = float32_val(b);
3556 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3559 /*----------------------------------------------------------------------------
3560 | Returns 1 if the single-precision floating-point value `a' is less than
3561 | or equal to the corresponding value `b', and 0 otherwise. The invalid
3562 | exception is raised if either operand is a NaN. The comparison is performed
3563 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3564 *----------------------------------------------------------------------------*/
3566 int float32_le(float32 a, float32 b, float_status *status)
3568 flag aSign, bSign;
3569 uint32_t av, bv;
3570 a = float32_squash_input_denormal(a, status);
3571 b = float32_squash_input_denormal(b, status);
3573 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3574 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3576 float_raise(float_flag_invalid, status);
3577 return 0;
3579 aSign = extractFloat32Sign( a );
3580 bSign = extractFloat32Sign( b );
3581 av = float32_val(a);
3582 bv = float32_val(b);
3583 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3584 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3588 /*----------------------------------------------------------------------------
3589 | Returns 1 if the single-precision floating-point value `a' is less than
3590 | the corresponding value `b', and 0 otherwise. The invalid exception is
3591 | raised if either operand is a NaN. The comparison is performed according
3592 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3593 *----------------------------------------------------------------------------*/
3595 int float32_lt(float32 a, float32 b, float_status *status)
3597 flag aSign, bSign;
3598 uint32_t av, bv;
3599 a = float32_squash_input_denormal(a, status);
3600 b = float32_squash_input_denormal(b, status);
3602 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3603 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3605 float_raise(float_flag_invalid, status);
3606 return 0;
3608 aSign = extractFloat32Sign( a );
3609 bSign = extractFloat32Sign( b );
3610 av = float32_val(a);
3611 bv = float32_val(b);
3612 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3613 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3617 /*----------------------------------------------------------------------------
3618 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3619 | be compared, and 0 otherwise. The invalid exception is raised if either
3620 | operand is a NaN. The comparison is performed according to the IEC/IEEE
3621 | Standard for Binary Floating-Point Arithmetic.
3622 *----------------------------------------------------------------------------*/
3624 int float32_unordered(float32 a, float32 b, float_status *status)
3626 a = float32_squash_input_denormal(a, status);
3627 b = float32_squash_input_denormal(b, status);
3629 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3630 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3632 float_raise(float_flag_invalid, status);
3633 return 1;
3635 return 0;
3638 /*----------------------------------------------------------------------------
3639 | Returns 1 if the single-precision floating-point value `a' is equal to
3640 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3641 | exception. The comparison is performed according to the IEC/IEEE Standard
3642 | for Binary Floating-Point Arithmetic.
3643 *----------------------------------------------------------------------------*/
3645 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3647 a = float32_squash_input_denormal(a, status);
3648 b = float32_squash_input_denormal(b, status);
3650 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3651 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3653 if (float32_is_signaling_nan(a, status)
3654 || float32_is_signaling_nan(b, status)) {
3655 float_raise(float_flag_invalid, status);
3657 return 0;
3659 return ( float32_val(a) == float32_val(b) ) ||
3660 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3663 /*----------------------------------------------------------------------------
3664 | Returns 1 if the single-precision floating-point value `a' is less than or
3665 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3666 | cause an exception. Otherwise, the comparison is performed according to the
3667 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3668 *----------------------------------------------------------------------------*/
3670 int float32_le_quiet(float32 a, float32 b, float_status *status)
3672 flag aSign, bSign;
3673 uint32_t av, bv;
3674 a = float32_squash_input_denormal(a, status);
3675 b = float32_squash_input_denormal(b, status);
3677 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3678 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3680 if (float32_is_signaling_nan(a, status)
3681 || float32_is_signaling_nan(b, status)) {
3682 float_raise(float_flag_invalid, status);
3684 return 0;
3686 aSign = extractFloat32Sign( a );
3687 bSign = extractFloat32Sign( b );
3688 av = float32_val(a);
3689 bv = float32_val(b);
3690 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3691 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3695 /*----------------------------------------------------------------------------
3696 | Returns 1 if the single-precision floating-point value `a' is less than
3697 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3698 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3699 | Standard for Binary Floating-Point Arithmetic.
3700 *----------------------------------------------------------------------------*/
3702 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3704 flag aSign, bSign;
3705 uint32_t av, bv;
3706 a = float32_squash_input_denormal(a, status);
3707 b = float32_squash_input_denormal(b, status);
3709 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3710 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3712 if (float32_is_signaling_nan(a, status)
3713 || float32_is_signaling_nan(b, status)) {
3714 float_raise(float_flag_invalid, status);
3716 return 0;
3718 aSign = extractFloat32Sign( a );
3719 bSign = extractFloat32Sign( b );
3720 av = float32_val(a);
3721 bv = float32_val(b);
3722 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3723 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3727 /*----------------------------------------------------------------------------
3728 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3729 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3730 | comparison is performed according to the IEC/IEEE Standard for Binary
3731 | Floating-Point Arithmetic.
3732 *----------------------------------------------------------------------------*/
3734 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3736 a = float32_squash_input_denormal(a, status);
3737 b = float32_squash_input_denormal(b, status);
3739 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3740 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3742 if (float32_is_signaling_nan(a, status)
3743 || float32_is_signaling_nan(b, status)) {
3744 float_raise(float_flag_invalid, status);
3746 return 1;
3748 return 0;
3752 /*----------------------------------------------------------------------------
3753 | Returns the result of converting the double-precision floating-point value
3754 | `a' to the single-precision floating-point format. The conversion is
3755 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3756 | Arithmetic.
3757 *----------------------------------------------------------------------------*/
3759 float32 float64_to_float32(float64 a, float_status *status)
3761 flag aSign;
3762 int aExp;
3763 uint64_t aSig;
3764 uint32_t zSig;
3765 a = float64_squash_input_denormal(a, status);
3767 aSig = extractFloat64Frac( a );
3768 aExp = extractFloat64Exp( a );
3769 aSign = extractFloat64Sign( a );
3770 if ( aExp == 0x7FF ) {
3771 if (aSig) {
3772 return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3774 return packFloat32( aSign, 0xFF, 0 );
3776 shift64RightJamming( aSig, 22, &aSig );
3777 zSig = aSig;
3778 if ( aExp || zSig ) {
3779 zSig |= 0x40000000;
3780 aExp -= 0x381;
3782 return roundAndPackFloat32(aSign, aExp, zSig, status);
3787 /*----------------------------------------------------------------------------
3788 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3789 | half-precision floating-point value, returning the result. After being
3790 | shifted into the proper positions, the three fields are simply added
3791 | together to form the result. This means that any integer portion of `zSig'
3792 | will be added into the exponent. Since a properly normalized significand
3793 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3794 | than the desired result exponent whenever `zSig' is a complete, normalized
3795 | significand.
3796 *----------------------------------------------------------------------------*/
3797 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3799 return make_float16(
3800 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3803 /*----------------------------------------------------------------------------
3804 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3805 | and significand `zSig', and returns the proper half-precision floating-
3806 | point value corresponding to the abstract input. Ordinarily, the abstract
3807 | value is simply rounded and packed into the half-precision format, with
3808 | the inexact exception raised if the abstract input cannot be represented
3809 | exactly. However, if the abstract value is too large, the overflow and
3810 | inexact exceptions are raised and an infinity or maximal finite value is
3811 | returned. If the abstract value is too small, the input value is rounded to
3812 | a subnormal number, and the underflow and inexact exceptions are raised if
3813 | the abstract input cannot be represented exactly as a subnormal half-
3814 | precision floating-point number.
3815 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3816 | ARM-style "alternative representation", which omits the NaN and Inf
3817 | encodings in order to raise the maximum representable exponent by one.
3818 | The input significand `zSig' has its binary point between bits 22
3819 | and 23, which is 13 bits to the left of the usual location. This shifted
3820 | significand must be normalized or smaller. If `zSig' is not normalized,
3821 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3822 | and it must not require rounding. In the usual case that `zSig' is
3823 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3824 | Note the slightly odd position of the binary point in zSig compared with the
3825 | other roundAndPackFloat functions. This should probably be fixed if we
3826 | need to implement more float16 routines than just conversion.
3827 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3828 | Binary Floating-Point Arithmetic.
3829 *----------------------------------------------------------------------------*/
3831 static float16 roundAndPackFloat16(flag zSign, int zExp,
3832 uint32_t zSig, flag ieee,
3833 float_status *status)
3835 int maxexp = ieee ? 29 : 30;
3836 uint32_t mask;
3837 uint32_t increment;
3838 bool rounding_bumps_exp;
3839 bool is_tiny = false;
3841 /* Calculate the mask of bits of the mantissa which are not
3842 * representable in half-precision and will be lost.
3844 if (zExp < 1) {
3845 /* Will be denormal in halfprec */
3846 mask = 0x00ffffff;
3847 if (zExp >= -11) {
3848 mask >>= 11 + zExp;
3850 } else {
3851 /* Normal number in halfprec */
3852 mask = 0x00001fff;
3855 switch (status->float_rounding_mode) {
3856 case float_round_nearest_even:
3857 increment = (mask + 1) >> 1;
3858 if ((zSig & mask) == increment) {
3859 increment = zSig & (increment << 1);
3861 break;
3862 case float_round_ties_away:
3863 increment = (mask + 1) >> 1;
3864 break;
3865 case float_round_up:
3866 increment = zSign ? 0 : mask;
3867 break;
3868 case float_round_down:
3869 increment = zSign ? mask : 0;
3870 break;
3871 default: /* round_to_zero */
3872 increment = 0;
3873 break;
3876 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3878 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3879 if (ieee) {
3880 float_raise(float_flag_overflow | float_flag_inexact, status);
3881 return packFloat16(zSign, 0x1f, 0);
3882 } else {
3883 float_raise(float_flag_invalid, status);
3884 return packFloat16(zSign, 0x1f, 0x3ff);
3888 if (zExp < 0) {
3889 /* Note that flush-to-zero does not affect half-precision results */
3890 is_tiny =
3891 (status->float_detect_tininess == float_tininess_before_rounding)
3892 || (zExp < -1)
3893 || (!rounding_bumps_exp);
3895 if (zSig & mask) {
3896 float_raise(float_flag_inexact, status);
3897 if (is_tiny) {
3898 float_raise(float_flag_underflow, status);
3902 zSig += increment;
3903 if (rounding_bumps_exp) {
3904 zSig >>= 1;
3905 zExp++;
3908 if (zExp < -10) {
3909 return packFloat16(zSign, 0, 0);
3911 if (zExp < 0) {
3912 zSig >>= -zExp;
3913 zExp = 0;
3915 return packFloat16(zSign, zExp, zSig >> 13);
3918 /*----------------------------------------------------------------------------
3919 | If `a' is denormal and we are in flush-to-zero mode then set the
3920 | input-denormal exception and return zero. Otherwise just return the value.
3921 *----------------------------------------------------------------------------*/
3922 float16 float16_squash_input_denormal(float16 a, float_status *status)
3924 if (status->flush_inputs_to_zero) {
3925 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3926 float_raise(float_flag_input_denormal, status);
3927 return make_float16(float16_val(a) & 0x8000);
3930 return a;
3933 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3934 uint32_t *zSigPtr)
3936 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3937 *zSigPtr = aSig << shiftCount;
3938 *zExpPtr = 1 - shiftCount;
3941 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3942 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3944 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3946 flag aSign;
3947 int aExp;
3948 uint32_t aSig;
3950 aSign = extractFloat16Sign(a);
3951 aExp = extractFloat16Exp(a);
3952 aSig = extractFloat16Frac(a);
3954 if (aExp == 0x1f && ieee) {
3955 if (aSig) {
3956 return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3958 return packFloat32(aSign, 0xff, 0);
3960 if (aExp == 0) {
3961 if (aSig == 0) {
3962 return packFloat32(aSign, 0, 0);
3965 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3966 aExp--;
3968 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3971 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3973 flag aSign;
3974 int aExp;
3975 uint32_t aSig;
3977 a = float32_squash_input_denormal(a, status);
3979 aSig = extractFloat32Frac( a );
3980 aExp = extractFloat32Exp( a );
3981 aSign = extractFloat32Sign( a );
3982 if ( aExp == 0xFF ) {
3983 if (aSig) {
3984 /* Input is a NaN */
3985 if (!ieee) {
3986 float_raise(float_flag_invalid, status);
3987 return packFloat16(aSign, 0, 0);
3989 return commonNaNToFloat16(
3990 float32ToCommonNaN(a, status), status);
3992 /* Infinity */
3993 if (!ieee) {
3994 float_raise(float_flag_invalid, status);
3995 return packFloat16(aSign, 0x1f, 0x3ff);
3997 return packFloat16(aSign, 0x1f, 0);
3999 if (aExp == 0 && aSig == 0) {
4000 return packFloat16(aSign, 0, 0);
4002 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4003 * even if the input is denormal; however this is harmless because
4004 * the largest possible single-precision denormal is still smaller
4005 * than the smallest representable half-precision denormal, and so we
4006 * will end up ignoring aSig and returning via the "always return zero"
4007 * codepath.
4009 aSig |= 0x00800000;
4010 aExp -= 0x71;
4012 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
4015 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
4017 flag aSign;
4018 int aExp;
4019 uint32_t aSig;
4021 aSign = extractFloat16Sign(a);
4022 aExp = extractFloat16Exp(a);
4023 aSig = extractFloat16Frac(a);
4025 if (aExp == 0x1f && ieee) {
4026 if (aSig) {
4027 return commonNaNToFloat64(
4028 float16ToCommonNaN(a, status), status);
4030 return packFloat64(aSign, 0x7ff, 0);
4032 if (aExp == 0) {
4033 if (aSig == 0) {
4034 return packFloat64(aSign, 0, 0);
4037 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
4038 aExp--;
4040 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
4043 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
4045 flag aSign;
4046 int aExp;
4047 uint64_t aSig;
4048 uint32_t zSig;
4050 a = float64_squash_input_denormal(a, status);
4052 aSig = extractFloat64Frac(a);
4053 aExp = extractFloat64Exp(a);
4054 aSign = extractFloat64Sign(a);
4055 if (aExp == 0x7FF) {
4056 if (aSig) {
4057 /* Input is a NaN */
4058 if (!ieee) {
4059 float_raise(float_flag_invalid, status);
4060 return packFloat16(aSign, 0, 0);
4062 return commonNaNToFloat16(
4063 float64ToCommonNaN(a, status), status);
4065 /* Infinity */
4066 if (!ieee) {
4067 float_raise(float_flag_invalid, status);
4068 return packFloat16(aSign, 0x1f, 0x3ff);
4070 return packFloat16(aSign, 0x1f, 0);
4072 shift64RightJamming(aSig, 29, &aSig);
4073 zSig = aSig;
4074 if (aExp == 0 && zSig == 0) {
4075 return packFloat16(aSign, 0, 0);
4077 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4078 * even if the input is denormal; however this is harmless because
4079 * the largest possible single-precision denormal is still smaller
4080 * than the smallest representable half-precision denormal, and so we
4081 * will end up ignoring aSig and returning via the "always return zero"
4082 * codepath.
4084 zSig |= 0x00800000;
4085 aExp -= 0x3F1;
4087 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
4090 /*----------------------------------------------------------------------------
4091 | Returns the result of converting the double-precision floating-point value
4092 | `a' to the extended double-precision floating-point format. The conversion
4093 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4094 | Arithmetic.
4095 *----------------------------------------------------------------------------*/
4097 floatx80 float64_to_floatx80(float64 a, float_status *status)
4099 flag aSign;
4100 int aExp;
4101 uint64_t aSig;
4103 a = float64_squash_input_denormal(a, status);
4104 aSig = extractFloat64Frac( a );
4105 aExp = extractFloat64Exp( a );
4106 aSign = extractFloat64Sign( a );
4107 if ( aExp == 0x7FF ) {
4108 if (aSig) {
4109 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4111 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4113 if ( aExp == 0 ) {
4114 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4115 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4117 return
4118 packFloatx80(
4119 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4123 /*----------------------------------------------------------------------------
4124 | Returns the result of converting the double-precision floating-point value
4125 | `a' to the quadruple-precision floating-point format. The conversion is
4126 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4127 | Arithmetic.
4128 *----------------------------------------------------------------------------*/
4130 float128 float64_to_float128(float64 a, float_status *status)
4132 flag aSign;
4133 int aExp;
4134 uint64_t aSig, zSig0, zSig1;
4136 a = float64_squash_input_denormal(a, status);
4137 aSig = extractFloat64Frac( a );
4138 aExp = extractFloat64Exp( a );
4139 aSign = extractFloat64Sign( a );
4140 if ( aExp == 0x7FF ) {
4141 if (aSig) {
4142 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4144 return packFloat128( aSign, 0x7FFF, 0, 0 );
4146 if ( aExp == 0 ) {
4147 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4148 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4149 --aExp;
4151 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4152 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4157 /*----------------------------------------------------------------------------
4158 | Returns the remainder of the double-precision floating-point value `a'
4159 | with respect to the corresponding value `b'. The operation is performed
4160 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4161 *----------------------------------------------------------------------------*/
4163 float64 float64_rem(float64 a, float64 b, float_status *status)
4165 flag aSign, zSign;
4166 int aExp, bExp, expDiff;
4167 uint64_t aSig, bSig;
4168 uint64_t q, alternateASig;
4169 int64_t sigMean;
4171 a = float64_squash_input_denormal(a, status);
4172 b = float64_squash_input_denormal(b, status);
4173 aSig = extractFloat64Frac( a );
4174 aExp = extractFloat64Exp( a );
4175 aSign = extractFloat64Sign( a );
4176 bSig = extractFloat64Frac( b );
4177 bExp = extractFloat64Exp( b );
4178 if ( aExp == 0x7FF ) {
4179 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4180 return propagateFloat64NaN(a, b, status);
4182 float_raise(float_flag_invalid, status);
4183 return float64_default_nan(status);
4185 if ( bExp == 0x7FF ) {
4186 if (bSig) {
4187 return propagateFloat64NaN(a, b, status);
4189 return a;
4191 if ( bExp == 0 ) {
4192 if ( bSig == 0 ) {
4193 float_raise(float_flag_invalid, status);
4194 return float64_default_nan(status);
4196 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4198 if ( aExp == 0 ) {
4199 if ( aSig == 0 ) return a;
4200 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4202 expDiff = aExp - bExp;
4203 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4204 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4205 if ( expDiff < 0 ) {
4206 if ( expDiff < -1 ) return a;
4207 aSig >>= 1;
4209 q = ( bSig <= aSig );
4210 if ( q ) aSig -= bSig;
4211 expDiff -= 64;
4212 while ( 0 < expDiff ) {
4213 q = estimateDiv128To64( aSig, 0, bSig );
4214 q = ( 2 < q ) ? q - 2 : 0;
4215 aSig = - ( ( bSig>>2 ) * q );
4216 expDiff -= 62;
4218 expDiff += 64;
4219 if ( 0 < expDiff ) {
4220 q = estimateDiv128To64( aSig, 0, bSig );
4221 q = ( 2 < q ) ? q - 2 : 0;
4222 q >>= 64 - expDiff;
4223 bSig >>= 2;
4224 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4226 else {
4227 aSig >>= 2;
4228 bSig >>= 2;
4230 do {
4231 alternateASig = aSig;
4232 ++q;
4233 aSig -= bSig;
4234 } while ( 0 <= (int64_t) aSig );
4235 sigMean = aSig + alternateASig;
4236 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4237 aSig = alternateASig;
4239 zSign = ( (int64_t) aSig < 0 );
4240 if ( zSign ) aSig = - aSig;
4241 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4245 /*----------------------------------------------------------------------------
4246 | Returns the binary log of the double-precision floating-point value `a'.
4247 | The operation is performed according to the IEC/IEEE Standard for Binary
4248 | Floating-Point Arithmetic.
4249 *----------------------------------------------------------------------------*/
4250 float64 float64_log2(float64 a, float_status *status)
4252 flag aSign, zSign;
4253 int aExp;
4254 uint64_t aSig, aSig0, aSig1, zSig, i;
4255 a = float64_squash_input_denormal(a, status);
4257 aSig = extractFloat64Frac( a );
4258 aExp = extractFloat64Exp( a );
4259 aSign = extractFloat64Sign( a );
4261 if ( aExp == 0 ) {
4262 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4263 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4265 if ( aSign ) {
4266 float_raise(float_flag_invalid, status);
4267 return float64_default_nan(status);
4269 if ( aExp == 0x7FF ) {
4270 if (aSig) {
4271 return propagateFloat64NaN(a, float64_zero, status);
4273 return a;
4276 aExp -= 0x3FF;
4277 aSig |= LIT64( 0x0010000000000000 );
4278 zSign = aExp < 0;
4279 zSig = (uint64_t)aExp << 52;
4280 for (i = 1LL << 51; i > 0; i >>= 1) {
4281 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4282 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4283 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4284 aSig >>= 1;
4285 zSig |= i;
4289 if ( zSign )
4290 zSig = -zSig;
4291 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4294 /*----------------------------------------------------------------------------
4295 | Returns 1 if the double-precision floating-point value `a' is equal to the
4296 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4297 | if either operand is a NaN. Otherwise, the comparison is performed
4298 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4299 *----------------------------------------------------------------------------*/
4301 int float64_eq(float64 a, float64 b, float_status *status)
4303 uint64_t av, bv;
4304 a = float64_squash_input_denormal(a, status);
4305 b = float64_squash_input_denormal(b, status);
4307 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4308 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4310 float_raise(float_flag_invalid, status);
4311 return 0;
4313 av = float64_val(a);
4314 bv = float64_val(b);
4315 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4319 /*----------------------------------------------------------------------------
4320 | Returns 1 if the double-precision floating-point value `a' is less than or
4321 | equal to the corresponding value `b', and 0 otherwise. The invalid
4322 | exception is raised if either operand is a NaN. The comparison is performed
4323 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4324 *----------------------------------------------------------------------------*/
4326 int float64_le(float64 a, float64 b, float_status *status)
4328 flag aSign, bSign;
4329 uint64_t av, bv;
4330 a = float64_squash_input_denormal(a, status);
4331 b = float64_squash_input_denormal(b, status);
4333 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4334 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4336 float_raise(float_flag_invalid, status);
4337 return 0;
4339 aSign = extractFloat64Sign( a );
4340 bSign = extractFloat64Sign( b );
4341 av = float64_val(a);
4342 bv = float64_val(b);
4343 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4344 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4348 /*----------------------------------------------------------------------------
4349 | Returns 1 if the double-precision floating-point value `a' is less than
4350 | the corresponding value `b', and 0 otherwise. The invalid exception is
4351 | raised if either operand is a NaN. The comparison is performed according
4352 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4353 *----------------------------------------------------------------------------*/
4355 int float64_lt(float64 a, float64 b, float_status *status)
4357 flag aSign, bSign;
4358 uint64_t av, bv;
4360 a = float64_squash_input_denormal(a, status);
4361 b = float64_squash_input_denormal(b, status);
4362 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4363 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4365 float_raise(float_flag_invalid, status);
4366 return 0;
4368 aSign = extractFloat64Sign( a );
4369 bSign = extractFloat64Sign( b );
4370 av = float64_val(a);
4371 bv = float64_val(b);
4372 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4373 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4377 /*----------------------------------------------------------------------------
4378 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4379 | be compared, and 0 otherwise. The invalid exception is raised if either
4380 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4381 | Standard for Binary Floating-Point Arithmetic.
4382 *----------------------------------------------------------------------------*/
4384 int float64_unordered(float64 a, float64 b, float_status *status)
4386 a = float64_squash_input_denormal(a, status);
4387 b = float64_squash_input_denormal(b, status);
4389 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4390 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4392 float_raise(float_flag_invalid, status);
4393 return 1;
4395 return 0;
4398 /*----------------------------------------------------------------------------
4399 | Returns 1 if the double-precision floating-point value `a' is equal to the
4400 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4401 | exception.The comparison is performed according to the IEC/IEEE Standard
4402 | for Binary Floating-Point Arithmetic.
4403 *----------------------------------------------------------------------------*/
4405 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4407 uint64_t av, bv;
4408 a = float64_squash_input_denormal(a, status);
4409 b = float64_squash_input_denormal(b, status);
4411 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4412 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4414 if (float64_is_signaling_nan(a, status)
4415 || float64_is_signaling_nan(b, status)) {
4416 float_raise(float_flag_invalid, status);
4418 return 0;
4420 av = float64_val(a);
4421 bv = float64_val(b);
4422 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4426 /*----------------------------------------------------------------------------
4427 | Returns 1 if the double-precision floating-point value `a' is less than or
4428 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4429 | cause an exception. Otherwise, the comparison is performed according to the
4430 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4431 *----------------------------------------------------------------------------*/
4433 int float64_le_quiet(float64 a, float64 b, float_status *status)
4435 flag aSign, bSign;
4436 uint64_t av, bv;
4437 a = float64_squash_input_denormal(a, status);
4438 b = float64_squash_input_denormal(b, status);
4440 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4441 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4443 if (float64_is_signaling_nan(a, status)
4444 || float64_is_signaling_nan(b, status)) {
4445 float_raise(float_flag_invalid, status);
4447 return 0;
4449 aSign = extractFloat64Sign( a );
4450 bSign = extractFloat64Sign( b );
4451 av = float64_val(a);
4452 bv = float64_val(b);
4453 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4454 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4458 /*----------------------------------------------------------------------------
4459 | Returns 1 if the double-precision floating-point value `a' is less than
4460 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4461 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4462 | Standard for Binary Floating-Point Arithmetic.
4463 *----------------------------------------------------------------------------*/
4465 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4467 flag aSign, bSign;
4468 uint64_t av, bv;
4469 a = float64_squash_input_denormal(a, status);
4470 b = float64_squash_input_denormal(b, status);
4472 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4473 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4475 if (float64_is_signaling_nan(a, status)
4476 || float64_is_signaling_nan(b, status)) {
4477 float_raise(float_flag_invalid, status);
4479 return 0;
4481 aSign = extractFloat64Sign( a );
4482 bSign = extractFloat64Sign( b );
4483 av = float64_val(a);
4484 bv = float64_val(b);
4485 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4486 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4490 /*----------------------------------------------------------------------------
4491 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4492 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4493 | comparison is performed according to the IEC/IEEE Standard for Binary
4494 | Floating-Point Arithmetic.
4495 *----------------------------------------------------------------------------*/
4497 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4499 a = float64_squash_input_denormal(a, status);
4500 b = float64_squash_input_denormal(b, status);
4502 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4503 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4505 if (float64_is_signaling_nan(a, status)
4506 || float64_is_signaling_nan(b, status)) {
4507 float_raise(float_flag_invalid, status);
4509 return 1;
4511 return 0;
4514 /*----------------------------------------------------------------------------
4515 | Returns the result of converting the extended double-precision floating-
4516 | point value `a' to the 32-bit two's complement integer format. The
4517 | conversion is performed according to the IEC/IEEE Standard for Binary
4518 | Floating-Point Arithmetic---which means in particular that the conversion
4519 | is rounded according to the current rounding mode. If `a' is a NaN, the
4520 | largest positive integer is returned. Otherwise, if the conversion
4521 | overflows, the largest integer with the same sign as `a' is returned.
4522 *----------------------------------------------------------------------------*/
4524 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4526 flag aSign;
4527 int32_t aExp, shiftCount;
4528 uint64_t aSig;
4530 if (floatx80_invalid_encoding(a)) {
4531 float_raise(float_flag_invalid, status);
4532 return 1 << 31;
4534 aSig = extractFloatx80Frac( a );
4535 aExp = extractFloatx80Exp( a );
4536 aSign = extractFloatx80Sign( a );
4537 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4538 shiftCount = 0x4037 - aExp;
4539 if ( shiftCount <= 0 ) shiftCount = 1;
4540 shift64RightJamming( aSig, shiftCount, &aSig );
4541 return roundAndPackInt32(aSign, aSig, status);
4545 /*----------------------------------------------------------------------------
4546 | Returns the result of converting the extended double-precision floating-
4547 | point value `a' to the 32-bit two's complement integer format. The
4548 | conversion is performed according to the IEC/IEEE Standard for Binary
4549 | Floating-Point Arithmetic, except that the conversion is always rounded
4550 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4551 | Otherwise, if the conversion overflows, the largest integer with the same
4552 | sign as `a' is returned.
4553 *----------------------------------------------------------------------------*/
4555 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4557 flag aSign;
4558 int32_t aExp, shiftCount;
4559 uint64_t aSig, savedASig;
4560 int32_t z;
4562 if (floatx80_invalid_encoding(a)) {
4563 float_raise(float_flag_invalid, status);
4564 return 1 << 31;
4566 aSig = extractFloatx80Frac( a );
4567 aExp = extractFloatx80Exp( a );
4568 aSign = extractFloatx80Sign( a );
4569 if ( 0x401E < aExp ) {
4570 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4571 goto invalid;
4573 else if ( aExp < 0x3FFF ) {
4574 if (aExp || aSig) {
4575 status->float_exception_flags |= float_flag_inexact;
4577 return 0;
4579 shiftCount = 0x403E - aExp;
4580 savedASig = aSig;
4581 aSig >>= shiftCount;
4582 z = aSig;
4583 if ( aSign ) z = - z;
4584 if ( ( z < 0 ) ^ aSign ) {
4585 invalid:
4586 float_raise(float_flag_invalid, status);
4587 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4589 if ( ( aSig<<shiftCount ) != savedASig ) {
4590 status->float_exception_flags |= float_flag_inexact;
4592 return z;
4596 /*----------------------------------------------------------------------------
4597 | Returns the result of converting the extended double-precision floating-
4598 | point value `a' to the 64-bit two's complement integer format. The
4599 | conversion is performed according to the IEC/IEEE Standard for Binary
4600 | Floating-Point Arithmetic---which means in particular that the conversion
4601 | is rounded according to the current rounding mode. If `a' is a NaN,
4602 | the largest positive integer is returned. Otherwise, if the conversion
4603 | overflows, the largest integer with the same sign as `a' is returned.
4604 *----------------------------------------------------------------------------*/
4606 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4608 flag aSign;
4609 int32_t aExp, shiftCount;
4610 uint64_t aSig, aSigExtra;
4612 if (floatx80_invalid_encoding(a)) {
4613 float_raise(float_flag_invalid, status);
4614 return 1ULL << 63;
4616 aSig = extractFloatx80Frac( a );
4617 aExp = extractFloatx80Exp( a );
4618 aSign = extractFloatx80Sign( a );
4619 shiftCount = 0x403E - aExp;
4620 if ( shiftCount <= 0 ) {
4621 if ( shiftCount ) {
4622 float_raise(float_flag_invalid, status);
4623 if ( ! aSign
4624 || ( ( aExp == 0x7FFF )
4625 && ( aSig != LIT64( 0x8000000000000000 ) ) )
4627 return LIT64( 0x7FFFFFFFFFFFFFFF );
4629 return (int64_t) LIT64( 0x8000000000000000 );
4631 aSigExtra = 0;
4633 else {
4634 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4636 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4640 /*----------------------------------------------------------------------------
4641 | Returns the result of converting the extended double-precision floating-
4642 | point value `a' to the 64-bit two's complement integer format. The
4643 | conversion is performed according to the IEC/IEEE Standard for Binary
4644 | Floating-Point Arithmetic, except that the conversion is always rounded
4645 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4646 | Otherwise, if the conversion overflows, the largest integer with the same
4647 | sign as `a' is returned.
4648 *----------------------------------------------------------------------------*/
4650 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4652 flag aSign;
4653 int32_t aExp, shiftCount;
4654 uint64_t aSig;
4655 int64_t z;
4657 if (floatx80_invalid_encoding(a)) {
4658 float_raise(float_flag_invalid, status);
4659 return 1ULL << 63;
4661 aSig = extractFloatx80Frac( a );
4662 aExp = extractFloatx80Exp( a );
4663 aSign = extractFloatx80Sign( a );
4664 shiftCount = aExp - 0x403E;
4665 if ( 0 <= shiftCount ) {
4666 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4667 if ( ( a.high != 0xC03E ) || aSig ) {
4668 float_raise(float_flag_invalid, status);
4669 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4670 return LIT64( 0x7FFFFFFFFFFFFFFF );
4673 return (int64_t) LIT64( 0x8000000000000000 );
4675 else if ( aExp < 0x3FFF ) {
4676 if (aExp | aSig) {
4677 status->float_exception_flags |= float_flag_inexact;
4679 return 0;
4681 z = aSig>>( - shiftCount );
4682 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4683 status->float_exception_flags |= float_flag_inexact;
4685 if ( aSign ) z = - z;
4686 return z;
4690 /*----------------------------------------------------------------------------
4691 | Returns the result of converting the extended double-precision floating-
4692 | point value `a' to the single-precision floating-point format. The
4693 | conversion is performed according to the IEC/IEEE Standard for Binary
4694 | Floating-Point Arithmetic.
4695 *----------------------------------------------------------------------------*/
4697 float32 floatx80_to_float32(floatx80 a, float_status *status)
4699 flag aSign;
4700 int32_t aExp;
4701 uint64_t aSig;
4703 if (floatx80_invalid_encoding(a)) {
4704 float_raise(float_flag_invalid, status);
4705 return float32_default_nan(status);
4707 aSig = extractFloatx80Frac( a );
4708 aExp = extractFloatx80Exp( a );
4709 aSign = extractFloatx80Sign( a );
4710 if ( aExp == 0x7FFF ) {
4711 if ( (uint64_t) ( aSig<<1 ) ) {
4712 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4714 return packFloat32( aSign, 0xFF, 0 );
4716 shift64RightJamming( aSig, 33, &aSig );
4717 if ( aExp || aSig ) aExp -= 0x3F81;
4718 return roundAndPackFloat32(aSign, aExp, aSig, status);
4722 /*----------------------------------------------------------------------------
4723 | Returns the result of converting the extended double-precision floating-
4724 | point value `a' to the double-precision floating-point format. The
4725 | conversion is performed according to the IEC/IEEE Standard for Binary
4726 | Floating-Point Arithmetic.
4727 *----------------------------------------------------------------------------*/
4729 float64 floatx80_to_float64(floatx80 a, float_status *status)
4731 flag aSign;
4732 int32_t aExp;
4733 uint64_t aSig, zSig;
4735 if (floatx80_invalid_encoding(a)) {
4736 float_raise(float_flag_invalid, status);
4737 return float64_default_nan(status);
4739 aSig = extractFloatx80Frac( a );
4740 aExp = extractFloatx80Exp( a );
4741 aSign = extractFloatx80Sign( a );
4742 if ( aExp == 0x7FFF ) {
4743 if ( (uint64_t) ( aSig<<1 ) ) {
4744 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4746 return packFloat64( aSign, 0x7FF, 0 );
4748 shift64RightJamming( aSig, 1, &zSig );
4749 if ( aExp || aSig ) aExp -= 0x3C01;
4750 return roundAndPackFloat64(aSign, aExp, zSig, status);
4754 /*----------------------------------------------------------------------------
4755 | Returns the result of converting the extended double-precision floating-
4756 | point value `a' to the quadruple-precision floating-point format. The
4757 | conversion is performed according to the IEC/IEEE Standard for Binary
4758 | Floating-Point Arithmetic.
4759 *----------------------------------------------------------------------------*/
4761 float128 floatx80_to_float128(floatx80 a, float_status *status)
4763 flag aSign;
4764 int aExp;
4765 uint64_t aSig, zSig0, zSig1;
4767 if (floatx80_invalid_encoding(a)) {
4768 float_raise(float_flag_invalid, status);
4769 return float128_default_nan(status);
4771 aSig = extractFloatx80Frac( a );
4772 aExp = extractFloatx80Exp( a );
4773 aSign = extractFloatx80Sign( a );
4774 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4775 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4777 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4778 return packFloat128( aSign, aExp, zSig0, zSig1 );
4782 /*----------------------------------------------------------------------------
4783 | Rounds the extended double-precision floating-point value `a'
4784 | to the precision provided by floatx80_rounding_precision and returns the
4785 | result as an extended double-precision floating-point value.
4786 | The operation is performed according to the IEC/IEEE Standard for Binary
4787 | Floating-Point Arithmetic.
4788 *----------------------------------------------------------------------------*/
4790 floatx80 floatx80_round(floatx80 a, float_status *status)
4792 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4793 extractFloatx80Sign(a),
4794 extractFloatx80Exp(a),
4795 extractFloatx80Frac(a), 0, status);
4798 /*----------------------------------------------------------------------------
4799 | Rounds the extended double-precision floating-point value `a' to an integer,
4800 | and returns the result as an extended quadruple-precision floating-point
4801 | value. The operation is performed according to the IEC/IEEE Standard for
4802 | Binary Floating-Point Arithmetic.
4803 *----------------------------------------------------------------------------*/
4805 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4807 flag aSign;
4808 int32_t aExp;
4809 uint64_t lastBitMask, roundBitsMask;
4810 floatx80 z;
4812 if (floatx80_invalid_encoding(a)) {
4813 float_raise(float_flag_invalid, status);
4814 return floatx80_default_nan(status);
4816 aExp = extractFloatx80Exp( a );
4817 if ( 0x403E <= aExp ) {
4818 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4819 return propagateFloatx80NaN(a, a, status);
4821 return a;
4823 if ( aExp < 0x3FFF ) {
4824 if ( ( aExp == 0 )
4825 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4826 return a;
4828 status->float_exception_flags |= float_flag_inexact;
4829 aSign = extractFloatx80Sign( a );
4830 switch (status->float_rounding_mode) {
4831 case float_round_nearest_even:
4832 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4834 return
4835 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4837 break;
4838 case float_round_ties_away:
4839 if (aExp == 0x3FFE) {
4840 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4842 break;
4843 case float_round_down:
4844 return
4845 aSign ?
4846 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4847 : packFloatx80( 0, 0, 0 );
4848 case float_round_up:
4849 return
4850 aSign ? packFloatx80( 1, 0, 0 )
4851 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4853 return packFloatx80( aSign, 0, 0 );
4855 lastBitMask = 1;
4856 lastBitMask <<= 0x403E - aExp;
4857 roundBitsMask = lastBitMask - 1;
4858 z = a;
4859 switch (status->float_rounding_mode) {
4860 case float_round_nearest_even:
4861 z.low += lastBitMask>>1;
4862 if ((z.low & roundBitsMask) == 0) {
4863 z.low &= ~lastBitMask;
4865 break;
4866 case float_round_ties_away:
4867 z.low += lastBitMask >> 1;
4868 break;
4869 case float_round_to_zero:
4870 break;
4871 case float_round_up:
4872 if (!extractFloatx80Sign(z)) {
4873 z.low += roundBitsMask;
4875 break;
4876 case float_round_down:
4877 if (extractFloatx80Sign(z)) {
4878 z.low += roundBitsMask;
4880 break;
4881 default:
4882 abort();
4884 z.low &= ~ roundBitsMask;
4885 if ( z.low == 0 ) {
4886 ++z.high;
4887 z.low = LIT64( 0x8000000000000000 );
4889 if (z.low != a.low) {
4890 status->float_exception_flags |= float_flag_inexact;
4892 return z;
4896 /*----------------------------------------------------------------------------
4897 | Returns the result of adding the absolute values of the extended double-
4898 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4899 | negated before being returned. `zSign' is ignored if the result is a NaN.
4900 | The addition is performed according to the IEC/IEEE Standard for Binary
4901 | Floating-Point Arithmetic.
4902 *----------------------------------------------------------------------------*/
4904 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4905 float_status *status)
4907 int32_t aExp, bExp, zExp;
4908 uint64_t aSig, bSig, zSig0, zSig1;
4909 int32_t expDiff;
4911 aSig = extractFloatx80Frac( a );
4912 aExp = extractFloatx80Exp( a );
4913 bSig = extractFloatx80Frac( b );
4914 bExp = extractFloatx80Exp( b );
4915 expDiff = aExp - bExp;
4916 if ( 0 < expDiff ) {
4917 if ( aExp == 0x7FFF ) {
4918 if ((uint64_t)(aSig << 1)) {
4919 return propagateFloatx80NaN(a, b, status);
4921 return a;
4923 if ( bExp == 0 ) --expDiff;
4924 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4925 zExp = aExp;
4927 else if ( expDiff < 0 ) {
4928 if ( bExp == 0x7FFF ) {
4929 if ((uint64_t)(bSig << 1)) {
4930 return propagateFloatx80NaN(a, b, status);
4932 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4934 if ( aExp == 0 ) ++expDiff;
4935 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4936 zExp = bExp;
4938 else {
4939 if ( aExp == 0x7FFF ) {
4940 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4941 return propagateFloatx80NaN(a, b, status);
4943 return a;
4945 zSig1 = 0;
4946 zSig0 = aSig + bSig;
4947 if ( aExp == 0 ) {
4948 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4949 goto roundAndPack;
4951 zExp = aExp;
4952 goto shiftRight1;
4954 zSig0 = aSig + bSig;
4955 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4956 shiftRight1:
4957 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4958 zSig0 |= LIT64( 0x8000000000000000 );
4959 ++zExp;
4960 roundAndPack:
4961 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4962 zSign, zExp, zSig0, zSig1, status);
4965 /*----------------------------------------------------------------------------
4966 | Returns the result of subtracting the absolute values of the extended
4967 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4968 | difference is negated before being returned. `zSign' is ignored if the
4969 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4970 | Standard for Binary Floating-Point Arithmetic.
4971 *----------------------------------------------------------------------------*/
4973 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4974 float_status *status)
4976 int32_t aExp, bExp, zExp;
4977 uint64_t aSig, bSig, zSig0, zSig1;
4978 int32_t expDiff;
4980 aSig = extractFloatx80Frac( a );
4981 aExp = extractFloatx80Exp( a );
4982 bSig = extractFloatx80Frac( b );
4983 bExp = extractFloatx80Exp( b );
4984 expDiff = aExp - bExp;
4985 if ( 0 < expDiff ) goto aExpBigger;
4986 if ( expDiff < 0 ) goto bExpBigger;
4987 if ( aExp == 0x7FFF ) {
4988 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4989 return propagateFloatx80NaN(a, b, status);
4991 float_raise(float_flag_invalid, status);
4992 return floatx80_default_nan(status);
4994 if ( aExp == 0 ) {
4995 aExp = 1;
4996 bExp = 1;
4998 zSig1 = 0;
4999 if ( bSig < aSig ) goto aBigger;
5000 if ( aSig < bSig ) goto bBigger;
5001 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5002 bExpBigger:
5003 if ( bExp == 0x7FFF ) {
5004 if ((uint64_t)(bSig << 1)) {
5005 return propagateFloatx80NaN(a, b, status);
5007 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5009 if ( aExp == 0 ) ++expDiff;
5010 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5011 bBigger:
5012 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5013 zExp = bExp;
5014 zSign ^= 1;
5015 goto normalizeRoundAndPack;
5016 aExpBigger:
5017 if ( aExp == 0x7FFF ) {
5018 if ((uint64_t)(aSig << 1)) {
5019 return propagateFloatx80NaN(a, b, status);
5021 return a;
5023 if ( bExp == 0 ) --expDiff;
5024 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5025 aBigger:
5026 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5027 zExp = aExp;
5028 normalizeRoundAndPack:
5029 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5030 zSign, zExp, zSig0, zSig1, status);
5033 /*----------------------------------------------------------------------------
5034 | Returns the result of adding the extended double-precision floating-point
5035 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5036 | Standard for Binary Floating-Point Arithmetic.
5037 *----------------------------------------------------------------------------*/
5039 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5041 flag aSign, bSign;
5043 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5044 float_raise(float_flag_invalid, status);
5045 return floatx80_default_nan(status);
5047 aSign = extractFloatx80Sign( a );
5048 bSign = extractFloatx80Sign( b );
5049 if ( aSign == bSign ) {
5050 return addFloatx80Sigs(a, b, aSign, status);
5052 else {
5053 return subFloatx80Sigs(a, b, aSign, status);
5058 /*----------------------------------------------------------------------------
5059 | Returns the result of subtracting the extended double-precision floating-
5060 | point values `a' and `b'. The operation is performed according to the
5061 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5062 *----------------------------------------------------------------------------*/
5064 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5066 flag aSign, bSign;
5068 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5069 float_raise(float_flag_invalid, status);
5070 return floatx80_default_nan(status);
5072 aSign = extractFloatx80Sign( a );
5073 bSign = extractFloatx80Sign( b );
5074 if ( aSign == bSign ) {
5075 return subFloatx80Sigs(a, b, aSign, status);
5077 else {
5078 return addFloatx80Sigs(a, b, aSign, status);
5083 /*----------------------------------------------------------------------------
5084 | Returns the result of multiplying the extended double-precision floating-
5085 | point values `a' and `b'. The operation is performed according to the
5086 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5087 *----------------------------------------------------------------------------*/
5089 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5091 flag aSign, bSign, zSign;
5092 int32_t aExp, bExp, zExp;
5093 uint64_t aSig, bSig, zSig0, zSig1;
5095 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5096 float_raise(float_flag_invalid, status);
5097 return floatx80_default_nan(status);
5099 aSig = extractFloatx80Frac( a );
5100 aExp = extractFloatx80Exp( a );
5101 aSign = extractFloatx80Sign( a );
5102 bSig = extractFloatx80Frac( b );
5103 bExp = extractFloatx80Exp( b );
5104 bSign = extractFloatx80Sign( b );
5105 zSign = aSign ^ bSign;
5106 if ( aExp == 0x7FFF ) {
5107 if ( (uint64_t) ( aSig<<1 )
5108 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5109 return propagateFloatx80NaN(a, b, status);
5111 if ( ( bExp | bSig ) == 0 ) goto invalid;
5112 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5114 if ( bExp == 0x7FFF ) {
5115 if ((uint64_t)(bSig << 1)) {
5116 return propagateFloatx80NaN(a, b, status);
5118 if ( ( aExp | aSig ) == 0 ) {
5119 invalid:
5120 float_raise(float_flag_invalid, status);
5121 return floatx80_default_nan(status);
5123 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5125 if ( aExp == 0 ) {
5126 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5127 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5129 if ( bExp == 0 ) {
5130 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5131 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5133 zExp = aExp + bExp - 0x3FFE;
5134 mul64To128( aSig, bSig, &zSig0, &zSig1 );
5135 if ( 0 < (int64_t) zSig0 ) {
5136 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5137 --zExp;
5139 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5140 zSign, zExp, zSig0, zSig1, status);
5143 /*----------------------------------------------------------------------------
5144 | Returns the result of dividing the extended double-precision floating-point
5145 | value `a' by the corresponding value `b'. The operation is performed
5146 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5147 *----------------------------------------------------------------------------*/
5149 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5151 flag aSign, bSign, zSign;
5152 int32_t aExp, bExp, zExp;
5153 uint64_t aSig, bSig, zSig0, zSig1;
5154 uint64_t rem0, rem1, rem2, term0, term1, term2;
5156 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5157 float_raise(float_flag_invalid, status);
5158 return floatx80_default_nan(status);
5160 aSig = extractFloatx80Frac( a );
5161 aExp = extractFloatx80Exp( a );
5162 aSign = extractFloatx80Sign( a );
5163 bSig = extractFloatx80Frac( b );
5164 bExp = extractFloatx80Exp( b );
5165 bSign = extractFloatx80Sign( b );
5166 zSign = aSign ^ bSign;
5167 if ( aExp == 0x7FFF ) {
5168 if ((uint64_t)(aSig << 1)) {
5169 return propagateFloatx80NaN(a, b, status);
5171 if ( bExp == 0x7FFF ) {
5172 if ((uint64_t)(bSig << 1)) {
5173 return propagateFloatx80NaN(a, b, status);
5175 goto invalid;
5177 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5179 if ( bExp == 0x7FFF ) {
5180 if ((uint64_t)(bSig << 1)) {
5181 return propagateFloatx80NaN(a, b, status);
5183 return packFloatx80( zSign, 0, 0 );
5185 if ( bExp == 0 ) {
5186 if ( bSig == 0 ) {
5187 if ( ( aExp | aSig ) == 0 ) {
5188 invalid:
5189 float_raise(float_flag_invalid, status);
5190 return floatx80_default_nan(status);
5192 float_raise(float_flag_divbyzero, status);
5193 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5195 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5197 if ( aExp == 0 ) {
5198 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5199 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5201 zExp = aExp - bExp + 0x3FFE;
5202 rem1 = 0;
5203 if ( bSig <= aSig ) {
5204 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5205 ++zExp;
5207 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5208 mul64To128( bSig, zSig0, &term0, &term1 );
5209 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5210 while ( (int64_t) rem0 < 0 ) {
5211 --zSig0;
5212 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5214 zSig1 = estimateDiv128To64( rem1, 0, bSig );
5215 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5216 mul64To128( bSig, zSig1, &term1, &term2 );
5217 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5218 while ( (int64_t) rem1 < 0 ) {
5219 --zSig1;
5220 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5222 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5224 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5225 zSign, zExp, zSig0, zSig1, status);
5228 /*----------------------------------------------------------------------------
5229 | Returns the remainder of the extended double-precision floating-point value
5230 | `a' with respect to the corresponding value `b'. The operation is performed
5231 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5232 *----------------------------------------------------------------------------*/
5234 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5236 flag aSign, zSign;
5237 int32_t aExp, bExp, expDiff;
5238 uint64_t aSig0, aSig1, bSig;
5239 uint64_t q, term0, term1, alternateASig0, alternateASig1;
5241 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5242 float_raise(float_flag_invalid, status);
5243 return floatx80_default_nan(status);
5245 aSig0 = extractFloatx80Frac( a );
5246 aExp = extractFloatx80Exp( a );
5247 aSign = extractFloatx80Sign( a );
5248 bSig = extractFloatx80Frac( b );
5249 bExp = extractFloatx80Exp( b );
5250 if ( aExp == 0x7FFF ) {
5251 if ( (uint64_t) ( aSig0<<1 )
5252 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5253 return propagateFloatx80NaN(a, b, status);
5255 goto invalid;
5257 if ( bExp == 0x7FFF ) {
5258 if ((uint64_t)(bSig << 1)) {
5259 return propagateFloatx80NaN(a, b, status);
5261 return a;
5263 if ( bExp == 0 ) {
5264 if ( bSig == 0 ) {
5265 invalid:
5266 float_raise(float_flag_invalid, status);
5267 return floatx80_default_nan(status);
5269 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5271 if ( aExp == 0 ) {
5272 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5273 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5275 bSig |= LIT64( 0x8000000000000000 );
5276 zSign = aSign;
5277 expDiff = aExp - bExp;
5278 aSig1 = 0;
5279 if ( expDiff < 0 ) {
5280 if ( expDiff < -1 ) return a;
5281 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5282 expDiff = 0;
5284 q = ( bSig <= aSig0 );
5285 if ( q ) aSig0 -= bSig;
5286 expDiff -= 64;
5287 while ( 0 < expDiff ) {
5288 q = estimateDiv128To64( aSig0, aSig1, bSig );
5289 q = ( 2 < q ) ? q - 2 : 0;
5290 mul64To128( bSig, q, &term0, &term1 );
5291 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5292 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5293 expDiff -= 62;
5295 expDiff += 64;
5296 if ( 0 < expDiff ) {
5297 q = estimateDiv128To64( aSig0, aSig1, bSig );
5298 q = ( 2 < q ) ? q - 2 : 0;
5299 q >>= 64 - expDiff;
5300 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5301 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5302 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5303 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5304 ++q;
5305 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5308 else {
5309 term1 = 0;
5310 term0 = bSig;
5312 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5313 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5314 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5315 && ( q & 1 ) )
5317 aSig0 = alternateASig0;
5318 aSig1 = alternateASig1;
5319 zSign = ! zSign;
5321 return
5322 normalizeRoundAndPackFloatx80(
5323 80, zSign, bExp + expDiff, aSig0, aSig1, status);
5327 /*----------------------------------------------------------------------------
5328 | Returns the square root of the extended double-precision floating-point
5329 | value `a'. The operation is performed according to the IEC/IEEE Standard
5330 | for Binary Floating-Point Arithmetic.
5331 *----------------------------------------------------------------------------*/
5333 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5335 flag aSign;
5336 int32_t aExp, zExp;
5337 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5338 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5340 if (floatx80_invalid_encoding(a)) {
5341 float_raise(float_flag_invalid, status);
5342 return floatx80_default_nan(status);
5344 aSig0 = extractFloatx80Frac( a );
5345 aExp = extractFloatx80Exp( a );
5346 aSign = extractFloatx80Sign( a );
5347 if ( aExp == 0x7FFF ) {
5348 if ((uint64_t)(aSig0 << 1)) {
5349 return propagateFloatx80NaN(a, a, status);
5351 if ( ! aSign ) return a;
5352 goto invalid;
5354 if ( aSign ) {
5355 if ( ( aExp | aSig0 ) == 0 ) return a;
5356 invalid:
5357 float_raise(float_flag_invalid, status);
5358 return floatx80_default_nan(status);
5360 if ( aExp == 0 ) {
5361 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5362 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5364 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5365 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5366 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5367 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5368 doubleZSig0 = zSig0<<1;
5369 mul64To128( zSig0, zSig0, &term0, &term1 );
5370 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5371 while ( (int64_t) rem0 < 0 ) {
5372 --zSig0;
5373 doubleZSig0 -= 2;
5374 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5376 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5377 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5378 if ( zSig1 == 0 ) zSig1 = 1;
5379 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5380 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5381 mul64To128( zSig1, zSig1, &term2, &term3 );
5382 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5383 while ( (int64_t) rem1 < 0 ) {
5384 --zSig1;
5385 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5386 term3 |= 1;
5387 term2 |= doubleZSig0;
5388 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5390 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5392 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5393 zSig0 |= doubleZSig0;
5394 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5395 0, zExp, zSig0, zSig1, status);
5398 /*----------------------------------------------------------------------------
5399 | Returns 1 if the extended double-precision floating-point value `a' is equal
5400 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5401 | raised if either operand is a NaN. Otherwise, the comparison is performed
5402 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5403 *----------------------------------------------------------------------------*/
5405 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5408 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5409 || (extractFloatx80Exp(a) == 0x7FFF
5410 && (uint64_t) (extractFloatx80Frac(a) << 1))
5411 || (extractFloatx80Exp(b) == 0x7FFF
5412 && (uint64_t) (extractFloatx80Frac(b) << 1))
5414 float_raise(float_flag_invalid, status);
5415 return 0;
5417 return
5418 ( a.low == b.low )
5419 && ( ( a.high == b.high )
5420 || ( ( a.low == 0 )
5421 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5426 /*----------------------------------------------------------------------------
5427 | Returns 1 if the extended double-precision floating-point value `a' is
5428 | less than or equal to the corresponding value `b', and 0 otherwise. The
5429 | invalid exception is raised if either operand is a NaN. The comparison is
5430 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5431 | Arithmetic.
5432 *----------------------------------------------------------------------------*/
5434 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5436 flag aSign, bSign;
5438 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5439 || (extractFloatx80Exp(a) == 0x7FFF
5440 && (uint64_t) (extractFloatx80Frac(a) << 1))
5441 || (extractFloatx80Exp(b) == 0x7FFF
5442 && (uint64_t) (extractFloatx80Frac(b) << 1))
5444 float_raise(float_flag_invalid, status);
5445 return 0;
5447 aSign = extractFloatx80Sign( a );
5448 bSign = extractFloatx80Sign( b );
5449 if ( aSign != bSign ) {
5450 return
5451 aSign
5452 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5453 == 0 );
5455 return
5456 aSign ? le128( b.high, b.low, a.high, a.low )
5457 : le128( a.high, a.low, b.high, b.low );
5461 /*----------------------------------------------------------------------------
5462 | Returns 1 if the extended double-precision floating-point value `a' is
5463 | less than the corresponding value `b', and 0 otherwise. The invalid
5464 | exception is raised if either operand is a NaN. The comparison is performed
5465 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5466 *----------------------------------------------------------------------------*/
5468 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5470 flag aSign, bSign;
5472 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5473 || (extractFloatx80Exp(a) == 0x7FFF
5474 && (uint64_t) (extractFloatx80Frac(a) << 1))
5475 || (extractFloatx80Exp(b) == 0x7FFF
5476 && (uint64_t) (extractFloatx80Frac(b) << 1))
5478 float_raise(float_flag_invalid, status);
5479 return 0;
5481 aSign = extractFloatx80Sign( a );
5482 bSign = extractFloatx80Sign( b );
5483 if ( aSign != bSign ) {
5484 return
5485 aSign
5486 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5487 != 0 );
5489 return
5490 aSign ? lt128( b.high, b.low, a.high, a.low )
5491 : lt128( a.high, a.low, b.high, b.low );
5495 /*----------------------------------------------------------------------------
5496 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5497 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5498 | either operand is a NaN. The comparison is performed according to the
5499 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5500 *----------------------------------------------------------------------------*/
5501 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5503 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5504 || (extractFloatx80Exp(a) == 0x7FFF
5505 && (uint64_t) (extractFloatx80Frac(a) << 1))
5506 || (extractFloatx80Exp(b) == 0x7FFF
5507 && (uint64_t) (extractFloatx80Frac(b) << 1))
5509 float_raise(float_flag_invalid, status);
5510 return 1;
5512 return 0;
5515 /*----------------------------------------------------------------------------
5516 | Returns 1 if the extended double-precision floating-point value `a' is
5517 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5518 | cause an exception. The comparison is performed according to the IEC/IEEE
5519 | Standard for Binary Floating-Point Arithmetic.
5520 *----------------------------------------------------------------------------*/
5522 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5525 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5526 float_raise(float_flag_invalid, status);
5527 return 0;
5529 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5530 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5531 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5532 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5534 if (floatx80_is_signaling_nan(a, status)
5535 || floatx80_is_signaling_nan(b, status)) {
5536 float_raise(float_flag_invalid, status);
5538 return 0;
5540 return
5541 ( a.low == b.low )
5542 && ( ( a.high == b.high )
5543 || ( ( a.low == 0 )
5544 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5549 /*----------------------------------------------------------------------------
5550 | Returns 1 if the extended double-precision floating-point value `a' is less
5551 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5552 | do not cause an exception. Otherwise, the comparison is performed according
5553 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5554 *----------------------------------------------------------------------------*/
5556 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5558 flag aSign, bSign;
5560 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5561 float_raise(float_flag_invalid, status);
5562 return 0;
5564 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5565 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5566 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5567 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5569 if (floatx80_is_signaling_nan(a, status)
5570 || floatx80_is_signaling_nan(b, status)) {
5571 float_raise(float_flag_invalid, status);
5573 return 0;
5575 aSign = extractFloatx80Sign( a );
5576 bSign = extractFloatx80Sign( b );
5577 if ( aSign != bSign ) {
5578 return
5579 aSign
5580 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5581 == 0 );
5583 return
5584 aSign ? le128( b.high, b.low, a.high, a.low )
5585 : le128( a.high, a.low, b.high, b.low );
5589 /*----------------------------------------------------------------------------
5590 | Returns 1 if the extended double-precision floating-point value `a' is less
5591 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5592 | an exception. Otherwise, the comparison is performed according to the
5593 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5594 *----------------------------------------------------------------------------*/
5596 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5598 flag aSign, bSign;
5600 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5601 float_raise(float_flag_invalid, status);
5602 return 0;
5604 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5605 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5606 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5607 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5609 if (floatx80_is_signaling_nan(a, status)
5610 || floatx80_is_signaling_nan(b, status)) {
5611 float_raise(float_flag_invalid, status);
5613 return 0;
5615 aSign = extractFloatx80Sign( a );
5616 bSign = extractFloatx80Sign( b );
5617 if ( aSign != bSign ) {
5618 return
5619 aSign
5620 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5621 != 0 );
5623 return
5624 aSign ? lt128( b.high, b.low, a.high, a.low )
5625 : lt128( a.high, a.low, b.high, b.low );
5629 /*----------------------------------------------------------------------------
5630 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5631 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5632 | The comparison is performed according to the IEC/IEEE Standard for Binary
5633 | Floating-Point Arithmetic.
5634 *----------------------------------------------------------------------------*/
5635 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5637 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5638 float_raise(float_flag_invalid, status);
5639 return 1;
5641 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5642 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5643 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5644 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5646 if (floatx80_is_signaling_nan(a, status)
5647 || floatx80_is_signaling_nan(b, status)) {
5648 float_raise(float_flag_invalid, status);
5650 return 1;
5652 return 0;
5655 /*----------------------------------------------------------------------------
5656 | Returns the result of converting the quadruple-precision floating-point
5657 | value `a' to the 32-bit two's complement integer format. The conversion
5658 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5659 | Arithmetic---which means in particular that the conversion is rounded
5660 | according to the current rounding mode. If `a' is a NaN, the largest
5661 | positive integer is returned. Otherwise, if the conversion overflows, the
5662 | largest integer with the same sign as `a' is returned.
5663 *----------------------------------------------------------------------------*/
5665 int32_t float128_to_int32(float128 a, float_status *status)
5667 flag aSign;
5668 int32_t aExp, shiftCount;
5669 uint64_t aSig0, aSig1;
5671 aSig1 = extractFloat128Frac1( a );
5672 aSig0 = extractFloat128Frac0( a );
5673 aExp = extractFloat128Exp( a );
5674 aSign = extractFloat128Sign( a );
5675 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5676 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5677 aSig0 |= ( aSig1 != 0 );
5678 shiftCount = 0x4028 - aExp;
5679 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5680 return roundAndPackInt32(aSign, aSig0, status);
5684 /*----------------------------------------------------------------------------
5685 | Returns the result of converting the quadruple-precision floating-point
5686 | value `a' to the 32-bit two's complement integer format. The conversion
5687 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5688 | Arithmetic, except that the conversion is always rounded toward zero. If
5689 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5690 | conversion overflows, the largest integer with the same sign as `a' is
5691 | returned.
5692 *----------------------------------------------------------------------------*/
5694 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5696 flag aSign;
5697 int32_t aExp, shiftCount;
5698 uint64_t aSig0, aSig1, savedASig;
5699 int32_t z;
5701 aSig1 = extractFloat128Frac1( a );
5702 aSig0 = extractFloat128Frac0( a );
5703 aExp = extractFloat128Exp( a );
5704 aSign = extractFloat128Sign( a );
5705 aSig0 |= ( aSig1 != 0 );
5706 if ( 0x401E < aExp ) {
5707 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5708 goto invalid;
5710 else if ( aExp < 0x3FFF ) {
5711 if (aExp || aSig0) {
5712 status->float_exception_flags |= float_flag_inexact;
5714 return 0;
5716 aSig0 |= LIT64( 0x0001000000000000 );
5717 shiftCount = 0x402F - aExp;
5718 savedASig = aSig0;
5719 aSig0 >>= shiftCount;
5720 z = aSig0;
5721 if ( aSign ) z = - z;
5722 if ( ( z < 0 ) ^ aSign ) {
5723 invalid:
5724 float_raise(float_flag_invalid, status);
5725 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5727 if ( ( aSig0<<shiftCount ) != savedASig ) {
5728 status->float_exception_flags |= float_flag_inexact;
5730 return z;
5734 /*----------------------------------------------------------------------------
5735 | Returns the result of converting the quadruple-precision floating-point
5736 | value `a' to the 64-bit two's complement integer format. The conversion
5737 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5738 | Arithmetic---which means in particular that the conversion is rounded
5739 | according to the current rounding mode. If `a' is a NaN, the largest
5740 | positive integer is returned. Otherwise, if the conversion overflows, the
5741 | largest integer with the same sign as `a' is returned.
5742 *----------------------------------------------------------------------------*/
5744 int64_t float128_to_int64(float128 a, float_status *status)
5746 flag aSign;
5747 int32_t aExp, shiftCount;
5748 uint64_t aSig0, aSig1;
5750 aSig1 = extractFloat128Frac1( a );
5751 aSig0 = extractFloat128Frac0( a );
5752 aExp = extractFloat128Exp( a );
5753 aSign = extractFloat128Sign( a );
5754 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5755 shiftCount = 0x402F - aExp;
5756 if ( shiftCount <= 0 ) {
5757 if ( 0x403E < aExp ) {
5758 float_raise(float_flag_invalid, status);
5759 if ( ! aSign
5760 || ( ( aExp == 0x7FFF )
5761 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5764 return LIT64( 0x7FFFFFFFFFFFFFFF );
5766 return (int64_t) LIT64( 0x8000000000000000 );
5768 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5770 else {
5771 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5773 return roundAndPackInt64(aSign, aSig0, aSig1, status);
5777 /*----------------------------------------------------------------------------
5778 | Returns the result of converting the quadruple-precision floating-point
5779 | value `a' to the 64-bit two's complement integer format. The conversion
5780 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5781 | Arithmetic, except that the conversion is always rounded toward zero.
5782 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5783 | the conversion overflows, the largest integer with the same sign as `a' is
5784 | returned.
5785 *----------------------------------------------------------------------------*/
5787 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5789 flag aSign;
5790 int32_t aExp, shiftCount;
5791 uint64_t aSig0, aSig1;
5792 int64_t z;
5794 aSig1 = extractFloat128Frac1( a );
5795 aSig0 = extractFloat128Frac0( a );
5796 aExp = extractFloat128Exp( a );
5797 aSign = extractFloat128Sign( a );
5798 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5799 shiftCount = aExp - 0x402F;
5800 if ( 0 < shiftCount ) {
5801 if ( 0x403E <= aExp ) {
5802 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5803 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5804 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5805 if (aSig1) {
5806 status->float_exception_flags |= float_flag_inexact;
5809 else {
5810 float_raise(float_flag_invalid, status);
5811 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5812 return LIT64( 0x7FFFFFFFFFFFFFFF );
5815 return (int64_t) LIT64( 0x8000000000000000 );
5817 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5818 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5819 status->float_exception_flags |= float_flag_inexact;
5822 else {
5823 if ( aExp < 0x3FFF ) {
5824 if ( aExp | aSig0 | aSig1 ) {
5825 status->float_exception_flags |= float_flag_inexact;
5827 return 0;
5829 z = aSig0>>( - shiftCount );
5830 if ( aSig1
5831 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5832 status->float_exception_flags |= float_flag_inexact;
5835 if ( aSign ) z = - z;
5836 return z;
5840 /*----------------------------------------------------------------------------
5841 | Returns the result of converting the quadruple-precision floating-point value
5842 | `a' to the 64-bit unsigned integer format. The conversion is
5843 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5844 | Arithmetic---which means in particular that the conversion is rounded
5845 | according to the current rounding mode. If `a' is a NaN, the largest
5846 | positive integer is returned. If the conversion overflows, the
5847 | largest unsigned integer is returned. If 'a' is negative, the value is
5848 | rounded and zero is returned; negative values that do not round to zero
5849 | will raise the inexact exception.
5850 *----------------------------------------------------------------------------*/
5852 uint64_t float128_to_uint64(float128 a, float_status *status)
5854 flag aSign;
5855 int aExp;
5856 int shiftCount;
5857 uint64_t aSig0, aSig1;
5859 aSig0 = extractFloat128Frac0(a);
5860 aSig1 = extractFloat128Frac1(a);
5861 aExp = extractFloat128Exp(a);
5862 aSign = extractFloat128Sign(a);
5863 if (aSign && (aExp > 0x3FFE)) {
5864 float_raise(float_flag_invalid, status);
5865 if (float128_is_any_nan(a)) {
5866 return LIT64(0xFFFFFFFFFFFFFFFF);
5867 } else {
5868 return 0;
5871 if (aExp) {
5872 aSig0 |= LIT64(0x0001000000000000);
5874 shiftCount = 0x402F - aExp;
5875 if (shiftCount <= 0) {
5876 if (0x403E < aExp) {
5877 float_raise(float_flag_invalid, status);
5878 return LIT64(0xFFFFFFFFFFFFFFFF);
5880 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5881 } else {
5882 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5884 return roundAndPackUint64(aSign, aSig0, aSig1, status);
5887 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5889 uint64_t v;
5890 signed char current_rounding_mode = status->float_rounding_mode;
5892 set_float_rounding_mode(float_round_to_zero, status);
5893 v = float128_to_uint64(a, status);
5894 set_float_rounding_mode(current_rounding_mode, status);
5896 return v;
5899 /*----------------------------------------------------------------------------
5900 | Returns the result of converting the quadruple-precision floating-point
5901 | value `a' to the 32-bit unsigned integer format. The conversion
5902 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5903 | Arithmetic except that the conversion is always rounded toward zero.
5904 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
5905 | if the conversion overflows, the largest unsigned integer is returned.
5906 | If 'a' is negative, the value is rounded and zero is returned; negative
5907 | values that do not round to zero will raise the inexact exception.
5908 *----------------------------------------------------------------------------*/
5910 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5912 uint64_t v;
5913 uint32_t res;
5914 int old_exc_flags = get_float_exception_flags(status);
5916 v = float128_to_uint64_round_to_zero(a, status);
5917 if (v > 0xffffffff) {
5918 res = 0xffffffff;
5919 } else {
5920 return v;
5922 set_float_exception_flags(old_exc_flags, status);
5923 float_raise(float_flag_invalid, status);
5924 return res;
5927 /*----------------------------------------------------------------------------
5928 | Returns the result of converting the quadruple-precision floating-point
5929 | value `a' to the single-precision floating-point format. The conversion
5930 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5931 | Arithmetic.
5932 *----------------------------------------------------------------------------*/
5934 float32 float128_to_float32(float128 a, float_status *status)
5936 flag aSign;
5937 int32_t aExp;
5938 uint64_t aSig0, aSig1;
5939 uint32_t zSig;
5941 aSig1 = extractFloat128Frac1( a );
5942 aSig0 = extractFloat128Frac0( a );
5943 aExp = extractFloat128Exp( a );
5944 aSign = extractFloat128Sign( a );
5945 if ( aExp == 0x7FFF ) {
5946 if ( aSig0 | aSig1 ) {
5947 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5949 return packFloat32( aSign, 0xFF, 0 );
5951 aSig0 |= ( aSig1 != 0 );
5952 shift64RightJamming( aSig0, 18, &aSig0 );
5953 zSig = aSig0;
5954 if ( aExp || zSig ) {
5955 zSig |= 0x40000000;
5956 aExp -= 0x3F81;
5958 return roundAndPackFloat32(aSign, aExp, zSig, status);
5962 /*----------------------------------------------------------------------------
5963 | Returns the result of converting the quadruple-precision floating-point
5964 | value `a' to the double-precision floating-point format. The conversion
5965 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5966 | Arithmetic.
5967 *----------------------------------------------------------------------------*/
5969 float64 float128_to_float64(float128 a, float_status *status)
5971 flag aSign;
5972 int32_t aExp;
5973 uint64_t aSig0, aSig1;
5975 aSig1 = extractFloat128Frac1( a );
5976 aSig0 = extractFloat128Frac0( a );
5977 aExp = extractFloat128Exp( a );
5978 aSign = extractFloat128Sign( a );
5979 if ( aExp == 0x7FFF ) {
5980 if ( aSig0 | aSig1 ) {
5981 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5983 return packFloat64( aSign, 0x7FF, 0 );
5985 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5986 aSig0 |= ( aSig1 != 0 );
5987 if ( aExp || aSig0 ) {
5988 aSig0 |= LIT64( 0x4000000000000000 );
5989 aExp -= 0x3C01;
5991 return roundAndPackFloat64(aSign, aExp, aSig0, status);
5995 /*----------------------------------------------------------------------------
5996 | Returns the result of converting the quadruple-precision floating-point
5997 | value `a' to the extended double-precision floating-point format. The
5998 | conversion is performed according to the IEC/IEEE Standard for Binary
5999 | Floating-Point Arithmetic.
6000 *----------------------------------------------------------------------------*/
6002 floatx80 float128_to_floatx80(float128 a, float_status *status)
6004 flag aSign;
6005 int32_t aExp;
6006 uint64_t aSig0, aSig1;
6008 aSig1 = extractFloat128Frac1( a );
6009 aSig0 = extractFloat128Frac0( a );
6010 aExp = extractFloat128Exp( a );
6011 aSign = extractFloat128Sign( a );
6012 if ( aExp == 0x7FFF ) {
6013 if ( aSig0 | aSig1 ) {
6014 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6016 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6018 if ( aExp == 0 ) {
6019 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6020 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6022 else {
6023 aSig0 |= LIT64( 0x0001000000000000 );
6025 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6026 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6030 /*----------------------------------------------------------------------------
6031 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6032 | returns the result as a quadruple-precision floating-point value. The
6033 | operation is performed according to the IEC/IEEE Standard for Binary
6034 | Floating-Point Arithmetic.
6035 *----------------------------------------------------------------------------*/
6037 float128 float128_round_to_int(float128 a, float_status *status)
6039 flag aSign;
6040 int32_t aExp;
6041 uint64_t lastBitMask, roundBitsMask;
6042 float128 z;
6044 aExp = extractFloat128Exp( a );
6045 if ( 0x402F <= aExp ) {
6046 if ( 0x406F <= aExp ) {
6047 if ( ( aExp == 0x7FFF )
6048 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6050 return propagateFloat128NaN(a, a, status);
6052 return a;
6054 lastBitMask = 1;
6055 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6056 roundBitsMask = lastBitMask - 1;
6057 z = a;
6058 switch (status->float_rounding_mode) {
6059 case float_round_nearest_even:
6060 if ( lastBitMask ) {
6061 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6062 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6064 else {
6065 if ( (int64_t) z.low < 0 ) {
6066 ++z.high;
6067 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6070 break;
6071 case float_round_ties_away:
6072 if (lastBitMask) {
6073 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6074 } else {
6075 if ((int64_t) z.low < 0) {
6076 ++z.high;
6079 break;
6080 case float_round_to_zero:
6081 break;
6082 case float_round_up:
6083 if (!extractFloat128Sign(z)) {
6084 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6086 break;
6087 case float_round_down:
6088 if (extractFloat128Sign(z)) {
6089 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6091 break;
6092 default:
6093 abort();
6095 z.low &= ~ roundBitsMask;
6097 else {
6098 if ( aExp < 0x3FFF ) {
6099 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6100 status->float_exception_flags |= float_flag_inexact;
6101 aSign = extractFloat128Sign( a );
6102 switch (status->float_rounding_mode) {
6103 case float_round_nearest_even:
6104 if ( ( aExp == 0x3FFE )
6105 && ( extractFloat128Frac0( a )
6106 | extractFloat128Frac1( a ) )
6108 return packFloat128( aSign, 0x3FFF, 0, 0 );
6110 break;
6111 case float_round_ties_away:
6112 if (aExp == 0x3FFE) {
6113 return packFloat128(aSign, 0x3FFF, 0, 0);
6115 break;
6116 case float_round_down:
6117 return
6118 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6119 : packFloat128( 0, 0, 0, 0 );
6120 case float_round_up:
6121 return
6122 aSign ? packFloat128( 1, 0, 0, 0 )
6123 : packFloat128( 0, 0x3FFF, 0, 0 );
6125 return packFloat128( aSign, 0, 0, 0 );
6127 lastBitMask = 1;
6128 lastBitMask <<= 0x402F - aExp;
6129 roundBitsMask = lastBitMask - 1;
6130 z.low = 0;
6131 z.high = a.high;
6132 switch (status->float_rounding_mode) {
6133 case float_round_nearest_even:
6134 z.high += lastBitMask>>1;
6135 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6136 z.high &= ~ lastBitMask;
6138 break;
6139 case float_round_ties_away:
6140 z.high += lastBitMask>>1;
6141 break;
6142 case float_round_to_zero:
6143 break;
6144 case float_round_up:
6145 if (!extractFloat128Sign(z)) {
6146 z.high |= ( a.low != 0 );
6147 z.high += roundBitsMask;
6149 break;
6150 case float_round_down:
6151 if (extractFloat128Sign(z)) {
6152 z.high |= (a.low != 0);
6153 z.high += roundBitsMask;
6155 break;
6156 default:
6157 abort();
6159 z.high &= ~ roundBitsMask;
6161 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6162 status->float_exception_flags |= float_flag_inexact;
6164 return z;
6168 /*----------------------------------------------------------------------------
6169 | Returns the result of adding the absolute values of the quadruple-precision
6170 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6171 | before being returned. `zSign' is ignored if the result is a NaN.
6172 | The addition is performed according to the IEC/IEEE Standard for Binary
6173 | Floating-Point Arithmetic.
6174 *----------------------------------------------------------------------------*/
6176 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6177 float_status *status)
6179 int32_t aExp, bExp, zExp;
6180 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6181 int32_t expDiff;
6183 aSig1 = extractFloat128Frac1( a );
6184 aSig0 = extractFloat128Frac0( a );
6185 aExp = extractFloat128Exp( a );
6186 bSig1 = extractFloat128Frac1( b );
6187 bSig0 = extractFloat128Frac0( b );
6188 bExp = extractFloat128Exp( b );
6189 expDiff = aExp - bExp;
6190 if ( 0 < expDiff ) {
6191 if ( aExp == 0x7FFF ) {
6192 if (aSig0 | aSig1) {
6193 return propagateFloat128NaN(a, b, status);
6195 return a;
6197 if ( bExp == 0 ) {
6198 --expDiff;
6200 else {
6201 bSig0 |= LIT64( 0x0001000000000000 );
6203 shift128ExtraRightJamming(
6204 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6205 zExp = aExp;
6207 else if ( expDiff < 0 ) {
6208 if ( bExp == 0x7FFF ) {
6209 if (bSig0 | bSig1) {
6210 return propagateFloat128NaN(a, b, status);
6212 return packFloat128( zSign, 0x7FFF, 0, 0 );
6214 if ( aExp == 0 ) {
6215 ++expDiff;
6217 else {
6218 aSig0 |= LIT64( 0x0001000000000000 );
6220 shift128ExtraRightJamming(
6221 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6222 zExp = bExp;
6224 else {
6225 if ( aExp == 0x7FFF ) {
6226 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6227 return propagateFloat128NaN(a, b, status);
6229 return a;
6231 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6232 if ( aExp == 0 ) {
6233 if (status->flush_to_zero) {
6234 if (zSig0 | zSig1) {
6235 float_raise(float_flag_output_denormal, status);
6237 return packFloat128(zSign, 0, 0, 0);
6239 return packFloat128( zSign, 0, zSig0, zSig1 );
6241 zSig2 = 0;
6242 zSig0 |= LIT64( 0x0002000000000000 );
6243 zExp = aExp;
6244 goto shiftRight1;
6246 aSig0 |= LIT64( 0x0001000000000000 );
6247 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6248 --zExp;
6249 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6250 ++zExp;
6251 shiftRight1:
6252 shift128ExtraRightJamming(
6253 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6254 roundAndPack:
6255 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6259 /*----------------------------------------------------------------------------
6260 | Returns the result of subtracting the absolute values of the quadruple-
6261 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6262 | difference is negated before being returned. `zSign' is ignored if the
6263 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6264 | Standard for Binary Floating-Point Arithmetic.
6265 *----------------------------------------------------------------------------*/
6267 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6268 float_status *status)
6270 int32_t aExp, bExp, zExp;
6271 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6272 int32_t expDiff;
6274 aSig1 = extractFloat128Frac1( a );
6275 aSig0 = extractFloat128Frac0( a );
6276 aExp = extractFloat128Exp( a );
6277 bSig1 = extractFloat128Frac1( b );
6278 bSig0 = extractFloat128Frac0( b );
6279 bExp = extractFloat128Exp( b );
6280 expDiff = aExp - bExp;
6281 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6282 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6283 if ( 0 < expDiff ) goto aExpBigger;
6284 if ( expDiff < 0 ) goto bExpBigger;
6285 if ( aExp == 0x7FFF ) {
6286 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6287 return propagateFloat128NaN(a, b, status);
6289 float_raise(float_flag_invalid, status);
6290 return float128_default_nan(status);
6292 if ( aExp == 0 ) {
6293 aExp = 1;
6294 bExp = 1;
6296 if ( bSig0 < aSig0 ) goto aBigger;
6297 if ( aSig0 < bSig0 ) goto bBigger;
6298 if ( bSig1 < aSig1 ) goto aBigger;
6299 if ( aSig1 < bSig1 ) goto bBigger;
6300 return packFloat128(status->float_rounding_mode == float_round_down,
6301 0, 0, 0);
6302 bExpBigger:
6303 if ( bExp == 0x7FFF ) {
6304 if (bSig0 | bSig1) {
6305 return propagateFloat128NaN(a, b, status);
6307 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6309 if ( aExp == 0 ) {
6310 ++expDiff;
6312 else {
6313 aSig0 |= LIT64( 0x4000000000000000 );
6315 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6316 bSig0 |= LIT64( 0x4000000000000000 );
6317 bBigger:
6318 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6319 zExp = bExp;
6320 zSign ^= 1;
6321 goto normalizeRoundAndPack;
6322 aExpBigger:
6323 if ( aExp == 0x7FFF ) {
6324 if (aSig0 | aSig1) {
6325 return propagateFloat128NaN(a, b, status);
6327 return a;
6329 if ( bExp == 0 ) {
6330 --expDiff;
6332 else {
6333 bSig0 |= LIT64( 0x4000000000000000 );
6335 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6336 aSig0 |= LIT64( 0x4000000000000000 );
6337 aBigger:
6338 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6339 zExp = aExp;
6340 normalizeRoundAndPack:
6341 --zExp;
6342 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6343 status);
6347 /*----------------------------------------------------------------------------
6348 | Returns the result of adding the quadruple-precision floating-point values
6349 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6350 | for Binary Floating-Point Arithmetic.
6351 *----------------------------------------------------------------------------*/
6353 float128 float128_add(float128 a, float128 b, float_status *status)
6355 flag aSign, bSign;
6357 aSign = extractFloat128Sign( a );
6358 bSign = extractFloat128Sign( b );
6359 if ( aSign == bSign ) {
6360 return addFloat128Sigs(a, b, aSign, status);
6362 else {
6363 return subFloat128Sigs(a, b, aSign, status);
6368 /*----------------------------------------------------------------------------
6369 | Returns the result of subtracting the quadruple-precision floating-point
6370 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6371 | Standard for Binary Floating-Point Arithmetic.
6372 *----------------------------------------------------------------------------*/
6374 float128 float128_sub(float128 a, float128 b, float_status *status)
6376 flag aSign, bSign;
6378 aSign = extractFloat128Sign( a );
6379 bSign = extractFloat128Sign( b );
6380 if ( aSign == bSign ) {
6381 return subFloat128Sigs(a, b, aSign, status);
6383 else {
6384 return addFloat128Sigs(a, b, aSign, status);
6389 /*----------------------------------------------------------------------------
6390 | Returns the result of multiplying the quadruple-precision floating-point
6391 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6392 | Standard for Binary Floating-Point Arithmetic.
6393 *----------------------------------------------------------------------------*/
6395 float128 float128_mul(float128 a, float128 b, float_status *status)
6397 flag aSign, bSign, zSign;
6398 int32_t aExp, bExp, zExp;
6399 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6401 aSig1 = extractFloat128Frac1( a );
6402 aSig0 = extractFloat128Frac0( a );
6403 aExp = extractFloat128Exp( a );
6404 aSign = extractFloat128Sign( a );
6405 bSig1 = extractFloat128Frac1( b );
6406 bSig0 = extractFloat128Frac0( b );
6407 bExp = extractFloat128Exp( b );
6408 bSign = extractFloat128Sign( b );
6409 zSign = aSign ^ bSign;
6410 if ( aExp == 0x7FFF ) {
6411 if ( ( aSig0 | aSig1 )
6412 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6413 return propagateFloat128NaN(a, b, status);
6415 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6416 return packFloat128( zSign, 0x7FFF, 0, 0 );
6418 if ( bExp == 0x7FFF ) {
6419 if (bSig0 | bSig1) {
6420 return propagateFloat128NaN(a, b, status);
6422 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6423 invalid:
6424 float_raise(float_flag_invalid, status);
6425 return float128_default_nan(status);
6427 return packFloat128( zSign, 0x7FFF, 0, 0 );
6429 if ( aExp == 0 ) {
6430 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6431 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6433 if ( bExp == 0 ) {
6434 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6435 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6437 zExp = aExp + bExp - 0x4000;
6438 aSig0 |= LIT64( 0x0001000000000000 );
6439 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6440 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6441 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6442 zSig2 |= ( zSig3 != 0 );
6443 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6444 shift128ExtraRightJamming(
6445 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6446 ++zExp;
6448 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6452 /*----------------------------------------------------------------------------
6453 | Returns the result of dividing the quadruple-precision floating-point value
6454 | `a' by the corresponding value `b'. The operation is performed according to
6455 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6456 *----------------------------------------------------------------------------*/
6458 float128 float128_div(float128 a, float128 b, float_status *status)
6460 flag aSign, bSign, zSign;
6461 int32_t aExp, bExp, zExp;
6462 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6463 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6465 aSig1 = extractFloat128Frac1( a );
6466 aSig0 = extractFloat128Frac0( a );
6467 aExp = extractFloat128Exp( a );
6468 aSign = extractFloat128Sign( a );
6469 bSig1 = extractFloat128Frac1( b );
6470 bSig0 = extractFloat128Frac0( b );
6471 bExp = extractFloat128Exp( b );
6472 bSign = extractFloat128Sign( b );
6473 zSign = aSign ^ bSign;
6474 if ( aExp == 0x7FFF ) {
6475 if (aSig0 | aSig1) {
6476 return propagateFloat128NaN(a, b, status);
6478 if ( bExp == 0x7FFF ) {
6479 if (bSig0 | bSig1) {
6480 return propagateFloat128NaN(a, b, status);
6482 goto invalid;
6484 return packFloat128( zSign, 0x7FFF, 0, 0 );
6486 if ( bExp == 0x7FFF ) {
6487 if (bSig0 | bSig1) {
6488 return propagateFloat128NaN(a, b, status);
6490 return packFloat128( zSign, 0, 0, 0 );
6492 if ( bExp == 0 ) {
6493 if ( ( bSig0 | bSig1 ) == 0 ) {
6494 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6495 invalid:
6496 float_raise(float_flag_invalid, status);
6497 return float128_default_nan(status);
6499 float_raise(float_flag_divbyzero, status);
6500 return packFloat128( zSign, 0x7FFF, 0, 0 );
6502 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6504 if ( aExp == 0 ) {
6505 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6506 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6508 zExp = aExp - bExp + 0x3FFD;
6509 shortShift128Left(
6510 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6511 shortShift128Left(
6512 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6513 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6514 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6515 ++zExp;
6517 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6518 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6519 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6520 while ( (int64_t) rem0 < 0 ) {
6521 --zSig0;
6522 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6524 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6525 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6526 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6527 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6528 while ( (int64_t) rem1 < 0 ) {
6529 --zSig1;
6530 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6532 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6534 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6535 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6539 /*----------------------------------------------------------------------------
6540 | Returns the remainder of the quadruple-precision floating-point value `a'
6541 | with respect to the corresponding value `b'. The operation is performed
6542 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6543 *----------------------------------------------------------------------------*/
6545 float128 float128_rem(float128 a, float128 b, float_status *status)
6547 flag aSign, zSign;
6548 int32_t aExp, bExp, expDiff;
6549 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6550 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6551 int64_t sigMean0;
6553 aSig1 = extractFloat128Frac1( a );
6554 aSig0 = extractFloat128Frac0( a );
6555 aExp = extractFloat128Exp( a );
6556 aSign = extractFloat128Sign( a );
6557 bSig1 = extractFloat128Frac1( b );
6558 bSig0 = extractFloat128Frac0( b );
6559 bExp = extractFloat128Exp( b );
6560 if ( aExp == 0x7FFF ) {
6561 if ( ( aSig0 | aSig1 )
6562 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6563 return propagateFloat128NaN(a, b, status);
6565 goto invalid;
6567 if ( bExp == 0x7FFF ) {
6568 if (bSig0 | bSig1) {
6569 return propagateFloat128NaN(a, b, status);
6571 return a;
6573 if ( bExp == 0 ) {
6574 if ( ( bSig0 | bSig1 ) == 0 ) {
6575 invalid:
6576 float_raise(float_flag_invalid, status);
6577 return float128_default_nan(status);
6579 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6581 if ( aExp == 0 ) {
6582 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6583 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6585 expDiff = aExp - bExp;
6586 if ( expDiff < -1 ) return a;
6587 shortShift128Left(
6588 aSig0 | LIT64( 0x0001000000000000 ),
6589 aSig1,
6590 15 - ( expDiff < 0 ),
6591 &aSig0,
6592 &aSig1
6594 shortShift128Left(
6595 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6596 q = le128( bSig0, bSig1, aSig0, aSig1 );
6597 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6598 expDiff -= 64;
6599 while ( 0 < expDiff ) {
6600 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6601 q = ( 4 < q ) ? q - 4 : 0;
6602 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6603 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6604 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6605 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6606 expDiff -= 61;
6608 if ( -64 < expDiff ) {
6609 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6610 q = ( 4 < q ) ? q - 4 : 0;
6611 q >>= - expDiff;
6612 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6613 expDiff += 52;
6614 if ( expDiff < 0 ) {
6615 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6617 else {
6618 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6620 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6621 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6623 else {
6624 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6625 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6627 do {
6628 alternateASig0 = aSig0;
6629 alternateASig1 = aSig1;
6630 ++q;
6631 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6632 } while ( 0 <= (int64_t) aSig0 );
6633 add128(
6634 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6635 if ( ( sigMean0 < 0 )
6636 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6637 aSig0 = alternateASig0;
6638 aSig1 = alternateASig1;
6640 zSign = ( (int64_t) aSig0 < 0 );
6641 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6642 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6643 status);
6646 /*----------------------------------------------------------------------------
6647 | Returns the square root of the quadruple-precision floating-point value `a'.
6648 | The operation is performed according to the IEC/IEEE Standard for Binary
6649 | Floating-Point Arithmetic.
6650 *----------------------------------------------------------------------------*/
6652 float128 float128_sqrt(float128 a, float_status *status)
6654 flag aSign;
6655 int32_t aExp, zExp;
6656 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6657 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6659 aSig1 = extractFloat128Frac1( a );
6660 aSig0 = extractFloat128Frac0( a );
6661 aExp = extractFloat128Exp( a );
6662 aSign = extractFloat128Sign( a );
6663 if ( aExp == 0x7FFF ) {
6664 if (aSig0 | aSig1) {
6665 return propagateFloat128NaN(a, a, status);
6667 if ( ! aSign ) return a;
6668 goto invalid;
6670 if ( aSign ) {
6671 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6672 invalid:
6673 float_raise(float_flag_invalid, status);
6674 return float128_default_nan(status);
6676 if ( aExp == 0 ) {
6677 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6678 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6680 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6681 aSig0 |= LIT64( 0x0001000000000000 );
6682 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6683 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6684 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6685 doubleZSig0 = zSig0<<1;
6686 mul64To128( zSig0, zSig0, &term0, &term1 );
6687 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6688 while ( (int64_t) rem0 < 0 ) {
6689 --zSig0;
6690 doubleZSig0 -= 2;
6691 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6693 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6694 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6695 if ( zSig1 == 0 ) zSig1 = 1;
6696 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6697 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6698 mul64To128( zSig1, zSig1, &term2, &term3 );
6699 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6700 while ( (int64_t) rem1 < 0 ) {
6701 --zSig1;
6702 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6703 term3 |= 1;
6704 term2 |= doubleZSig0;
6705 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6707 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6709 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6710 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6714 /*----------------------------------------------------------------------------
6715 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6716 | the corresponding value `b', and 0 otherwise. The invalid exception is
6717 | raised if either operand is a NaN. Otherwise, the comparison is performed
6718 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6719 *----------------------------------------------------------------------------*/
6721 int float128_eq(float128 a, float128 b, float_status *status)
6724 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6725 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6726 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6727 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6729 float_raise(float_flag_invalid, status);
6730 return 0;
6732 return
6733 ( a.low == b.low )
6734 && ( ( a.high == b.high )
6735 || ( ( a.low == 0 )
6736 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6741 /*----------------------------------------------------------------------------
6742 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6743 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6744 | exception is raised if either operand is a NaN. The comparison is performed
6745 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6746 *----------------------------------------------------------------------------*/
6748 int float128_le(float128 a, float128 b, float_status *status)
6750 flag aSign, bSign;
6752 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6753 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6754 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6755 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6757 float_raise(float_flag_invalid, status);
6758 return 0;
6760 aSign = extractFloat128Sign( a );
6761 bSign = extractFloat128Sign( b );
6762 if ( aSign != bSign ) {
6763 return
6764 aSign
6765 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6766 == 0 );
6768 return
6769 aSign ? le128( b.high, b.low, a.high, a.low )
6770 : le128( a.high, a.low, b.high, b.low );
6774 /*----------------------------------------------------------------------------
6775 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6776 | the corresponding value `b', and 0 otherwise. The invalid exception is
6777 | raised if either operand is a NaN. The comparison is performed according
6778 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6779 *----------------------------------------------------------------------------*/
6781 int float128_lt(float128 a, float128 b, float_status *status)
6783 flag aSign, bSign;
6785 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6786 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6787 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6788 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6790 float_raise(float_flag_invalid, status);
6791 return 0;
6793 aSign = extractFloat128Sign( a );
6794 bSign = extractFloat128Sign( b );
6795 if ( aSign != bSign ) {
6796 return
6797 aSign
6798 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6799 != 0 );
6801 return
6802 aSign ? lt128( b.high, b.low, a.high, a.low )
6803 : lt128( a.high, a.low, b.high, b.low );
6807 /*----------------------------------------------------------------------------
6808 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6809 | be compared, and 0 otherwise. The invalid exception is raised if either
6810 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6811 | Standard for Binary Floating-Point Arithmetic.
6812 *----------------------------------------------------------------------------*/
6814 int float128_unordered(float128 a, float128 b, float_status *status)
6816 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6817 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6818 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6819 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6821 float_raise(float_flag_invalid, status);
6822 return 1;
6824 return 0;
6827 /*----------------------------------------------------------------------------
6828 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6829 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6830 | exception. The comparison is performed according to the IEC/IEEE Standard
6831 | for Binary Floating-Point Arithmetic.
6832 *----------------------------------------------------------------------------*/
6834 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6837 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6838 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6839 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6840 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6842 if (float128_is_signaling_nan(a, status)
6843 || float128_is_signaling_nan(b, status)) {
6844 float_raise(float_flag_invalid, status);
6846 return 0;
6848 return
6849 ( a.low == b.low )
6850 && ( ( a.high == b.high )
6851 || ( ( a.low == 0 )
6852 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6857 /*----------------------------------------------------------------------------
6858 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6859 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6860 | cause an exception. Otherwise, the comparison is performed according to the
6861 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6862 *----------------------------------------------------------------------------*/
6864 int float128_le_quiet(float128 a, float128 b, float_status *status)
6866 flag aSign, bSign;
6868 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6869 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6870 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6871 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6873 if (float128_is_signaling_nan(a, status)
6874 || float128_is_signaling_nan(b, status)) {
6875 float_raise(float_flag_invalid, status);
6877 return 0;
6879 aSign = extractFloat128Sign( a );
6880 bSign = extractFloat128Sign( b );
6881 if ( aSign != bSign ) {
6882 return
6883 aSign
6884 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6885 == 0 );
6887 return
6888 aSign ? le128( b.high, b.low, a.high, a.low )
6889 : le128( a.high, a.low, b.high, b.low );
6893 /*----------------------------------------------------------------------------
6894 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6895 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6896 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6897 | Standard for Binary Floating-Point Arithmetic.
6898 *----------------------------------------------------------------------------*/
6900 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6902 flag aSign, bSign;
6904 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6905 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6906 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6907 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6909 if (float128_is_signaling_nan(a, status)
6910 || float128_is_signaling_nan(b, status)) {
6911 float_raise(float_flag_invalid, status);
6913 return 0;
6915 aSign = extractFloat128Sign( a );
6916 bSign = extractFloat128Sign( b );
6917 if ( aSign != bSign ) {
6918 return
6919 aSign
6920 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6921 != 0 );
6923 return
6924 aSign ? lt128( b.high, b.low, a.high, a.low )
6925 : lt128( a.high, a.low, b.high, b.low );
6929 /*----------------------------------------------------------------------------
6930 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6931 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6932 | comparison is performed according to the IEC/IEEE Standard for Binary
6933 | Floating-Point Arithmetic.
6934 *----------------------------------------------------------------------------*/
6936 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6938 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6939 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6940 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6941 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6943 if (float128_is_signaling_nan(a, status)
6944 || float128_is_signaling_nan(b, status)) {
6945 float_raise(float_flag_invalid, status);
6947 return 1;
6949 return 0;
6952 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6953 int is_quiet, float_status *status)
6955 flag aSign, bSign;
6957 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6958 float_raise(float_flag_invalid, status);
6959 return float_relation_unordered;
6961 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6962 ( extractFloatx80Frac( a )<<1 ) ) ||
6963 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6964 ( extractFloatx80Frac( b )<<1 ) )) {
6965 if (!is_quiet ||
6966 floatx80_is_signaling_nan(a, status) ||
6967 floatx80_is_signaling_nan(b, status)) {
6968 float_raise(float_flag_invalid, status);
6970 return float_relation_unordered;
6972 aSign = extractFloatx80Sign( a );
6973 bSign = extractFloatx80Sign( b );
6974 if ( aSign != bSign ) {
6976 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6977 ( ( a.low | b.low ) == 0 ) ) {
6978 /* zero case */
6979 return float_relation_equal;
6980 } else {
6981 return 1 - (2 * aSign);
6983 } else {
6984 if (a.low == b.low && a.high == b.high) {
6985 return float_relation_equal;
6986 } else {
6987 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6992 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6994 return floatx80_compare_internal(a, b, 0, status);
6997 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6999 return floatx80_compare_internal(a, b, 1, status);
7002 static inline int float128_compare_internal(float128 a, float128 b,
7003 int is_quiet, float_status *status)
7005 flag aSign, bSign;
7007 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7008 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7009 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7010 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7011 if (!is_quiet ||
7012 float128_is_signaling_nan(a, status) ||
7013 float128_is_signaling_nan(b, status)) {
7014 float_raise(float_flag_invalid, status);
7016 return float_relation_unordered;
7018 aSign = extractFloat128Sign( a );
7019 bSign = extractFloat128Sign( b );
7020 if ( aSign != bSign ) {
7021 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7022 /* zero case */
7023 return float_relation_equal;
7024 } else {
7025 return 1 - (2 * aSign);
7027 } else {
7028 if (a.low == b.low && a.high == b.high) {
7029 return float_relation_equal;
7030 } else {
7031 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7036 int float128_compare(float128 a, float128 b, float_status *status)
7038 return float128_compare_internal(a, b, 0, status);
7041 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7043 return float128_compare_internal(a, b, 1, status);
7046 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7048 flag aSign;
7049 int32_t aExp;
7050 uint64_t aSig;
7052 if (floatx80_invalid_encoding(a)) {
7053 float_raise(float_flag_invalid, status);
7054 return floatx80_default_nan(status);
7056 aSig = extractFloatx80Frac( a );
7057 aExp = extractFloatx80Exp( a );
7058 aSign = extractFloatx80Sign( a );
7060 if ( aExp == 0x7FFF ) {
7061 if ( aSig<<1 ) {
7062 return propagateFloatx80NaN(a, a, status);
7064 return a;
7067 if (aExp == 0) {
7068 if (aSig == 0) {
7069 return a;
7071 aExp++;
7074 if (n > 0x10000) {
7075 n = 0x10000;
7076 } else if (n < -0x10000) {
7077 n = -0x10000;
7080 aExp += n;
7081 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7082 aSign, aExp, aSig, 0, status);
7085 float128 float128_scalbn(float128 a, int n, float_status *status)
7087 flag aSign;
7088 int32_t aExp;
7089 uint64_t aSig0, aSig1;
7091 aSig1 = extractFloat128Frac1( a );
7092 aSig0 = extractFloat128Frac0( a );
7093 aExp = extractFloat128Exp( a );
7094 aSign = extractFloat128Sign( a );
7095 if ( aExp == 0x7FFF ) {
7096 if ( aSig0 | aSig1 ) {
7097 return propagateFloat128NaN(a, a, status);
7099 return a;
7101 if (aExp != 0) {
7102 aSig0 |= LIT64( 0x0001000000000000 );
7103 } else if (aSig0 == 0 && aSig1 == 0) {
7104 return a;
7105 } else {
7106 aExp++;
7109 if (n > 0x10000) {
7110 n = 0x10000;
7111 } else if (n < -0x10000) {
7112 n = -0x10000;
7115 aExp += n - 1;
7116 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7117 , status);