replay: fixed replay_enable_events
[qemu/ar7.git] / fpu / softfloat.c
blob6e16284e66c277195e9076953326a4606c0d3209
1 /*
2 * QEMU float support
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "fpu/softfloat-macros.h"
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine: (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output. These details are target-
104 | specific.
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
112 static inline uint32_t extractFloat16Frac(float16 a)
114 return float16_val(a) & 0x3ff;
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
121 static inline int extractFloat16Exp(float16 a)
123 return (float16_val(a) >> 10) & 0x1f;
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
130 static inline flag extractFloat16Sign(float16 a)
132 return float16_val(a)>>15;
135 /*----------------------------------------------------------------------------
136 | Returns the fraction bits of the single-precision floating-point value `a'.
137 *----------------------------------------------------------------------------*/
139 static inline uint32_t extractFloat32Frac(float32 a)
141 return float32_val(a) & 0x007FFFFF;
144 /*----------------------------------------------------------------------------
145 | Returns the exponent bits of the single-precision floating-point value `a'.
146 *----------------------------------------------------------------------------*/
148 static inline int extractFloat32Exp(float32 a)
150 return (float32_val(a) >> 23) & 0xFF;
153 /*----------------------------------------------------------------------------
154 | Returns the sign bit of the single-precision floating-point value `a'.
155 *----------------------------------------------------------------------------*/
157 static inline flag extractFloat32Sign(float32 a)
159 return float32_val(a) >> 31;
162 /*----------------------------------------------------------------------------
163 | Returns the fraction bits of the double-precision floating-point value `a'.
164 *----------------------------------------------------------------------------*/
166 static inline uint64_t extractFloat64Frac(float64 a)
168 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
171 /*----------------------------------------------------------------------------
172 | Returns the exponent bits of the double-precision floating-point value `a'.
173 *----------------------------------------------------------------------------*/
175 static inline int extractFloat64Exp(float64 a)
177 return (float64_val(a) >> 52) & 0x7FF;
180 /*----------------------------------------------------------------------------
181 | Returns the sign bit of the double-precision floating-point value `a'.
182 *----------------------------------------------------------------------------*/
184 static inline flag extractFloat64Sign(float64 a)
186 return float64_val(a) >> 63;
190 * Classify a floating point number. Everything above float_class_qnan
191 * is a NaN so cls >= float_class_qnan is any NaN.
194 typedef enum __attribute__ ((__packed__)) {
195 float_class_unclassified,
196 float_class_zero,
197 float_class_normal,
198 float_class_inf,
199 float_class_qnan, /* all NaNs from here */
200 float_class_snan,
201 float_class_dnan,
202 float_class_msnan, /* maybe silenced */
203 } FloatClass;
206 * Structure holding all of the decomposed parts of a float. The
207 * exponent is unbiased and the fraction is normalized. All
208 * calculations are done with a 64 bit fraction and then rounded as
209 * appropriate for the final format.
211 * Thanks to the packed FloatClass a decent compiler should be able to
212 * fit the whole structure into registers and avoid using the stack
213 * for parameter passing.
216 typedef struct {
217 uint64_t frac;
218 int32_t exp;
219 FloatClass cls;
220 bool sign;
221 } FloatParts;
223 #define DECOMPOSED_BINARY_POINT (64 - 2)
224 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
225 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
227 /* Structure holding all of the relevant parameters for a format.
228 * exp_size: the size of the exponent field
229 * exp_bias: the offset applied to the exponent field
230 * exp_max: the maximum normalised exponent
231 * frac_size: the size of the fraction field
232 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
233 * The following are computed based the size of fraction
234 * frac_lsb: least significant bit of fraction
235 * fram_lsbm1: the bit bellow the least significant bit (for rounding)
236 * round_mask/roundeven_mask: masks used for rounding
238 typedef struct {
239 int exp_size;
240 int exp_bias;
241 int exp_max;
242 int frac_size;
243 int frac_shift;
244 uint64_t frac_lsb;
245 uint64_t frac_lsbm1;
246 uint64_t round_mask;
247 uint64_t roundeven_mask;
248 } FloatFmt;
250 /* Expand fields based on the size of exponent and fraction */
251 #define FLOAT_PARAMS(E, F) \
252 .exp_size = E, \
253 .exp_bias = ((1 << E) - 1) >> 1, \
254 .exp_max = (1 << E) - 1, \
255 .frac_size = F, \
256 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
257 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
258 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
259 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
260 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
262 static const FloatFmt float16_params = {
263 FLOAT_PARAMS(5, 10)
266 static const FloatFmt float32_params = {
267 FLOAT_PARAMS(8, 23)
270 static const FloatFmt float64_params = {
271 FLOAT_PARAMS(11, 52)
274 /* Unpack a float to parts, but do not canonicalize. */
275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
277 const int sign_pos = fmt.frac_size + fmt.exp_size;
279 return (FloatParts) {
280 .cls = float_class_unclassified,
281 .sign = extract64(raw, sign_pos, 1),
282 .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
283 .frac = extract64(raw, 0, fmt.frac_size),
287 static inline FloatParts float16_unpack_raw(float16 f)
289 return unpack_raw(float16_params, f);
292 static inline FloatParts float32_unpack_raw(float32 f)
294 return unpack_raw(float32_params, f);
297 static inline FloatParts float64_unpack_raw(float64 f)
299 return unpack_raw(float64_params, f);
302 /* Pack a float from parts, but do not canonicalize. */
303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
305 const int sign_pos = fmt.frac_size + fmt.exp_size;
306 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
307 return deposit64(ret, sign_pos, 1, p.sign);
310 static inline float16 float16_pack_raw(FloatParts p)
312 return make_float16(pack_raw(float16_params, p));
315 static inline float32 float32_pack_raw(FloatParts p)
317 return make_float32(pack_raw(float32_params, p));
320 static inline float64 float64_pack_raw(FloatParts p)
322 return make_float64(pack_raw(float64_params, p));
325 /* Canonicalize EXP and FRAC, setting CLS. */
326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
327 float_status *status)
329 if (part.exp == parm->exp_max) {
330 if (part.frac == 0) {
331 part.cls = float_class_inf;
332 } else {
333 #ifdef NO_SIGNALING_NANS
334 part.cls = float_class_qnan;
335 #else
336 int64_t msb = part.frac << (parm->frac_shift + 2);
337 if ((msb < 0) == status->snan_bit_is_one) {
338 part.cls = float_class_snan;
339 } else {
340 part.cls = float_class_qnan;
342 #endif
344 } else if (part.exp == 0) {
345 if (likely(part.frac == 0)) {
346 part.cls = float_class_zero;
347 } else if (status->flush_inputs_to_zero) {
348 float_raise(float_flag_input_denormal, status);
349 part.cls = float_class_zero;
350 part.frac = 0;
351 } else {
352 int shift = clz64(part.frac) - 1;
353 part.cls = float_class_normal;
354 part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
355 part.frac <<= shift;
357 } else {
358 part.cls = float_class_normal;
359 part.exp -= parm->exp_bias;
360 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
362 return part;
365 /* Round and uncanonicalize a floating-point number by parts. There
366 * are FRAC_SHIFT bits that may require rounding at the bottom of the
367 * fraction; these bits will be removed. The exponent will be biased
368 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
371 static FloatParts round_canonical(FloatParts p, float_status *s,
372 const FloatFmt *parm)
374 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
375 const uint64_t round_mask = parm->round_mask;
376 const uint64_t roundeven_mask = parm->roundeven_mask;
377 const int exp_max = parm->exp_max;
378 const int frac_shift = parm->frac_shift;
379 uint64_t frac, inc;
380 int exp, flags = 0;
381 bool overflow_norm;
383 frac = p.frac;
384 exp = p.exp;
386 switch (p.cls) {
387 case float_class_normal:
388 switch (s->float_rounding_mode) {
389 case float_round_nearest_even:
390 overflow_norm = false;
391 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
392 break;
393 case float_round_ties_away:
394 overflow_norm = false;
395 inc = frac_lsbm1;
396 break;
397 case float_round_to_zero:
398 overflow_norm = true;
399 inc = 0;
400 break;
401 case float_round_up:
402 inc = p.sign ? 0 : round_mask;
403 overflow_norm = p.sign;
404 break;
405 case float_round_down:
406 inc = p.sign ? round_mask : 0;
407 overflow_norm = !p.sign;
408 break;
409 default:
410 g_assert_not_reached();
413 exp += parm->exp_bias;
414 if (likely(exp > 0)) {
415 if (frac & round_mask) {
416 flags |= float_flag_inexact;
417 frac += inc;
418 if (frac & DECOMPOSED_OVERFLOW_BIT) {
419 frac >>= 1;
420 exp++;
423 frac >>= frac_shift;
425 if (unlikely(exp >= exp_max)) {
426 flags |= float_flag_overflow | float_flag_inexact;
427 if (overflow_norm) {
428 exp = exp_max - 1;
429 frac = -1;
430 } else {
431 p.cls = float_class_inf;
432 goto do_inf;
435 } else if (s->flush_to_zero) {
436 flags |= float_flag_output_denormal;
437 p.cls = float_class_zero;
438 goto do_zero;
439 } else {
440 bool is_tiny = (s->float_detect_tininess
441 == float_tininess_before_rounding)
442 || (exp < 0)
443 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
445 shift64RightJamming(frac, 1 - exp, &frac);
446 if (frac & round_mask) {
447 /* Need to recompute round-to-even. */
448 if (s->float_rounding_mode == float_round_nearest_even) {
449 inc = ((frac & roundeven_mask) != frac_lsbm1
450 ? frac_lsbm1 : 0);
452 flags |= float_flag_inexact;
453 frac += inc;
456 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
457 frac >>= frac_shift;
459 if (is_tiny && (flags & float_flag_inexact)) {
460 flags |= float_flag_underflow;
462 if (exp == 0 && frac == 0) {
463 p.cls = float_class_zero;
466 break;
468 case float_class_zero:
469 do_zero:
470 exp = 0;
471 frac = 0;
472 break;
474 case float_class_inf:
475 do_inf:
476 exp = exp_max;
477 frac = 0;
478 break;
480 case float_class_qnan:
481 case float_class_snan:
482 exp = exp_max;
483 break;
485 default:
486 g_assert_not_reached();
489 float_raise(flags, s);
490 p.exp = exp;
491 p.frac = frac;
492 return p;
495 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
497 return canonicalize(float16_unpack_raw(f), &float16_params, s);
500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
502 switch (p.cls) {
503 case float_class_dnan:
504 return float16_default_nan(s);
505 case float_class_msnan:
506 return float16_maybe_silence_nan(float16_pack_raw(p), s);
507 default:
508 p = round_canonical(p, s, &float16_params);
509 return float16_pack_raw(p);
513 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
515 return canonicalize(float32_unpack_raw(f), &float32_params, s);
518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
520 switch (p.cls) {
521 case float_class_dnan:
522 return float32_default_nan(s);
523 case float_class_msnan:
524 return float32_maybe_silence_nan(float32_pack_raw(p), s);
525 default:
526 p = round_canonical(p, s, &float32_params);
527 return float32_pack_raw(p);
531 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
533 return canonicalize(float64_unpack_raw(f), &float64_params, s);
536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
538 switch (p.cls) {
539 case float_class_dnan:
540 return float64_default_nan(s);
541 case float_class_msnan:
542 return float64_maybe_silence_nan(float64_pack_raw(p), s);
543 default:
544 p = round_canonical(p, s, &float64_params);
545 return float64_pack_raw(p);
549 /* Simple helpers for checking if what NaN we have */
550 static bool is_nan(FloatClass c)
552 return unlikely(c >= float_class_qnan);
554 static bool is_snan(FloatClass c)
556 return c == float_class_snan;
558 static bool is_qnan(FloatClass c)
560 return c == float_class_qnan;
563 static FloatParts return_nan(FloatParts a, float_status *s)
565 switch (a.cls) {
566 case float_class_snan:
567 s->float_exception_flags |= float_flag_invalid;
568 a.cls = float_class_msnan;
569 /* fall through */
570 case float_class_qnan:
571 if (s->default_nan_mode) {
572 a.cls = float_class_dnan;
574 break;
576 default:
577 g_assert_not_reached();
579 return a;
582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
584 if (is_snan(a.cls) || is_snan(b.cls)) {
585 s->float_exception_flags |= float_flag_invalid;
588 if (s->default_nan_mode) {
589 a.cls = float_class_dnan;
590 } else {
591 if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
592 is_qnan(b.cls), is_snan(b.cls),
593 a.frac > b.frac ||
594 (a.frac == b.frac && a.sign < b.sign))) {
595 a = b;
597 a.cls = float_class_msnan;
599 return a;
602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
603 bool inf_zero, float_status *s)
605 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
606 s->float_exception_flags |= float_flag_invalid;
609 if (s->default_nan_mode) {
610 a.cls = float_class_dnan;
611 } else {
612 switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
613 is_qnan(b.cls), is_snan(b.cls),
614 is_qnan(c.cls), is_snan(c.cls),
615 inf_zero, s)) {
616 case 0:
617 break;
618 case 1:
619 a = b;
620 break;
621 case 2:
622 a = c;
623 break;
624 case 3:
625 a.cls = float_class_dnan;
626 return a;
627 default:
628 g_assert_not_reached();
631 a.cls = float_class_msnan;
633 return a;
637 * Returns the result of adding or subtracting the values of the
638 * floating-point values `a' and `b'. The operation is performed
639 * according to the IEC/IEEE Standard for Binary Floating-Point
640 * Arithmetic.
643 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
644 float_status *s)
646 bool a_sign = a.sign;
647 bool b_sign = b.sign ^ subtract;
649 if (a_sign != b_sign) {
650 /* Subtraction */
652 if (a.cls == float_class_normal && b.cls == float_class_normal) {
653 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
654 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
655 a.frac = a.frac - b.frac;
656 } else {
657 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
658 a.frac = b.frac - a.frac;
659 a.exp = b.exp;
660 a_sign ^= 1;
663 if (a.frac == 0) {
664 a.cls = float_class_zero;
665 a.sign = s->float_rounding_mode == float_round_down;
666 } else {
667 int shift = clz64(a.frac) - 1;
668 a.frac = a.frac << shift;
669 a.exp = a.exp - shift;
670 a.sign = a_sign;
672 return a;
674 if (is_nan(a.cls) || is_nan(b.cls)) {
675 return pick_nan(a, b, s);
677 if (a.cls == float_class_inf) {
678 if (b.cls == float_class_inf) {
679 float_raise(float_flag_invalid, s);
680 a.cls = float_class_dnan;
682 return a;
684 if (a.cls == float_class_zero && b.cls == float_class_zero) {
685 a.sign = s->float_rounding_mode == float_round_down;
686 return a;
688 if (a.cls == float_class_zero || b.cls == float_class_inf) {
689 b.sign = a_sign ^ 1;
690 return b;
692 if (b.cls == float_class_zero) {
693 return a;
695 } else {
696 /* Addition */
697 if (a.cls == float_class_normal && b.cls == float_class_normal) {
698 if (a.exp > b.exp) {
699 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
700 } else if (a.exp < b.exp) {
701 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
702 a.exp = b.exp;
704 a.frac += b.frac;
705 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
706 a.frac >>= 1;
707 a.exp += 1;
709 return a;
711 if (is_nan(a.cls) || is_nan(b.cls)) {
712 return pick_nan(a, b, s);
714 if (a.cls == float_class_inf || b.cls == float_class_zero) {
715 return a;
717 if (b.cls == float_class_inf || a.cls == float_class_zero) {
718 b.sign = b_sign;
719 return b;
722 g_assert_not_reached();
726 * Returns the result of adding or subtracting the floating-point
727 * values `a' and `b'. The operation is performed according to the
728 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
731 float16 __attribute__((flatten)) float16_add(float16 a, float16 b,
732 float_status *status)
734 FloatParts pa = float16_unpack_canonical(a, status);
735 FloatParts pb = float16_unpack_canonical(b, status);
736 FloatParts pr = addsub_floats(pa, pb, false, status);
738 return float16_round_pack_canonical(pr, status);
741 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
742 float_status *status)
744 FloatParts pa = float32_unpack_canonical(a, status);
745 FloatParts pb = float32_unpack_canonical(b, status);
746 FloatParts pr = addsub_floats(pa, pb, false, status);
748 return float32_round_pack_canonical(pr, status);
751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
752 float_status *status)
754 FloatParts pa = float64_unpack_canonical(a, status);
755 FloatParts pb = float64_unpack_canonical(b, status);
756 FloatParts pr = addsub_floats(pa, pb, false, status);
758 return float64_round_pack_canonical(pr, status);
761 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
762 float_status *status)
764 FloatParts pa = float16_unpack_canonical(a, status);
765 FloatParts pb = float16_unpack_canonical(b, status);
766 FloatParts pr = addsub_floats(pa, pb, true, status);
768 return float16_round_pack_canonical(pr, status);
771 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
772 float_status *status)
774 FloatParts pa = float32_unpack_canonical(a, status);
775 FloatParts pb = float32_unpack_canonical(b, status);
776 FloatParts pr = addsub_floats(pa, pb, true, status);
778 return float32_round_pack_canonical(pr, status);
781 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
782 float_status *status)
784 FloatParts pa = float64_unpack_canonical(a, status);
785 FloatParts pb = float64_unpack_canonical(b, status);
786 FloatParts pr = addsub_floats(pa, pb, true, status);
788 return float64_round_pack_canonical(pr, status);
792 * Returns the result of multiplying the floating-point values `a' and
793 * `b'. The operation is performed according to the IEC/IEEE Standard
794 * for Binary Floating-Point Arithmetic.
797 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
799 bool sign = a.sign ^ b.sign;
801 if (a.cls == float_class_normal && b.cls == float_class_normal) {
802 uint64_t hi, lo;
803 int exp = a.exp + b.exp;
805 mul64To128(a.frac, b.frac, &hi, &lo);
806 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
807 if (lo & DECOMPOSED_OVERFLOW_BIT) {
808 shift64RightJamming(lo, 1, &lo);
809 exp += 1;
812 /* Re-use a */
813 a.exp = exp;
814 a.sign = sign;
815 a.frac = lo;
816 return a;
818 /* handle all the NaN cases */
819 if (is_nan(a.cls) || is_nan(b.cls)) {
820 return pick_nan(a, b, s);
822 /* Inf * Zero == NaN */
823 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
824 (a.cls == float_class_zero && b.cls == float_class_inf)) {
825 s->float_exception_flags |= float_flag_invalid;
826 a.cls = float_class_dnan;
827 a.sign = sign;
828 return a;
830 /* Multiply by 0 or Inf */
831 if (a.cls == float_class_inf || a.cls == float_class_zero) {
832 a.sign = sign;
833 return a;
835 if (b.cls == float_class_inf || b.cls == float_class_zero) {
836 b.sign = sign;
837 return b;
839 g_assert_not_reached();
842 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
843 float_status *status)
845 FloatParts pa = float16_unpack_canonical(a, status);
846 FloatParts pb = float16_unpack_canonical(b, status);
847 FloatParts pr = mul_floats(pa, pb, status);
849 return float16_round_pack_canonical(pr, status);
852 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
853 float_status *status)
855 FloatParts pa = float32_unpack_canonical(a, status);
856 FloatParts pb = float32_unpack_canonical(b, status);
857 FloatParts pr = mul_floats(pa, pb, status);
859 return float32_round_pack_canonical(pr, status);
862 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
863 float_status *status)
865 FloatParts pa = float64_unpack_canonical(a, status);
866 FloatParts pb = float64_unpack_canonical(b, status);
867 FloatParts pr = mul_floats(pa, pb, status);
869 return float64_round_pack_canonical(pr, status);
873 * Returns the result of multiplying the floating-point values `a' and
874 * `b' then adding 'c', with no intermediate rounding step after the
875 * multiplication. The operation is performed according to the
876 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
877 * The flags argument allows the caller to select negation of the
878 * addend, the intermediate product, or the final result. (The
879 * difference between this and having the caller do a separate
880 * negation is that negating externally will flip the sign bit on
881 * NaNs.)
884 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
885 int flags, float_status *s)
887 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
888 ((1 << float_class_inf) | (1 << float_class_zero));
889 bool p_sign;
890 bool sign_flip = flags & float_muladd_negate_result;
891 FloatClass p_class;
892 uint64_t hi, lo;
893 int p_exp;
895 /* It is implementation-defined whether the cases of (0,inf,qnan)
896 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
897 * they return if they do), so we have to hand this information
898 * off to the target-specific pick-a-NaN routine.
900 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
901 return pick_nan_muladd(a, b, c, inf_zero, s);
904 if (inf_zero) {
905 s->float_exception_flags |= float_flag_invalid;
906 a.cls = float_class_dnan;
907 return a;
910 if (flags & float_muladd_negate_c) {
911 c.sign ^= 1;
914 p_sign = a.sign ^ b.sign;
916 if (flags & float_muladd_negate_product) {
917 p_sign ^= 1;
920 if (a.cls == float_class_inf || b.cls == float_class_inf) {
921 p_class = float_class_inf;
922 } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
923 p_class = float_class_zero;
924 } else {
925 p_class = float_class_normal;
928 if (c.cls == float_class_inf) {
929 if (p_class == float_class_inf && p_sign != c.sign) {
930 s->float_exception_flags |= float_flag_invalid;
931 a.cls = float_class_dnan;
932 } else {
933 a.cls = float_class_inf;
934 a.sign = c.sign ^ sign_flip;
936 return a;
939 if (p_class == float_class_inf) {
940 a.cls = float_class_inf;
941 a.sign = p_sign ^ sign_flip;
942 return a;
945 if (p_class == float_class_zero) {
946 if (c.cls == float_class_zero) {
947 if (p_sign != c.sign) {
948 p_sign = s->float_rounding_mode == float_round_down;
950 c.sign = p_sign;
951 } else if (flags & float_muladd_halve_result) {
952 c.exp -= 1;
954 c.sign ^= sign_flip;
955 return c;
958 /* a & b should be normals now... */
959 assert(a.cls == float_class_normal &&
960 b.cls == float_class_normal);
962 p_exp = a.exp + b.exp;
964 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
965 * result.
967 mul64To128(a.frac, b.frac, &hi, &lo);
968 /* binary point now at bit 124 */
970 /* check for overflow */
971 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
972 shift128RightJamming(hi, lo, 1, &hi, &lo);
973 p_exp += 1;
976 /* + add/sub */
977 if (c.cls == float_class_zero) {
978 /* move binary point back to 62 */
979 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
980 } else {
981 int exp_diff = p_exp - c.exp;
982 if (p_sign == c.sign) {
983 /* Addition */
984 if (exp_diff <= 0) {
985 shift128RightJamming(hi, lo,
986 DECOMPOSED_BINARY_POINT - exp_diff,
987 &hi, &lo);
988 lo += c.frac;
989 p_exp = c.exp;
990 } else {
991 uint64_t c_hi, c_lo;
992 /* shift c to the same binary point as the product (124) */
993 c_hi = c.frac >> 2;
994 c_lo = 0;
995 shift128RightJamming(c_hi, c_lo,
996 exp_diff,
997 &c_hi, &c_lo);
998 add128(hi, lo, c_hi, c_lo, &hi, &lo);
999 /* move binary point back to 62 */
1000 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1003 if (lo & DECOMPOSED_OVERFLOW_BIT) {
1004 shift64RightJamming(lo, 1, &lo);
1005 p_exp += 1;
1008 } else {
1009 /* Subtraction */
1010 uint64_t c_hi, c_lo;
1011 /* make C binary point match product at bit 124 */
1012 c_hi = c.frac >> 2;
1013 c_lo = 0;
1015 if (exp_diff <= 0) {
1016 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1017 if (exp_diff == 0
1019 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1020 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1021 } else {
1022 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1023 p_sign ^= 1;
1024 p_exp = c.exp;
1026 } else {
1027 shift128RightJamming(c_hi, c_lo,
1028 exp_diff,
1029 &c_hi, &c_lo);
1030 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1033 if (hi == 0 && lo == 0) {
1034 a.cls = float_class_zero;
1035 a.sign = s->float_rounding_mode == float_round_down;
1036 a.sign ^= sign_flip;
1037 return a;
1038 } else {
1039 int shift;
1040 if (hi != 0) {
1041 shift = clz64(hi);
1042 } else {
1043 shift = clz64(lo) + 64;
1045 /* Normalizing to a binary point of 124 is the
1046 correct adjust for the exponent. However since we're
1047 shifting, we might as well put the binary point back
1048 at 62 where we really want it. Therefore shift as
1049 if we're leaving 1 bit at the top of the word, but
1050 adjust the exponent as if we're leaving 3 bits. */
1051 shift -= 1;
1052 if (shift >= 64) {
1053 lo = lo << (shift - 64);
1054 } else {
1055 hi = (hi << shift) | (lo >> (64 - shift));
1056 lo = hi | ((lo << shift) != 0);
1058 p_exp -= shift - 2;
1063 if (flags & float_muladd_halve_result) {
1064 p_exp -= 1;
1067 /* finally prepare our result */
1068 a.cls = float_class_normal;
1069 a.sign = p_sign ^ sign_flip;
1070 a.exp = p_exp;
1071 a.frac = lo;
1073 return a;
1076 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1077 int flags, float_status *status)
1079 FloatParts pa = float16_unpack_canonical(a, status);
1080 FloatParts pb = float16_unpack_canonical(b, status);
1081 FloatParts pc = float16_unpack_canonical(c, status);
1082 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1084 return float16_round_pack_canonical(pr, status);
1087 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1088 int flags, float_status *status)
1090 FloatParts pa = float32_unpack_canonical(a, status);
1091 FloatParts pb = float32_unpack_canonical(b, status);
1092 FloatParts pc = float32_unpack_canonical(c, status);
1093 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1095 return float32_round_pack_canonical(pr, status);
1098 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1099 int flags, float_status *status)
1101 FloatParts pa = float64_unpack_canonical(a, status);
1102 FloatParts pb = float64_unpack_canonical(b, status);
1103 FloatParts pc = float64_unpack_canonical(c, status);
1104 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1106 return float64_round_pack_canonical(pr, status);
1110 * Returns the result of dividing the floating-point value `a' by the
1111 * corresponding value `b'. The operation is performed according to
1112 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1117 bool sign = a.sign ^ b.sign;
1119 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1120 uint64_t temp_lo, temp_hi;
1121 int exp = a.exp - b.exp;
1122 if (a.frac < b.frac) {
1123 exp -= 1;
1124 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1125 &temp_hi, &temp_lo);
1126 } else {
1127 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1128 &temp_hi, &temp_lo);
1130 /* LSB of quot is set if inexact which roundandpack will use
1131 * to set flags. Yet again we re-use a for the result */
1132 a.frac = div128To64(temp_lo, temp_hi, b.frac);
1133 a.sign = sign;
1134 a.exp = exp;
1135 return a;
1137 /* handle all the NaN cases */
1138 if (is_nan(a.cls) || is_nan(b.cls)) {
1139 return pick_nan(a, b, s);
1141 /* 0/0 or Inf/Inf */
1142 if (a.cls == b.cls
1144 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1145 s->float_exception_flags |= float_flag_invalid;
1146 a.cls = float_class_dnan;
1147 return a;
1149 /* Div 0 => Inf */
1150 if (b.cls == float_class_zero) {
1151 s->float_exception_flags |= float_flag_divbyzero;
1152 a.cls = float_class_inf;
1153 a.sign = sign;
1154 return a;
1156 /* Inf / x or 0 / x */
1157 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1158 a.sign = sign;
1159 return a;
1161 /* Div by Inf */
1162 if (b.cls == float_class_inf) {
1163 a.cls = float_class_zero;
1164 a.sign = sign;
1165 return a;
1167 g_assert_not_reached();
1170 float16 float16_div(float16 a, float16 b, float_status *status)
1172 FloatParts pa = float16_unpack_canonical(a, status);
1173 FloatParts pb = float16_unpack_canonical(b, status);
1174 FloatParts pr = div_floats(pa, pb, status);
1176 return float16_round_pack_canonical(pr, status);
1179 float32 float32_div(float32 a, float32 b, float_status *status)
1181 FloatParts pa = float32_unpack_canonical(a, status);
1182 FloatParts pb = float32_unpack_canonical(b, status);
1183 FloatParts pr = div_floats(pa, pb, status);
1185 return float32_round_pack_canonical(pr, status);
1188 float64 float64_div(float64 a, float64 b, float_status *status)
1190 FloatParts pa = float64_unpack_canonical(a, status);
1191 FloatParts pb = float64_unpack_canonical(b, status);
1192 FloatParts pr = div_floats(pa, pb, status);
1194 return float64_round_pack_canonical(pr, status);
1198 * Rounds the floating-point value `a' to an integer, and returns the
1199 * result as a floating-point value. The operation is performed
1200 * according to the IEC/IEEE Standard for Binary Floating-Point
1201 * Arithmetic.
1204 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1206 if (is_nan(a.cls)) {
1207 return return_nan(a, s);
1210 switch (a.cls) {
1211 case float_class_zero:
1212 case float_class_inf:
1213 case float_class_qnan:
1214 /* already "integral" */
1215 break;
1216 case float_class_normal:
1217 if (a.exp >= DECOMPOSED_BINARY_POINT) {
1218 /* already integral */
1219 break;
1221 if (a.exp < 0) {
1222 bool one;
1223 /* all fractional */
1224 s->float_exception_flags |= float_flag_inexact;
1225 switch (rounding_mode) {
1226 case float_round_nearest_even:
1227 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1228 break;
1229 case float_round_ties_away:
1230 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1231 break;
1232 case float_round_to_zero:
1233 one = false;
1234 break;
1235 case float_round_up:
1236 one = !a.sign;
1237 break;
1238 case float_round_down:
1239 one = a.sign;
1240 break;
1241 default:
1242 g_assert_not_reached();
1245 if (one) {
1246 a.frac = DECOMPOSED_IMPLICIT_BIT;
1247 a.exp = 0;
1248 } else {
1249 a.cls = float_class_zero;
1251 } else {
1252 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1253 uint64_t frac_lsbm1 = frac_lsb >> 1;
1254 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1255 uint64_t rnd_mask = rnd_even_mask >> 1;
1256 uint64_t inc;
1258 switch (rounding_mode) {
1259 case float_round_nearest_even:
1260 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1261 break;
1262 case float_round_ties_away:
1263 inc = frac_lsbm1;
1264 break;
1265 case float_round_to_zero:
1266 inc = 0;
1267 break;
1268 case float_round_up:
1269 inc = a.sign ? 0 : rnd_mask;
1270 break;
1271 case float_round_down:
1272 inc = a.sign ? rnd_mask : 0;
1273 break;
1274 default:
1275 g_assert_not_reached();
1278 if (a.frac & rnd_mask) {
1279 s->float_exception_flags |= float_flag_inexact;
1280 a.frac += inc;
1281 a.frac &= ~rnd_mask;
1282 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1283 a.frac >>= 1;
1284 a.exp++;
1288 break;
1289 default:
1290 g_assert_not_reached();
1292 return a;
1295 float16 float16_round_to_int(float16 a, float_status *s)
1297 FloatParts pa = float16_unpack_canonical(a, s);
1298 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1299 return float16_round_pack_canonical(pr, s);
1302 float32 float32_round_to_int(float32 a, float_status *s)
1304 FloatParts pa = float32_unpack_canonical(a, s);
1305 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1306 return float32_round_pack_canonical(pr, s);
1309 float64 float64_round_to_int(float64 a, float_status *s)
1311 FloatParts pa = float64_unpack_canonical(a, s);
1312 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1313 return float64_round_pack_canonical(pr, s);
1316 float64 float64_trunc_to_int(float64 a, float_status *s)
1318 FloatParts pa = float64_unpack_canonical(a, s);
1319 FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1320 return float64_round_pack_canonical(pr, s);
1324 * Returns the result of converting the floating-point value `a' to
1325 * the two's complement integer format. The conversion is performed
1326 * according to the IEC/IEEE Standard for Binary Floating-Point
1327 * Arithmetic---which means in particular that the conversion is
1328 * rounded according to the current rounding mode. If `a' is a NaN,
1329 * the largest positive integer is returned. Otherwise, if the
1330 * conversion overflows, the largest integer with the same sign as `a'
1331 * is returned.
1334 static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1335 int64_t min, int64_t max,
1336 float_status *s)
1338 uint64_t r;
1339 int orig_flags = get_float_exception_flags(s);
1340 FloatParts p = round_to_int(in, rmode, s);
1342 switch (p.cls) {
1343 case float_class_snan:
1344 case float_class_qnan:
1345 case float_class_dnan:
1346 case float_class_msnan:
1347 return max;
1348 case float_class_inf:
1349 return p.sign ? min : max;
1350 case float_class_zero:
1351 return 0;
1352 case float_class_normal:
1353 if (p.exp < DECOMPOSED_BINARY_POINT) {
1354 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1355 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1356 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1357 } else {
1358 r = UINT64_MAX;
1360 if (p.sign) {
1361 if (r < -(uint64_t) min) {
1362 return -r;
1363 } else {
1364 s->float_exception_flags = orig_flags | float_flag_invalid;
1365 return min;
1367 } else {
1368 if (r < max) {
1369 return r;
1370 } else {
1371 s->float_exception_flags = orig_flags | float_flag_invalid;
1372 return max;
1375 default:
1376 g_assert_not_reached();
1380 #define FLOAT_TO_INT(fsz, isz) \
1381 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \
1382 float_status *s) \
1384 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1385 return round_to_int_and_pack(p, s->float_rounding_mode, \
1386 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1387 s); \
1390 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \
1391 (float ## fsz a, float_status *s) \
1393 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1394 return round_to_int_and_pack(p, float_round_to_zero, \
1395 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1396 s); \
1399 FLOAT_TO_INT(16, 16)
1400 FLOAT_TO_INT(16, 32)
1401 FLOAT_TO_INT(16, 64)
1403 FLOAT_TO_INT(32, 16)
1404 FLOAT_TO_INT(32, 32)
1405 FLOAT_TO_INT(32, 64)
1407 FLOAT_TO_INT(64, 16)
1408 FLOAT_TO_INT(64, 32)
1409 FLOAT_TO_INT(64, 64)
1411 #undef FLOAT_TO_INT
1414 * Returns the result of converting the floating-point value `a' to
1415 * the unsigned integer format. The conversion is performed according
1416 * to the IEC/IEEE Standard for Binary Floating-Point
1417 * Arithmetic---which means in particular that the conversion is
1418 * rounded according to the current rounding mode. If `a' is a NaN,
1419 * the largest unsigned integer is returned. Otherwise, if the
1420 * conversion overflows, the largest unsigned integer is returned. If
1421 * the 'a' is negative, the result is rounded and zero is returned;
1422 * values that do not round to zero will raise the inexact exception
1423 * flag.
1426 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1427 float_status *s)
1429 int orig_flags = get_float_exception_flags(s);
1430 FloatParts p = round_to_int(in, rmode, s);
1432 switch (p.cls) {
1433 case float_class_snan:
1434 case float_class_qnan:
1435 case float_class_dnan:
1436 case float_class_msnan:
1437 s->float_exception_flags = orig_flags | float_flag_invalid;
1438 return max;
1439 case float_class_inf:
1440 return p.sign ? 0 : max;
1441 case float_class_zero:
1442 return 0;
1443 case float_class_normal:
1445 uint64_t r;
1446 if (p.sign) {
1447 s->float_exception_flags = orig_flags | float_flag_invalid;
1448 return 0;
1451 if (p.exp < DECOMPOSED_BINARY_POINT) {
1452 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1453 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1454 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1455 } else {
1456 s->float_exception_flags = orig_flags | float_flag_invalid;
1457 return max;
1460 /* For uint64 this will never trip, but if p.exp is too large
1461 * to shift a decomposed fraction we shall have exited via the
1462 * 3rd leg above.
1464 if (r > max) {
1465 s->float_exception_flags = orig_flags | float_flag_invalid;
1466 return max;
1467 } else {
1468 return r;
1471 default:
1472 g_assert_not_reached();
1476 #define FLOAT_TO_UINT(fsz, isz) \
1477 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \
1478 float_status *s) \
1480 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1481 return round_to_uint_and_pack(p, s->float_rounding_mode, \
1482 UINT ## isz ## _MAX, s); \
1485 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \
1486 (float ## fsz a, float_status *s) \
1488 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1489 return round_to_uint_and_pack(p, s->float_rounding_mode, \
1490 UINT ## isz ## _MAX, s); \
1493 FLOAT_TO_UINT(16, 16)
1494 FLOAT_TO_UINT(16, 32)
1495 FLOAT_TO_UINT(16, 64)
1497 FLOAT_TO_UINT(32, 16)
1498 FLOAT_TO_UINT(32, 32)
1499 FLOAT_TO_UINT(32, 64)
1501 FLOAT_TO_UINT(64, 16)
1502 FLOAT_TO_UINT(64, 32)
1503 FLOAT_TO_UINT(64, 64)
1505 #undef FLOAT_TO_UINT
1508 * Integer to float conversions
1510 * Returns the result of converting the two's complement integer `a'
1511 * to the floating-point format. The conversion is performed according
1512 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1515 static FloatParts int_to_float(int64_t a, float_status *status)
1517 FloatParts r;
1518 if (a == 0) {
1519 r.cls = float_class_zero;
1520 r.sign = false;
1521 } else if (a == (1ULL << 63)) {
1522 r.cls = float_class_normal;
1523 r.sign = true;
1524 r.frac = DECOMPOSED_IMPLICIT_BIT;
1525 r.exp = 63;
1526 } else {
1527 uint64_t f;
1528 if (a < 0) {
1529 f = -a;
1530 r.sign = true;
1531 } else {
1532 f = a;
1533 r.sign = false;
1535 int shift = clz64(f) - 1;
1536 r.cls = float_class_normal;
1537 r.exp = (DECOMPOSED_BINARY_POINT - shift);
1538 r.frac = f << shift;
1541 return r;
1544 float16 int64_to_float16(int64_t a, float_status *status)
1546 FloatParts pa = int_to_float(a, status);
1547 return float16_round_pack_canonical(pa, status);
1550 float16 int32_to_float16(int32_t a, float_status *status)
1552 return int64_to_float16(a, status);
1555 float16 int16_to_float16(int16_t a, float_status *status)
1557 return int64_to_float16(a, status);
1560 float32 int64_to_float32(int64_t a, float_status *status)
1562 FloatParts pa = int_to_float(a, status);
1563 return float32_round_pack_canonical(pa, status);
1566 float32 int32_to_float32(int32_t a, float_status *status)
1568 return int64_to_float32(a, status);
1571 float32 int16_to_float32(int16_t a, float_status *status)
1573 return int64_to_float32(a, status);
1576 float64 int64_to_float64(int64_t a, float_status *status)
1578 FloatParts pa = int_to_float(a, status);
1579 return float64_round_pack_canonical(pa, status);
1582 float64 int32_to_float64(int32_t a, float_status *status)
1584 return int64_to_float64(a, status);
1587 float64 int16_to_float64(int16_t a, float_status *status)
1589 return int64_to_float64(a, status);
1594 * Unsigned Integer to float conversions
1596 * Returns the result of converting the unsigned integer `a' to the
1597 * floating-point format. The conversion is performed according to the
1598 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1601 static FloatParts uint_to_float(uint64_t a, float_status *status)
1603 FloatParts r = { .sign = false};
1605 if (a == 0) {
1606 r.cls = float_class_zero;
1607 } else {
1608 int spare_bits = clz64(a) - 1;
1609 r.cls = float_class_normal;
1610 r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1611 if (spare_bits < 0) {
1612 shift64RightJamming(a, -spare_bits, &a);
1613 r.frac = a;
1614 } else {
1615 r.frac = a << spare_bits;
1619 return r;
1622 float16 uint64_to_float16(uint64_t a, float_status *status)
1624 FloatParts pa = uint_to_float(a, status);
1625 return float16_round_pack_canonical(pa, status);
1628 float16 uint32_to_float16(uint32_t a, float_status *status)
1630 return uint64_to_float16(a, status);
1633 float16 uint16_to_float16(uint16_t a, float_status *status)
1635 return uint64_to_float16(a, status);
1638 float32 uint64_to_float32(uint64_t a, float_status *status)
1640 FloatParts pa = uint_to_float(a, status);
1641 return float32_round_pack_canonical(pa, status);
1644 float32 uint32_to_float32(uint32_t a, float_status *status)
1646 return uint64_to_float32(a, status);
1649 float32 uint16_to_float32(uint16_t a, float_status *status)
1651 return uint64_to_float32(a, status);
1654 float64 uint64_to_float64(uint64_t a, float_status *status)
1656 FloatParts pa = uint_to_float(a, status);
1657 return float64_round_pack_canonical(pa, status);
1660 float64 uint32_to_float64(uint32_t a, float_status *status)
1662 return uint64_to_float64(a, status);
1665 float64 uint16_to_float64(uint16_t a, float_status *status)
1667 return uint64_to_float64(a, status);
1670 /* Float Min/Max */
1671 /* min() and max() functions. These can't be implemented as
1672 * 'compare and pick one input' because that would mishandle
1673 * NaNs and +0 vs -0.
1675 * minnum() and maxnum() functions. These are similar to the min()
1676 * and max() functions but if one of the arguments is a QNaN and
1677 * the other is numerical then the numerical argument is returned.
1678 * SNaNs will get quietened before being returned.
1679 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1680 * and maxNum() operations. min() and max() are the typical min/max
1681 * semantics provided by many CPUs which predate that specification.
1683 * minnummag() and maxnummag() functions correspond to minNumMag()
1684 * and minNumMag() from the IEEE-754 2008.
1686 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1687 bool ieee, bool ismag, float_status *s)
1689 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1690 if (ieee) {
1691 /* Takes two floating-point values `a' and `b', one of
1692 * which is a NaN, and returns the appropriate NaN
1693 * result. If either `a' or `b' is a signaling NaN,
1694 * the invalid exception is raised.
1696 if (is_snan(a.cls) || is_snan(b.cls)) {
1697 return pick_nan(a, b, s);
1698 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1699 return b;
1700 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1701 return a;
1704 return pick_nan(a, b, s);
1705 } else {
1706 int a_exp, b_exp;
1707 bool a_sign, b_sign;
1709 switch (a.cls) {
1710 case float_class_normal:
1711 a_exp = a.exp;
1712 break;
1713 case float_class_inf:
1714 a_exp = INT_MAX;
1715 break;
1716 case float_class_zero:
1717 a_exp = INT_MIN;
1718 break;
1719 default:
1720 g_assert_not_reached();
1721 break;
1723 switch (b.cls) {
1724 case float_class_normal:
1725 b_exp = b.exp;
1726 break;
1727 case float_class_inf:
1728 b_exp = INT_MAX;
1729 break;
1730 case float_class_zero:
1731 b_exp = INT_MIN;
1732 break;
1733 default:
1734 g_assert_not_reached();
1735 break;
1738 a_sign = a.sign;
1739 b_sign = b.sign;
1740 if (ismag) {
1741 a_sign = b_sign = 0;
1744 if (a_sign == b_sign) {
1745 bool a_less = a_exp < b_exp;
1746 if (a_exp == b_exp) {
1747 a_less = a.frac < b.frac;
1749 return a_sign ^ a_less ^ ismin ? b : a;
1750 } else {
1751 return a_sign ^ ismin ? b : a;
1756 #define MINMAX(sz, name, ismin, isiee, ismag) \
1757 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
1758 float_status *s) \
1760 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1761 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1762 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
1764 return float ## sz ## _round_pack_canonical(pr, s); \
1767 MINMAX(16, min, true, false, false)
1768 MINMAX(16, minnum, true, true, false)
1769 MINMAX(16, minnummag, true, true, true)
1770 MINMAX(16, max, false, false, false)
1771 MINMAX(16, maxnum, false, true, false)
1772 MINMAX(16, maxnummag, false, true, true)
1774 MINMAX(32, min, true, false, false)
1775 MINMAX(32, minnum, true, true, false)
1776 MINMAX(32, minnummag, true, true, true)
1777 MINMAX(32, max, false, false, false)
1778 MINMAX(32, maxnum, false, true, false)
1779 MINMAX(32, maxnummag, false, true, true)
1781 MINMAX(64, min, true, false, false)
1782 MINMAX(64, minnum, true, true, false)
1783 MINMAX(64, minnummag, true, true, true)
1784 MINMAX(64, max, false, false, false)
1785 MINMAX(64, maxnum, false, true, false)
1786 MINMAX(64, maxnummag, false, true, true)
1788 #undef MINMAX
1790 /* Floating point compare */
1791 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
1792 float_status *s)
1794 if (is_nan(a.cls) || is_nan(b.cls)) {
1795 if (!is_quiet ||
1796 a.cls == float_class_snan ||
1797 b.cls == float_class_snan) {
1798 s->float_exception_flags |= float_flag_invalid;
1800 return float_relation_unordered;
1803 if (a.cls == float_class_zero) {
1804 if (b.cls == float_class_zero) {
1805 return float_relation_equal;
1807 return b.sign ? float_relation_greater : float_relation_less;
1808 } else if (b.cls == float_class_zero) {
1809 return a.sign ? float_relation_less : float_relation_greater;
1812 /* The only really important thing about infinity is its sign. If
1813 * both are infinities the sign marks the smallest of the two.
1815 if (a.cls == float_class_inf) {
1816 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
1817 return float_relation_equal;
1819 return a.sign ? float_relation_less : float_relation_greater;
1820 } else if (b.cls == float_class_inf) {
1821 return b.sign ? float_relation_greater : float_relation_less;
1824 if (a.sign != b.sign) {
1825 return a.sign ? float_relation_less : float_relation_greater;
1828 if (a.exp == b.exp) {
1829 if (a.frac == b.frac) {
1830 return float_relation_equal;
1832 if (a.sign) {
1833 return a.frac > b.frac ?
1834 float_relation_less : float_relation_greater;
1835 } else {
1836 return a.frac > b.frac ?
1837 float_relation_greater : float_relation_less;
1839 } else {
1840 if (a.sign) {
1841 return a.exp > b.exp ? float_relation_less : float_relation_greater;
1842 } else {
1843 return a.exp > b.exp ? float_relation_greater : float_relation_less;
1848 #define COMPARE(sz) \
1849 int float ## sz ## _compare(float ## sz a, float ## sz b, \
1850 float_status *s) \
1852 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1853 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1854 return compare_floats(pa, pb, false, s); \
1856 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \
1857 float_status *s) \
1859 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1860 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1861 return compare_floats(pa, pb, true, s); \
1864 COMPARE(16)
1865 COMPARE(32)
1866 COMPARE(64)
1868 #undef COMPARE
1870 /* Multiply A by 2 raised to the power N. */
1871 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1873 if (unlikely(is_nan(a.cls))) {
1874 return return_nan(a, s);
1876 if (a.cls == float_class_normal) {
1877 a.exp += n;
1879 return a;
1882 float16 float16_scalbn(float16 a, int n, float_status *status)
1884 FloatParts pa = float16_unpack_canonical(a, status);
1885 FloatParts pr = scalbn_decomposed(pa, n, status);
1886 return float16_round_pack_canonical(pr, status);
1889 float32 float32_scalbn(float32 a, int n, float_status *status)
1891 FloatParts pa = float32_unpack_canonical(a, status);
1892 FloatParts pr = scalbn_decomposed(pa, n, status);
1893 return float32_round_pack_canonical(pr, status);
1896 float64 float64_scalbn(float64 a, int n, float_status *status)
1898 FloatParts pa = float64_unpack_canonical(a, status);
1899 FloatParts pr = scalbn_decomposed(pa, n, status);
1900 return float64_round_pack_canonical(pr, status);
1904 * Square Root
1906 * The old softfloat code did an approximation step before zeroing in
1907 * on the final result. However for simpleness we just compute the
1908 * square root by iterating down from the implicit bit to enough extra
1909 * bits to ensure we get a correctly rounded result.
1911 * This does mean however the calculation is slower than before,
1912 * especially for 64 bit floats.
1915 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
1917 uint64_t a_frac, r_frac, s_frac;
1918 int bit, last_bit;
1920 if (is_nan(a.cls)) {
1921 return return_nan(a, s);
1923 if (a.cls == float_class_zero) {
1924 return a; /* sqrt(+-0) = +-0 */
1926 if (a.sign) {
1927 s->float_exception_flags |= float_flag_invalid;
1928 a.cls = float_class_dnan;
1929 return a;
1931 if (a.cls == float_class_inf) {
1932 return a; /* sqrt(+inf) = +inf */
1935 assert(a.cls == float_class_normal);
1937 /* We need two overflow bits at the top. Adding room for that is a
1938 * right shift. If the exponent is odd, we can discard the low bit
1939 * by multiplying the fraction by 2; that's a left shift. Combine
1940 * those and we shift right if the exponent is even.
1942 a_frac = a.frac;
1943 if (!(a.exp & 1)) {
1944 a_frac >>= 1;
1946 a.exp >>= 1;
1948 /* Bit-by-bit computation of sqrt. */
1949 r_frac = 0;
1950 s_frac = 0;
1952 /* Iterate from implicit bit down to the 3 extra bits to compute a
1953 * properly rounded result. Remember we've inserted one more bit
1954 * at the top, so these positions are one less.
1956 bit = DECOMPOSED_BINARY_POINT - 1;
1957 last_bit = MAX(p->frac_shift - 4, 0);
1958 do {
1959 uint64_t q = 1ULL << bit;
1960 uint64_t t_frac = s_frac + q;
1961 if (t_frac <= a_frac) {
1962 s_frac = t_frac + q;
1963 a_frac -= t_frac;
1964 r_frac += q;
1966 a_frac <<= 1;
1967 } while (--bit >= last_bit);
1969 /* Undo the right shift done above. If there is any remaining
1970 * fraction, the result is inexact. Set the sticky bit.
1972 a.frac = (r_frac << 1) + (a_frac != 0);
1974 return a;
1977 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
1979 FloatParts pa = float16_unpack_canonical(a, status);
1980 FloatParts pr = sqrt_float(pa, status, &float16_params);
1981 return float16_round_pack_canonical(pr, status);
1984 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
1986 FloatParts pa = float32_unpack_canonical(a, status);
1987 FloatParts pr = sqrt_float(pa, status, &float32_params);
1988 return float32_round_pack_canonical(pr, status);
1991 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
1993 FloatParts pa = float64_unpack_canonical(a, status);
1994 FloatParts pr = sqrt_float(pa, status, &float64_params);
1995 return float64_round_pack_canonical(pr, status);
1999 /*----------------------------------------------------------------------------
2000 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2001 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2002 | input. If `zSign' is 1, the input is negated before being converted to an
2003 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
2004 | is simply rounded to an integer, with the inexact exception raised if the
2005 | input cannot be represented exactly as an integer. However, if the fixed-
2006 | point input is too large, the invalid exception is raised and the largest
2007 | positive or negative integer is returned.
2008 *----------------------------------------------------------------------------*/
2010 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2012 int8_t roundingMode;
2013 flag roundNearestEven;
2014 int8_t roundIncrement, roundBits;
2015 int32_t z;
2017 roundingMode = status->float_rounding_mode;
2018 roundNearestEven = ( roundingMode == float_round_nearest_even );
2019 switch (roundingMode) {
2020 case float_round_nearest_even:
2021 case float_round_ties_away:
2022 roundIncrement = 0x40;
2023 break;
2024 case float_round_to_zero:
2025 roundIncrement = 0;
2026 break;
2027 case float_round_up:
2028 roundIncrement = zSign ? 0 : 0x7f;
2029 break;
2030 case float_round_down:
2031 roundIncrement = zSign ? 0x7f : 0;
2032 break;
2033 default:
2034 abort();
2036 roundBits = absZ & 0x7F;
2037 absZ = ( absZ + roundIncrement )>>7;
2038 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2039 z = absZ;
2040 if ( zSign ) z = - z;
2041 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2042 float_raise(float_flag_invalid, status);
2043 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2045 if (roundBits) {
2046 status->float_exception_flags |= float_flag_inexact;
2048 return z;
2052 /*----------------------------------------------------------------------------
2053 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2054 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2055 | and returns the properly rounded 64-bit integer corresponding to the input.
2056 | If `zSign' is 1, the input is negated before being converted to an integer.
2057 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2058 | the inexact exception raised if the input cannot be represented exactly as
2059 | an integer. However, if the fixed-point input is too large, the invalid
2060 | exception is raised and the largest positive or negative integer is
2061 | returned.
2062 *----------------------------------------------------------------------------*/
2064 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2065 float_status *status)
2067 int8_t roundingMode;
2068 flag roundNearestEven, increment;
2069 int64_t z;
2071 roundingMode = status->float_rounding_mode;
2072 roundNearestEven = ( roundingMode == float_round_nearest_even );
2073 switch (roundingMode) {
2074 case float_round_nearest_even:
2075 case float_round_ties_away:
2076 increment = ((int64_t) absZ1 < 0);
2077 break;
2078 case float_round_to_zero:
2079 increment = 0;
2080 break;
2081 case float_round_up:
2082 increment = !zSign && absZ1;
2083 break;
2084 case float_round_down:
2085 increment = zSign && absZ1;
2086 break;
2087 default:
2088 abort();
2090 if ( increment ) {
2091 ++absZ0;
2092 if ( absZ0 == 0 ) goto overflow;
2093 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2095 z = absZ0;
2096 if ( zSign ) z = - z;
2097 if ( z && ( ( z < 0 ) ^ zSign ) ) {
2098 overflow:
2099 float_raise(float_flag_invalid, status);
2100 return
2101 zSign ? (int64_t) LIT64( 0x8000000000000000 )
2102 : LIT64( 0x7FFFFFFFFFFFFFFF );
2104 if (absZ1) {
2105 status->float_exception_flags |= float_flag_inexact;
2107 return z;
2111 /*----------------------------------------------------------------------------
2112 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2113 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2114 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2115 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
2116 | with the inexact exception raised if the input cannot be represented exactly
2117 | as an integer. However, if the fixed-point input is too large, the invalid
2118 | exception is raised and the largest unsigned integer is returned.
2119 *----------------------------------------------------------------------------*/
2121 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2122 uint64_t absZ1, float_status *status)
2124 int8_t roundingMode;
2125 flag roundNearestEven, increment;
2127 roundingMode = status->float_rounding_mode;
2128 roundNearestEven = (roundingMode == float_round_nearest_even);
2129 switch (roundingMode) {
2130 case float_round_nearest_even:
2131 case float_round_ties_away:
2132 increment = ((int64_t)absZ1 < 0);
2133 break;
2134 case float_round_to_zero:
2135 increment = 0;
2136 break;
2137 case float_round_up:
2138 increment = !zSign && absZ1;
2139 break;
2140 case float_round_down:
2141 increment = zSign && absZ1;
2142 break;
2143 default:
2144 abort();
2146 if (increment) {
2147 ++absZ0;
2148 if (absZ0 == 0) {
2149 float_raise(float_flag_invalid, status);
2150 return LIT64(0xFFFFFFFFFFFFFFFF);
2152 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2155 if (zSign && absZ0) {
2156 float_raise(float_flag_invalid, status);
2157 return 0;
2160 if (absZ1) {
2161 status->float_exception_flags |= float_flag_inexact;
2163 return absZ0;
2166 /*----------------------------------------------------------------------------
2167 | If `a' is denormal and we are in flush-to-zero mode then set the
2168 | input-denormal exception and return zero. Otherwise just return the value.
2169 *----------------------------------------------------------------------------*/
2170 float32 float32_squash_input_denormal(float32 a, float_status *status)
2172 if (status->flush_inputs_to_zero) {
2173 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2174 float_raise(float_flag_input_denormal, status);
2175 return make_float32(float32_val(a) & 0x80000000);
2178 return a;
2181 /*----------------------------------------------------------------------------
2182 | Normalizes the subnormal single-precision floating-point value represented
2183 | by the denormalized significand `aSig'. The normalized exponent and
2184 | significand are stored at the locations pointed to by `zExpPtr' and
2185 | `zSigPtr', respectively.
2186 *----------------------------------------------------------------------------*/
2188 static void
2189 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2191 int8_t shiftCount;
2193 shiftCount = countLeadingZeros32( aSig ) - 8;
2194 *zSigPtr = aSig<<shiftCount;
2195 *zExpPtr = 1 - shiftCount;
2199 /*----------------------------------------------------------------------------
2200 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2201 | and significand `zSig', and returns the proper single-precision floating-
2202 | point value corresponding to the abstract input. Ordinarily, the abstract
2203 | value is simply rounded and packed into the single-precision format, with
2204 | the inexact exception raised if the abstract input cannot be represented
2205 | exactly. However, if the abstract value is too large, the overflow and
2206 | inexact exceptions are raised and an infinity or maximal finite value is
2207 | returned. If the abstract value is too small, the input value is rounded to
2208 | a subnormal number, and the underflow and inexact exceptions are raised if
2209 | the abstract input cannot be represented exactly as a subnormal single-
2210 | precision floating-point number.
2211 | The input significand `zSig' has its binary point between bits 30
2212 | and 29, which is 7 bits to the left of the usual location. This shifted
2213 | significand must be normalized or smaller. If `zSig' is not normalized,
2214 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2215 | and it must not require rounding. In the usual case that `zSig' is
2216 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2217 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2218 | Binary Floating-Point Arithmetic.
2219 *----------------------------------------------------------------------------*/
2221 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2222 float_status *status)
2224 int8_t roundingMode;
2225 flag roundNearestEven;
2226 int8_t roundIncrement, roundBits;
2227 flag isTiny;
2229 roundingMode = status->float_rounding_mode;
2230 roundNearestEven = ( roundingMode == float_round_nearest_even );
2231 switch (roundingMode) {
2232 case float_round_nearest_even:
2233 case float_round_ties_away:
2234 roundIncrement = 0x40;
2235 break;
2236 case float_round_to_zero:
2237 roundIncrement = 0;
2238 break;
2239 case float_round_up:
2240 roundIncrement = zSign ? 0 : 0x7f;
2241 break;
2242 case float_round_down:
2243 roundIncrement = zSign ? 0x7f : 0;
2244 break;
2245 default:
2246 abort();
2247 break;
2249 roundBits = zSig & 0x7F;
2250 if ( 0xFD <= (uint16_t) zExp ) {
2251 if ( ( 0xFD < zExp )
2252 || ( ( zExp == 0xFD )
2253 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2255 float_raise(float_flag_overflow | float_flag_inexact, status);
2256 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2258 if ( zExp < 0 ) {
2259 if (status->flush_to_zero) {
2260 float_raise(float_flag_output_denormal, status);
2261 return packFloat32(zSign, 0, 0);
2263 isTiny =
2264 (status->float_detect_tininess
2265 == float_tininess_before_rounding)
2266 || ( zExp < -1 )
2267 || ( zSig + roundIncrement < 0x80000000 );
2268 shift32RightJamming( zSig, - zExp, &zSig );
2269 zExp = 0;
2270 roundBits = zSig & 0x7F;
2271 if (isTiny && roundBits) {
2272 float_raise(float_flag_underflow, status);
2276 if (roundBits) {
2277 status->float_exception_flags |= float_flag_inexact;
2279 zSig = ( zSig + roundIncrement )>>7;
2280 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2281 if ( zSig == 0 ) zExp = 0;
2282 return packFloat32( zSign, zExp, zSig );
2286 /*----------------------------------------------------------------------------
2287 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2288 | and significand `zSig', and returns the proper single-precision floating-
2289 | point value corresponding to the abstract input. This routine is just like
2290 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2291 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2292 | floating-point exponent.
2293 *----------------------------------------------------------------------------*/
2295 static float32
2296 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2297 float_status *status)
2299 int8_t shiftCount;
2301 shiftCount = countLeadingZeros32( zSig ) - 1;
2302 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2303 status);
2307 /*----------------------------------------------------------------------------
2308 | If `a' is denormal and we are in flush-to-zero mode then set the
2309 | input-denormal exception and return zero. Otherwise just return the value.
2310 *----------------------------------------------------------------------------*/
2311 float64 float64_squash_input_denormal(float64 a, float_status *status)
2313 if (status->flush_inputs_to_zero) {
2314 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2315 float_raise(float_flag_input_denormal, status);
2316 return make_float64(float64_val(a) & (1ULL << 63));
2319 return a;
2322 /*----------------------------------------------------------------------------
2323 | Normalizes the subnormal double-precision floating-point value represented
2324 | by the denormalized significand `aSig'. The normalized exponent and
2325 | significand are stored at the locations pointed to by `zExpPtr' and
2326 | `zSigPtr', respectively.
2327 *----------------------------------------------------------------------------*/
2329 static void
2330 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2332 int8_t shiftCount;
2334 shiftCount = countLeadingZeros64( aSig ) - 11;
2335 *zSigPtr = aSig<<shiftCount;
2336 *zExpPtr = 1 - shiftCount;
2340 /*----------------------------------------------------------------------------
2341 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2342 | double-precision floating-point value, returning the result. After being
2343 | shifted into the proper positions, the three fields are simply added
2344 | together to form the result. This means that any integer portion of `zSig'
2345 | will be added into the exponent. Since a properly normalized significand
2346 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2347 | than the desired result exponent whenever `zSig' is a complete, normalized
2348 | significand.
2349 *----------------------------------------------------------------------------*/
2351 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2354 return make_float64(
2355 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2359 /*----------------------------------------------------------------------------
2360 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2361 | and significand `zSig', and returns the proper double-precision floating-
2362 | point value corresponding to the abstract input. Ordinarily, the abstract
2363 | value is simply rounded and packed into the double-precision format, with
2364 | the inexact exception raised if the abstract input cannot be represented
2365 | exactly. However, if the abstract value is too large, the overflow and
2366 | inexact exceptions are raised and an infinity or maximal finite value is
2367 | returned. If the abstract value is too small, the input value is rounded to
2368 | a subnormal number, and the underflow and inexact exceptions are raised if
2369 | the abstract input cannot be represented exactly as a subnormal double-
2370 | precision floating-point number.
2371 | The input significand `zSig' has its binary point between bits 62
2372 | and 61, which is 10 bits to the left of the usual location. This shifted
2373 | significand must be normalized or smaller. If `zSig' is not normalized,
2374 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2375 | and it must not require rounding. In the usual case that `zSig' is
2376 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2377 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2378 | Binary Floating-Point Arithmetic.
2379 *----------------------------------------------------------------------------*/
2381 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2382 float_status *status)
2384 int8_t roundingMode;
2385 flag roundNearestEven;
2386 int roundIncrement, roundBits;
2387 flag isTiny;
2389 roundingMode = status->float_rounding_mode;
2390 roundNearestEven = ( roundingMode == float_round_nearest_even );
2391 switch (roundingMode) {
2392 case float_round_nearest_even:
2393 case float_round_ties_away:
2394 roundIncrement = 0x200;
2395 break;
2396 case float_round_to_zero:
2397 roundIncrement = 0;
2398 break;
2399 case float_round_up:
2400 roundIncrement = zSign ? 0 : 0x3ff;
2401 break;
2402 case float_round_down:
2403 roundIncrement = zSign ? 0x3ff : 0;
2404 break;
2405 case float_round_to_odd:
2406 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2407 break;
2408 default:
2409 abort();
2411 roundBits = zSig & 0x3FF;
2412 if ( 0x7FD <= (uint16_t) zExp ) {
2413 if ( ( 0x7FD < zExp )
2414 || ( ( zExp == 0x7FD )
2415 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2417 bool overflow_to_inf = roundingMode != float_round_to_odd &&
2418 roundIncrement != 0;
2419 float_raise(float_flag_overflow | float_flag_inexact, status);
2420 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2422 if ( zExp < 0 ) {
2423 if (status->flush_to_zero) {
2424 float_raise(float_flag_output_denormal, status);
2425 return packFloat64(zSign, 0, 0);
2427 isTiny =
2428 (status->float_detect_tininess
2429 == float_tininess_before_rounding)
2430 || ( zExp < -1 )
2431 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2432 shift64RightJamming( zSig, - zExp, &zSig );
2433 zExp = 0;
2434 roundBits = zSig & 0x3FF;
2435 if (isTiny && roundBits) {
2436 float_raise(float_flag_underflow, status);
2438 if (roundingMode == float_round_to_odd) {
2440 * For round-to-odd case, the roundIncrement depends on
2441 * zSig which just changed.
2443 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2447 if (roundBits) {
2448 status->float_exception_flags |= float_flag_inexact;
2450 zSig = ( zSig + roundIncrement )>>10;
2451 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2452 if ( zSig == 0 ) zExp = 0;
2453 return packFloat64( zSign, zExp, zSig );
2457 /*----------------------------------------------------------------------------
2458 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2459 | and significand `zSig', and returns the proper double-precision floating-
2460 | point value corresponding to the abstract input. This routine is just like
2461 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2462 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2463 | floating-point exponent.
2464 *----------------------------------------------------------------------------*/
2466 static float64
2467 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2468 float_status *status)
2470 int8_t shiftCount;
2472 shiftCount = countLeadingZeros64( zSig ) - 1;
2473 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2474 status);
2478 /*----------------------------------------------------------------------------
2479 | Normalizes the subnormal extended double-precision floating-point value
2480 | represented by the denormalized significand `aSig'. The normalized exponent
2481 | and significand are stored at the locations pointed to by `zExpPtr' and
2482 | `zSigPtr', respectively.
2483 *----------------------------------------------------------------------------*/
2485 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2486 uint64_t *zSigPtr)
2488 int8_t shiftCount;
2490 shiftCount = countLeadingZeros64( aSig );
2491 *zSigPtr = aSig<<shiftCount;
2492 *zExpPtr = 1 - shiftCount;
2495 /*----------------------------------------------------------------------------
2496 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2497 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2498 | and returns the proper extended double-precision floating-point value
2499 | corresponding to the abstract input. Ordinarily, the abstract value is
2500 | rounded and packed into the extended double-precision format, with the
2501 | inexact exception raised if the abstract input cannot be represented
2502 | exactly. However, if the abstract value is too large, the overflow and
2503 | inexact exceptions are raised and an infinity or maximal finite value is
2504 | returned. If the abstract value is too small, the input value is rounded to
2505 | a subnormal number, and the underflow and inexact exceptions are raised if
2506 | the abstract input cannot be represented exactly as a subnormal extended
2507 | double-precision floating-point number.
2508 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
2509 | number of bits as single or double precision, respectively. Otherwise, the
2510 | result is rounded to the full precision of the extended double-precision
2511 | format.
2512 | The input significand must be normalized or smaller. If the input
2513 | significand is not normalized, `zExp' must be 0; in that case, the result
2514 | returned is a subnormal number, and it must not require rounding. The
2515 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2516 | Floating-Point Arithmetic.
2517 *----------------------------------------------------------------------------*/
2519 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2520 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2521 float_status *status)
2523 int8_t roundingMode;
2524 flag roundNearestEven, increment, isTiny;
2525 int64_t roundIncrement, roundMask, roundBits;
2527 roundingMode = status->float_rounding_mode;
2528 roundNearestEven = ( roundingMode == float_round_nearest_even );
2529 if ( roundingPrecision == 80 ) goto precision80;
2530 if ( roundingPrecision == 64 ) {
2531 roundIncrement = LIT64( 0x0000000000000400 );
2532 roundMask = LIT64( 0x00000000000007FF );
2534 else if ( roundingPrecision == 32 ) {
2535 roundIncrement = LIT64( 0x0000008000000000 );
2536 roundMask = LIT64( 0x000000FFFFFFFFFF );
2538 else {
2539 goto precision80;
2541 zSig0 |= ( zSig1 != 0 );
2542 switch (roundingMode) {
2543 case float_round_nearest_even:
2544 case float_round_ties_away:
2545 break;
2546 case float_round_to_zero:
2547 roundIncrement = 0;
2548 break;
2549 case float_round_up:
2550 roundIncrement = zSign ? 0 : roundMask;
2551 break;
2552 case float_round_down:
2553 roundIncrement = zSign ? roundMask : 0;
2554 break;
2555 default:
2556 abort();
2558 roundBits = zSig0 & roundMask;
2559 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2560 if ( ( 0x7FFE < zExp )
2561 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2563 goto overflow;
2565 if ( zExp <= 0 ) {
2566 if (status->flush_to_zero) {
2567 float_raise(float_flag_output_denormal, status);
2568 return packFloatx80(zSign, 0, 0);
2570 isTiny =
2571 (status->float_detect_tininess
2572 == float_tininess_before_rounding)
2573 || ( zExp < 0 )
2574 || ( zSig0 <= zSig0 + roundIncrement );
2575 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2576 zExp = 0;
2577 roundBits = zSig0 & roundMask;
2578 if (isTiny && roundBits) {
2579 float_raise(float_flag_underflow, status);
2581 if (roundBits) {
2582 status->float_exception_flags |= float_flag_inexact;
2584 zSig0 += roundIncrement;
2585 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2586 roundIncrement = roundMask + 1;
2587 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2588 roundMask |= roundIncrement;
2590 zSig0 &= ~ roundMask;
2591 return packFloatx80( zSign, zExp, zSig0 );
2594 if (roundBits) {
2595 status->float_exception_flags |= float_flag_inexact;
2597 zSig0 += roundIncrement;
2598 if ( zSig0 < roundIncrement ) {
2599 ++zExp;
2600 zSig0 = LIT64( 0x8000000000000000 );
2602 roundIncrement = roundMask + 1;
2603 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2604 roundMask |= roundIncrement;
2606 zSig0 &= ~ roundMask;
2607 if ( zSig0 == 0 ) zExp = 0;
2608 return packFloatx80( zSign, zExp, zSig0 );
2609 precision80:
2610 switch (roundingMode) {
2611 case float_round_nearest_even:
2612 case float_round_ties_away:
2613 increment = ((int64_t)zSig1 < 0);
2614 break;
2615 case float_round_to_zero:
2616 increment = 0;
2617 break;
2618 case float_round_up:
2619 increment = !zSign && zSig1;
2620 break;
2621 case float_round_down:
2622 increment = zSign && zSig1;
2623 break;
2624 default:
2625 abort();
2627 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2628 if ( ( 0x7FFE < zExp )
2629 || ( ( zExp == 0x7FFE )
2630 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2631 && increment
2634 roundMask = 0;
2635 overflow:
2636 float_raise(float_flag_overflow | float_flag_inexact, status);
2637 if ( ( roundingMode == float_round_to_zero )
2638 || ( zSign && ( roundingMode == float_round_up ) )
2639 || ( ! zSign && ( roundingMode == float_round_down ) )
2641 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2643 return packFloatx80(zSign,
2644 floatx80_infinity_high,
2645 floatx80_infinity_low);
2647 if ( zExp <= 0 ) {
2648 isTiny =
2649 (status->float_detect_tininess
2650 == float_tininess_before_rounding)
2651 || ( zExp < 0 )
2652 || ! increment
2653 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2654 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2655 zExp = 0;
2656 if (isTiny && zSig1) {
2657 float_raise(float_flag_underflow, status);
2659 if (zSig1) {
2660 status->float_exception_flags |= float_flag_inexact;
2662 switch (roundingMode) {
2663 case float_round_nearest_even:
2664 case float_round_ties_away:
2665 increment = ((int64_t)zSig1 < 0);
2666 break;
2667 case float_round_to_zero:
2668 increment = 0;
2669 break;
2670 case float_round_up:
2671 increment = !zSign && zSig1;
2672 break;
2673 case float_round_down:
2674 increment = zSign && zSig1;
2675 break;
2676 default:
2677 abort();
2679 if ( increment ) {
2680 ++zSig0;
2681 zSig0 &=
2682 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2683 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2685 return packFloatx80( zSign, zExp, zSig0 );
2688 if (zSig1) {
2689 status->float_exception_flags |= float_flag_inexact;
2691 if ( increment ) {
2692 ++zSig0;
2693 if ( zSig0 == 0 ) {
2694 ++zExp;
2695 zSig0 = LIT64( 0x8000000000000000 );
2697 else {
2698 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2701 else {
2702 if ( zSig0 == 0 ) zExp = 0;
2704 return packFloatx80( zSign, zExp, zSig0 );
2708 /*----------------------------------------------------------------------------
2709 | Takes an abstract floating-point value having sign `zSign', exponent
2710 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2711 | and returns the proper extended double-precision floating-point value
2712 | corresponding to the abstract input. This routine is just like
2713 | `roundAndPackFloatx80' except that the input significand does not have to be
2714 | normalized.
2715 *----------------------------------------------------------------------------*/
2717 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2718 flag zSign, int32_t zExp,
2719 uint64_t zSig0, uint64_t zSig1,
2720 float_status *status)
2722 int8_t shiftCount;
2724 if ( zSig0 == 0 ) {
2725 zSig0 = zSig1;
2726 zSig1 = 0;
2727 zExp -= 64;
2729 shiftCount = countLeadingZeros64( zSig0 );
2730 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2731 zExp -= shiftCount;
2732 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2733 zSig0, zSig1, status);
2737 /*----------------------------------------------------------------------------
2738 | Returns the least-significant 64 fraction bits of the quadruple-precision
2739 | floating-point value `a'.
2740 *----------------------------------------------------------------------------*/
2742 static inline uint64_t extractFloat128Frac1( float128 a )
2745 return a.low;
2749 /*----------------------------------------------------------------------------
2750 | Returns the most-significant 48 fraction bits of the quadruple-precision
2751 | floating-point value `a'.
2752 *----------------------------------------------------------------------------*/
2754 static inline uint64_t extractFloat128Frac0( float128 a )
2757 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2761 /*----------------------------------------------------------------------------
2762 | Returns the exponent bits of the quadruple-precision floating-point value
2763 | `a'.
2764 *----------------------------------------------------------------------------*/
2766 static inline int32_t extractFloat128Exp( float128 a )
2769 return ( a.high>>48 ) & 0x7FFF;
2773 /*----------------------------------------------------------------------------
2774 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2775 *----------------------------------------------------------------------------*/
2777 static inline flag extractFloat128Sign( float128 a )
2780 return a.high>>63;
2784 /*----------------------------------------------------------------------------
2785 | Normalizes the subnormal quadruple-precision floating-point value
2786 | represented by the denormalized significand formed by the concatenation of
2787 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
2788 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
2789 | significand are stored at the location pointed to by `zSig0Ptr', and the
2790 | least significant 64 bits of the normalized significand are stored at the
2791 | location pointed to by `zSig1Ptr'.
2792 *----------------------------------------------------------------------------*/
2794 static void
2795 normalizeFloat128Subnormal(
2796 uint64_t aSig0,
2797 uint64_t aSig1,
2798 int32_t *zExpPtr,
2799 uint64_t *zSig0Ptr,
2800 uint64_t *zSig1Ptr
2803 int8_t shiftCount;
2805 if ( aSig0 == 0 ) {
2806 shiftCount = countLeadingZeros64( aSig1 ) - 15;
2807 if ( shiftCount < 0 ) {
2808 *zSig0Ptr = aSig1>>( - shiftCount );
2809 *zSig1Ptr = aSig1<<( shiftCount & 63 );
2811 else {
2812 *zSig0Ptr = aSig1<<shiftCount;
2813 *zSig1Ptr = 0;
2815 *zExpPtr = - shiftCount - 63;
2817 else {
2818 shiftCount = countLeadingZeros64( aSig0 ) - 15;
2819 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2820 *zExpPtr = 1 - shiftCount;
2825 /*----------------------------------------------------------------------------
2826 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2827 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2828 | floating-point value, returning the result. After being shifted into the
2829 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2830 | added together to form the most significant 32 bits of the result. This
2831 | means that any integer portion of `zSig0' will be added into the exponent.
2832 | Since a properly normalized significand will have an integer portion equal
2833 | to 1, the `zExp' input should be 1 less than the desired result exponent
2834 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2835 | significand.
2836 *----------------------------------------------------------------------------*/
2838 static inline float128
2839 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
2841 float128 z;
2843 z.low = zSig1;
2844 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
2845 return z;
2849 /*----------------------------------------------------------------------------
2850 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2851 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2852 | and `zSig2', and returns the proper quadruple-precision floating-point value
2853 | corresponding to the abstract input. Ordinarily, the abstract value is
2854 | simply rounded and packed into the quadruple-precision format, with the
2855 | inexact exception raised if the abstract input cannot be represented
2856 | exactly. However, if the abstract value is too large, the overflow and
2857 | inexact exceptions are raised and an infinity or maximal finite value is
2858 | returned. If the abstract value is too small, the input value is rounded to
2859 | a subnormal number, and the underflow and inexact exceptions are raised if
2860 | the abstract input cannot be represented exactly as a subnormal quadruple-
2861 | precision floating-point number.
2862 | The input significand must be normalized or smaller. If the input
2863 | significand is not normalized, `zExp' must be 0; in that case, the result
2864 | returned is a subnormal number, and it must not require rounding. In the
2865 | usual case that the input significand is normalized, `zExp' must be 1 less
2866 | than the ``true'' floating-point exponent. The handling of underflow and
2867 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2868 *----------------------------------------------------------------------------*/
2870 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
2871 uint64_t zSig0, uint64_t zSig1,
2872 uint64_t zSig2, float_status *status)
2874 int8_t roundingMode;
2875 flag roundNearestEven, increment, isTiny;
2877 roundingMode = status->float_rounding_mode;
2878 roundNearestEven = ( roundingMode == float_round_nearest_even );
2879 switch (roundingMode) {
2880 case float_round_nearest_even:
2881 case float_round_ties_away:
2882 increment = ((int64_t)zSig2 < 0);
2883 break;
2884 case float_round_to_zero:
2885 increment = 0;
2886 break;
2887 case float_round_up:
2888 increment = !zSign && zSig2;
2889 break;
2890 case float_round_down:
2891 increment = zSign && zSig2;
2892 break;
2893 case float_round_to_odd:
2894 increment = !(zSig1 & 0x1) && zSig2;
2895 break;
2896 default:
2897 abort();
2899 if ( 0x7FFD <= (uint32_t) zExp ) {
2900 if ( ( 0x7FFD < zExp )
2901 || ( ( zExp == 0x7FFD )
2902 && eq128(
2903 LIT64( 0x0001FFFFFFFFFFFF ),
2904 LIT64( 0xFFFFFFFFFFFFFFFF ),
2905 zSig0,
2906 zSig1
2908 && increment
2911 float_raise(float_flag_overflow | float_flag_inexact, status);
2912 if ( ( roundingMode == float_round_to_zero )
2913 || ( zSign && ( roundingMode == float_round_up ) )
2914 || ( ! zSign && ( roundingMode == float_round_down ) )
2915 || (roundingMode == float_round_to_odd)
2917 return
2918 packFloat128(
2919 zSign,
2920 0x7FFE,
2921 LIT64( 0x0000FFFFFFFFFFFF ),
2922 LIT64( 0xFFFFFFFFFFFFFFFF )
2925 return packFloat128( zSign, 0x7FFF, 0, 0 );
2927 if ( zExp < 0 ) {
2928 if (status->flush_to_zero) {
2929 float_raise(float_flag_output_denormal, status);
2930 return packFloat128(zSign, 0, 0, 0);
2932 isTiny =
2933 (status->float_detect_tininess
2934 == float_tininess_before_rounding)
2935 || ( zExp < -1 )
2936 || ! increment
2937 || lt128(
2938 zSig0,
2939 zSig1,
2940 LIT64( 0x0001FFFFFFFFFFFF ),
2941 LIT64( 0xFFFFFFFFFFFFFFFF )
2943 shift128ExtraRightJamming(
2944 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
2945 zExp = 0;
2946 if (isTiny && zSig2) {
2947 float_raise(float_flag_underflow, status);
2949 switch (roundingMode) {
2950 case float_round_nearest_even:
2951 case float_round_ties_away:
2952 increment = ((int64_t)zSig2 < 0);
2953 break;
2954 case float_round_to_zero:
2955 increment = 0;
2956 break;
2957 case float_round_up:
2958 increment = !zSign && zSig2;
2959 break;
2960 case float_round_down:
2961 increment = zSign && zSig2;
2962 break;
2963 case float_round_to_odd:
2964 increment = !(zSig1 & 0x1) && zSig2;
2965 break;
2966 default:
2967 abort();
2971 if (zSig2) {
2972 status->float_exception_flags |= float_flag_inexact;
2974 if ( increment ) {
2975 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
2976 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
2978 else {
2979 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
2981 return packFloat128( zSign, zExp, zSig0, zSig1 );
2985 /*----------------------------------------------------------------------------
2986 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2987 | and significand formed by the concatenation of `zSig0' and `zSig1', and
2988 | returns the proper quadruple-precision floating-point value corresponding
2989 | to the abstract input. This routine is just like `roundAndPackFloat128'
2990 | except that the input significand has fewer bits and does not have to be
2991 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
2992 | point exponent.
2993 *----------------------------------------------------------------------------*/
2995 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
2996 uint64_t zSig0, uint64_t zSig1,
2997 float_status *status)
2999 int8_t shiftCount;
3000 uint64_t zSig2;
3002 if ( zSig0 == 0 ) {
3003 zSig0 = zSig1;
3004 zSig1 = 0;
3005 zExp -= 64;
3007 shiftCount = countLeadingZeros64( zSig0 ) - 15;
3008 if ( 0 <= shiftCount ) {
3009 zSig2 = 0;
3010 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3012 else {
3013 shift128ExtraRightJamming(
3014 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3016 zExp -= shiftCount;
3017 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3022 /*----------------------------------------------------------------------------
3023 | Returns the result of converting the 32-bit two's complement integer `a'
3024 | to the extended double-precision floating-point format. The conversion
3025 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3026 | Arithmetic.
3027 *----------------------------------------------------------------------------*/
3029 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3031 flag zSign;
3032 uint32_t absA;
3033 int8_t shiftCount;
3034 uint64_t zSig;
3036 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3037 zSign = ( a < 0 );
3038 absA = zSign ? - a : a;
3039 shiftCount = countLeadingZeros32( absA ) + 32;
3040 zSig = absA;
3041 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3045 /*----------------------------------------------------------------------------
3046 | Returns the result of converting the 32-bit two's complement integer `a' to
3047 | the quadruple-precision floating-point format. The conversion is performed
3048 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3049 *----------------------------------------------------------------------------*/
3051 float128 int32_to_float128(int32_t a, float_status *status)
3053 flag zSign;
3054 uint32_t absA;
3055 int8_t shiftCount;
3056 uint64_t zSig0;
3058 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3059 zSign = ( a < 0 );
3060 absA = zSign ? - a : a;
3061 shiftCount = countLeadingZeros32( absA ) + 17;
3062 zSig0 = absA;
3063 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3067 /*----------------------------------------------------------------------------
3068 | Returns the result of converting the 64-bit two's complement integer `a'
3069 | to the extended double-precision floating-point format. The conversion
3070 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3071 | Arithmetic.
3072 *----------------------------------------------------------------------------*/
3074 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3076 flag zSign;
3077 uint64_t absA;
3078 int8_t shiftCount;
3080 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3081 zSign = ( a < 0 );
3082 absA = zSign ? - a : a;
3083 shiftCount = countLeadingZeros64( absA );
3084 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3088 /*----------------------------------------------------------------------------
3089 | Returns the result of converting the 64-bit two's complement integer `a' to
3090 | the quadruple-precision floating-point format. The conversion is performed
3091 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3092 *----------------------------------------------------------------------------*/
3094 float128 int64_to_float128(int64_t a, float_status *status)
3096 flag zSign;
3097 uint64_t absA;
3098 int8_t shiftCount;
3099 int32_t zExp;
3100 uint64_t zSig0, zSig1;
3102 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3103 zSign = ( a < 0 );
3104 absA = zSign ? - a : a;
3105 shiftCount = countLeadingZeros64( absA ) + 49;
3106 zExp = 0x406E - shiftCount;
3107 if ( 64 <= shiftCount ) {
3108 zSig1 = 0;
3109 zSig0 = absA;
3110 shiftCount -= 64;
3112 else {
3113 zSig1 = absA;
3114 zSig0 = 0;
3116 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3117 return packFloat128( zSign, zExp, zSig0, zSig1 );
3121 /*----------------------------------------------------------------------------
3122 | Returns the result of converting the 64-bit unsigned integer `a'
3123 | to the quadruple-precision floating-point format. The conversion is performed
3124 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3125 *----------------------------------------------------------------------------*/
3127 float128 uint64_to_float128(uint64_t a, float_status *status)
3129 if (a == 0) {
3130 return float128_zero;
3132 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
3138 /*----------------------------------------------------------------------------
3139 | Returns the result of converting the single-precision floating-point value
3140 | `a' to the double-precision floating-point format. The conversion is
3141 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3142 | Arithmetic.
3143 *----------------------------------------------------------------------------*/
3145 float64 float32_to_float64(float32 a, float_status *status)
3147 flag aSign;
3148 int aExp;
3149 uint32_t aSig;
3150 a = float32_squash_input_denormal(a, status);
3152 aSig = extractFloat32Frac( a );
3153 aExp = extractFloat32Exp( a );
3154 aSign = extractFloat32Sign( a );
3155 if ( aExp == 0xFF ) {
3156 if (aSig) {
3157 return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
3159 return packFloat64( aSign, 0x7FF, 0 );
3161 if ( aExp == 0 ) {
3162 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
3163 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3164 --aExp;
3166 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
3170 /*----------------------------------------------------------------------------
3171 | Returns the result of converting the single-precision floating-point value
3172 | `a' to the extended double-precision floating-point format. The conversion
3173 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3174 | Arithmetic.
3175 *----------------------------------------------------------------------------*/
3177 floatx80 float32_to_floatx80(float32 a, float_status *status)
3179 flag aSign;
3180 int aExp;
3181 uint32_t aSig;
3183 a = float32_squash_input_denormal(a, status);
3184 aSig = extractFloat32Frac( a );
3185 aExp = extractFloat32Exp( a );
3186 aSign = extractFloat32Sign( a );
3187 if ( aExp == 0xFF ) {
3188 if (aSig) {
3189 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3191 return packFloatx80(aSign,
3192 floatx80_infinity_high,
3193 floatx80_infinity_low);
3195 if ( aExp == 0 ) {
3196 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3197 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3199 aSig |= 0x00800000;
3200 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3204 /*----------------------------------------------------------------------------
3205 | Returns the result of converting the single-precision floating-point value
3206 | `a' to the double-precision floating-point format. The conversion is
3207 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3208 | Arithmetic.
3209 *----------------------------------------------------------------------------*/
3211 float128 float32_to_float128(float32 a, float_status *status)
3213 flag aSign;
3214 int aExp;
3215 uint32_t aSig;
3217 a = float32_squash_input_denormal(a, status);
3218 aSig = extractFloat32Frac( a );
3219 aExp = extractFloat32Exp( a );
3220 aSign = extractFloat32Sign( a );
3221 if ( aExp == 0xFF ) {
3222 if (aSig) {
3223 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3225 return packFloat128( aSign, 0x7FFF, 0, 0 );
3227 if ( aExp == 0 ) {
3228 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3229 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3230 --aExp;
3232 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3236 /*----------------------------------------------------------------------------
3237 | Returns the remainder of the single-precision floating-point value `a'
3238 | with respect to the corresponding value `b'. The operation is performed
3239 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3240 *----------------------------------------------------------------------------*/
3242 float32 float32_rem(float32 a, float32 b, float_status *status)
3244 flag aSign, zSign;
3245 int aExp, bExp, expDiff;
3246 uint32_t aSig, bSig;
3247 uint32_t q;
3248 uint64_t aSig64, bSig64, q64;
3249 uint32_t alternateASig;
3250 int32_t sigMean;
3251 a = float32_squash_input_denormal(a, status);
3252 b = float32_squash_input_denormal(b, status);
3254 aSig = extractFloat32Frac( a );
3255 aExp = extractFloat32Exp( a );
3256 aSign = extractFloat32Sign( a );
3257 bSig = extractFloat32Frac( b );
3258 bExp = extractFloat32Exp( b );
3259 if ( aExp == 0xFF ) {
3260 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3261 return propagateFloat32NaN(a, b, status);
3263 float_raise(float_flag_invalid, status);
3264 return float32_default_nan(status);
3266 if ( bExp == 0xFF ) {
3267 if (bSig) {
3268 return propagateFloat32NaN(a, b, status);
3270 return a;
3272 if ( bExp == 0 ) {
3273 if ( bSig == 0 ) {
3274 float_raise(float_flag_invalid, status);
3275 return float32_default_nan(status);
3277 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3279 if ( aExp == 0 ) {
3280 if ( aSig == 0 ) return a;
3281 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3283 expDiff = aExp - bExp;
3284 aSig |= 0x00800000;
3285 bSig |= 0x00800000;
3286 if ( expDiff < 32 ) {
3287 aSig <<= 8;
3288 bSig <<= 8;
3289 if ( expDiff < 0 ) {
3290 if ( expDiff < -1 ) return a;
3291 aSig >>= 1;
3293 q = ( bSig <= aSig );
3294 if ( q ) aSig -= bSig;
3295 if ( 0 < expDiff ) {
3296 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3297 q >>= 32 - expDiff;
3298 bSig >>= 2;
3299 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3301 else {
3302 aSig >>= 2;
3303 bSig >>= 2;
3306 else {
3307 if ( bSig <= aSig ) aSig -= bSig;
3308 aSig64 = ( (uint64_t) aSig )<<40;
3309 bSig64 = ( (uint64_t) bSig )<<40;
3310 expDiff -= 64;
3311 while ( 0 < expDiff ) {
3312 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3313 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3314 aSig64 = - ( ( bSig * q64 )<<38 );
3315 expDiff -= 62;
3317 expDiff += 64;
3318 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3319 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3320 q = q64>>( 64 - expDiff );
3321 bSig <<= 6;
3322 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3324 do {
3325 alternateASig = aSig;
3326 ++q;
3327 aSig -= bSig;
3328 } while ( 0 <= (int32_t) aSig );
3329 sigMean = aSig + alternateASig;
3330 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3331 aSig = alternateASig;
3333 zSign = ( (int32_t) aSig < 0 );
3334 if ( zSign ) aSig = - aSig;
3335 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3340 /*----------------------------------------------------------------------------
3341 | Returns the binary exponential of the single-precision floating-point value
3342 | `a'. The operation is performed according to the IEC/IEEE Standard for
3343 | Binary Floating-Point Arithmetic.
3345 | Uses the following identities:
3347 | 1. -------------------------------------------------------------------------
3348 | x x*ln(2)
3349 | 2 = e
3351 | 2. -------------------------------------------------------------------------
3352 | 2 3 4 5 n
3353 | x x x x x x x
3354 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3355 | 1! 2! 3! 4! 5! n!
3356 *----------------------------------------------------------------------------*/
3358 static const float64 float32_exp2_coefficients[15] =
3360 const_float64( 0x3ff0000000000000ll ), /* 1 */
3361 const_float64( 0x3fe0000000000000ll ), /* 2 */
3362 const_float64( 0x3fc5555555555555ll ), /* 3 */
3363 const_float64( 0x3fa5555555555555ll ), /* 4 */
3364 const_float64( 0x3f81111111111111ll ), /* 5 */
3365 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
3366 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
3367 const_float64( 0x3efa01a01a01a01all ), /* 8 */
3368 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
3369 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3370 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3371 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3372 const_float64( 0x3de6124613a86d09ll ), /* 13 */
3373 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3374 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3377 float32 float32_exp2(float32 a, float_status *status)
3379 flag aSign;
3380 int aExp;
3381 uint32_t aSig;
3382 float64 r, x, xn;
3383 int i;
3384 a = float32_squash_input_denormal(a, status);
3386 aSig = extractFloat32Frac( a );
3387 aExp = extractFloat32Exp( a );
3388 aSign = extractFloat32Sign( a );
3390 if ( aExp == 0xFF) {
3391 if (aSig) {
3392 return propagateFloat32NaN(a, float32_zero, status);
3394 return (aSign) ? float32_zero : a;
3396 if (aExp == 0) {
3397 if (aSig == 0) return float32_one;
3400 float_raise(float_flag_inexact, status);
3402 /* ******************************* */
3403 /* using float64 for approximation */
3404 /* ******************************* */
3405 x = float32_to_float64(a, status);
3406 x = float64_mul(x, float64_ln2, status);
3408 xn = x;
3409 r = float64_one;
3410 for (i = 0 ; i < 15 ; i++) {
3411 float64 f;
3413 f = float64_mul(xn, float32_exp2_coefficients[i], status);
3414 r = float64_add(r, f, status);
3416 xn = float64_mul(xn, x, status);
3419 return float64_to_float32(r, status);
3422 /*----------------------------------------------------------------------------
3423 | Returns the binary log of the single-precision floating-point value `a'.
3424 | The operation is performed according to the IEC/IEEE Standard for Binary
3425 | Floating-Point Arithmetic.
3426 *----------------------------------------------------------------------------*/
3427 float32 float32_log2(float32 a, float_status *status)
3429 flag aSign, zSign;
3430 int aExp;
3431 uint32_t aSig, zSig, i;
3433 a = float32_squash_input_denormal(a, status);
3434 aSig = extractFloat32Frac( a );
3435 aExp = extractFloat32Exp( a );
3436 aSign = extractFloat32Sign( a );
3438 if ( aExp == 0 ) {
3439 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3440 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3442 if ( aSign ) {
3443 float_raise(float_flag_invalid, status);
3444 return float32_default_nan(status);
3446 if ( aExp == 0xFF ) {
3447 if (aSig) {
3448 return propagateFloat32NaN(a, float32_zero, status);
3450 return a;
3453 aExp -= 0x7F;
3454 aSig |= 0x00800000;
3455 zSign = aExp < 0;
3456 zSig = aExp << 23;
3458 for (i = 1 << 22; i > 0; i >>= 1) {
3459 aSig = ( (uint64_t)aSig * aSig ) >> 23;
3460 if ( aSig & 0x01000000 ) {
3461 aSig >>= 1;
3462 zSig |= i;
3466 if ( zSign )
3467 zSig = -zSig;
3469 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3472 /*----------------------------------------------------------------------------
3473 | Returns 1 if the single-precision floating-point value `a' is equal to
3474 | the corresponding value `b', and 0 otherwise. The invalid exception is
3475 | raised if either operand is a NaN. Otherwise, the comparison is performed
3476 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3477 *----------------------------------------------------------------------------*/
3479 int float32_eq(float32 a, float32 b, float_status *status)
3481 uint32_t av, bv;
3482 a = float32_squash_input_denormal(a, status);
3483 b = float32_squash_input_denormal(b, status);
3485 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3486 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3488 float_raise(float_flag_invalid, status);
3489 return 0;
3491 av = float32_val(a);
3492 bv = float32_val(b);
3493 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3496 /*----------------------------------------------------------------------------
3497 | Returns 1 if the single-precision floating-point value `a' is less than
3498 | or equal to the corresponding value `b', and 0 otherwise. The invalid
3499 | exception is raised if either operand is a NaN. The comparison is performed
3500 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3501 *----------------------------------------------------------------------------*/
3503 int float32_le(float32 a, float32 b, float_status *status)
3505 flag aSign, bSign;
3506 uint32_t av, bv;
3507 a = float32_squash_input_denormal(a, status);
3508 b = float32_squash_input_denormal(b, status);
3510 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3511 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3513 float_raise(float_flag_invalid, status);
3514 return 0;
3516 aSign = extractFloat32Sign( a );
3517 bSign = extractFloat32Sign( b );
3518 av = float32_val(a);
3519 bv = float32_val(b);
3520 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3521 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3525 /*----------------------------------------------------------------------------
3526 | Returns 1 if the single-precision floating-point value `a' is less than
3527 | the corresponding value `b', and 0 otherwise. The invalid exception is
3528 | raised if either operand is a NaN. The comparison is performed according
3529 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3530 *----------------------------------------------------------------------------*/
3532 int float32_lt(float32 a, float32 b, float_status *status)
3534 flag aSign, bSign;
3535 uint32_t av, bv;
3536 a = float32_squash_input_denormal(a, status);
3537 b = float32_squash_input_denormal(b, status);
3539 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3540 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3542 float_raise(float_flag_invalid, status);
3543 return 0;
3545 aSign = extractFloat32Sign( a );
3546 bSign = extractFloat32Sign( b );
3547 av = float32_val(a);
3548 bv = float32_val(b);
3549 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3550 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3554 /*----------------------------------------------------------------------------
3555 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3556 | be compared, and 0 otherwise. The invalid exception is raised if either
3557 | operand is a NaN. The comparison is performed according to the IEC/IEEE
3558 | Standard for Binary Floating-Point Arithmetic.
3559 *----------------------------------------------------------------------------*/
3561 int float32_unordered(float32 a, float32 b, float_status *status)
3563 a = float32_squash_input_denormal(a, status);
3564 b = float32_squash_input_denormal(b, status);
3566 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3567 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3569 float_raise(float_flag_invalid, status);
3570 return 1;
3572 return 0;
3575 /*----------------------------------------------------------------------------
3576 | Returns 1 if the single-precision floating-point value `a' is equal to
3577 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3578 | exception. The comparison is performed according to the IEC/IEEE Standard
3579 | for Binary Floating-Point Arithmetic.
3580 *----------------------------------------------------------------------------*/
3582 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3584 a = float32_squash_input_denormal(a, status);
3585 b = float32_squash_input_denormal(b, status);
3587 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3588 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3590 if (float32_is_signaling_nan(a, status)
3591 || float32_is_signaling_nan(b, status)) {
3592 float_raise(float_flag_invalid, status);
3594 return 0;
3596 return ( float32_val(a) == float32_val(b) ) ||
3597 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3600 /*----------------------------------------------------------------------------
3601 | Returns 1 if the single-precision floating-point value `a' is less than or
3602 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3603 | cause an exception. Otherwise, the comparison is performed according to the
3604 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3605 *----------------------------------------------------------------------------*/
3607 int float32_le_quiet(float32 a, float32 b, float_status *status)
3609 flag aSign, bSign;
3610 uint32_t av, bv;
3611 a = float32_squash_input_denormal(a, status);
3612 b = float32_squash_input_denormal(b, status);
3614 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3615 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3617 if (float32_is_signaling_nan(a, status)
3618 || float32_is_signaling_nan(b, status)) {
3619 float_raise(float_flag_invalid, status);
3621 return 0;
3623 aSign = extractFloat32Sign( a );
3624 bSign = extractFloat32Sign( b );
3625 av = float32_val(a);
3626 bv = float32_val(b);
3627 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3628 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3632 /*----------------------------------------------------------------------------
3633 | Returns 1 if the single-precision floating-point value `a' is less than
3634 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3635 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3636 | Standard for Binary Floating-Point Arithmetic.
3637 *----------------------------------------------------------------------------*/
3639 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3641 flag aSign, bSign;
3642 uint32_t av, bv;
3643 a = float32_squash_input_denormal(a, status);
3644 b = float32_squash_input_denormal(b, status);
3646 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3647 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3649 if (float32_is_signaling_nan(a, status)
3650 || float32_is_signaling_nan(b, status)) {
3651 float_raise(float_flag_invalid, status);
3653 return 0;
3655 aSign = extractFloat32Sign( a );
3656 bSign = extractFloat32Sign( b );
3657 av = float32_val(a);
3658 bv = float32_val(b);
3659 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3660 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3664 /*----------------------------------------------------------------------------
3665 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3666 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3667 | comparison is performed according to the IEC/IEEE Standard for Binary
3668 | Floating-Point Arithmetic.
3669 *----------------------------------------------------------------------------*/
3671 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3673 a = float32_squash_input_denormal(a, status);
3674 b = float32_squash_input_denormal(b, status);
3676 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3677 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3679 if (float32_is_signaling_nan(a, status)
3680 || float32_is_signaling_nan(b, status)) {
3681 float_raise(float_flag_invalid, status);
3683 return 1;
3685 return 0;
3689 /*----------------------------------------------------------------------------
3690 | Returns the result of converting the double-precision floating-point value
3691 | `a' to the single-precision floating-point format. The conversion is
3692 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3693 | Arithmetic.
3694 *----------------------------------------------------------------------------*/
3696 float32 float64_to_float32(float64 a, float_status *status)
3698 flag aSign;
3699 int aExp;
3700 uint64_t aSig;
3701 uint32_t zSig;
3702 a = float64_squash_input_denormal(a, status);
3704 aSig = extractFloat64Frac( a );
3705 aExp = extractFloat64Exp( a );
3706 aSign = extractFloat64Sign( a );
3707 if ( aExp == 0x7FF ) {
3708 if (aSig) {
3709 return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3711 return packFloat32( aSign, 0xFF, 0 );
3713 shift64RightJamming( aSig, 22, &aSig );
3714 zSig = aSig;
3715 if ( aExp || zSig ) {
3716 zSig |= 0x40000000;
3717 aExp -= 0x381;
3719 return roundAndPackFloat32(aSign, aExp, zSig, status);
3724 /*----------------------------------------------------------------------------
3725 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3726 | half-precision floating-point value, returning the result. After being
3727 | shifted into the proper positions, the three fields are simply added
3728 | together to form the result. This means that any integer portion of `zSig'
3729 | will be added into the exponent. Since a properly normalized significand
3730 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3731 | than the desired result exponent whenever `zSig' is a complete, normalized
3732 | significand.
3733 *----------------------------------------------------------------------------*/
3734 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3736 return make_float16(
3737 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3740 /*----------------------------------------------------------------------------
3741 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3742 | and significand `zSig', and returns the proper half-precision floating-
3743 | point value corresponding to the abstract input. Ordinarily, the abstract
3744 | value is simply rounded and packed into the half-precision format, with
3745 | the inexact exception raised if the abstract input cannot be represented
3746 | exactly. However, if the abstract value is too large, the overflow and
3747 | inexact exceptions are raised and an infinity or maximal finite value is
3748 | returned. If the abstract value is too small, the input value is rounded to
3749 | a subnormal number, and the underflow and inexact exceptions are raised if
3750 | the abstract input cannot be represented exactly as a subnormal half-
3751 | precision floating-point number.
3752 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3753 | ARM-style "alternative representation", which omits the NaN and Inf
3754 | encodings in order to raise the maximum representable exponent by one.
3755 | The input significand `zSig' has its binary point between bits 22
3756 | and 23, which is 13 bits to the left of the usual location. This shifted
3757 | significand must be normalized or smaller. If `zSig' is not normalized,
3758 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3759 | and it must not require rounding. In the usual case that `zSig' is
3760 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3761 | Note the slightly odd position of the binary point in zSig compared with the
3762 | other roundAndPackFloat functions. This should probably be fixed if we
3763 | need to implement more float16 routines than just conversion.
3764 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3765 | Binary Floating-Point Arithmetic.
3766 *----------------------------------------------------------------------------*/
3768 static float16 roundAndPackFloat16(flag zSign, int zExp,
3769 uint32_t zSig, flag ieee,
3770 float_status *status)
3772 int maxexp = ieee ? 29 : 30;
3773 uint32_t mask;
3774 uint32_t increment;
3775 bool rounding_bumps_exp;
3776 bool is_tiny = false;
3778 /* Calculate the mask of bits of the mantissa which are not
3779 * representable in half-precision and will be lost.
3781 if (zExp < 1) {
3782 /* Will be denormal in halfprec */
3783 mask = 0x00ffffff;
3784 if (zExp >= -11) {
3785 mask >>= 11 + zExp;
3787 } else {
3788 /* Normal number in halfprec */
3789 mask = 0x00001fff;
3792 switch (status->float_rounding_mode) {
3793 case float_round_nearest_even:
3794 increment = (mask + 1) >> 1;
3795 if ((zSig & mask) == increment) {
3796 increment = zSig & (increment << 1);
3798 break;
3799 case float_round_ties_away:
3800 increment = (mask + 1) >> 1;
3801 break;
3802 case float_round_up:
3803 increment = zSign ? 0 : mask;
3804 break;
3805 case float_round_down:
3806 increment = zSign ? mask : 0;
3807 break;
3808 default: /* round_to_zero */
3809 increment = 0;
3810 break;
3813 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3815 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3816 if (ieee) {
3817 float_raise(float_flag_overflow | float_flag_inexact, status);
3818 return packFloat16(zSign, 0x1f, 0);
3819 } else {
3820 float_raise(float_flag_invalid, status);
3821 return packFloat16(zSign, 0x1f, 0x3ff);
3825 if (zExp < 0) {
3826 /* Note that flush-to-zero does not affect half-precision results */
3827 is_tiny =
3828 (status->float_detect_tininess == float_tininess_before_rounding)
3829 || (zExp < -1)
3830 || (!rounding_bumps_exp);
3832 if (zSig & mask) {
3833 float_raise(float_flag_inexact, status);
3834 if (is_tiny) {
3835 float_raise(float_flag_underflow, status);
3839 zSig += increment;
3840 if (rounding_bumps_exp) {
3841 zSig >>= 1;
3842 zExp++;
3845 if (zExp < -10) {
3846 return packFloat16(zSign, 0, 0);
3848 if (zExp < 0) {
3849 zSig >>= -zExp;
3850 zExp = 0;
3852 return packFloat16(zSign, zExp, zSig >> 13);
3855 /*----------------------------------------------------------------------------
3856 | If `a' is denormal and we are in flush-to-zero mode then set the
3857 | input-denormal exception and return zero. Otherwise just return the value.
3858 *----------------------------------------------------------------------------*/
3859 float16 float16_squash_input_denormal(float16 a, float_status *status)
3861 if (status->flush_inputs_to_zero) {
3862 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3863 float_raise(float_flag_input_denormal, status);
3864 return make_float16(float16_val(a) & 0x8000);
3867 return a;
3870 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3871 uint32_t *zSigPtr)
3873 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3874 *zSigPtr = aSig << shiftCount;
3875 *zExpPtr = 1 - shiftCount;
3878 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3879 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3881 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3883 flag aSign;
3884 int aExp;
3885 uint32_t aSig;
3887 aSign = extractFloat16Sign(a);
3888 aExp = extractFloat16Exp(a);
3889 aSig = extractFloat16Frac(a);
3891 if (aExp == 0x1f && ieee) {
3892 if (aSig) {
3893 return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3895 return packFloat32(aSign, 0xff, 0);
3897 if (aExp == 0) {
3898 if (aSig == 0) {
3899 return packFloat32(aSign, 0, 0);
3902 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3903 aExp--;
3905 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3908 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3910 flag aSign;
3911 int aExp;
3912 uint32_t aSig;
3914 a = float32_squash_input_denormal(a, status);
3916 aSig = extractFloat32Frac( a );
3917 aExp = extractFloat32Exp( a );
3918 aSign = extractFloat32Sign( a );
3919 if ( aExp == 0xFF ) {
3920 if (aSig) {
3921 /* Input is a NaN */
3922 if (!ieee) {
3923 float_raise(float_flag_invalid, status);
3924 return packFloat16(aSign, 0, 0);
3926 return commonNaNToFloat16(
3927 float32ToCommonNaN(a, status), status);
3929 /* Infinity */
3930 if (!ieee) {
3931 float_raise(float_flag_invalid, status);
3932 return packFloat16(aSign, 0x1f, 0x3ff);
3934 return packFloat16(aSign, 0x1f, 0);
3936 if (aExp == 0 && aSig == 0) {
3937 return packFloat16(aSign, 0, 0);
3939 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3940 * even if the input is denormal; however this is harmless because
3941 * the largest possible single-precision denormal is still smaller
3942 * than the smallest representable half-precision denormal, and so we
3943 * will end up ignoring aSig and returning via the "always return zero"
3944 * codepath.
3946 aSig |= 0x00800000;
3947 aExp -= 0x71;
3949 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3952 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3954 flag aSign;
3955 int aExp;
3956 uint32_t aSig;
3958 aSign = extractFloat16Sign(a);
3959 aExp = extractFloat16Exp(a);
3960 aSig = extractFloat16Frac(a);
3962 if (aExp == 0x1f && ieee) {
3963 if (aSig) {
3964 return commonNaNToFloat64(
3965 float16ToCommonNaN(a, status), status);
3967 return packFloat64(aSign, 0x7ff, 0);
3969 if (aExp == 0) {
3970 if (aSig == 0) {
3971 return packFloat64(aSign, 0, 0);
3974 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3975 aExp--;
3977 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3980 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3982 flag aSign;
3983 int aExp;
3984 uint64_t aSig;
3985 uint32_t zSig;
3987 a = float64_squash_input_denormal(a, status);
3989 aSig = extractFloat64Frac(a);
3990 aExp = extractFloat64Exp(a);
3991 aSign = extractFloat64Sign(a);
3992 if (aExp == 0x7FF) {
3993 if (aSig) {
3994 /* Input is a NaN */
3995 if (!ieee) {
3996 float_raise(float_flag_invalid, status);
3997 return packFloat16(aSign, 0, 0);
3999 return commonNaNToFloat16(
4000 float64ToCommonNaN(a, status), status);
4002 /* Infinity */
4003 if (!ieee) {
4004 float_raise(float_flag_invalid, status);
4005 return packFloat16(aSign, 0x1f, 0x3ff);
4007 return packFloat16(aSign, 0x1f, 0);
4009 shift64RightJamming(aSig, 29, &aSig);
4010 zSig = aSig;
4011 if (aExp == 0 && zSig == 0) {
4012 return packFloat16(aSign, 0, 0);
4014 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4015 * even if the input is denormal; however this is harmless because
4016 * the largest possible single-precision denormal is still smaller
4017 * than the smallest representable half-precision denormal, and so we
4018 * will end up ignoring aSig and returning via the "always return zero"
4019 * codepath.
4021 zSig |= 0x00800000;
4022 aExp -= 0x3F1;
4024 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
4027 /*----------------------------------------------------------------------------
4028 | Returns the result of converting the double-precision floating-point value
4029 | `a' to the extended double-precision floating-point format. The conversion
4030 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4031 | Arithmetic.
4032 *----------------------------------------------------------------------------*/
4034 floatx80 float64_to_floatx80(float64 a, float_status *status)
4036 flag aSign;
4037 int aExp;
4038 uint64_t aSig;
4040 a = float64_squash_input_denormal(a, status);
4041 aSig = extractFloat64Frac( a );
4042 aExp = extractFloat64Exp( a );
4043 aSign = extractFloat64Sign( a );
4044 if ( aExp == 0x7FF ) {
4045 if (aSig) {
4046 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4048 return packFloatx80(aSign,
4049 floatx80_infinity_high,
4050 floatx80_infinity_low);
4052 if ( aExp == 0 ) {
4053 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4054 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4056 return
4057 packFloatx80(
4058 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4062 /*----------------------------------------------------------------------------
4063 | Returns the result of converting the double-precision floating-point value
4064 | `a' to the quadruple-precision floating-point format. The conversion is
4065 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4066 | Arithmetic.
4067 *----------------------------------------------------------------------------*/
4069 float128 float64_to_float128(float64 a, float_status *status)
4071 flag aSign;
4072 int aExp;
4073 uint64_t aSig, zSig0, zSig1;
4075 a = float64_squash_input_denormal(a, status);
4076 aSig = extractFloat64Frac( a );
4077 aExp = extractFloat64Exp( a );
4078 aSign = extractFloat64Sign( a );
4079 if ( aExp == 0x7FF ) {
4080 if (aSig) {
4081 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4083 return packFloat128( aSign, 0x7FFF, 0, 0 );
4085 if ( aExp == 0 ) {
4086 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4087 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4088 --aExp;
4090 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4091 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4096 /*----------------------------------------------------------------------------
4097 | Returns the remainder of the double-precision floating-point value `a'
4098 | with respect to the corresponding value `b'. The operation is performed
4099 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4100 *----------------------------------------------------------------------------*/
4102 float64 float64_rem(float64 a, float64 b, float_status *status)
4104 flag aSign, zSign;
4105 int aExp, bExp, expDiff;
4106 uint64_t aSig, bSig;
4107 uint64_t q, alternateASig;
4108 int64_t sigMean;
4110 a = float64_squash_input_denormal(a, status);
4111 b = float64_squash_input_denormal(b, status);
4112 aSig = extractFloat64Frac( a );
4113 aExp = extractFloat64Exp( a );
4114 aSign = extractFloat64Sign( a );
4115 bSig = extractFloat64Frac( b );
4116 bExp = extractFloat64Exp( b );
4117 if ( aExp == 0x7FF ) {
4118 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4119 return propagateFloat64NaN(a, b, status);
4121 float_raise(float_flag_invalid, status);
4122 return float64_default_nan(status);
4124 if ( bExp == 0x7FF ) {
4125 if (bSig) {
4126 return propagateFloat64NaN(a, b, status);
4128 return a;
4130 if ( bExp == 0 ) {
4131 if ( bSig == 0 ) {
4132 float_raise(float_flag_invalid, status);
4133 return float64_default_nan(status);
4135 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4137 if ( aExp == 0 ) {
4138 if ( aSig == 0 ) return a;
4139 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4141 expDiff = aExp - bExp;
4142 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4143 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4144 if ( expDiff < 0 ) {
4145 if ( expDiff < -1 ) return a;
4146 aSig >>= 1;
4148 q = ( bSig <= aSig );
4149 if ( q ) aSig -= bSig;
4150 expDiff -= 64;
4151 while ( 0 < expDiff ) {
4152 q = estimateDiv128To64( aSig, 0, bSig );
4153 q = ( 2 < q ) ? q - 2 : 0;
4154 aSig = - ( ( bSig>>2 ) * q );
4155 expDiff -= 62;
4157 expDiff += 64;
4158 if ( 0 < expDiff ) {
4159 q = estimateDiv128To64( aSig, 0, bSig );
4160 q = ( 2 < q ) ? q - 2 : 0;
4161 q >>= 64 - expDiff;
4162 bSig >>= 2;
4163 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4165 else {
4166 aSig >>= 2;
4167 bSig >>= 2;
4169 do {
4170 alternateASig = aSig;
4171 ++q;
4172 aSig -= bSig;
4173 } while ( 0 <= (int64_t) aSig );
4174 sigMean = aSig + alternateASig;
4175 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4176 aSig = alternateASig;
4178 zSign = ( (int64_t) aSig < 0 );
4179 if ( zSign ) aSig = - aSig;
4180 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4184 /*----------------------------------------------------------------------------
4185 | Returns the binary log of the double-precision floating-point value `a'.
4186 | The operation is performed according to the IEC/IEEE Standard for Binary
4187 | Floating-Point Arithmetic.
4188 *----------------------------------------------------------------------------*/
4189 float64 float64_log2(float64 a, float_status *status)
4191 flag aSign, zSign;
4192 int aExp;
4193 uint64_t aSig, aSig0, aSig1, zSig, i;
4194 a = float64_squash_input_denormal(a, status);
4196 aSig = extractFloat64Frac( a );
4197 aExp = extractFloat64Exp( a );
4198 aSign = extractFloat64Sign( a );
4200 if ( aExp == 0 ) {
4201 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4202 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4204 if ( aSign ) {
4205 float_raise(float_flag_invalid, status);
4206 return float64_default_nan(status);
4208 if ( aExp == 0x7FF ) {
4209 if (aSig) {
4210 return propagateFloat64NaN(a, float64_zero, status);
4212 return a;
4215 aExp -= 0x3FF;
4216 aSig |= LIT64( 0x0010000000000000 );
4217 zSign = aExp < 0;
4218 zSig = (uint64_t)aExp << 52;
4219 for (i = 1LL << 51; i > 0; i >>= 1) {
4220 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4221 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4222 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4223 aSig >>= 1;
4224 zSig |= i;
4228 if ( zSign )
4229 zSig = -zSig;
4230 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4233 /*----------------------------------------------------------------------------
4234 | Returns 1 if the double-precision floating-point value `a' is equal to the
4235 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4236 | if either operand is a NaN. Otherwise, the comparison is performed
4237 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4238 *----------------------------------------------------------------------------*/
4240 int float64_eq(float64 a, float64 b, float_status *status)
4242 uint64_t av, bv;
4243 a = float64_squash_input_denormal(a, status);
4244 b = float64_squash_input_denormal(b, status);
4246 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4247 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4249 float_raise(float_flag_invalid, status);
4250 return 0;
4252 av = float64_val(a);
4253 bv = float64_val(b);
4254 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4258 /*----------------------------------------------------------------------------
4259 | Returns 1 if the double-precision floating-point value `a' is less than or
4260 | equal to the corresponding value `b', and 0 otherwise. The invalid
4261 | exception is raised if either operand is a NaN. The comparison is performed
4262 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4263 *----------------------------------------------------------------------------*/
4265 int float64_le(float64 a, float64 b, float_status *status)
4267 flag aSign, bSign;
4268 uint64_t av, bv;
4269 a = float64_squash_input_denormal(a, status);
4270 b = float64_squash_input_denormal(b, status);
4272 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4273 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4275 float_raise(float_flag_invalid, status);
4276 return 0;
4278 aSign = extractFloat64Sign( a );
4279 bSign = extractFloat64Sign( b );
4280 av = float64_val(a);
4281 bv = float64_val(b);
4282 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4283 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4287 /*----------------------------------------------------------------------------
4288 | Returns 1 if the double-precision floating-point value `a' is less than
4289 | the corresponding value `b', and 0 otherwise. The invalid exception is
4290 | raised if either operand is a NaN. The comparison is performed according
4291 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4292 *----------------------------------------------------------------------------*/
4294 int float64_lt(float64 a, float64 b, float_status *status)
4296 flag aSign, bSign;
4297 uint64_t av, bv;
4299 a = float64_squash_input_denormal(a, status);
4300 b = float64_squash_input_denormal(b, status);
4301 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4302 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4304 float_raise(float_flag_invalid, status);
4305 return 0;
4307 aSign = extractFloat64Sign( a );
4308 bSign = extractFloat64Sign( b );
4309 av = float64_val(a);
4310 bv = float64_val(b);
4311 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4312 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4316 /*----------------------------------------------------------------------------
4317 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4318 | be compared, and 0 otherwise. The invalid exception is raised if either
4319 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4320 | Standard for Binary Floating-Point Arithmetic.
4321 *----------------------------------------------------------------------------*/
4323 int float64_unordered(float64 a, float64 b, float_status *status)
4325 a = float64_squash_input_denormal(a, status);
4326 b = float64_squash_input_denormal(b, status);
4328 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4329 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4331 float_raise(float_flag_invalid, status);
4332 return 1;
4334 return 0;
4337 /*----------------------------------------------------------------------------
4338 | Returns 1 if the double-precision floating-point value `a' is equal to the
4339 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4340 | exception.The comparison is performed according to the IEC/IEEE Standard
4341 | for Binary Floating-Point Arithmetic.
4342 *----------------------------------------------------------------------------*/
4344 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4346 uint64_t av, bv;
4347 a = float64_squash_input_denormal(a, status);
4348 b = float64_squash_input_denormal(b, status);
4350 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4351 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4353 if (float64_is_signaling_nan(a, status)
4354 || float64_is_signaling_nan(b, status)) {
4355 float_raise(float_flag_invalid, status);
4357 return 0;
4359 av = float64_val(a);
4360 bv = float64_val(b);
4361 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4365 /*----------------------------------------------------------------------------
4366 | Returns 1 if the double-precision floating-point value `a' is less than or
4367 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4368 | cause an exception. Otherwise, the comparison is performed according to the
4369 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4370 *----------------------------------------------------------------------------*/
4372 int float64_le_quiet(float64 a, float64 b, float_status *status)
4374 flag aSign, bSign;
4375 uint64_t av, bv;
4376 a = float64_squash_input_denormal(a, status);
4377 b = float64_squash_input_denormal(b, status);
4379 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4380 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4382 if (float64_is_signaling_nan(a, status)
4383 || float64_is_signaling_nan(b, status)) {
4384 float_raise(float_flag_invalid, status);
4386 return 0;
4388 aSign = extractFloat64Sign( a );
4389 bSign = extractFloat64Sign( b );
4390 av = float64_val(a);
4391 bv = float64_val(b);
4392 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4393 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4397 /*----------------------------------------------------------------------------
4398 | Returns 1 if the double-precision floating-point value `a' is less than
4399 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4400 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4401 | Standard for Binary Floating-Point Arithmetic.
4402 *----------------------------------------------------------------------------*/
4404 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4406 flag aSign, bSign;
4407 uint64_t av, bv;
4408 a = float64_squash_input_denormal(a, status);
4409 b = float64_squash_input_denormal(b, status);
4411 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4412 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4414 if (float64_is_signaling_nan(a, status)
4415 || float64_is_signaling_nan(b, status)) {
4416 float_raise(float_flag_invalid, status);
4418 return 0;
4420 aSign = extractFloat64Sign( a );
4421 bSign = extractFloat64Sign( b );
4422 av = float64_val(a);
4423 bv = float64_val(b);
4424 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4425 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4429 /*----------------------------------------------------------------------------
4430 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4431 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4432 | comparison is performed according to the IEC/IEEE Standard for Binary
4433 | Floating-Point Arithmetic.
4434 *----------------------------------------------------------------------------*/
4436 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4438 a = float64_squash_input_denormal(a, status);
4439 b = float64_squash_input_denormal(b, status);
4441 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4442 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4444 if (float64_is_signaling_nan(a, status)
4445 || float64_is_signaling_nan(b, status)) {
4446 float_raise(float_flag_invalid, status);
4448 return 1;
4450 return 0;
4453 /*----------------------------------------------------------------------------
4454 | Returns the result of converting the extended double-precision floating-
4455 | point value `a' to the 32-bit two's complement integer format. The
4456 | conversion is performed according to the IEC/IEEE Standard for Binary
4457 | Floating-Point Arithmetic---which means in particular that the conversion
4458 | is rounded according to the current rounding mode. If `a' is a NaN, the
4459 | largest positive integer is returned. Otherwise, if the conversion
4460 | overflows, the largest integer with the same sign as `a' is returned.
4461 *----------------------------------------------------------------------------*/
4463 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4465 flag aSign;
4466 int32_t aExp, shiftCount;
4467 uint64_t aSig;
4469 if (floatx80_invalid_encoding(a)) {
4470 float_raise(float_flag_invalid, status);
4471 return 1 << 31;
4473 aSig = extractFloatx80Frac( a );
4474 aExp = extractFloatx80Exp( a );
4475 aSign = extractFloatx80Sign( a );
4476 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4477 shiftCount = 0x4037 - aExp;
4478 if ( shiftCount <= 0 ) shiftCount = 1;
4479 shift64RightJamming( aSig, shiftCount, &aSig );
4480 return roundAndPackInt32(aSign, aSig, status);
4484 /*----------------------------------------------------------------------------
4485 | Returns the result of converting the extended double-precision floating-
4486 | point value `a' to the 32-bit two's complement integer format. The
4487 | conversion is performed according to the IEC/IEEE Standard for Binary
4488 | Floating-Point Arithmetic, except that the conversion is always rounded
4489 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4490 | Otherwise, if the conversion overflows, the largest integer with the same
4491 | sign as `a' is returned.
4492 *----------------------------------------------------------------------------*/
4494 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4496 flag aSign;
4497 int32_t aExp, shiftCount;
4498 uint64_t aSig, savedASig;
4499 int32_t z;
4501 if (floatx80_invalid_encoding(a)) {
4502 float_raise(float_flag_invalid, status);
4503 return 1 << 31;
4505 aSig = extractFloatx80Frac( a );
4506 aExp = extractFloatx80Exp( a );
4507 aSign = extractFloatx80Sign( a );
4508 if ( 0x401E < aExp ) {
4509 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4510 goto invalid;
4512 else if ( aExp < 0x3FFF ) {
4513 if (aExp || aSig) {
4514 status->float_exception_flags |= float_flag_inexact;
4516 return 0;
4518 shiftCount = 0x403E - aExp;
4519 savedASig = aSig;
4520 aSig >>= shiftCount;
4521 z = aSig;
4522 if ( aSign ) z = - z;
4523 if ( ( z < 0 ) ^ aSign ) {
4524 invalid:
4525 float_raise(float_flag_invalid, status);
4526 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4528 if ( ( aSig<<shiftCount ) != savedASig ) {
4529 status->float_exception_flags |= float_flag_inexact;
4531 return z;
4535 /*----------------------------------------------------------------------------
4536 | Returns the result of converting the extended double-precision floating-
4537 | point value `a' to the 64-bit two's complement integer format. The
4538 | conversion is performed according to the IEC/IEEE Standard for Binary
4539 | Floating-Point Arithmetic---which means in particular that the conversion
4540 | is rounded according to the current rounding mode. If `a' is a NaN,
4541 | the largest positive integer is returned. Otherwise, if the conversion
4542 | overflows, the largest integer with the same sign as `a' is returned.
4543 *----------------------------------------------------------------------------*/
4545 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4547 flag aSign;
4548 int32_t aExp, shiftCount;
4549 uint64_t aSig, aSigExtra;
4551 if (floatx80_invalid_encoding(a)) {
4552 float_raise(float_flag_invalid, status);
4553 return 1ULL << 63;
4555 aSig = extractFloatx80Frac( a );
4556 aExp = extractFloatx80Exp( a );
4557 aSign = extractFloatx80Sign( a );
4558 shiftCount = 0x403E - aExp;
4559 if ( shiftCount <= 0 ) {
4560 if ( shiftCount ) {
4561 float_raise(float_flag_invalid, status);
4562 if (!aSign || floatx80_is_any_nan(a)) {
4563 return LIT64( 0x7FFFFFFFFFFFFFFF );
4565 return (int64_t) LIT64( 0x8000000000000000 );
4567 aSigExtra = 0;
4569 else {
4570 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4572 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4576 /*----------------------------------------------------------------------------
4577 | Returns the result of converting the extended double-precision floating-
4578 | point value `a' to the 64-bit two's complement integer format. The
4579 | conversion is performed according to the IEC/IEEE Standard for Binary
4580 | Floating-Point Arithmetic, except that the conversion is always rounded
4581 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4582 | Otherwise, if the conversion overflows, the largest integer with the same
4583 | sign as `a' is returned.
4584 *----------------------------------------------------------------------------*/
4586 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4588 flag aSign;
4589 int32_t aExp, shiftCount;
4590 uint64_t aSig;
4591 int64_t z;
4593 if (floatx80_invalid_encoding(a)) {
4594 float_raise(float_flag_invalid, status);
4595 return 1ULL << 63;
4597 aSig = extractFloatx80Frac( a );
4598 aExp = extractFloatx80Exp( a );
4599 aSign = extractFloatx80Sign( a );
4600 shiftCount = aExp - 0x403E;
4601 if ( 0 <= shiftCount ) {
4602 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4603 if ( ( a.high != 0xC03E ) || aSig ) {
4604 float_raise(float_flag_invalid, status);
4605 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4606 return LIT64( 0x7FFFFFFFFFFFFFFF );
4609 return (int64_t) LIT64( 0x8000000000000000 );
4611 else if ( aExp < 0x3FFF ) {
4612 if (aExp | aSig) {
4613 status->float_exception_flags |= float_flag_inexact;
4615 return 0;
4617 z = aSig>>( - shiftCount );
4618 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4619 status->float_exception_flags |= float_flag_inexact;
4621 if ( aSign ) z = - z;
4622 return z;
4626 /*----------------------------------------------------------------------------
4627 | Returns the result of converting the extended double-precision floating-
4628 | point value `a' to the single-precision floating-point format. The
4629 | conversion is performed according to the IEC/IEEE Standard for Binary
4630 | Floating-Point Arithmetic.
4631 *----------------------------------------------------------------------------*/
4633 float32 floatx80_to_float32(floatx80 a, float_status *status)
4635 flag aSign;
4636 int32_t aExp;
4637 uint64_t aSig;
4639 if (floatx80_invalid_encoding(a)) {
4640 float_raise(float_flag_invalid, status);
4641 return float32_default_nan(status);
4643 aSig = extractFloatx80Frac( a );
4644 aExp = extractFloatx80Exp( a );
4645 aSign = extractFloatx80Sign( a );
4646 if ( aExp == 0x7FFF ) {
4647 if ( (uint64_t) ( aSig<<1 ) ) {
4648 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4650 return packFloat32( aSign, 0xFF, 0 );
4652 shift64RightJamming( aSig, 33, &aSig );
4653 if ( aExp || aSig ) aExp -= 0x3F81;
4654 return roundAndPackFloat32(aSign, aExp, aSig, status);
4658 /*----------------------------------------------------------------------------
4659 | Returns the result of converting the extended double-precision floating-
4660 | point value `a' to the double-precision floating-point format. The
4661 | conversion is performed according to the IEC/IEEE Standard for Binary
4662 | Floating-Point Arithmetic.
4663 *----------------------------------------------------------------------------*/
4665 float64 floatx80_to_float64(floatx80 a, float_status *status)
4667 flag aSign;
4668 int32_t aExp;
4669 uint64_t aSig, zSig;
4671 if (floatx80_invalid_encoding(a)) {
4672 float_raise(float_flag_invalid, status);
4673 return float64_default_nan(status);
4675 aSig = extractFloatx80Frac( a );
4676 aExp = extractFloatx80Exp( a );
4677 aSign = extractFloatx80Sign( a );
4678 if ( aExp == 0x7FFF ) {
4679 if ( (uint64_t) ( aSig<<1 ) ) {
4680 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4682 return packFloat64( aSign, 0x7FF, 0 );
4684 shift64RightJamming( aSig, 1, &zSig );
4685 if ( aExp || aSig ) aExp -= 0x3C01;
4686 return roundAndPackFloat64(aSign, aExp, zSig, status);
4690 /*----------------------------------------------------------------------------
4691 | Returns the result of converting the extended double-precision floating-
4692 | point value `a' to the quadruple-precision floating-point format. The
4693 | conversion is performed according to the IEC/IEEE Standard for Binary
4694 | Floating-Point Arithmetic.
4695 *----------------------------------------------------------------------------*/
4697 float128 floatx80_to_float128(floatx80 a, float_status *status)
4699 flag aSign;
4700 int aExp;
4701 uint64_t aSig, zSig0, zSig1;
4703 if (floatx80_invalid_encoding(a)) {
4704 float_raise(float_flag_invalid, status);
4705 return float128_default_nan(status);
4707 aSig = extractFloatx80Frac( a );
4708 aExp = extractFloatx80Exp( a );
4709 aSign = extractFloatx80Sign( a );
4710 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4711 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4713 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4714 return packFloat128( aSign, aExp, zSig0, zSig1 );
4718 /*----------------------------------------------------------------------------
4719 | Rounds the extended double-precision floating-point value `a'
4720 | to the precision provided by floatx80_rounding_precision and returns the
4721 | result as an extended double-precision floating-point value.
4722 | The operation is performed according to the IEC/IEEE Standard for Binary
4723 | Floating-Point Arithmetic.
4724 *----------------------------------------------------------------------------*/
4726 floatx80 floatx80_round(floatx80 a, float_status *status)
4728 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4729 extractFloatx80Sign(a),
4730 extractFloatx80Exp(a),
4731 extractFloatx80Frac(a), 0, status);
4734 /*----------------------------------------------------------------------------
4735 | Rounds the extended double-precision floating-point value `a' to an integer,
4736 | and returns the result as an extended quadruple-precision floating-point
4737 | value. The operation is performed according to the IEC/IEEE Standard for
4738 | Binary Floating-Point Arithmetic.
4739 *----------------------------------------------------------------------------*/
4741 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4743 flag aSign;
4744 int32_t aExp;
4745 uint64_t lastBitMask, roundBitsMask;
4746 floatx80 z;
4748 if (floatx80_invalid_encoding(a)) {
4749 float_raise(float_flag_invalid, status);
4750 return floatx80_default_nan(status);
4752 aExp = extractFloatx80Exp( a );
4753 if ( 0x403E <= aExp ) {
4754 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4755 return propagateFloatx80NaN(a, a, status);
4757 return a;
4759 if ( aExp < 0x3FFF ) {
4760 if ( ( aExp == 0 )
4761 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4762 return a;
4764 status->float_exception_flags |= float_flag_inexact;
4765 aSign = extractFloatx80Sign( a );
4766 switch (status->float_rounding_mode) {
4767 case float_round_nearest_even:
4768 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4770 return
4771 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4773 break;
4774 case float_round_ties_away:
4775 if (aExp == 0x3FFE) {
4776 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4778 break;
4779 case float_round_down:
4780 return
4781 aSign ?
4782 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4783 : packFloatx80( 0, 0, 0 );
4784 case float_round_up:
4785 return
4786 aSign ? packFloatx80( 1, 0, 0 )
4787 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4789 return packFloatx80( aSign, 0, 0 );
4791 lastBitMask = 1;
4792 lastBitMask <<= 0x403E - aExp;
4793 roundBitsMask = lastBitMask - 1;
4794 z = a;
4795 switch (status->float_rounding_mode) {
4796 case float_round_nearest_even:
4797 z.low += lastBitMask>>1;
4798 if ((z.low & roundBitsMask) == 0) {
4799 z.low &= ~lastBitMask;
4801 break;
4802 case float_round_ties_away:
4803 z.low += lastBitMask >> 1;
4804 break;
4805 case float_round_to_zero:
4806 break;
4807 case float_round_up:
4808 if (!extractFloatx80Sign(z)) {
4809 z.low += roundBitsMask;
4811 break;
4812 case float_round_down:
4813 if (extractFloatx80Sign(z)) {
4814 z.low += roundBitsMask;
4816 break;
4817 default:
4818 abort();
4820 z.low &= ~ roundBitsMask;
4821 if ( z.low == 0 ) {
4822 ++z.high;
4823 z.low = LIT64( 0x8000000000000000 );
4825 if (z.low != a.low) {
4826 status->float_exception_flags |= float_flag_inexact;
4828 return z;
4832 /*----------------------------------------------------------------------------
4833 | Returns the result of adding the absolute values of the extended double-
4834 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4835 | negated before being returned. `zSign' is ignored if the result is a NaN.
4836 | The addition is performed according to the IEC/IEEE Standard for Binary
4837 | Floating-Point Arithmetic.
4838 *----------------------------------------------------------------------------*/
4840 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4841 float_status *status)
4843 int32_t aExp, bExp, zExp;
4844 uint64_t aSig, bSig, zSig0, zSig1;
4845 int32_t expDiff;
4847 aSig = extractFloatx80Frac( a );
4848 aExp = extractFloatx80Exp( a );
4849 bSig = extractFloatx80Frac( b );
4850 bExp = extractFloatx80Exp( b );
4851 expDiff = aExp - bExp;
4852 if ( 0 < expDiff ) {
4853 if ( aExp == 0x7FFF ) {
4854 if ((uint64_t)(aSig << 1)) {
4855 return propagateFloatx80NaN(a, b, status);
4857 return a;
4859 if ( bExp == 0 ) --expDiff;
4860 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4861 zExp = aExp;
4863 else if ( expDiff < 0 ) {
4864 if ( bExp == 0x7FFF ) {
4865 if ((uint64_t)(bSig << 1)) {
4866 return propagateFloatx80NaN(a, b, status);
4868 return packFloatx80(zSign,
4869 floatx80_infinity_high,
4870 floatx80_infinity_low);
4872 if ( aExp == 0 ) ++expDiff;
4873 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4874 zExp = bExp;
4876 else {
4877 if ( aExp == 0x7FFF ) {
4878 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4879 return propagateFloatx80NaN(a, b, status);
4881 return a;
4883 zSig1 = 0;
4884 zSig0 = aSig + bSig;
4885 if ( aExp == 0 ) {
4886 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4887 goto roundAndPack;
4889 zExp = aExp;
4890 goto shiftRight1;
4892 zSig0 = aSig + bSig;
4893 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4894 shiftRight1:
4895 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4896 zSig0 |= LIT64( 0x8000000000000000 );
4897 ++zExp;
4898 roundAndPack:
4899 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4900 zSign, zExp, zSig0, zSig1, status);
4903 /*----------------------------------------------------------------------------
4904 | Returns the result of subtracting the absolute values of the extended
4905 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4906 | difference is negated before being returned. `zSign' is ignored if the
4907 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4908 | Standard for Binary Floating-Point Arithmetic.
4909 *----------------------------------------------------------------------------*/
4911 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4912 float_status *status)
4914 int32_t aExp, bExp, zExp;
4915 uint64_t aSig, bSig, zSig0, zSig1;
4916 int32_t expDiff;
4918 aSig = extractFloatx80Frac( a );
4919 aExp = extractFloatx80Exp( a );
4920 bSig = extractFloatx80Frac( b );
4921 bExp = extractFloatx80Exp( b );
4922 expDiff = aExp - bExp;
4923 if ( 0 < expDiff ) goto aExpBigger;
4924 if ( expDiff < 0 ) goto bExpBigger;
4925 if ( aExp == 0x7FFF ) {
4926 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4927 return propagateFloatx80NaN(a, b, status);
4929 float_raise(float_flag_invalid, status);
4930 return floatx80_default_nan(status);
4932 if ( aExp == 0 ) {
4933 aExp = 1;
4934 bExp = 1;
4936 zSig1 = 0;
4937 if ( bSig < aSig ) goto aBigger;
4938 if ( aSig < bSig ) goto bBigger;
4939 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
4940 bExpBigger:
4941 if ( bExp == 0x7FFF ) {
4942 if ((uint64_t)(bSig << 1)) {
4943 return propagateFloatx80NaN(a, b, status);
4945 return packFloatx80(zSign ^ 1, floatx80_infinity_high,
4946 floatx80_infinity_low);
4948 if ( aExp == 0 ) ++expDiff;
4949 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4950 bBigger:
4951 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4952 zExp = bExp;
4953 zSign ^= 1;
4954 goto normalizeRoundAndPack;
4955 aExpBigger:
4956 if ( aExp == 0x7FFF ) {
4957 if ((uint64_t)(aSig << 1)) {
4958 return propagateFloatx80NaN(a, b, status);
4960 return a;
4962 if ( bExp == 0 ) --expDiff;
4963 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4964 aBigger:
4965 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4966 zExp = aExp;
4967 normalizeRoundAndPack:
4968 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
4969 zSign, zExp, zSig0, zSig1, status);
4972 /*----------------------------------------------------------------------------
4973 | Returns the result of adding the extended double-precision floating-point
4974 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4975 | Standard for Binary Floating-Point Arithmetic.
4976 *----------------------------------------------------------------------------*/
4978 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
4980 flag aSign, bSign;
4982 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4983 float_raise(float_flag_invalid, status);
4984 return floatx80_default_nan(status);
4986 aSign = extractFloatx80Sign( a );
4987 bSign = extractFloatx80Sign( b );
4988 if ( aSign == bSign ) {
4989 return addFloatx80Sigs(a, b, aSign, status);
4991 else {
4992 return subFloatx80Sigs(a, b, aSign, status);
4997 /*----------------------------------------------------------------------------
4998 | Returns the result of subtracting the extended double-precision floating-
4999 | point values `a' and `b'. The operation is performed according to the
5000 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5001 *----------------------------------------------------------------------------*/
5003 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5005 flag aSign, bSign;
5007 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5008 float_raise(float_flag_invalid, status);
5009 return floatx80_default_nan(status);
5011 aSign = extractFloatx80Sign( a );
5012 bSign = extractFloatx80Sign( b );
5013 if ( aSign == bSign ) {
5014 return subFloatx80Sigs(a, b, aSign, status);
5016 else {
5017 return addFloatx80Sigs(a, b, aSign, status);
5022 /*----------------------------------------------------------------------------
5023 | Returns the result of multiplying the extended double-precision floating-
5024 | point values `a' and `b'. The operation is performed according to the
5025 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5026 *----------------------------------------------------------------------------*/
5028 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5030 flag aSign, bSign, zSign;
5031 int32_t aExp, bExp, zExp;
5032 uint64_t aSig, bSig, zSig0, zSig1;
5034 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5035 float_raise(float_flag_invalid, status);
5036 return floatx80_default_nan(status);
5038 aSig = extractFloatx80Frac( a );
5039 aExp = extractFloatx80Exp( a );
5040 aSign = extractFloatx80Sign( a );
5041 bSig = extractFloatx80Frac( b );
5042 bExp = extractFloatx80Exp( b );
5043 bSign = extractFloatx80Sign( b );
5044 zSign = aSign ^ bSign;
5045 if ( aExp == 0x7FFF ) {
5046 if ( (uint64_t) ( aSig<<1 )
5047 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5048 return propagateFloatx80NaN(a, b, status);
5050 if ( ( bExp | bSig ) == 0 ) goto invalid;
5051 return packFloatx80(zSign, floatx80_infinity_high,
5052 floatx80_infinity_low);
5054 if ( bExp == 0x7FFF ) {
5055 if ((uint64_t)(bSig << 1)) {
5056 return propagateFloatx80NaN(a, b, status);
5058 if ( ( aExp | aSig ) == 0 ) {
5059 invalid:
5060 float_raise(float_flag_invalid, status);
5061 return floatx80_default_nan(status);
5063 return packFloatx80(zSign, floatx80_infinity_high,
5064 floatx80_infinity_low);
5066 if ( aExp == 0 ) {
5067 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5068 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5070 if ( bExp == 0 ) {
5071 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5072 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5074 zExp = aExp + bExp - 0x3FFE;
5075 mul64To128( aSig, bSig, &zSig0, &zSig1 );
5076 if ( 0 < (int64_t) zSig0 ) {
5077 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5078 --zExp;
5080 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5081 zSign, zExp, zSig0, zSig1, status);
5084 /*----------------------------------------------------------------------------
5085 | Returns the result of dividing the extended double-precision floating-point
5086 | value `a' by the corresponding value `b'. The operation is performed
5087 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5088 *----------------------------------------------------------------------------*/
5090 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5092 flag aSign, bSign, zSign;
5093 int32_t aExp, bExp, zExp;
5094 uint64_t aSig, bSig, zSig0, zSig1;
5095 uint64_t rem0, rem1, rem2, term0, term1, term2;
5097 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5098 float_raise(float_flag_invalid, status);
5099 return floatx80_default_nan(status);
5101 aSig = extractFloatx80Frac( a );
5102 aExp = extractFloatx80Exp( a );
5103 aSign = extractFloatx80Sign( a );
5104 bSig = extractFloatx80Frac( b );
5105 bExp = extractFloatx80Exp( b );
5106 bSign = extractFloatx80Sign( b );
5107 zSign = aSign ^ bSign;
5108 if ( aExp == 0x7FFF ) {
5109 if ((uint64_t)(aSig << 1)) {
5110 return propagateFloatx80NaN(a, b, status);
5112 if ( bExp == 0x7FFF ) {
5113 if ((uint64_t)(bSig << 1)) {
5114 return propagateFloatx80NaN(a, b, status);
5116 goto invalid;
5118 return packFloatx80(zSign, floatx80_infinity_high,
5119 floatx80_infinity_low);
5121 if ( bExp == 0x7FFF ) {
5122 if ((uint64_t)(bSig << 1)) {
5123 return propagateFloatx80NaN(a, b, status);
5125 return packFloatx80( zSign, 0, 0 );
5127 if ( bExp == 0 ) {
5128 if ( bSig == 0 ) {
5129 if ( ( aExp | aSig ) == 0 ) {
5130 invalid:
5131 float_raise(float_flag_invalid, status);
5132 return floatx80_default_nan(status);
5134 float_raise(float_flag_divbyzero, status);
5135 return packFloatx80(zSign, floatx80_infinity_high,
5136 floatx80_infinity_low);
5138 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5140 if ( aExp == 0 ) {
5141 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5142 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5144 zExp = aExp - bExp + 0x3FFE;
5145 rem1 = 0;
5146 if ( bSig <= aSig ) {
5147 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5148 ++zExp;
5150 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5151 mul64To128( bSig, zSig0, &term0, &term1 );
5152 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5153 while ( (int64_t) rem0 < 0 ) {
5154 --zSig0;
5155 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5157 zSig1 = estimateDiv128To64( rem1, 0, bSig );
5158 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5159 mul64To128( bSig, zSig1, &term1, &term2 );
5160 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5161 while ( (int64_t) rem1 < 0 ) {
5162 --zSig1;
5163 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5165 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5167 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5168 zSign, zExp, zSig0, zSig1, status);
5171 /*----------------------------------------------------------------------------
5172 | Returns the remainder of the extended double-precision floating-point value
5173 | `a' with respect to the corresponding value `b'. The operation is performed
5174 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5175 *----------------------------------------------------------------------------*/
5177 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5179 flag aSign, zSign;
5180 int32_t aExp, bExp, expDiff;
5181 uint64_t aSig0, aSig1, bSig;
5182 uint64_t q, term0, term1, alternateASig0, alternateASig1;
5184 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5185 float_raise(float_flag_invalid, status);
5186 return floatx80_default_nan(status);
5188 aSig0 = extractFloatx80Frac( a );
5189 aExp = extractFloatx80Exp( a );
5190 aSign = extractFloatx80Sign( a );
5191 bSig = extractFloatx80Frac( b );
5192 bExp = extractFloatx80Exp( b );
5193 if ( aExp == 0x7FFF ) {
5194 if ( (uint64_t) ( aSig0<<1 )
5195 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5196 return propagateFloatx80NaN(a, b, status);
5198 goto invalid;
5200 if ( bExp == 0x7FFF ) {
5201 if ((uint64_t)(bSig << 1)) {
5202 return propagateFloatx80NaN(a, b, status);
5204 return a;
5206 if ( bExp == 0 ) {
5207 if ( bSig == 0 ) {
5208 invalid:
5209 float_raise(float_flag_invalid, status);
5210 return floatx80_default_nan(status);
5212 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5214 if ( aExp == 0 ) {
5215 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5216 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5218 bSig |= LIT64( 0x8000000000000000 );
5219 zSign = aSign;
5220 expDiff = aExp - bExp;
5221 aSig1 = 0;
5222 if ( expDiff < 0 ) {
5223 if ( expDiff < -1 ) return a;
5224 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5225 expDiff = 0;
5227 q = ( bSig <= aSig0 );
5228 if ( q ) aSig0 -= bSig;
5229 expDiff -= 64;
5230 while ( 0 < expDiff ) {
5231 q = estimateDiv128To64( aSig0, aSig1, bSig );
5232 q = ( 2 < q ) ? q - 2 : 0;
5233 mul64To128( bSig, q, &term0, &term1 );
5234 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5235 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5236 expDiff -= 62;
5238 expDiff += 64;
5239 if ( 0 < expDiff ) {
5240 q = estimateDiv128To64( aSig0, aSig1, bSig );
5241 q = ( 2 < q ) ? q - 2 : 0;
5242 q >>= 64 - expDiff;
5243 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5244 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5245 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5246 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5247 ++q;
5248 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5251 else {
5252 term1 = 0;
5253 term0 = bSig;
5255 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5256 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5257 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5258 && ( q & 1 ) )
5260 aSig0 = alternateASig0;
5261 aSig1 = alternateASig1;
5262 zSign = ! zSign;
5264 return
5265 normalizeRoundAndPackFloatx80(
5266 80, zSign, bExp + expDiff, aSig0, aSig1, status);
5270 /*----------------------------------------------------------------------------
5271 | Returns the square root of the extended double-precision floating-point
5272 | value `a'. The operation is performed according to the IEC/IEEE Standard
5273 | for Binary Floating-Point Arithmetic.
5274 *----------------------------------------------------------------------------*/
5276 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5278 flag aSign;
5279 int32_t aExp, zExp;
5280 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5281 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5283 if (floatx80_invalid_encoding(a)) {
5284 float_raise(float_flag_invalid, status);
5285 return floatx80_default_nan(status);
5287 aSig0 = extractFloatx80Frac( a );
5288 aExp = extractFloatx80Exp( a );
5289 aSign = extractFloatx80Sign( a );
5290 if ( aExp == 0x7FFF ) {
5291 if ((uint64_t)(aSig0 << 1)) {
5292 return propagateFloatx80NaN(a, a, status);
5294 if ( ! aSign ) return a;
5295 goto invalid;
5297 if ( aSign ) {
5298 if ( ( aExp | aSig0 ) == 0 ) return a;
5299 invalid:
5300 float_raise(float_flag_invalid, status);
5301 return floatx80_default_nan(status);
5303 if ( aExp == 0 ) {
5304 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5305 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5307 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5308 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5309 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5310 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5311 doubleZSig0 = zSig0<<1;
5312 mul64To128( zSig0, zSig0, &term0, &term1 );
5313 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5314 while ( (int64_t) rem0 < 0 ) {
5315 --zSig0;
5316 doubleZSig0 -= 2;
5317 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5319 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5320 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5321 if ( zSig1 == 0 ) zSig1 = 1;
5322 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5323 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5324 mul64To128( zSig1, zSig1, &term2, &term3 );
5325 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5326 while ( (int64_t) rem1 < 0 ) {
5327 --zSig1;
5328 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5329 term3 |= 1;
5330 term2 |= doubleZSig0;
5331 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5333 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5335 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5336 zSig0 |= doubleZSig0;
5337 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5338 0, zExp, zSig0, zSig1, status);
5341 /*----------------------------------------------------------------------------
5342 | Returns 1 if the extended double-precision floating-point value `a' is equal
5343 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5344 | raised if either operand is a NaN. Otherwise, the comparison is performed
5345 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5346 *----------------------------------------------------------------------------*/
5348 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5351 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5352 || (extractFloatx80Exp(a) == 0x7FFF
5353 && (uint64_t) (extractFloatx80Frac(a) << 1))
5354 || (extractFloatx80Exp(b) == 0x7FFF
5355 && (uint64_t) (extractFloatx80Frac(b) << 1))
5357 float_raise(float_flag_invalid, status);
5358 return 0;
5360 return
5361 ( a.low == b.low )
5362 && ( ( a.high == b.high )
5363 || ( ( a.low == 0 )
5364 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5369 /*----------------------------------------------------------------------------
5370 | Returns 1 if the extended double-precision floating-point value `a' is
5371 | less than or equal to the corresponding value `b', and 0 otherwise. The
5372 | invalid exception is raised if either operand is a NaN. The comparison is
5373 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5374 | Arithmetic.
5375 *----------------------------------------------------------------------------*/
5377 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5379 flag aSign, bSign;
5381 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5382 || (extractFloatx80Exp(a) == 0x7FFF
5383 && (uint64_t) (extractFloatx80Frac(a) << 1))
5384 || (extractFloatx80Exp(b) == 0x7FFF
5385 && (uint64_t) (extractFloatx80Frac(b) << 1))
5387 float_raise(float_flag_invalid, status);
5388 return 0;
5390 aSign = extractFloatx80Sign( a );
5391 bSign = extractFloatx80Sign( b );
5392 if ( aSign != bSign ) {
5393 return
5394 aSign
5395 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5396 == 0 );
5398 return
5399 aSign ? le128( b.high, b.low, a.high, a.low )
5400 : le128( a.high, a.low, b.high, b.low );
5404 /*----------------------------------------------------------------------------
5405 | Returns 1 if the extended double-precision floating-point value `a' is
5406 | less than the corresponding value `b', and 0 otherwise. The invalid
5407 | exception is raised if either operand is a NaN. The comparison is performed
5408 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5409 *----------------------------------------------------------------------------*/
5411 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5413 flag aSign, bSign;
5415 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5416 || (extractFloatx80Exp(a) == 0x7FFF
5417 && (uint64_t) (extractFloatx80Frac(a) << 1))
5418 || (extractFloatx80Exp(b) == 0x7FFF
5419 && (uint64_t) (extractFloatx80Frac(b) << 1))
5421 float_raise(float_flag_invalid, status);
5422 return 0;
5424 aSign = extractFloatx80Sign( a );
5425 bSign = extractFloatx80Sign( b );
5426 if ( aSign != bSign ) {
5427 return
5428 aSign
5429 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5430 != 0 );
5432 return
5433 aSign ? lt128( b.high, b.low, a.high, a.low )
5434 : lt128( a.high, a.low, b.high, b.low );
5438 /*----------------------------------------------------------------------------
5439 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5440 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5441 | either operand is a NaN. The comparison is performed according to the
5442 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5443 *----------------------------------------------------------------------------*/
5444 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5446 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5447 || (extractFloatx80Exp(a) == 0x7FFF
5448 && (uint64_t) (extractFloatx80Frac(a) << 1))
5449 || (extractFloatx80Exp(b) == 0x7FFF
5450 && (uint64_t) (extractFloatx80Frac(b) << 1))
5452 float_raise(float_flag_invalid, status);
5453 return 1;
5455 return 0;
5458 /*----------------------------------------------------------------------------
5459 | Returns 1 if the extended double-precision floating-point value `a' is
5460 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5461 | cause an exception. The comparison is performed according to the IEC/IEEE
5462 | Standard for Binary Floating-Point Arithmetic.
5463 *----------------------------------------------------------------------------*/
5465 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5468 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5469 float_raise(float_flag_invalid, status);
5470 return 0;
5472 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5473 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5474 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5475 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5477 if (floatx80_is_signaling_nan(a, status)
5478 || floatx80_is_signaling_nan(b, status)) {
5479 float_raise(float_flag_invalid, status);
5481 return 0;
5483 return
5484 ( a.low == b.low )
5485 && ( ( a.high == b.high )
5486 || ( ( a.low == 0 )
5487 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5492 /*----------------------------------------------------------------------------
5493 | Returns 1 if the extended double-precision floating-point value `a' is less
5494 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5495 | do not cause an exception. Otherwise, the comparison is performed according
5496 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5497 *----------------------------------------------------------------------------*/
5499 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5501 flag aSign, bSign;
5503 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5504 float_raise(float_flag_invalid, status);
5505 return 0;
5507 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5508 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5509 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5510 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5512 if (floatx80_is_signaling_nan(a, status)
5513 || floatx80_is_signaling_nan(b, status)) {
5514 float_raise(float_flag_invalid, status);
5516 return 0;
5518 aSign = extractFloatx80Sign( a );
5519 bSign = extractFloatx80Sign( b );
5520 if ( aSign != bSign ) {
5521 return
5522 aSign
5523 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5524 == 0 );
5526 return
5527 aSign ? le128( b.high, b.low, a.high, a.low )
5528 : le128( a.high, a.low, b.high, b.low );
5532 /*----------------------------------------------------------------------------
5533 | Returns 1 if the extended double-precision floating-point value `a' is less
5534 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5535 | an exception. Otherwise, the comparison is performed according to the
5536 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5537 *----------------------------------------------------------------------------*/
5539 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5541 flag aSign, bSign;
5543 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5544 float_raise(float_flag_invalid, status);
5545 return 0;
5547 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5548 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5549 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5550 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5552 if (floatx80_is_signaling_nan(a, status)
5553 || floatx80_is_signaling_nan(b, status)) {
5554 float_raise(float_flag_invalid, status);
5556 return 0;
5558 aSign = extractFloatx80Sign( a );
5559 bSign = extractFloatx80Sign( b );
5560 if ( aSign != bSign ) {
5561 return
5562 aSign
5563 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5564 != 0 );
5566 return
5567 aSign ? lt128( b.high, b.low, a.high, a.low )
5568 : lt128( a.high, a.low, b.high, b.low );
5572 /*----------------------------------------------------------------------------
5573 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5574 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5575 | The comparison is performed according to the IEC/IEEE Standard for Binary
5576 | Floating-Point Arithmetic.
5577 *----------------------------------------------------------------------------*/
5578 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5580 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5581 float_raise(float_flag_invalid, status);
5582 return 1;
5584 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5585 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5586 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5587 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5589 if (floatx80_is_signaling_nan(a, status)
5590 || floatx80_is_signaling_nan(b, status)) {
5591 float_raise(float_flag_invalid, status);
5593 return 1;
5595 return 0;
5598 /*----------------------------------------------------------------------------
5599 | Returns the result of converting the quadruple-precision floating-point
5600 | value `a' to the 32-bit two's complement integer format. The conversion
5601 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5602 | Arithmetic---which means in particular that the conversion is rounded
5603 | according to the current rounding mode. If `a' is a NaN, the largest
5604 | positive integer is returned. Otherwise, if the conversion overflows, the
5605 | largest integer with the same sign as `a' is returned.
5606 *----------------------------------------------------------------------------*/
5608 int32_t float128_to_int32(float128 a, float_status *status)
5610 flag aSign;
5611 int32_t aExp, shiftCount;
5612 uint64_t aSig0, aSig1;
5614 aSig1 = extractFloat128Frac1( a );
5615 aSig0 = extractFloat128Frac0( a );
5616 aExp = extractFloat128Exp( a );
5617 aSign = extractFloat128Sign( a );
5618 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5619 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5620 aSig0 |= ( aSig1 != 0 );
5621 shiftCount = 0x4028 - aExp;
5622 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5623 return roundAndPackInt32(aSign, aSig0, status);
5627 /*----------------------------------------------------------------------------
5628 | Returns the result of converting the quadruple-precision floating-point
5629 | value `a' to the 32-bit two's complement integer format. The conversion
5630 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5631 | Arithmetic, except that the conversion is always rounded toward zero. If
5632 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5633 | conversion overflows, the largest integer with the same sign as `a' is
5634 | returned.
5635 *----------------------------------------------------------------------------*/
5637 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5639 flag aSign;
5640 int32_t aExp, shiftCount;
5641 uint64_t aSig0, aSig1, savedASig;
5642 int32_t z;
5644 aSig1 = extractFloat128Frac1( a );
5645 aSig0 = extractFloat128Frac0( a );
5646 aExp = extractFloat128Exp( a );
5647 aSign = extractFloat128Sign( a );
5648 aSig0 |= ( aSig1 != 0 );
5649 if ( 0x401E < aExp ) {
5650 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5651 goto invalid;
5653 else if ( aExp < 0x3FFF ) {
5654 if (aExp || aSig0) {
5655 status->float_exception_flags |= float_flag_inexact;
5657 return 0;
5659 aSig0 |= LIT64( 0x0001000000000000 );
5660 shiftCount = 0x402F - aExp;
5661 savedASig = aSig0;
5662 aSig0 >>= shiftCount;
5663 z = aSig0;
5664 if ( aSign ) z = - z;
5665 if ( ( z < 0 ) ^ aSign ) {
5666 invalid:
5667 float_raise(float_flag_invalid, status);
5668 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5670 if ( ( aSig0<<shiftCount ) != savedASig ) {
5671 status->float_exception_flags |= float_flag_inexact;
5673 return z;
5677 /*----------------------------------------------------------------------------
5678 | Returns the result of converting the quadruple-precision floating-point
5679 | value `a' to the 64-bit two's complement integer format. The conversion
5680 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5681 | Arithmetic---which means in particular that the conversion is rounded
5682 | according to the current rounding mode. If `a' is a NaN, the largest
5683 | positive integer is returned. Otherwise, if the conversion overflows, the
5684 | largest integer with the same sign as `a' is returned.
5685 *----------------------------------------------------------------------------*/
5687 int64_t float128_to_int64(float128 a, float_status *status)
5689 flag aSign;
5690 int32_t aExp, shiftCount;
5691 uint64_t aSig0, aSig1;
5693 aSig1 = extractFloat128Frac1( a );
5694 aSig0 = extractFloat128Frac0( a );
5695 aExp = extractFloat128Exp( a );
5696 aSign = extractFloat128Sign( a );
5697 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5698 shiftCount = 0x402F - aExp;
5699 if ( shiftCount <= 0 ) {
5700 if ( 0x403E < aExp ) {
5701 float_raise(float_flag_invalid, status);
5702 if ( ! aSign
5703 || ( ( aExp == 0x7FFF )
5704 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5707 return LIT64( 0x7FFFFFFFFFFFFFFF );
5709 return (int64_t) LIT64( 0x8000000000000000 );
5711 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5713 else {
5714 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5716 return roundAndPackInt64(aSign, aSig0, aSig1, status);
5720 /*----------------------------------------------------------------------------
5721 | Returns the result of converting the quadruple-precision floating-point
5722 | value `a' to the 64-bit two's complement integer format. The conversion
5723 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5724 | Arithmetic, except that the conversion is always rounded toward zero.
5725 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5726 | the conversion overflows, the largest integer with the same sign as `a' is
5727 | returned.
5728 *----------------------------------------------------------------------------*/
5730 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5732 flag aSign;
5733 int32_t aExp, shiftCount;
5734 uint64_t aSig0, aSig1;
5735 int64_t z;
5737 aSig1 = extractFloat128Frac1( a );
5738 aSig0 = extractFloat128Frac0( a );
5739 aExp = extractFloat128Exp( a );
5740 aSign = extractFloat128Sign( a );
5741 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5742 shiftCount = aExp - 0x402F;
5743 if ( 0 < shiftCount ) {
5744 if ( 0x403E <= aExp ) {
5745 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5746 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5747 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5748 if (aSig1) {
5749 status->float_exception_flags |= float_flag_inexact;
5752 else {
5753 float_raise(float_flag_invalid, status);
5754 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5755 return LIT64( 0x7FFFFFFFFFFFFFFF );
5758 return (int64_t) LIT64( 0x8000000000000000 );
5760 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5761 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5762 status->float_exception_flags |= float_flag_inexact;
5765 else {
5766 if ( aExp < 0x3FFF ) {
5767 if ( aExp | aSig0 | aSig1 ) {
5768 status->float_exception_flags |= float_flag_inexact;
5770 return 0;
5772 z = aSig0>>( - shiftCount );
5773 if ( aSig1
5774 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5775 status->float_exception_flags |= float_flag_inexact;
5778 if ( aSign ) z = - z;
5779 return z;
5783 /*----------------------------------------------------------------------------
5784 | Returns the result of converting the quadruple-precision floating-point value
5785 | `a' to the 64-bit unsigned integer format. The conversion is
5786 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5787 | Arithmetic---which means in particular that the conversion is rounded
5788 | according to the current rounding mode. If `a' is a NaN, the largest
5789 | positive integer is returned. If the conversion overflows, the
5790 | largest unsigned integer is returned. If 'a' is negative, the value is
5791 | rounded and zero is returned; negative values that do not round to zero
5792 | will raise the inexact exception.
5793 *----------------------------------------------------------------------------*/
5795 uint64_t float128_to_uint64(float128 a, float_status *status)
5797 flag aSign;
5798 int aExp;
5799 int shiftCount;
5800 uint64_t aSig0, aSig1;
5802 aSig0 = extractFloat128Frac0(a);
5803 aSig1 = extractFloat128Frac1(a);
5804 aExp = extractFloat128Exp(a);
5805 aSign = extractFloat128Sign(a);
5806 if (aSign && (aExp > 0x3FFE)) {
5807 float_raise(float_flag_invalid, status);
5808 if (float128_is_any_nan(a)) {
5809 return LIT64(0xFFFFFFFFFFFFFFFF);
5810 } else {
5811 return 0;
5814 if (aExp) {
5815 aSig0 |= LIT64(0x0001000000000000);
5817 shiftCount = 0x402F - aExp;
5818 if (shiftCount <= 0) {
5819 if (0x403E < aExp) {
5820 float_raise(float_flag_invalid, status);
5821 return LIT64(0xFFFFFFFFFFFFFFFF);
5823 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5824 } else {
5825 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5827 return roundAndPackUint64(aSign, aSig0, aSig1, status);
5830 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5832 uint64_t v;
5833 signed char current_rounding_mode = status->float_rounding_mode;
5835 set_float_rounding_mode(float_round_to_zero, status);
5836 v = float128_to_uint64(a, status);
5837 set_float_rounding_mode(current_rounding_mode, status);
5839 return v;
5842 /*----------------------------------------------------------------------------
5843 | Returns the result of converting the quadruple-precision floating-point
5844 | value `a' to the 32-bit unsigned integer format. The conversion
5845 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5846 | Arithmetic except that the conversion is always rounded toward zero.
5847 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
5848 | if the conversion overflows, the largest unsigned integer is returned.
5849 | If 'a' is negative, the value is rounded and zero is returned; negative
5850 | values that do not round to zero will raise the inexact exception.
5851 *----------------------------------------------------------------------------*/
5853 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5855 uint64_t v;
5856 uint32_t res;
5857 int old_exc_flags = get_float_exception_flags(status);
5859 v = float128_to_uint64_round_to_zero(a, status);
5860 if (v > 0xffffffff) {
5861 res = 0xffffffff;
5862 } else {
5863 return v;
5865 set_float_exception_flags(old_exc_flags, status);
5866 float_raise(float_flag_invalid, status);
5867 return res;
5870 /*----------------------------------------------------------------------------
5871 | Returns the result of converting the quadruple-precision floating-point
5872 | value `a' to the single-precision floating-point format. The conversion
5873 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5874 | Arithmetic.
5875 *----------------------------------------------------------------------------*/
5877 float32 float128_to_float32(float128 a, float_status *status)
5879 flag aSign;
5880 int32_t aExp;
5881 uint64_t aSig0, aSig1;
5882 uint32_t zSig;
5884 aSig1 = extractFloat128Frac1( a );
5885 aSig0 = extractFloat128Frac0( a );
5886 aExp = extractFloat128Exp( a );
5887 aSign = extractFloat128Sign( a );
5888 if ( aExp == 0x7FFF ) {
5889 if ( aSig0 | aSig1 ) {
5890 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5892 return packFloat32( aSign, 0xFF, 0 );
5894 aSig0 |= ( aSig1 != 0 );
5895 shift64RightJamming( aSig0, 18, &aSig0 );
5896 zSig = aSig0;
5897 if ( aExp || zSig ) {
5898 zSig |= 0x40000000;
5899 aExp -= 0x3F81;
5901 return roundAndPackFloat32(aSign, aExp, zSig, status);
5905 /*----------------------------------------------------------------------------
5906 | Returns the result of converting the quadruple-precision floating-point
5907 | value `a' to the double-precision floating-point format. The conversion
5908 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5909 | Arithmetic.
5910 *----------------------------------------------------------------------------*/
5912 float64 float128_to_float64(float128 a, float_status *status)
5914 flag aSign;
5915 int32_t aExp;
5916 uint64_t aSig0, aSig1;
5918 aSig1 = extractFloat128Frac1( a );
5919 aSig0 = extractFloat128Frac0( a );
5920 aExp = extractFloat128Exp( a );
5921 aSign = extractFloat128Sign( a );
5922 if ( aExp == 0x7FFF ) {
5923 if ( aSig0 | aSig1 ) {
5924 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5926 return packFloat64( aSign, 0x7FF, 0 );
5928 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5929 aSig0 |= ( aSig1 != 0 );
5930 if ( aExp || aSig0 ) {
5931 aSig0 |= LIT64( 0x4000000000000000 );
5932 aExp -= 0x3C01;
5934 return roundAndPackFloat64(aSign, aExp, aSig0, status);
5938 /*----------------------------------------------------------------------------
5939 | Returns the result of converting the quadruple-precision floating-point
5940 | value `a' to the extended double-precision floating-point format. The
5941 | conversion is performed according to the IEC/IEEE Standard for Binary
5942 | Floating-Point Arithmetic.
5943 *----------------------------------------------------------------------------*/
5945 floatx80 float128_to_floatx80(float128 a, float_status *status)
5947 flag aSign;
5948 int32_t aExp;
5949 uint64_t aSig0, aSig1;
5951 aSig1 = extractFloat128Frac1( a );
5952 aSig0 = extractFloat128Frac0( a );
5953 aExp = extractFloat128Exp( a );
5954 aSign = extractFloat128Sign( a );
5955 if ( aExp == 0x7FFF ) {
5956 if ( aSig0 | aSig1 ) {
5957 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
5959 return packFloatx80(aSign, floatx80_infinity_high,
5960 floatx80_infinity_low);
5962 if ( aExp == 0 ) {
5963 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5964 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5966 else {
5967 aSig0 |= LIT64( 0x0001000000000000 );
5969 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5970 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
5974 /*----------------------------------------------------------------------------
5975 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5976 | returns the result as a quadruple-precision floating-point value. The
5977 | operation is performed according to the IEC/IEEE Standard for Binary
5978 | Floating-Point Arithmetic.
5979 *----------------------------------------------------------------------------*/
5981 float128 float128_round_to_int(float128 a, float_status *status)
5983 flag aSign;
5984 int32_t aExp;
5985 uint64_t lastBitMask, roundBitsMask;
5986 float128 z;
5988 aExp = extractFloat128Exp( a );
5989 if ( 0x402F <= aExp ) {
5990 if ( 0x406F <= aExp ) {
5991 if ( ( aExp == 0x7FFF )
5992 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5994 return propagateFloat128NaN(a, a, status);
5996 return a;
5998 lastBitMask = 1;
5999 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6000 roundBitsMask = lastBitMask - 1;
6001 z = a;
6002 switch (status->float_rounding_mode) {
6003 case float_round_nearest_even:
6004 if ( lastBitMask ) {
6005 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6006 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6008 else {
6009 if ( (int64_t) z.low < 0 ) {
6010 ++z.high;
6011 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6014 break;
6015 case float_round_ties_away:
6016 if (lastBitMask) {
6017 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6018 } else {
6019 if ((int64_t) z.low < 0) {
6020 ++z.high;
6023 break;
6024 case float_round_to_zero:
6025 break;
6026 case float_round_up:
6027 if (!extractFloat128Sign(z)) {
6028 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6030 break;
6031 case float_round_down:
6032 if (extractFloat128Sign(z)) {
6033 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6035 break;
6036 default:
6037 abort();
6039 z.low &= ~ roundBitsMask;
6041 else {
6042 if ( aExp < 0x3FFF ) {
6043 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6044 status->float_exception_flags |= float_flag_inexact;
6045 aSign = extractFloat128Sign( a );
6046 switch (status->float_rounding_mode) {
6047 case float_round_nearest_even:
6048 if ( ( aExp == 0x3FFE )
6049 && ( extractFloat128Frac0( a )
6050 | extractFloat128Frac1( a ) )
6052 return packFloat128( aSign, 0x3FFF, 0, 0 );
6054 break;
6055 case float_round_ties_away:
6056 if (aExp == 0x3FFE) {
6057 return packFloat128(aSign, 0x3FFF, 0, 0);
6059 break;
6060 case float_round_down:
6061 return
6062 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6063 : packFloat128( 0, 0, 0, 0 );
6064 case float_round_up:
6065 return
6066 aSign ? packFloat128( 1, 0, 0, 0 )
6067 : packFloat128( 0, 0x3FFF, 0, 0 );
6069 return packFloat128( aSign, 0, 0, 0 );
6071 lastBitMask = 1;
6072 lastBitMask <<= 0x402F - aExp;
6073 roundBitsMask = lastBitMask - 1;
6074 z.low = 0;
6075 z.high = a.high;
6076 switch (status->float_rounding_mode) {
6077 case float_round_nearest_even:
6078 z.high += lastBitMask>>1;
6079 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6080 z.high &= ~ lastBitMask;
6082 break;
6083 case float_round_ties_away:
6084 z.high += lastBitMask>>1;
6085 break;
6086 case float_round_to_zero:
6087 break;
6088 case float_round_up:
6089 if (!extractFloat128Sign(z)) {
6090 z.high |= ( a.low != 0 );
6091 z.high += roundBitsMask;
6093 break;
6094 case float_round_down:
6095 if (extractFloat128Sign(z)) {
6096 z.high |= (a.low != 0);
6097 z.high += roundBitsMask;
6099 break;
6100 default:
6101 abort();
6103 z.high &= ~ roundBitsMask;
6105 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6106 status->float_exception_flags |= float_flag_inexact;
6108 return z;
6112 /*----------------------------------------------------------------------------
6113 | Returns the result of adding the absolute values of the quadruple-precision
6114 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6115 | before being returned. `zSign' is ignored if the result is a NaN.
6116 | The addition is performed according to the IEC/IEEE Standard for Binary
6117 | Floating-Point Arithmetic.
6118 *----------------------------------------------------------------------------*/
6120 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6121 float_status *status)
6123 int32_t aExp, bExp, zExp;
6124 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6125 int32_t expDiff;
6127 aSig1 = extractFloat128Frac1( a );
6128 aSig0 = extractFloat128Frac0( a );
6129 aExp = extractFloat128Exp( a );
6130 bSig1 = extractFloat128Frac1( b );
6131 bSig0 = extractFloat128Frac0( b );
6132 bExp = extractFloat128Exp( b );
6133 expDiff = aExp - bExp;
6134 if ( 0 < expDiff ) {
6135 if ( aExp == 0x7FFF ) {
6136 if (aSig0 | aSig1) {
6137 return propagateFloat128NaN(a, b, status);
6139 return a;
6141 if ( bExp == 0 ) {
6142 --expDiff;
6144 else {
6145 bSig0 |= LIT64( 0x0001000000000000 );
6147 shift128ExtraRightJamming(
6148 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6149 zExp = aExp;
6151 else if ( expDiff < 0 ) {
6152 if ( bExp == 0x7FFF ) {
6153 if (bSig0 | bSig1) {
6154 return propagateFloat128NaN(a, b, status);
6156 return packFloat128( zSign, 0x7FFF, 0, 0 );
6158 if ( aExp == 0 ) {
6159 ++expDiff;
6161 else {
6162 aSig0 |= LIT64( 0x0001000000000000 );
6164 shift128ExtraRightJamming(
6165 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6166 zExp = bExp;
6168 else {
6169 if ( aExp == 0x7FFF ) {
6170 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6171 return propagateFloat128NaN(a, b, status);
6173 return a;
6175 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6176 if ( aExp == 0 ) {
6177 if (status->flush_to_zero) {
6178 if (zSig0 | zSig1) {
6179 float_raise(float_flag_output_denormal, status);
6181 return packFloat128(zSign, 0, 0, 0);
6183 return packFloat128( zSign, 0, zSig0, zSig1 );
6185 zSig2 = 0;
6186 zSig0 |= LIT64( 0x0002000000000000 );
6187 zExp = aExp;
6188 goto shiftRight1;
6190 aSig0 |= LIT64( 0x0001000000000000 );
6191 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6192 --zExp;
6193 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6194 ++zExp;
6195 shiftRight1:
6196 shift128ExtraRightJamming(
6197 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6198 roundAndPack:
6199 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6203 /*----------------------------------------------------------------------------
6204 | Returns the result of subtracting the absolute values of the quadruple-
6205 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6206 | difference is negated before being returned. `zSign' is ignored if the
6207 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6208 | Standard for Binary Floating-Point Arithmetic.
6209 *----------------------------------------------------------------------------*/
6211 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6212 float_status *status)
6214 int32_t aExp, bExp, zExp;
6215 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6216 int32_t expDiff;
6218 aSig1 = extractFloat128Frac1( a );
6219 aSig0 = extractFloat128Frac0( a );
6220 aExp = extractFloat128Exp( a );
6221 bSig1 = extractFloat128Frac1( b );
6222 bSig0 = extractFloat128Frac0( b );
6223 bExp = extractFloat128Exp( b );
6224 expDiff = aExp - bExp;
6225 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6226 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6227 if ( 0 < expDiff ) goto aExpBigger;
6228 if ( expDiff < 0 ) goto bExpBigger;
6229 if ( aExp == 0x7FFF ) {
6230 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6231 return propagateFloat128NaN(a, b, status);
6233 float_raise(float_flag_invalid, status);
6234 return float128_default_nan(status);
6236 if ( aExp == 0 ) {
6237 aExp = 1;
6238 bExp = 1;
6240 if ( bSig0 < aSig0 ) goto aBigger;
6241 if ( aSig0 < bSig0 ) goto bBigger;
6242 if ( bSig1 < aSig1 ) goto aBigger;
6243 if ( aSig1 < bSig1 ) goto bBigger;
6244 return packFloat128(status->float_rounding_mode == float_round_down,
6245 0, 0, 0);
6246 bExpBigger:
6247 if ( bExp == 0x7FFF ) {
6248 if (bSig0 | bSig1) {
6249 return propagateFloat128NaN(a, b, status);
6251 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6253 if ( aExp == 0 ) {
6254 ++expDiff;
6256 else {
6257 aSig0 |= LIT64( 0x4000000000000000 );
6259 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6260 bSig0 |= LIT64( 0x4000000000000000 );
6261 bBigger:
6262 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6263 zExp = bExp;
6264 zSign ^= 1;
6265 goto normalizeRoundAndPack;
6266 aExpBigger:
6267 if ( aExp == 0x7FFF ) {
6268 if (aSig0 | aSig1) {
6269 return propagateFloat128NaN(a, b, status);
6271 return a;
6273 if ( bExp == 0 ) {
6274 --expDiff;
6276 else {
6277 bSig0 |= LIT64( 0x4000000000000000 );
6279 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6280 aSig0 |= LIT64( 0x4000000000000000 );
6281 aBigger:
6282 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6283 zExp = aExp;
6284 normalizeRoundAndPack:
6285 --zExp;
6286 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6287 status);
6291 /*----------------------------------------------------------------------------
6292 | Returns the result of adding the quadruple-precision floating-point values
6293 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6294 | for Binary Floating-Point Arithmetic.
6295 *----------------------------------------------------------------------------*/
6297 float128 float128_add(float128 a, float128 b, float_status *status)
6299 flag aSign, bSign;
6301 aSign = extractFloat128Sign( a );
6302 bSign = extractFloat128Sign( b );
6303 if ( aSign == bSign ) {
6304 return addFloat128Sigs(a, b, aSign, status);
6306 else {
6307 return subFloat128Sigs(a, b, aSign, status);
6312 /*----------------------------------------------------------------------------
6313 | Returns the result of subtracting the quadruple-precision floating-point
6314 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6315 | Standard for Binary Floating-Point Arithmetic.
6316 *----------------------------------------------------------------------------*/
6318 float128 float128_sub(float128 a, float128 b, float_status *status)
6320 flag aSign, bSign;
6322 aSign = extractFloat128Sign( a );
6323 bSign = extractFloat128Sign( b );
6324 if ( aSign == bSign ) {
6325 return subFloat128Sigs(a, b, aSign, status);
6327 else {
6328 return addFloat128Sigs(a, b, aSign, status);
6333 /*----------------------------------------------------------------------------
6334 | Returns the result of multiplying the quadruple-precision floating-point
6335 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6336 | Standard for Binary Floating-Point Arithmetic.
6337 *----------------------------------------------------------------------------*/
6339 float128 float128_mul(float128 a, float128 b, float_status *status)
6341 flag aSign, bSign, zSign;
6342 int32_t aExp, bExp, zExp;
6343 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6345 aSig1 = extractFloat128Frac1( a );
6346 aSig0 = extractFloat128Frac0( a );
6347 aExp = extractFloat128Exp( a );
6348 aSign = extractFloat128Sign( a );
6349 bSig1 = extractFloat128Frac1( b );
6350 bSig0 = extractFloat128Frac0( b );
6351 bExp = extractFloat128Exp( b );
6352 bSign = extractFloat128Sign( b );
6353 zSign = aSign ^ bSign;
6354 if ( aExp == 0x7FFF ) {
6355 if ( ( aSig0 | aSig1 )
6356 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6357 return propagateFloat128NaN(a, b, status);
6359 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6360 return packFloat128( zSign, 0x7FFF, 0, 0 );
6362 if ( bExp == 0x7FFF ) {
6363 if (bSig0 | bSig1) {
6364 return propagateFloat128NaN(a, b, status);
6366 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6367 invalid:
6368 float_raise(float_flag_invalid, status);
6369 return float128_default_nan(status);
6371 return packFloat128( zSign, 0x7FFF, 0, 0 );
6373 if ( aExp == 0 ) {
6374 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6375 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6377 if ( bExp == 0 ) {
6378 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6379 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6381 zExp = aExp + bExp - 0x4000;
6382 aSig0 |= LIT64( 0x0001000000000000 );
6383 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6384 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6385 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6386 zSig2 |= ( zSig3 != 0 );
6387 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6388 shift128ExtraRightJamming(
6389 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6390 ++zExp;
6392 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6396 /*----------------------------------------------------------------------------
6397 | Returns the result of dividing the quadruple-precision floating-point value
6398 | `a' by the corresponding value `b'. The operation is performed according to
6399 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6400 *----------------------------------------------------------------------------*/
6402 float128 float128_div(float128 a, float128 b, float_status *status)
6404 flag aSign, bSign, zSign;
6405 int32_t aExp, bExp, zExp;
6406 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6407 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6409 aSig1 = extractFloat128Frac1( a );
6410 aSig0 = extractFloat128Frac0( a );
6411 aExp = extractFloat128Exp( a );
6412 aSign = extractFloat128Sign( a );
6413 bSig1 = extractFloat128Frac1( b );
6414 bSig0 = extractFloat128Frac0( b );
6415 bExp = extractFloat128Exp( b );
6416 bSign = extractFloat128Sign( b );
6417 zSign = aSign ^ bSign;
6418 if ( aExp == 0x7FFF ) {
6419 if (aSig0 | aSig1) {
6420 return propagateFloat128NaN(a, b, status);
6422 if ( bExp == 0x7FFF ) {
6423 if (bSig0 | bSig1) {
6424 return propagateFloat128NaN(a, b, status);
6426 goto invalid;
6428 return packFloat128( zSign, 0x7FFF, 0, 0 );
6430 if ( bExp == 0x7FFF ) {
6431 if (bSig0 | bSig1) {
6432 return propagateFloat128NaN(a, b, status);
6434 return packFloat128( zSign, 0, 0, 0 );
6436 if ( bExp == 0 ) {
6437 if ( ( bSig0 | bSig1 ) == 0 ) {
6438 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6439 invalid:
6440 float_raise(float_flag_invalid, status);
6441 return float128_default_nan(status);
6443 float_raise(float_flag_divbyzero, status);
6444 return packFloat128( zSign, 0x7FFF, 0, 0 );
6446 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6448 if ( aExp == 0 ) {
6449 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6450 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6452 zExp = aExp - bExp + 0x3FFD;
6453 shortShift128Left(
6454 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6455 shortShift128Left(
6456 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6457 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6458 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6459 ++zExp;
6461 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6462 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6463 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6464 while ( (int64_t) rem0 < 0 ) {
6465 --zSig0;
6466 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6468 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6469 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6470 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6471 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6472 while ( (int64_t) rem1 < 0 ) {
6473 --zSig1;
6474 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6476 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6478 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6479 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6483 /*----------------------------------------------------------------------------
6484 | Returns the remainder of the quadruple-precision floating-point value `a'
6485 | with respect to the corresponding value `b'. The operation is performed
6486 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6487 *----------------------------------------------------------------------------*/
6489 float128 float128_rem(float128 a, float128 b, float_status *status)
6491 flag aSign, zSign;
6492 int32_t aExp, bExp, expDiff;
6493 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6494 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6495 int64_t sigMean0;
6497 aSig1 = extractFloat128Frac1( a );
6498 aSig0 = extractFloat128Frac0( a );
6499 aExp = extractFloat128Exp( a );
6500 aSign = extractFloat128Sign( a );
6501 bSig1 = extractFloat128Frac1( b );
6502 bSig0 = extractFloat128Frac0( b );
6503 bExp = extractFloat128Exp( b );
6504 if ( aExp == 0x7FFF ) {
6505 if ( ( aSig0 | aSig1 )
6506 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6507 return propagateFloat128NaN(a, b, status);
6509 goto invalid;
6511 if ( bExp == 0x7FFF ) {
6512 if (bSig0 | bSig1) {
6513 return propagateFloat128NaN(a, b, status);
6515 return a;
6517 if ( bExp == 0 ) {
6518 if ( ( bSig0 | bSig1 ) == 0 ) {
6519 invalid:
6520 float_raise(float_flag_invalid, status);
6521 return float128_default_nan(status);
6523 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6525 if ( aExp == 0 ) {
6526 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6527 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6529 expDiff = aExp - bExp;
6530 if ( expDiff < -1 ) return a;
6531 shortShift128Left(
6532 aSig0 | LIT64( 0x0001000000000000 ),
6533 aSig1,
6534 15 - ( expDiff < 0 ),
6535 &aSig0,
6536 &aSig1
6538 shortShift128Left(
6539 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6540 q = le128( bSig0, bSig1, aSig0, aSig1 );
6541 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6542 expDiff -= 64;
6543 while ( 0 < expDiff ) {
6544 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6545 q = ( 4 < q ) ? q - 4 : 0;
6546 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6547 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6548 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6549 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6550 expDiff -= 61;
6552 if ( -64 < expDiff ) {
6553 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6554 q = ( 4 < q ) ? q - 4 : 0;
6555 q >>= - expDiff;
6556 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6557 expDiff += 52;
6558 if ( expDiff < 0 ) {
6559 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6561 else {
6562 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6564 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6565 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6567 else {
6568 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6569 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6571 do {
6572 alternateASig0 = aSig0;
6573 alternateASig1 = aSig1;
6574 ++q;
6575 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6576 } while ( 0 <= (int64_t) aSig0 );
6577 add128(
6578 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6579 if ( ( sigMean0 < 0 )
6580 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6581 aSig0 = alternateASig0;
6582 aSig1 = alternateASig1;
6584 zSign = ( (int64_t) aSig0 < 0 );
6585 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6586 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6587 status);
6590 /*----------------------------------------------------------------------------
6591 | Returns the square root of the quadruple-precision floating-point value `a'.
6592 | The operation is performed according to the IEC/IEEE Standard for Binary
6593 | Floating-Point Arithmetic.
6594 *----------------------------------------------------------------------------*/
6596 float128 float128_sqrt(float128 a, float_status *status)
6598 flag aSign;
6599 int32_t aExp, zExp;
6600 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6601 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6603 aSig1 = extractFloat128Frac1( a );
6604 aSig0 = extractFloat128Frac0( a );
6605 aExp = extractFloat128Exp( a );
6606 aSign = extractFloat128Sign( a );
6607 if ( aExp == 0x7FFF ) {
6608 if (aSig0 | aSig1) {
6609 return propagateFloat128NaN(a, a, status);
6611 if ( ! aSign ) return a;
6612 goto invalid;
6614 if ( aSign ) {
6615 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6616 invalid:
6617 float_raise(float_flag_invalid, status);
6618 return float128_default_nan(status);
6620 if ( aExp == 0 ) {
6621 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6622 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6624 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6625 aSig0 |= LIT64( 0x0001000000000000 );
6626 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6627 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6628 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6629 doubleZSig0 = zSig0<<1;
6630 mul64To128( zSig0, zSig0, &term0, &term1 );
6631 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6632 while ( (int64_t) rem0 < 0 ) {
6633 --zSig0;
6634 doubleZSig0 -= 2;
6635 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6637 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6638 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6639 if ( zSig1 == 0 ) zSig1 = 1;
6640 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6641 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6642 mul64To128( zSig1, zSig1, &term2, &term3 );
6643 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6644 while ( (int64_t) rem1 < 0 ) {
6645 --zSig1;
6646 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6647 term3 |= 1;
6648 term2 |= doubleZSig0;
6649 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6651 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6653 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6654 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6658 /*----------------------------------------------------------------------------
6659 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6660 | the corresponding value `b', and 0 otherwise. The invalid exception is
6661 | raised if either operand is a NaN. Otherwise, the comparison is performed
6662 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6663 *----------------------------------------------------------------------------*/
6665 int float128_eq(float128 a, float128 b, float_status *status)
6668 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6669 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6670 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6671 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6673 float_raise(float_flag_invalid, status);
6674 return 0;
6676 return
6677 ( a.low == b.low )
6678 && ( ( a.high == b.high )
6679 || ( ( a.low == 0 )
6680 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6685 /*----------------------------------------------------------------------------
6686 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6687 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6688 | exception is raised if either operand is a NaN. The comparison is performed
6689 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6690 *----------------------------------------------------------------------------*/
6692 int float128_le(float128 a, float128 b, float_status *status)
6694 flag aSign, bSign;
6696 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6697 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6698 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6699 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6701 float_raise(float_flag_invalid, status);
6702 return 0;
6704 aSign = extractFloat128Sign( a );
6705 bSign = extractFloat128Sign( b );
6706 if ( aSign != bSign ) {
6707 return
6708 aSign
6709 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6710 == 0 );
6712 return
6713 aSign ? le128( b.high, b.low, a.high, a.low )
6714 : le128( a.high, a.low, b.high, b.low );
6718 /*----------------------------------------------------------------------------
6719 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6720 | the corresponding value `b', and 0 otherwise. The invalid exception is
6721 | raised if either operand is a NaN. The comparison is performed according
6722 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6723 *----------------------------------------------------------------------------*/
6725 int float128_lt(float128 a, float128 b, float_status *status)
6727 flag aSign, bSign;
6729 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6730 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6731 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6732 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6734 float_raise(float_flag_invalid, status);
6735 return 0;
6737 aSign = extractFloat128Sign( a );
6738 bSign = extractFloat128Sign( b );
6739 if ( aSign != bSign ) {
6740 return
6741 aSign
6742 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6743 != 0 );
6745 return
6746 aSign ? lt128( b.high, b.low, a.high, a.low )
6747 : lt128( a.high, a.low, b.high, b.low );
6751 /*----------------------------------------------------------------------------
6752 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6753 | be compared, and 0 otherwise. The invalid exception is raised if either
6754 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6755 | Standard for Binary Floating-Point Arithmetic.
6756 *----------------------------------------------------------------------------*/
6758 int float128_unordered(float128 a, float128 b, float_status *status)
6760 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6761 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6762 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6763 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6765 float_raise(float_flag_invalid, status);
6766 return 1;
6768 return 0;
6771 /*----------------------------------------------------------------------------
6772 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6773 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6774 | exception. The comparison is performed according to the IEC/IEEE Standard
6775 | for Binary Floating-Point Arithmetic.
6776 *----------------------------------------------------------------------------*/
6778 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6781 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6782 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6783 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6784 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6786 if (float128_is_signaling_nan(a, status)
6787 || float128_is_signaling_nan(b, status)) {
6788 float_raise(float_flag_invalid, status);
6790 return 0;
6792 return
6793 ( a.low == b.low )
6794 && ( ( a.high == b.high )
6795 || ( ( a.low == 0 )
6796 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6801 /*----------------------------------------------------------------------------
6802 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6803 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6804 | cause an exception. Otherwise, the comparison is performed according to the
6805 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6806 *----------------------------------------------------------------------------*/
6808 int float128_le_quiet(float128 a, float128 b, float_status *status)
6810 flag aSign, bSign;
6812 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6813 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6814 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6815 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6817 if (float128_is_signaling_nan(a, status)
6818 || float128_is_signaling_nan(b, status)) {
6819 float_raise(float_flag_invalid, status);
6821 return 0;
6823 aSign = extractFloat128Sign( a );
6824 bSign = extractFloat128Sign( b );
6825 if ( aSign != bSign ) {
6826 return
6827 aSign
6828 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6829 == 0 );
6831 return
6832 aSign ? le128( b.high, b.low, a.high, a.low )
6833 : le128( a.high, a.low, b.high, b.low );
6837 /*----------------------------------------------------------------------------
6838 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6839 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6840 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6841 | Standard for Binary Floating-Point Arithmetic.
6842 *----------------------------------------------------------------------------*/
6844 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6846 flag aSign, bSign;
6848 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6849 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6850 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6851 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6853 if (float128_is_signaling_nan(a, status)
6854 || float128_is_signaling_nan(b, status)) {
6855 float_raise(float_flag_invalid, status);
6857 return 0;
6859 aSign = extractFloat128Sign( a );
6860 bSign = extractFloat128Sign( b );
6861 if ( aSign != bSign ) {
6862 return
6863 aSign
6864 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6865 != 0 );
6867 return
6868 aSign ? lt128( b.high, b.low, a.high, a.low )
6869 : lt128( a.high, a.low, b.high, b.low );
6873 /*----------------------------------------------------------------------------
6874 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6875 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6876 | comparison is performed according to the IEC/IEEE Standard for Binary
6877 | Floating-Point Arithmetic.
6878 *----------------------------------------------------------------------------*/
6880 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6882 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6883 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6884 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6885 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6887 if (float128_is_signaling_nan(a, status)
6888 || float128_is_signaling_nan(b, status)) {
6889 float_raise(float_flag_invalid, status);
6891 return 1;
6893 return 0;
6896 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6897 int is_quiet, float_status *status)
6899 flag aSign, bSign;
6901 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6902 float_raise(float_flag_invalid, status);
6903 return float_relation_unordered;
6905 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6906 ( extractFloatx80Frac( a )<<1 ) ) ||
6907 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6908 ( extractFloatx80Frac( b )<<1 ) )) {
6909 if (!is_quiet ||
6910 floatx80_is_signaling_nan(a, status) ||
6911 floatx80_is_signaling_nan(b, status)) {
6912 float_raise(float_flag_invalid, status);
6914 return float_relation_unordered;
6916 aSign = extractFloatx80Sign( a );
6917 bSign = extractFloatx80Sign( b );
6918 if ( aSign != bSign ) {
6920 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6921 ( ( a.low | b.low ) == 0 ) ) {
6922 /* zero case */
6923 return float_relation_equal;
6924 } else {
6925 return 1 - (2 * aSign);
6927 } else {
6928 if (a.low == b.low && a.high == b.high) {
6929 return float_relation_equal;
6930 } else {
6931 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6936 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6938 return floatx80_compare_internal(a, b, 0, status);
6941 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6943 return floatx80_compare_internal(a, b, 1, status);
6946 static inline int float128_compare_internal(float128 a, float128 b,
6947 int is_quiet, float_status *status)
6949 flag aSign, bSign;
6951 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6952 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6953 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6954 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6955 if (!is_quiet ||
6956 float128_is_signaling_nan(a, status) ||
6957 float128_is_signaling_nan(b, status)) {
6958 float_raise(float_flag_invalid, status);
6960 return float_relation_unordered;
6962 aSign = extractFloat128Sign( a );
6963 bSign = extractFloat128Sign( b );
6964 if ( aSign != bSign ) {
6965 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6966 /* zero case */
6967 return float_relation_equal;
6968 } else {
6969 return 1 - (2 * aSign);
6971 } else {
6972 if (a.low == b.low && a.high == b.high) {
6973 return float_relation_equal;
6974 } else {
6975 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6980 int float128_compare(float128 a, float128 b, float_status *status)
6982 return float128_compare_internal(a, b, 0, status);
6985 int float128_compare_quiet(float128 a, float128 b, float_status *status)
6987 return float128_compare_internal(a, b, 1, status);
6990 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
6992 flag aSign;
6993 int32_t aExp;
6994 uint64_t aSig;
6996 if (floatx80_invalid_encoding(a)) {
6997 float_raise(float_flag_invalid, status);
6998 return floatx80_default_nan(status);
7000 aSig = extractFloatx80Frac( a );
7001 aExp = extractFloatx80Exp( a );
7002 aSign = extractFloatx80Sign( a );
7004 if ( aExp == 0x7FFF ) {
7005 if ( aSig<<1 ) {
7006 return propagateFloatx80NaN(a, a, status);
7008 return a;
7011 if (aExp == 0) {
7012 if (aSig == 0) {
7013 return a;
7015 aExp++;
7018 if (n > 0x10000) {
7019 n = 0x10000;
7020 } else if (n < -0x10000) {
7021 n = -0x10000;
7024 aExp += n;
7025 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7026 aSign, aExp, aSig, 0, status);
7029 float128 float128_scalbn(float128 a, int n, float_status *status)
7031 flag aSign;
7032 int32_t aExp;
7033 uint64_t aSig0, aSig1;
7035 aSig1 = extractFloat128Frac1( a );
7036 aSig0 = extractFloat128Frac0( a );
7037 aExp = extractFloat128Exp( a );
7038 aSign = extractFloat128Sign( a );
7039 if ( aExp == 0x7FFF ) {
7040 if ( aSig0 | aSig1 ) {
7041 return propagateFloat128NaN(a, a, status);
7043 return a;
7045 if (aExp != 0) {
7046 aSig0 |= LIT64( 0x0001000000000000 );
7047 } else if (aSig0 == 0 && aSig1 == 0) {
7048 return a;
7049 } else {
7050 aExp++;
7053 if (n > 0x10000) {
7054 n = 0x10000;
7055 } else if (n < -0x10000) {
7056 n = -0x10000;
7059 aExp += n - 1;
7060 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7061 , status);