migration: new message MIG_RP_MSG_RECV_BITMAP
[qemu/ar7.git] / fpu / softfloat.c
blobbc0f52fa54706644785a9229c8efe93b297fbc0b
1 /*
2 * QEMU float support
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "fpu/softfloat-macros.h"
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine: (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output. These details are target-
104 | specific.
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
112 static inline uint32_t extractFloat16Frac(float16 a)
114 return float16_val(a) & 0x3ff;
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
121 static inline int extractFloat16Exp(float16 a)
123 return (float16_val(a) >> 10) & 0x1f;
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
130 static inline flag extractFloat16Sign(float16 a)
132 return float16_val(a)>>15;
135 /*----------------------------------------------------------------------------
136 | Returns the fraction bits of the single-precision floating-point value `a'.
137 *----------------------------------------------------------------------------*/
139 static inline uint32_t extractFloat32Frac(float32 a)
141 return float32_val(a) & 0x007FFFFF;
144 /*----------------------------------------------------------------------------
145 | Returns the exponent bits of the single-precision floating-point value `a'.
146 *----------------------------------------------------------------------------*/
148 static inline int extractFloat32Exp(float32 a)
150 return (float32_val(a) >> 23) & 0xFF;
153 /*----------------------------------------------------------------------------
154 | Returns the sign bit of the single-precision floating-point value `a'.
155 *----------------------------------------------------------------------------*/
157 static inline flag extractFloat32Sign(float32 a)
159 return float32_val(a) >> 31;
162 /*----------------------------------------------------------------------------
163 | Returns the fraction bits of the double-precision floating-point value `a'.
164 *----------------------------------------------------------------------------*/
166 static inline uint64_t extractFloat64Frac(float64 a)
168 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
171 /*----------------------------------------------------------------------------
172 | Returns the exponent bits of the double-precision floating-point value `a'.
173 *----------------------------------------------------------------------------*/
175 static inline int extractFloat64Exp(float64 a)
177 return (float64_val(a) >> 52) & 0x7FF;
180 /*----------------------------------------------------------------------------
181 | Returns the sign bit of the double-precision floating-point value `a'.
182 *----------------------------------------------------------------------------*/
184 static inline flag extractFloat64Sign(float64 a)
186 return float64_val(a) >> 63;
190 * Classify a floating point number. Everything above float_class_qnan
191 * is a NaN so cls >= float_class_qnan is any NaN.
194 typedef enum __attribute__ ((__packed__)) {
195 float_class_unclassified,
196 float_class_zero,
197 float_class_normal,
198 float_class_inf,
199 float_class_qnan, /* all NaNs from here */
200 float_class_snan,
201 float_class_dnan,
202 float_class_msnan, /* maybe silenced */
203 } FloatClass;
206 * Structure holding all of the decomposed parts of a float. The
207 * exponent is unbiased and the fraction is normalized. All
208 * calculations are done with a 64 bit fraction and then rounded as
209 * appropriate for the final format.
211 * Thanks to the packed FloatClass a decent compiler should be able to
212 * fit the whole structure into registers and avoid using the stack
213 * for parameter passing.
216 typedef struct {
217 uint64_t frac;
218 int32_t exp;
219 FloatClass cls;
220 bool sign;
221 } FloatParts;
223 #define DECOMPOSED_BINARY_POINT (64 - 2)
224 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
225 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
227 /* Structure holding all of the relevant parameters for a format.
228 * exp_size: the size of the exponent field
229 * exp_bias: the offset applied to the exponent field
230 * exp_max: the maximum normalised exponent
231 * frac_size: the size of the fraction field
232 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
233 * The following are computed based the size of fraction
234 * frac_lsb: least significant bit of fraction
235 * fram_lsbm1: the bit bellow the least significant bit (for rounding)
236 * round_mask/roundeven_mask: masks used for rounding
238 typedef struct {
239 int exp_size;
240 int exp_bias;
241 int exp_max;
242 int frac_size;
243 int frac_shift;
244 uint64_t frac_lsb;
245 uint64_t frac_lsbm1;
246 uint64_t round_mask;
247 uint64_t roundeven_mask;
248 } FloatFmt;
250 /* Expand fields based on the size of exponent and fraction */
251 #define FLOAT_PARAMS(E, F) \
252 .exp_size = E, \
253 .exp_bias = ((1 << E) - 1) >> 1, \
254 .exp_max = (1 << E) - 1, \
255 .frac_size = F, \
256 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
257 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
258 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
259 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
260 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
262 static const FloatFmt float16_params = {
263 FLOAT_PARAMS(5, 10)
266 static const FloatFmt float32_params = {
267 FLOAT_PARAMS(8, 23)
270 static const FloatFmt float64_params = {
271 FLOAT_PARAMS(11, 52)
274 /* Unpack a float to parts, but do not canonicalize. */
275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
277 const int sign_pos = fmt.frac_size + fmt.exp_size;
279 return (FloatParts) {
280 .cls = float_class_unclassified,
281 .sign = extract64(raw, sign_pos, 1),
282 .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
283 .frac = extract64(raw, 0, fmt.frac_size),
287 static inline FloatParts float16_unpack_raw(float16 f)
289 return unpack_raw(float16_params, f);
292 static inline FloatParts float32_unpack_raw(float32 f)
294 return unpack_raw(float32_params, f);
297 static inline FloatParts float64_unpack_raw(float64 f)
299 return unpack_raw(float64_params, f);
302 /* Pack a float from parts, but do not canonicalize. */
303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
305 const int sign_pos = fmt.frac_size + fmt.exp_size;
306 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
307 return deposit64(ret, sign_pos, 1, p.sign);
310 static inline float16 float16_pack_raw(FloatParts p)
312 return make_float16(pack_raw(float16_params, p));
315 static inline float32 float32_pack_raw(FloatParts p)
317 return make_float32(pack_raw(float32_params, p));
320 static inline float64 float64_pack_raw(FloatParts p)
322 return make_float64(pack_raw(float64_params, p));
325 /* Canonicalize EXP and FRAC, setting CLS. */
326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
327 float_status *status)
329 if (part.exp == parm->exp_max) {
330 if (part.frac == 0) {
331 part.cls = float_class_inf;
332 } else {
333 #ifdef NO_SIGNALING_NANS
334 part.cls = float_class_qnan;
335 #else
336 int64_t msb = part.frac << (parm->frac_shift + 2);
337 if ((msb < 0) == status->snan_bit_is_one) {
338 part.cls = float_class_snan;
339 } else {
340 part.cls = float_class_qnan;
342 #endif
344 } else if (part.exp == 0) {
345 if (likely(part.frac == 0)) {
346 part.cls = float_class_zero;
347 } else if (status->flush_inputs_to_zero) {
348 float_raise(float_flag_input_denormal, status);
349 part.cls = float_class_zero;
350 part.frac = 0;
351 } else {
352 int shift = clz64(part.frac) - 1;
353 part.cls = float_class_normal;
354 part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
355 part.frac <<= shift;
357 } else {
358 part.cls = float_class_normal;
359 part.exp -= parm->exp_bias;
360 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
362 return part;
365 /* Round and uncanonicalize a floating-point number by parts. There
366 * are FRAC_SHIFT bits that may require rounding at the bottom of the
367 * fraction; these bits will be removed. The exponent will be biased
368 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
371 static FloatParts round_canonical(FloatParts p, float_status *s,
372 const FloatFmt *parm)
374 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
375 const uint64_t round_mask = parm->round_mask;
376 const uint64_t roundeven_mask = parm->roundeven_mask;
377 const int exp_max = parm->exp_max;
378 const int frac_shift = parm->frac_shift;
379 uint64_t frac, inc;
380 int exp, flags = 0;
381 bool overflow_norm;
383 frac = p.frac;
384 exp = p.exp;
386 switch (p.cls) {
387 case float_class_normal:
388 switch (s->float_rounding_mode) {
389 case float_round_nearest_even:
390 overflow_norm = false;
391 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
392 break;
393 case float_round_ties_away:
394 overflow_norm = false;
395 inc = frac_lsbm1;
396 break;
397 case float_round_to_zero:
398 overflow_norm = true;
399 inc = 0;
400 break;
401 case float_round_up:
402 inc = p.sign ? 0 : round_mask;
403 overflow_norm = p.sign;
404 break;
405 case float_round_down:
406 inc = p.sign ? round_mask : 0;
407 overflow_norm = !p.sign;
408 break;
409 default:
410 g_assert_not_reached();
413 exp += parm->exp_bias;
414 if (likely(exp > 0)) {
415 if (frac & round_mask) {
416 flags |= float_flag_inexact;
417 frac += inc;
418 if (frac & DECOMPOSED_OVERFLOW_BIT) {
419 frac >>= 1;
420 exp++;
423 frac >>= frac_shift;
425 if (unlikely(exp >= exp_max)) {
426 flags |= float_flag_overflow | float_flag_inexact;
427 if (overflow_norm) {
428 exp = exp_max - 1;
429 frac = -1;
430 } else {
431 p.cls = float_class_inf;
432 goto do_inf;
435 } else if (s->flush_to_zero) {
436 flags |= float_flag_output_denormal;
437 p.cls = float_class_zero;
438 goto do_zero;
439 } else {
440 bool is_tiny = (s->float_detect_tininess
441 == float_tininess_before_rounding)
442 || (exp < 0)
443 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
445 shift64RightJamming(frac, 1 - exp, &frac);
446 if (frac & round_mask) {
447 /* Need to recompute round-to-even. */
448 if (s->float_rounding_mode == float_round_nearest_even) {
449 inc = ((frac & roundeven_mask) != frac_lsbm1
450 ? frac_lsbm1 : 0);
452 flags |= float_flag_inexact;
453 frac += inc;
456 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
457 frac >>= frac_shift;
459 if (is_tiny && (flags & float_flag_inexact)) {
460 flags |= float_flag_underflow;
462 if (exp == 0 && frac == 0) {
463 p.cls = float_class_zero;
466 break;
468 case float_class_zero:
469 do_zero:
470 exp = 0;
471 frac = 0;
472 break;
474 case float_class_inf:
475 do_inf:
476 exp = exp_max;
477 frac = 0;
478 break;
480 case float_class_qnan:
481 case float_class_snan:
482 exp = exp_max;
483 break;
485 default:
486 g_assert_not_reached();
489 float_raise(flags, s);
490 p.exp = exp;
491 p.frac = frac;
492 return p;
495 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
497 return canonicalize(float16_unpack_raw(f), &float16_params, s);
500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
502 switch (p.cls) {
503 case float_class_dnan:
504 return float16_default_nan(s);
505 case float_class_msnan:
506 return float16_maybe_silence_nan(float16_pack_raw(p), s);
507 default:
508 p = round_canonical(p, s, &float16_params);
509 return float16_pack_raw(p);
513 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
515 return canonicalize(float32_unpack_raw(f), &float32_params, s);
518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
520 switch (p.cls) {
521 case float_class_dnan:
522 return float32_default_nan(s);
523 case float_class_msnan:
524 return float32_maybe_silence_nan(float32_pack_raw(p), s);
525 default:
526 p = round_canonical(p, s, &float32_params);
527 return float32_pack_raw(p);
531 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
533 return canonicalize(float64_unpack_raw(f), &float64_params, s);
536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
538 switch (p.cls) {
539 case float_class_dnan:
540 return float64_default_nan(s);
541 case float_class_msnan:
542 return float64_maybe_silence_nan(float64_pack_raw(p), s);
543 default:
544 p = round_canonical(p, s, &float64_params);
545 return float64_pack_raw(p);
549 /* Simple helpers for checking if what NaN we have */
550 static bool is_nan(FloatClass c)
552 return unlikely(c >= float_class_qnan);
554 static bool is_snan(FloatClass c)
556 return c == float_class_snan;
558 static bool is_qnan(FloatClass c)
560 return c == float_class_qnan;
563 static FloatParts return_nan(FloatParts a, float_status *s)
565 switch (a.cls) {
566 case float_class_snan:
567 s->float_exception_flags |= float_flag_invalid;
568 a.cls = float_class_msnan;
569 /* fall through */
570 case float_class_qnan:
571 if (s->default_nan_mode) {
572 a.cls = float_class_dnan;
574 break;
576 default:
577 g_assert_not_reached();
579 return a;
582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
584 if (is_snan(a.cls) || is_snan(b.cls)) {
585 s->float_exception_flags |= float_flag_invalid;
588 if (s->default_nan_mode) {
589 a.cls = float_class_dnan;
590 } else {
591 if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
592 is_qnan(b.cls), is_snan(b.cls),
593 a.frac > b.frac ||
594 (a.frac == b.frac && a.sign < b.sign))) {
595 a = b;
597 a.cls = float_class_msnan;
599 return a;
602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
603 bool inf_zero, float_status *s)
605 int which;
607 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
608 s->float_exception_flags |= float_flag_invalid;
611 which = pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
612 is_qnan(b.cls), is_snan(b.cls),
613 is_qnan(c.cls), is_snan(c.cls),
614 inf_zero, s);
616 if (s->default_nan_mode) {
617 /* Note that this check is after pickNaNMulAdd so that function
618 * has an opportunity to set the Invalid flag.
620 a.cls = float_class_dnan;
621 return a;
624 switch (which) {
625 case 0:
626 break;
627 case 1:
628 a = b;
629 break;
630 case 2:
631 a = c;
632 break;
633 case 3:
634 a.cls = float_class_dnan;
635 return a;
636 default:
637 g_assert_not_reached();
639 a.cls = float_class_msnan;
641 return a;
645 * Returns the result of adding or subtracting the values of the
646 * floating-point values `a' and `b'. The operation is performed
647 * according to the IEC/IEEE Standard for Binary Floating-Point
648 * Arithmetic.
651 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
652 float_status *s)
654 bool a_sign = a.sign;
655 bool b_sign = b.sign ^ subtract;
657 if (a_sign != b_sign) {
658 /* Subtraction */
660 if (a.cls == float_class_normal && b.cls == float_class_normal) {
661 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
662 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
663 a.frac = a.frac - b.frac;
664 } else {
665 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
666 a.frac = b.frac - a.frac;
667 a.exp = b.exp;
668 a_sign ^= 1;
671 if (a.frac == 0) {
672 a.cls = float_class_zero;
673 a.sign = s->float_rounding_mode == float_round_down;
674 } else {
675 int shift = clz64(a.frac) - 1;
676 a.frac = a.frac << shift;
677 a.exp = a.exp - shift;
678 a.sign = a_sign;
680 return a;
682 if (is_nan(a.cls) || is_nan(b.cls)) {
683 return pick_nan(a, b, s);
685 if (a.cls == float_class_inf) {
686 if (b.cls == float_class_inf) {
687 float_raise(float_flag_invalid, s);
688 a.cls = float_class_dnan;
690 return a;
692 if (a.cls == float_class_zero && b.cls == float_class_zero) {
693 a.sign = s->float_rounding_mode == float_round_down;
694 return a;
696 if (a.cls == float_class_zero || b.cls == float_class_inf) {
697 b.sign = a_sign ^ 1;
698 return b;
700 if (b.cls == float_class_zero) {
701 return a;
703 } else {
704 /* Addition */
705 if (a.cls == float_class_normal && b.cls == float_class_normal) {
706 if (a.exp > b.exp) {
707 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
708 } else if (a.exp < b.exp) {
709 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
710 a.exp = b.exp;
712 a.frac += b.frac;
713 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
714 a.frac >>= 1;
715 a.exp += 1;
717 return a;
719 if (is_nan(a.cls) || is_nan(b.cls)) {
720 return pick_nan(a, b, s);
722 if (a.cls == float_class_inf || b.cls == float_class_zero) {
723 return a;
725 if (b.cls == float_class_inf || a.cls == float_class_zero) {
726 b.sign = b_sign;
727 return b;
730 g_assert_not_reached();
734 * Returns the result of adding or subtracting the floating-point
735 * values `a' and `b'. The operation is performed according to the
736 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
739 float16 __attribute__((flatten)) float16_add(float16 a, float16 b,
740 float_status *status)
742 FloatParts pa = float16_unpack_canonical(a, status);
743 FloatParts pb = float16_unpack_canonical(b, status);
744 FloatParts pr = addsub_floats(pa, pb, false, status);
746 return float16_round_pack_canonical(pr, status);
749 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
750 float_status *status)
752 FloatParts pa = float32_unpack_canonical(a, status);
753 FloatParts pb = float32_unpack_canonical(b, status);
754 FloatParts pr = addsub_floats(pa, pb, false, status);
756 return float32_round_pack_canonical(pr, status);
759 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
760 float_status *status)
762 FloatParts pa = float64_unpack_canonical(a, status);
763 FloatParts pb = float64_unpack_canonical(b, status);
764 FloatParts pr = addsub_floats(pa, pb, false, status);
766 return float64_round_pack_canonical(pr, status);
769 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
770 float_status *status)
772 FloatParts pa = float16_unpack_canonical(a, status);
773 FloatParts pb = float16_unpack_canonical(b, status);
774 FloatParts pr = addsub_floats(pa, pb, true, status);
776 return float16_round_pack_canonical(pr, status);
779 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
780 float_status *status)
782 FloatParts pa = float32_unpack_canonical(a, status);
783 FloatParts pb = float32_unpack_canonical(b, status);
784 FloatParts pr = addsub_floats(pa, pb, true, status);
786 return float32_round_pack_canonical(pr, status);
789 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
790 float_status *status)
792 FloatParts pa = float64_unpack_canonical(a, status);
793 FloatParts pb = float64_unpack_canonical(b, status);
794 FloatParts pr = addsub_floats(pa, pb, true, status);
796 return float64_round_pack_canonical(pr, status);
800 * Returns the result of multiplying the floating-point values `a' and
801 * `b'. The operation is performed according to the IEC/IEEE Standard
802 * for Binary Floating-Point Arithmetic.
805 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
807 bool sign = a.sign ^ b.sign;
809 if (a.cls == float_class_normal && b.cls == float_class_normal) {
810 uint64_t hi, lo;
811 int exp = a.exp + b.exp;
813 mul64To128(a.frac, b.frac, &hi, &lo);
814 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
815 if (lo & DECOMPOSED_OVERFLOW_BIT) {
816 shift64RightJamming(lo, 1, &lo);
817 exp += 1;
820 /* Re-use a */
821 a.exp = exp;
822 a.sign = sign;
823 a.frac = lo;
824 return a;
826 /* handle all the NaN cases */
827 if (is_nan(a.cls) || is_nan(b.cls)) {
828 return pick_nan(a, b, s);
830 /* Inf * Zero == NaN */
831 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
832 (a.cls == float_class_zero && b.cls == float_class_inf)) {
833 s->float_exception_flags |= float_flag_invalid;
834 a.cls = float_class_dnan;
835 a.sign = sign;
836 return a;
838 /* Multiply by 0 or Inf */
839 if (a.cls == float_class_inf || a.cls == float_class_zero) {
840 a.sign = sign;
841 return a;
843 if (b.cls == float_class_inf || b.cls == float_class_zero) {
844 b.sign = sign;
845 return b;
847 g_assert_not_reached();
850 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
851 float_status *status)
853 FloatParts pa = float16_unpack_canonical(a, status);
854 FloatParts pb = float16_unpack_canonical(b, status);
855 FloatParts pr = mul_floats(pa, pb, status);
857 return float16_round_pack_canonical(pr, status);
860 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
861 float_status *status)
863 FloatParts pa = float32_unpack_canonical(a, status);
864 FloatParts pb = float32_unpack_canonical(b, status);
865 FloatParts pr = mul_floats(pa, pb, status);
867 return float32_round_pack_canonical(pr, status);
870 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
871 float_status *status)
873 FloatParts pa = float64_unpack_canonical(a, status);
874 FloatParts pb = float64_unpack_canonical(b, status);
875 FloatParts pr = mul_floats(pa, pb, status);
877 return float64_round_pack_canonical(pr, status);
881 * Returns the result of multiplying the floating-point values `a' and
882 * `b' then adding 'c', with no intermediate rounding step after the
883 * multiplication. The operation is performed according to the
884 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
885 * The flags argument allows the caller to select negation of the
886 * addend, the intermediate product, or the final result. (The
887 * difference between this and having the caller do a separate
888 * negation is that negating externally will flip the sign bit on
889 * NaNs.)
892 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
893 int flags, float_status *s)
895 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
896 ((1 << float_class_inf) | (1 << float_class_zero));
897 bool p_sign;
898 bool sign_flip = flags & float_muladd_negate_result;
899 FloatClass p_class;
900 uint64_t hi, lo;
901 int p_exp;
903 /* It is implementation-defined whether the cases of (0,inf,qnan)
904 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
905 * they return if they do), so we have to hand this information
906 * off to the target-specific pick-a-NaN routine.
908 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
909 return pick_nan_muladd(a, b, c, inf_zero, s);
912 if (inf_zero) {
913 s->float_exception_flags |= float_flag_invalid;
914 a.cls = float_class_dnan;
915 return a;
918 if (flags & float_muladd_negate_c) {
919 c.sign ^= 1;
922 p_sign = a.sign ^ b.sign;
924 if (flags & float_muladd_negate_product) {
925 p_sign ^= 1;
928 if (a.cls == float_class_inf || b.cls == float_class_inf) {
929 p_class = float_class_inf;
930 } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
931 p_class = float_class_zero;
932 } else {
933 p_class = float_class_normal;
936 if (c.cls == float_class_inf) {
937 if (p_class == float_class_inf && p_sign != c.sign) {
938 s->float_exception_flags |= float_flag_invalid;
939 a.cls = float_class_dnan;
940 } else {
941 a.cls = float_class_inf;
942 a.sign = c.sign ^ sign_flip;
944 return a;
947 if (p_class == float_class_inf) {
948 a.cls = float_class_inf;
949 a.sign = p_sign ^ sign_flip;
950 return a;
953 if (p_class == float_class_zero) {
954 if (c.cls == float_class_zero) {
955 if (p_sign != c.sign) {
956 p_sign = s->float_rounding_mode == float_round_down;
958 c.sign = p_sign;
959 } else if (flags & float_muladd_halve_result) {
960 c.exp -= 1;
962 c.sign ^= sign_flip;
963 return c;
966 /* a & b should be normals now... */
967 assert(a.cls == float_class_normal &&
968 b.cls == float_class_normal);
970 p_exp = a.exp + b.exp;
972 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
973 * result.
975 mul64To128(a.frac, b.frac, &hi, &lo);
976 /* binary point now at bit 124 */
978 /* check for overflow */
979 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
980 shift128RightJamming(hi, lo, 1, &hi, &lo);
981 p_exp += 1;
984 /* + add/sub */
985 if (c.cls == float_class_zero) {
986 /* move binary point back to 62 */
987 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
988 } else {
989 int exp_diff = p_exp - c.exp;
990 if (p_sign == c.sign) {
991 /* Addition */
992 if (exp_diff <= 0) {
993 shift128RightJamming(hi, lo,
994 DECOMPOSED_BINARY_POINT - exp_diff,
995 &hi, &lo);
996 lo += c.frac;
997 p_exp = c.exp;
998 } else {
999 uint64_t c_hi, c_lo;
1000 /* shift c to the same binary point as the product (124) */
1001 c_hi = c.frac >> 2;
1002 c_lo = 0;
1003 shift128RightJamming(c_hi, c_lo,
1004 exp_diff,
1005 &c_hi, &c_lo);
1006 add128(hi, lo, c_hi, c_lo, &hi, &lo);
1007 /* move binary point back to 62 */
1008 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1011 if (lo & DECOMPOSED_OVERFLOW_BIT) {
1012 shift64RightJamming(lo, 1, &lo);
1013 p_exp += 1;
1016 } else {
1017 /* Subtraction */
1018 uint64_t c_hi, c_lo;
1019 /* make C binary point match product at bit 124 */
1020 c_hi = c.frac >> 2;
1021 c_lo = 0;
1023 if (exp_diff <= 0) {
1024 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1025 if (exp_diff == 0
1027 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1028 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1029 } else {
1030 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1031 p_sign ^= 1;
1032 p_exp = c.exp;
1034 } else {
1035 shift128RightJamming(c_hi, c_lo,
1036 exp_diff,
1037 &c_hi, &c_lo);
1038 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1041 if (hi == 0 && lo == 0) {
1042 a.cls = float_class_zero;
1043 a.sign = s->float_rounding_mode == float_round_down;
1044 a.sign ^= sign_flip;
1045 return a;
1046 } else {
1047 int shift;
1048 if (hi != 0) {
1049 shift = clz64(hi);
1050 } else {
1051 shift = clz64(lo) + 64;
1053 /* Normalizing to a binary point of 124 is the
1054 correct adjust for the exponent. However since we're
1055 shifting, we might as well put the binary point back
1056 at 62 where we really want it. Therefore shift as
1057 if we're leaving 1 bit at the top of the word, but
1058 adjust the exponent as if we're leaving 3 bits. */
1059 shift -= 1;
1060 if (shift >= 64) {
1061 lo = lo << (shift - 64);
1062 } else {
1063 hi = (hi << shift) | (lo >> (64 - shift));
1064 lo = hi | ((lo << shift) != 0);
1066 p_exp -= shift - 2;
1071 if (flags & float_muladd_halve_result) {
1072 p_exp -= 1;
1075 /* finally prepare our result */
1076 a.cls = float_class_normal;
1077 a.sign = p_sign ^ sign_flip;
1078 a.exp = p_exp;
1079 a.frac = lo;
1081 return a;
1084 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1085 int flags, float_status *status)
1087 FloatParts pa = float16_unpack_canonical(a, status);
1088 FloatParts pb = float16_unpack_canonical(b, status);
1089 FloatParts pc = float16_unpack_canonical(c, status);
1090 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1092 return float16_round_pack_canonical(pr, status);
1095 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1096 int flags, float_status *status)
1098 FloatParts pa = float32_unpack_canonical(a, status);
1099 FloatParts pb = float32_unpack_canonical(b, status);
1100 FloatParts pc = float32_unpack_canonical(c, status);
1101 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1103 return float32_round_pack_canonical(pr, status);
1106 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1107 int flags, float_status *status)
1109 FloatParts pa = float64_unpack_canonical(a, status);
1110 FloatParts pb = float64_unpack_canonical(b, status);
1111 FloatParts pc = float64_unpack_canonical(c, status);
1112 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1114 return float64_round_pack_canonical(pr, status);
1118 * Returns the result of dividing the floating-point value `a' by the
1119 * corresponding value `b'. The operation is performed according to
1120 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1123 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1125 bool sign = a.sign ^ b.sign;
1127 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1128 uint64_t temp_lo, temp_hi;
1129 int exp = a.exp - b.exp;
1130 if (a.frac < b.frac) {
1131 exp -= 1;
1132 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1133 &temp_hi, &temp_lo);
1134 } else {
1135 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1136 &temp_hi, &temp_lo);
1138 /* LSB of quot is set if inexact which roundandpack will use
1139 * to set flags. Yet again we re-use a for the result */
1140 a.frac = div128To64(temp_lo, temp_hi, b.frac);
1141 a.sign = sign;
1142 a.exp = exp;
1143 return a;
1145 /* handle all the NaN cases */
1146 if (is_nan(a.cls) || is_nan(b.cls)) {
1147 return pick_nan(a, b, s);
1149 /* 0/0 or Inf/Inf */
1150 if (a.cls == b.cls
1152 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1153 s->float_exception_flags |= float_flag_invalid;
1154 a.cls = float_class_dnan;
1155 return a;
1157 /* Inf / x or 0 / x */
1158 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1159 a.sign = sign;
1160 return a;
1162 /* Div 0 => Inf */
1163 if (b.cls == float_class_zero) {
1164 s->float_exception_flags |= float_flag_divbyzero;
1165 a.cls = float_class_inf;
1166 a.sign = sign;
1167 return a;
1169 /* Div by Inf */
1170 if (b.cls == float_class_inf) {
1171 a.cls = float_class_zero;
1172 a.sign = sign;
1173 return a;
1175 g_assert_not_reached();
1178 float16 float16_div(float16 a, float16 b, float_status *status)
1180 FloatParts pa = float16_unpack_canonical(a, status);
1181 FloatParts pb = float16_unpack_canonical(b, status);
1182 FloatParts pr = div_floats(pa, pb, status);
1184 return float16_round_pack_canonical(pr, status);
1187 float32 float32_div(float32 a, float32 b, float_status *status)
1189 FloatParts pa = float32_unpack_canonical(a, status);
1190 FloatParts pb = float32_unpack_canonical(b, status);
1191 FloatParts pr = div_floats(pa, pb, status);
1193 return float32_round_pack_canonical(pr, status);
1196 float64 float64_div(float64 a, float64 b, float_status *status)
1198 FloatParts pa = float64_unpack_canonical(a, status);
1199 FloatParts pb = float64_unpack_canonical(b, status);
1200 FloatParts pr = div_floats(pa, pb, status);
1202 return float64_round_pack_canonical(pr, status);
1206 * Rounds the floating-point value `a' to an integer, and returns the
1207 * result as a floating-point value. The operation is performed
1208 * according to the IEC/IEEE Standard for Binary Floating-Point
1209 * Arithmetic.
1212 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1214 if (is_nan(a.cls)) {
1215 return return_nan(a, s);
1218 switch (a.cls) {
1219 case float_class_zero:
1220 case float_class_inf:
1221 case float_class_qnan:
1222 /* already "integral" */
1223 break;
1224 case float_class_normal:
1225 if (a.exp >= DECOMPOSED_BINARY_POINT) {
1226 /* already integral */
1227 break;
1229 if (a.exp < 0) {
1230 bool one;
1231 /* all fractional */
1232 s->float_exception_flags |= float_flag_inexact;
1233 switch (rounding_mode) {
1234 case float_round_nearest_even:
1235 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1236 break;
1237 case float_round_ties_away:
1238 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1239 break;
1240 case float_round_to_zero:
1241 one = false;
1242 break;
1243 case float_round_up:
1244 one = !a.sign;
1245 break;
1246 case float_round_down:
1247 one = a.sign;
1248 break;
1249 default:
1250 g_assert_not_reached();
1253 if (one) {
1254 a.frac = DECOMPOSED_IMPLICIT_BIT;
1255 a.exp = 0;
1256 } else {
1257 a.cls = float_class_zero;
1259 } else {
1260 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1261 uint64_t frac_lsbm1 = frac_lsb >> 1;
1262 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1263 uint64_t rnd_mask = rnd_even_mask >> 1;
1264 uint64_t inc;
1266 switch (rounding_mode) {
1267 case float_round_nearest_even:
1268 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1269 break;
1270 case float_round_ties_away:
1271 inc = frac_lsbm1;
1272 break;
1273 case float_round_to_zero:
1274 inc = 0;
1275 break;
1276 case float_round_up:
1277 inc = a.sign ? 0 : rnd_mask;
1278 break;
1279 case float_round_down:
1280 inc = a.sign ? rnd_mask : 0;
1281 break;
1282 default:
1283 g_assert_not_reached();
1286 if (a.frac & rnd_mask) {
1287 s->float_exception_flags |= float_flag_inexact;
1288 a.frac += inc;
1289 a.frac &= ~rnd_mask;
1290 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1291 a.frac >>= 1;
1292 a.exp++;
1296 break;
1297 default:
1298 g_assert_not_reached();
1300 return a;
1303 float16 float16_round_to_int(float16 a, float_status *s)
1305 FloatParts pa = float16_unpack_canonical(a, s);
1306 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1307 return float16_round_pack_canonical(pr, s);
1310 float32 float32_round_to_int(float32 a, float_status *s)
1312 FloatParts pa = float32_unpack_canonical(a, s);
1313 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1314 return float32_round_pack_canonical(pr, s);
1317 float64 float64_round_to_int(float64 a, float_status *s)
1319 FloatParts pa = float64_unpack_canonical(a, s);
1320 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1321 return float64_round_pack_canonical(pr, s);
1324 float64 float64_trunc_to_int(float64 a, float_status *s)
1326 FloatParts pa = float64_unpack_canonical(a, s);
1327 FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1328 return float64_round_pack_canonical(pr, s);
1332 * Returns the result of converting the floating-point value `a' to
1333 * the two's complement integer format. The conversion is performed
1334 * according to the IEC/IEEE Standard for Binary Floating-Point
1335 * Arithmetic---which means in particular that the conversion is
1336 * rounded according to the current rounding mode. If `a' is a NaN,
1337 * the largest positive integer is returned. Otherwise, if the
1338 * conversion overflows, the largest integer with the same sign as `a'
1339 * is returned.
1342 static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1343 int64_t min, int64_t max,
1344 float_status *s)
1346 uint64_t r;
1347 int orig_flags = get_float_exception_flags(s);
1348 FloatParts p = round_to_int(in, rmode, s);
1350 switch (p.cls) {
1351 case float_class_snan:
1352 case float_class_qnan:
1353 case float_class_dnan:
1354 case float_class_msnan:
1355 s->float_exception_flags = orig_flags | float_flag_invalid;
1356 return max;
1357 case float_class_inf:
1358 s->float_exception_flags = orig_flags | float_flag_invalid;
1359 return p.sign ? min : max;
1360 case float_class_zero:
1361 return 0;
1362 case float_class_normal:
1363 if (p.exp < DECOMPOSED_BINARY_POINT) {
1364 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1365 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1366 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1367 } else {
1368 r = UINT64_MAX;
1370 if (p.sign) {
1371 if (r <= -(uint64_t) min) {
1372 return -r;
1373 } else {
1374 s->float_exception_flags = orig_flags | float_flag_invalid;
1375 return min;
1377 } else {
1378 if (r <= max) {
1379 return r;
1380 } else {
1381 s->float_exception_flags = orig_flags | float_flag_invalid;
1382 return max;
1385 default:
1386 g_assert_not_reached();
1390 #define FLOAT_TO_INT(fsz, isz) \
1391 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \
1392 float_status *s) \
1394 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1395 return round_to_int_and_pack(p, s->float_rounding_mode, \
1396 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1397 s); \
1400 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \
1401 (float ## fsz a, float_status *s) \
1403 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1404 return round_to_int_and_pack(p, float_round_to_zero, \
1405 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1406 s); \
1409 FLOAT_TO_INT(16, 16)
1410 FLOAT_TO_INT(16, 32)
1411 FLOAT_TO_INT(16, 64)
1413 FLOAT_TO_INT(32, 16)
1414 FLOAT_TO_INT(32, 32)
1415 FLOAT_TO_INT(32, 64)
1417 FLOAT_TO_INT(64, 16)
1418 FLOAT_TO_INT(64, 32)
1419 FLOAT_TO_INT(64, 64)
1421 #undef FLOAT_TO_INT
1424 * Returns the result of converting the floating-point value `a' to
1425 * the unsigned integer format. The conversion is performed according
1426 * to the IEC/IEEE Standard for Binary Floating-Point
1427 * Arithmetic---which means in particular that the conversion is
1428 * rounded according to the current rounding mode. If `a' is a NaN,
1429 * the largest unsigned integer is returned. Otherwise, if the
1430 * conversion overflows, the largest unsigned integer is returned. If
1431 * the 'a' is negative, the result is rounded and zero is returned;
1432 * values that do not round to zero will raise the inexact exception
1433 * flag.
1436 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1437 float_status *s)
1439 int orig_flags = get_float_exception_flags(s);
1440 FloatParts p = round_to_int(in, rmode, s);
1442 switch (p.cls) {
1443 case float_class_snan:
1444 case float_class_qnan:
1445 case float_class_dnan:
1446 case float_class_msnan:
1447 s->float_exception_flags = orig_flags | float_flag_invalid;
1448 return max;
1449 case float_class_inf:
1450 s->float_exception_flags = orig_flags | float_flag_invalid;
1451 return p.sign ? 0 : max;
1452 case float_class_zero:
1453 return 0;
1454 case float_class_normal:
1456 uint64_t r;
1457 if (p.sign) {
1458 s->float_exception_flags = orig_flags | float_flag_invalid;
1459 return 0;
1462 if (p.exp < DECOMPOSED_BINARY_POINT) {
1463 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1464 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1465 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1466 } else {
1467 s->float_exception_flags = orig_flags | float_flag_invalid;
1468 return max;
1471 /* For uint64 this will never trip, but if p.exp is too large
1472 * to shift a decomposed fraction we shall have exited via the
1473 * 3rd leg above.
1475 if (r > max) {
1476 s->float_exception_flags = orig_flags | float_flag_invalid;
1477 return max;
1478 } else {
1479 return r;
1482 default:
1483 g_assert_not_reached();
1487 #define FLOAT_TO_UINT(fsz, isz) \
1488 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \
1489 float_status *s) \
1491 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1492 return round_to_uint_and_pack(p, s->float_rounding_mode, \
1493 UINT ## isz ## _MAX, s); \
1496 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \
1497 (float ## fsz a, float_status *s) \
1499 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1500 return round_to_uint_and_pack(p, float_round_to_zero, \
1501 UINT ## isz ## _MAX, s); \
1504 FLOAT_TO_UINT(16, 16)
1505 FLOAT_TO_UINT(16, 32)
1506 FLOAT_TO_UINT(16, 64)
1508 FLOAT_TO_UINT(32, 16)
1509 FLOAT_TO_UINT(32, 32)
1510 FLOAT_TO_UINT(32, 64)
1512 FLOAT_TO_UINT(64, 16)
1513 FLOAT_TO_UINT(64, 32)
1514 FLOAT_TO_UINT(64, 64)
1516 #undef FLOAT_TO_UINT
1519 * Integer to float conversions
1521 * Returns the result of converting the two's complement integer `a'
1522 * to the floating-point format. The conversion is performed according
1523 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1526 static FloatParts int_to_float(int64_t a, float_status *status)
1528 FloatParts r = {};
1529 if (a == 0) {
1530 r.cls = float_class_zero;
1531 r.sign = false;
1532 } else if (a == (1ULL << 63)) {
1533 r.cls = float_class_normal;
1534 r.sign = true;
1535 r.frac = DECOMPOSED_IMPLICIT_BIT;
1536 r.exp = 63;
1537 } else {
1538 uint64_t f;
1539 if (a < 0) {
1540 f = -a;
1541 r.sign = true;
1542 } else {
1543 f = a;
1544 r.sign = false;
1546 int shift = clz64(f) - 1;
1547 r.cls = float_class_normal;
1548 r.exp = (DECOMPOSED_BINARY_POINT - shift);
1549 r.frac = f << shift;
1552 return r;
1555 float16 int64_to_float16(int64_t a, float_status *status)
1557 FloatParts pa = int_to_float(a, status);
1558 return float16_round_pack_canonical(pa, status);
1561 float16 int32_to_float16(int32_t a, float_status *status)
1563 return int64_to_float16(a, status);
1566 float16 int16_to_float16(int16_t a, float_status *status)
1568 return int64_to_float16(a, status);
1571 float32 int64_to_float32(int64_t a, float_status *status)
1573 FloatParts pa = int_to_float(a, status);
1574 return float32_round_pack_canonical(pa, status);
1577 float32 int32_to_float32(int32_t a, float_status *status)
1579 return int64_to_float32(a, status);
1582 float32 int16_to_float32(int16_t a, float_status *status)
1584 return int64_to_float32(a, status);
1587 float64 int64_to_float64(int64_t a, float_status *status)
1589 FloatParts pa = int_to_float(a, status);
1590 return float64_round_pack_canonical(pa, status);
1593 float64 int32_to_float64(int32_t a, float_status *status)
1595 return int64_to_float64(a, status);
1598 float64 int16_to_float64(int16_t a, float_status *status)
1600 return int64_to_float64(a, status);
1605 * Unsigned Integer to float conversions
1607 * Returns the result of converting the unsigned integer `a' to the
1608 * floating-point format. The conversion is performed according to the
1609 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1612 static FloatParts uint_to_float(uint64_t a, float_status *status)
1614 FloatParts r = { .sign = false};
1616 if (a == 0) {
1617 r.cls = float_class_zero;
1618 } else {
1619 int spare_bits = clz64(a) - 1;
1620 r.cls = float_class_normal;
1621 r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1622 if (spare_bits < 0) {
1623 shift64RightJamming(a, -spare_bits, &a);
1624 r.frac = a;
1625 } else {
1626 r.frac = a << spare_bits;
1630 return r;
1633 float16 uint64_to_float16(uint64_t a, float_status *status)
1635 FloatParts pa = uint_to_float(a, status);
1636 return float16_round_pack_canonical(pa, status);
1639 float16 uint32_to_float16(uint32_t a, float_status *status)
1641 return uint64_to_float16(a, status);
1644 float16 uint16_to_float16(uint16_t a, float_status *status)
1646 return uint64_to_float16(a, status);
1649 float32 uint64_to_float32(uint64_t a, float_status *status)
1651 FloatParts pa = uint_to_float(a, status);
1652 return float32_round_pack_canonical(pa, status);
1655 float32 uint32_to_float32(uint32_t a, float_status *status)
1657 return uint64_to_float32(a, status);
1660 float32 uint16_to_float32(uint16_t a, float_status *status)
1662 return uint64_to_float32(a, status);
1665 float64 uint64_to_float64(uint64_t a, float_status *status)
1667 FloatParts pa = uint_to_float(a, status);
1668 return float64_round_pack_canonical(pa, status);
1671 float64 uint32_to_float64(uint32_t a, float_status *status)
1673 return uint64_to_float64(a, status);
1676 float64 uint16_to_float64(uint16_t a, float_status *status)
1678 return uint64_to_float64(a, status);
1681 /* Float Min/Max */
1682 /* min() and max() functions. These can't be implemented as
1683 * 'compare and pick one input' because that would mishandle
1684 * NaNs and +0 vs -0.
1686 * minnum() and maxnum() functions. These are similar to the min()
1687 * and max() functions but if one of the arguments is a QNaN and
1688 * the other is numerical then the numerical argument is returned.
1689 * SNaNs will get quietened before being returned.
1690 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1691 * and maxNum() operations. min() and max() are the typical min/max
1692 * semantics provided by many CPUs which predate that specification.
1694 * minnummag() and maxnummag() functions correspond to minNumMag()
1695 * and minNumMag() from the IEEE-754 2008.
1697 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1698 bool ieee, bool ismag, float_status *s)
1700 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1701 if (ieee) {
1702 /* Takes two floating-point values `a' and `b', one of
1703 * which is a NaN, and returns the appropriate NaN
1704 * result. If either `a' or `b' is a signaling NaN,
1705 * the invalid exception is raised.
1707 if (is_snan(a.cls) || is_snan(b.cls)) {
1708 return pick_nan(a, b, s);
1709 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1710 return b;
1711 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1712 return a;
1715 return pick_nan(a, b, s);
1716 } else {
1717 int a_exp, b_exp;
1719 switch (a.cls) {
1720 case float_class_normal:
1721 a_exp = a.exp;
1722 break;
1723 case float_class_inf:
1724 a_exp = INT_MAX;
1725 break;
1726 case float_class_zero:
1727 a_exp = INT_MIN;
1728 break;
1729 default:
1730 g_assert_not_reached();
1731 break;
1733 switch (b.cls) {
1734 case float_class_normal:
1735 b_exp = b.exp;
1736 break;
1737 case float_class_inf:
1738 b_exp = INT_MAX;
1739 break;
1740 case float_class_zero:
1741 b_exp = INT_MIN;
1742 break;
1743 default:
1744 g_assert_not_reached();
1745 break;
1748 if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
1749 bool a_less = a_exp < b_exp;
1750 if (a_exp == b_exp) {
1751 a_less = a.frac < b.frac;
1753 return a_less ^ ismin ? b : a;
1756 if (a.sign == b.sign) {
1757 bool a_less = a_exp < b_exp;
1758 if (a_exp == b_exp) {
1759 a_less = a.frac < b.frac;
1761 return a.sign ^ a_less ^ ismin ? b : a;
1762 } else {
1763 return a.sign ^ ismin ? b : a;
1768 #define MINMAX(sz, name, ismin, isiee, ismag) \
1769 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
1770 float_status *s) \
1772 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1773 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1774 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
1776 return float ## sz ## _round_pack_canonical(pr, s); \
1779 MINMAX(16, min, true, false, false)
1780 MINMAX(16, minnum, true, true, false)
1781 MINMAX(16, minnummag, true, true, true)
1782 MINMAX(16, max, false, false, false)
1783 MINMAX(16, maxnum, false, true, false)
1784 MINMAX(16, maxnummag, false, true, true)
1786 MINMAX(32, min, true, false, false)
1787 MINMAX(32, minnum, true, true, false)
1788 MINMAX(32, minnummag, true, true, true)
1789 MINMAX(32, max, false, false, false)
1790 MINMAX(32, maxnum, false, true, false)
1791 MINMAX(32, maxnummag, false, true, true)
1793 MINMAX(64, min, true, false, false)
1794 MINMAX(64, minnum, true, true, false)
1795 MINMAX(64, minnummag, true, true, true)
1796 MINMAX(64, max, false, false, false)
1797 MINMAX(64, maxnum, false, true, false)
1798 MINMAX(64, maxnummag, false, true, true)
1800 #undef MINMAX
1802 /* Floating point compare */
1803 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
1804 float_status *s)
1806 if (is_nan(a.cls) || is_nan(b.cls)) {
1807 if (!is_quiet ||
1808 a.cls == float_class_snan ||
1809 b.cls == float_class_snan) {
1810 s->float_exception_flags |= float_flag_invalid;
1812 return float_relation_unordered;
1815 if (a.cls == float_class_zero) {
1816 if (b.cls == float_class_zero) {
1817 return float_relation_equal;
1819 return b.sign ? float_relation_greater : float_relation_less;
1820 } else if (b.cls == float_class_zero) {
1821 return a.sign ? float_relation_less : float_relation_greater;
1824 /* The only really important thing about infinity is its sign. If
1825 * both are infinities the sign marks the smallest of the two.
1827 if (a.cls == float_class_inf) {
1828 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
1829 return float_relation_equal;
1831 return a.sign ? float_relation_less : float_relation_greater;
1832 } else if (b.cls == float_class_inf) {
1833 return b.sign ? float_relation_greater : float_relation_less;
1836 if (a.sign != b.sign) {
1837 return a.sign ? float_relation_less : float_relation_greater;
1840 if (a.exp == b.exp) {
1841 if (a.frac == b.frac) {
1842 return float_relation_equal;
1844 if (a.sign) {
1845 return a.frac > b.frac ?
1846 float_relation_less : float_relation_greater;
1847 } else {
1848 return a.frac > b.frac ?
1849 float_relation_greater : float_relation_less;
1851 } else {
1852 if (a.sign) {
1853 return a.exp > b.exp ? float_relation_less : float_relation_greater;
1854 } else {
1855 return a.exp > b.exp ? float_relation_greater : float_relation_less;
1860 #define COMPARE(sz) \
1861 int float ## sz ## _compare(float ## sz a, float ## sz b, \
1862 float_status *s) \
1864 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1865 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1866 return compare_floats(pa, pb, false, s); \
1868 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \
1869 float_status *s) \
1871 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1872 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1873 return compare_floats(pa, pb, true, s); \
1876 COMPARE(16)
1877 COMPARE(32)
1878 COMPARE(64)
1880 #undef COMPARE
1882 /* Multiply A by 2 raised to the power N. */
1883 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1885 if (unlikely(is_nan(a.cls))) {
1886 return return_nan(a, s);
1888 if (a.cls == float_class_normal) {
1889 /* The largest float type (even though not supported by FloatParts)
1890 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
1891 * still allows rounding to infinity, without allowing overflow
1892 * within the int32_t that backs FloatParts.exp.
1894 n = MIN(MAX(n, -0x10000), 0x10000);
1895 a.exp += n;
1897 return a;
1900 float16 float16_scalbn(float16 a, int n, float_status *status)
1902 FloatParts pa = float16_unpack_canonical(a, status);
1903 FloatParts pr = scalbn_decomposed(pa, n, status);
1904 return float16_round_pack_canonical(pr, status);
1907 float32 float32_scalbn(float32 a, int n, float_status *status)
1909 FloatParts pa = float32_unpack_canonical(a, status);
1910 FloatParts pr = scalbn_decomposed(pa, n, status);
1911 return float32_round_pack_canonical(pr, status);
1914 float64 float64_scalbn(float64 a, int n, float_status *status)
1916 FloatParts pa = float64_unpack_canonical(a, status);
1917 FloatParts pr = scalbn_decomposed(pa, n, status);
1918 return float64_round_pack_canonical(pr, status);
1922 * Square Root
1924 * The old softfloat code did an approximation step before zeroing in
1925 * on the final result. However for simpleness we just compute the
1926 * square root by iterating down from the implicit bit to enough extra
1927 * bits to ensure we get a correctly rounded result.
1929 * This does mean however the calculation is slower than before,
1930 * especially for 64 bit floats.
1933 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
1935 uint64_t a_frac, r_frac, s_frac;
1936 int bit, last_bit;
1938 if (is_nan(a.cls)) {
1939 return return_nan(a, s);
1941 if (a.cls == float_class_zero) {
1942 return a; /* sqrt(+-0) = +-0 */
1944 if (a.sign) {
1945 s->float_exception_flags |= float_flag_invalid;
1946 a.cls = float_class_dnan;
1947 return a;
1949 if (a.cls == float_class_inf) {
1950 return a; /* sqrt(+inf) = +inf */
1953 assert(a.cls == float_class_normal);
1955 /* We need two overflow bits at the top. Adding room for that is a
1956 * right shift. If the exponent is odd, we can discard the low bit
1957 * by multiplying the fraction by 2; that's a left shift. Combine
1958 * those and we shift right if the exponent is even.
1960 a_frac = a.frac;
1961 if (!(a.exp & 1)) {
1962 a_frac >>= 1;
1964 a.exp >>= 1;
1966 /* Bit-by-bit computation of sqrt. */
1967 r_frac = 0;
1968 s_frac = 0;
1970 /* Iterate from implicit bit down to the 3 extra bits to compute a
1971 * properly rounded result. Remember we've inserted one more bit
1972 * at the top, so these positions are one less.
1974 bit = DECOMPOSED_BINARY_POINT - 1;
1975 last_bit = MAX(p->frac_shift - 4, 0);
1976 do {
1977 uint64_t q = 1ULL << bit;
1978 uint64_t t_frac = s_frac + q;
1979 if (t_frac <= a_frac) {
1980 s_frac = t_frac + q;
1981 a_frac -= t_frac;
1982 r_frac += q;
1984 a_frac <<= 1;
1985 } while (--bit >= last_bit);
1987 /* Undo the right shift done above. If there is any remaining
1988 * fraction, the result is inexact. Set the sticky bit.
1990 a.frac = (r_frac << 1) + (a_frac != 0);
1992 return a;
1995 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
1997 FloatParts pa = float16_unpack_canonical(a, status);
1998 FloatParts pr = sqrt_float(pa, status, &float16_params);
1999 return float16_round_pack_canonical(pr, status);
2002 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
2004 FloatParts pa = float32_unpack_canonical(a, status);
2005 FloatParts pr = sqrt_float(pa, status, &float32_params);
2006 return float32_round_pack_canonical(pr, status);
2009 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
2011 FloatParts pa = float64_unpack_canonical(a, status);
2012 FloatParts pr = sqrt_float(pa, status, &float64_params);
2013 return float64_round_pack_canonical(pr, status);
2017 /*----------------------------------------------------------------------------
2018 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2019 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2020 | input. If `zSign' is 1, the input is negated before being converted to an
2021 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
2022 | is simply rounded to an integer, with the inexact exception raised if the
2023 | input cannot be represented exactly as an integer. However, if the fixed-
2024 | point input is too large, the invalid exception is raised and the largest
2025 | positive or negative integer is returned.
2026 *----------------------------------------------------------------------------*/
2028 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2030 int8_t roundingMode;
2031 flag roundNearestEven;
2032 int8_t roundIncrement, roundBits;
2033 int32_t z;
2035 roundingMode = status->float_rounding_mode;
2036 roundNearestEven = ( roundingMode == float_round_nearest_even );
2037 switch (roundingMode) {
2038 case float_round_nearest_even:
2039 case float_round_ties_away:
2040 roundIncrement = 0x40;
2041 break;
2042 case float_round_to_zero:
2043 roundIncrement = 0;
2044 break;
2045 case float_round_up:
2046 roundIncrement = zSign ? 0 : 0x7f;
2047 break;
2048 case float_round_down:
2049 roundIncrement = zSign ? 0x7f : 0;
2050 break;
2051 default:
2052 abort();
2054 roundBits = absZ & 0x7F;
2055 absZ = ( absZ + roundIncrement )>>7;
2056 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2057 z = absZ;
2058 if ( zSign ) z = - z;
2059 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2060 float_raise(float_flag_invalid, status);
2061 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2063 if (roundBits) {
2064 status->float_exception_flags |= float_flag_inexact;
2066 return z;
2070 /*----------------------------------------------------------------------------
2071 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2072 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2073 | and returns the properly rounded 64-bit integer corresponding to the input.
2074 | If `zSign' is 1, the input is negated before being converted to an integer.
2075 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2076 | the inexact exception raised if the input cannot be represented exactly as
2077 | an integer. However, if the fixed-point input is too large, the invalid
2078 | exception is raised and the largest positive or negative integer is
2079 | returned.
2080 *----------------------------------------------------------------------------*/
2082 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2083 float_status *status)
2085 int8_t roundingMode;
2086 flag roundNearestEven, increment;
2087 int64_t z;
2089 roundingMode = status->float_rounding_mode;
2090 roundNearestEven = ( roundingMode == float_round_nearest_even );
2091 switch (roundingMode) {
2092 case float_round_nearest_even:
2093 case float_round_ties_away:
2094 increment = ((int64_t) absZ1 < 0);
2095 break;
2096 case float_round_to_zero:
2097 increment = 0;
2098 break;
2099 case float_round_up:
2100 increment = !zSign && absZ1;
2101 break;
2102 case float_round_down:
2103 increment = zSign && absZ1;
2104 break;
2105 default:
2106 abort();
2108 if ( increment ) {
2109 ++absZ0;
2110 if ( absZ0 == 0 ) goto overflow;
2111 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2113 z = absZ0;
2114 if ( zSign ) z = - z;
2115 if ( z && ( ( z < 0 ) ^ zSign ) ) {
2116 overflow:
2117 float_raise(float_flag_invalid, status);
2118 return
2119 zSign ? (int64_t) LIT64( 0x8000000000000000 )
2120 : LIT64( 0x7FFFFFFFFFFFFFFF );
2122 if (absZ1) {
2123 status->float_exception_flags |= float_flag_inexact;
2125 return z;
2129 /*----------------------------------------------------------------------------
2130 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2131 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2132 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2133 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
2134 | with the inexact exception raised if the input cannot be represented exactly
2135 | as an integer. However, if the fixed-point input is too large, the invalid
2136 | exception is raised and the largest unsigned integer is returned.
2137 *----------------------------------------------------------------------------*/
2139 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2140 uint64_t absZ1, float_status *status)
2142 int8_t roundingMode;
2143 flag roundNearestEven, increment;
2145 roundingMode = status->float_rounding_mode;
2146 roundNearestEven = (roundingMode == float_round_nearest_even);
2147 switch (roundingMode) {
2148 case float_round_nearest_even:
2149 case float_round_ties_away:
2150 increment = ((int64_t)absZ1 < 0);
2151 break;
2152 case float_round_to_zero:
2153 increment = 0;
2154 break;
2155 case float_round_up:
2156 increment = !zSign && absZ1;
2157 break;
2158 case float_round_down:
2159 increment = zSign && absZ1;
2160 break;
2161 default:
2162 abort();
2164 if (increment) {
2165 ++absZ0;
2166 if (absZ0 == 0) {
2167 float_raise(float_flag_invalid, status);
2168 return LIT64(0xFFFFFFFFFFFFFFFF);
2170 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2173 if (zSign && absZ0) {
2174 float_raise(float_flag_invalid, status);
2175 return 0;
2178 if (absZ1) {
2179 status->float_exception_flags |= float_flag_inexact;
2181 return absZ0;
2184 /*----------------------------------------------------------------------------
2185 | If `a' is denormal and we are in flush-to-zero mode then set the
2186 | input-denormal exception and return zero. Otherwise just return the value.
2187 *----------------------------------------------------------------------------*/
2188 float32 float32_squash_input_denormal(float32 a, float_status *status)
2190 if (status->flush_inputs_to_zero) {
2191 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2192 float_raise(float_flag_input_denormal, status);
2193 return make_float32(float32_val(a) & 0x80000000);
2196 return a;
2199 /*----------------------------------------------------------------------------
2200 | Normalizes the subnormal single-precision floating-point value represented
2201 | by the denormalized significand `aSig'. The normalized exponent and
2202 | significand are stored at the locations pointed to by `zExpPtr' and
2203 | `zSigPtr', respectively.
2204 *----------------------------------------------------------------------------*/
2206 static void
2207 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2209 int8_t shiftCount;
2211 shiftCount = countLeadingZeros32( aSig ) - 8;
2212 *zSigPtr = aSig<<shiftCount;
2213 *zExpPtr = 1 - shiftCount;
2217 /*----------------------------------------------------------------------------
2218 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2219 | and significand `zSig', and returns the proper single-precision floating-
2220 | point value corresponding to the abstract input. Ordinarily, the abstract
2221 | value is simply rounded and packed into the single-precision format, with
2222 | the inexact exception raised if the abstract input cannot be represented
2223 | exactly. However, if the abstract value is too large, the overflow and
2224 | inexact exceptions are raised and an infinity or maximal finite value is
2225 | returned. If the abstract value is too small, the input value is rounded to
2226 | a subnormal number, and the underflow and inexact exceptions are raised if
2227 | the abstract input cannot be represented exactly as a subnormal single-
2228 | precision floating-point number.
2229 | The input significand `zSig' has its binary point between bits 30
2230 | and 29, which is 7 bits to the left of the usual location. This shifted
2231 | significand must be normalized or smaller. If `zSig' is not normalized,
2232 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2233 | and it must not require rounding. In the usual case that `zSig' is
2234 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2235 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2236 | Binary Floating-Point Arithmetic.
2237 *----------------------------------------------------------------------------*/
2239 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2240 float_status *status)
2242 int8_t roundingMode;
2243 flag roundNearestEven;
2244 int8_t roundIncrement, roundBits;
2245 flag isTiny;
2247 roundingMode = status->float_rounding_mode;
2248 roundNearestEven = ( roundingMode == float_round_nearest_even );
2249 switch (roundingMode) {
2250 case float_round_nearest_even:
2251 case float_round_ties_away:
2252 roundIncrement = 0x40;
2253 break;
2254 case float_round_to_zero:
2255 roundIncrement = 0;
2256 break;
2257 case float_round_up:
2258 roundIncrement = zSign ? 0 : 0x7f;
2259 break;
2260 case float_round_down:
2261 roundIncrement = zSign ? 0x7f : 0;
2262 break;
2263 default:
2264 abort();
2265 break;
2267 roundBits = zSig & 0x7F;
2268 if ( 0xFD <= (uint16_t) zExp ) {
2269 if ( ( 0xFD < zExp )
2270 || ( ( zExp == 0xFD )
2271 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2273 float_raise(float_flag_overflow | float_flag_inexact, status);
2274 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2276 if ( zExp < 0 ) {
2277 if (status->flush_to_zero) {
2278 float_raise(float_flag_output_denormal, status);
2279 return packFloat32(zSign, 0, 0);
2281 isTiny =
2282 (status->float_detect_tininess
2283 == float_tininess_before_rounding)
2284 || ( zExp < -1 )
2285 || ( zSig + roundIncrement < 0x80000000 );
2286 shift32RightJamming( zSig, - zExp, &zSig );
2287 zExp = 0;
2288 roundBits = zSig & 0x7F;
2289 if (isTiny && roundBits) {
2290 float_raise(float_flag_underflow, status);
2294 if (roundBits) {
2295 status->float_exception_flags |= float_flag_inexact;
2297 zSig = ( zSig + roundIncrement )>>7;
2298 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2299 if ( zSig == 0 ) zExp = 0;
2300 return packFloat32( zSign, zExp, zSig );
2304 /*----------------------------------------------------------------------------
2305 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2306 | and significand `zSig', and returns the proper single-precision floating-
2307 | point value corresponding to the abstract input. This routine is just like
2308 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2309 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2310 | floating-point exponent.
2311 *----------------------------------------------------------------------------*/
2313 static float32
2314 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2315 float_status *status)
2317 int8_t shiftCount;
2319 shiftCount = countLeadingZeros32( zSig ) - 1;
2320 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2321 status);
2325 /*----------------------------------------------------------------------------
2326 | If `a' is denormal and we are in flush-to-zero mode then set the
2327 | input-denormal exception and return zero. Otherwise just return the value.
2328 *----------------------------------------------------------------------------*/
2329 float64 float64_squash_input_denormal(float64 a, float_status *status)
2331 if (status->flush_inputs_to_zero) {
2332 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2333 float_raise(float_flag_input_denormal, status);
2334 return make_float64(float64_val(a) & (1ULL << 63));
2337 return a;
2340 /*----------------------------------------------------------------------------
2341 | Normalizes the subnormal double-precision floating-point value represented
2342 | by the denormalized significand `aSig'. The normalized exponent and
2343 | significand are stored at the locations pointed to by `zExpPtr' and
2344 | `zSigPtr', respectively.
2345 *----------------------------------------------------------------------------*/
2347 static void
2348 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2350 int8_t shiftCount;
2352 shiftCount = countLeadingZeros64( aSig ) - 11;
2353 *zSigPtr = aSig<<shiftCount;
2354 *zExpPtr = 1 - shiftCount;
2358 /*----------------------------------------------------------------------------
2359 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2360 | double-precision floating-point value, returning the result. After being
2361 | shifted into the proper positions, the three fields are simply added
2362 | together to form the result. This means that any integer portion of `zSig'
2363 | will be added into the exponent. Since a properly normalized significand
2364 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2365 | than the desired result exponent whenever `zSig' is a complete, normalized
2366 | significand.
2367 *----------------------------------------------------------------------------*/
2369 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2372 return make_float64(
2373 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2377 /*----------------------------------------------------------------------------
2378 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2379 | and significand `zSig', and returns the proper double-precision floating-
2380 | point value corresponding to the abstract input. Ordinarily, the abstract
2381 | value is simply rounded and packed into the double-precision format, with
2382 | the inexact exception raised if the abstract input cannot be represented
2383 | exactly. However, if the abstract value is too large, the overflow and
2384 | inexact exceptions are raised and an infinity or maximal finite value is
2385 | returned. If the abstract value is too small, the input value is rounded to
2386 | a subnormal number, and the underflow and inexact exceptions are raised if
2387 | the abstract input cannot be represented exactly as a subnormal double-
2388 | precision floating-point number.
2389 | The input significand `zSig' has its binary point between bits 62
2390 | and 61, which is 10 bits to the left of the usual location. This shifted
2391 | significand must be normalized or smaller. If `zSig' is not normalized,
2392 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2393 | and it must not require rounding. In the usual case that `zSig' is
2394 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2395 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2396 | Binary Floating-Point Arithmetic.
2397 *----------------------------------------------------------------------------*/
2399 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2400 float_status *status)
2402 int8_t roundingMode;
2403 flag roundNearestEven;
2404 int roundIncrement, roundBits;
2405 flag isTiny;
2407 roundingMode = status->float_rounding_mode;
2408 roundNearestEven = ( roundingMode == float_round_nearest_even );
2409 switch (roundingMode) {
2410 case float_round_nearest_even:
2411 case float_round_ties_away:
2412 roundIncrement = 0x200;
2413 break;
2414 case float_round_to_zero:
2415 roundIncrement = 0;
2416 break;
2417 case float_round_up:
2418 roundIncrement = zSign ? 0 : 0x3ff;
2419 break;
2420 case float_round_down:
2421 roundIncrement = zSign ? 0x3ff : 0;
2422 break;
2423 case float_round_to_odd:
2424 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2425 break;
2426 default:
2427 abort();
2429 roundBits = zSig & 0x3FF;
2430 if ( 0x7FD <= (uint16_t) zExp ) {
2431 if ( ( 0x7FD < zExp )
2432 || ( ( zExp == 0x7FD )
2433 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2435 bool overflow_to_inf = roundingMode != float_round_to_odd &&
2436 roundIncrement != 0;
2437 float_raise(float_flag_overflow | float_flag_inexact, status);
2438 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2440 if ( zExp < 0 ) {
2441 if (status->flush_to_zero) {
2442 float_raise(float_flag_output_denormal, status);
2443 return packFloat64(zSign, 0, 0);
2445 isTiny =
2446 (status->float_detect_tininess
2447 == float_tininess_before_rounding)
2448 || ( zExp < -1 )
2449 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2450 shift64RightJamming( zSig, - zExp, &zSig );
2451 zExp = 0;
2452 roundBits = zSig & 0x3FF;
2453 if (isTiny && roundBits) {
2454 float_raise(float_flag_underflow, status);
2456 if (roundingMode == float_round_to_odd) {
2458 * For round-to-odd case, the roundIncrement depends on
2459 * zSig which just changed.
2461 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2465 if (roundBits) {
2466 status->float_exception_flags |= float_flag_inexact;
2468 zSig = ( zSig + roundIncrement )>>10;
2469 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2470 if ( zSig == 0 ) zExp = 0;
2471 return packFloat64( zSign, zExp, zSig );
2475 /*----------------------------------------------------------------------------
2476 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2477 | and significand `zSig', and returns the proper double-precision floating-
2478 | point value corresponding to the abstract input. This routine is just like
2479 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2480 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2481 | floating-point exponent.
2482 *----------------------------------------------------------------------------*/
2484 static float64
2485 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2486 float_status *status)
2488 int8_t shiftCount;
2490 shiftCount = countLeadingZeros64( zSig ) - 1;
2491 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2492 status);
2496 /*----------------------------------------------------------------------------
2497 | Normalizes the subnormal extended double-precision floating-point value
2498 | represented by the denormalized significand `aSig'. The normalized exponent
2499 | and significand are stored at the locations pointed to by `zExpPtr' and
2500 | `zSigPtr', respectively.
2501 *----------------------------------------------------------------------------*/
2503 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2504 uint64_t *zSigPtr)
2506 int8_t shiftCount;
2508 shiftCount = countLeadingZeros64( aSig );
2509 *zSigPtr = aSig<<shiftCount;
2510 *zExpPtr = 1 - shiftCount;
2513 /*----------------------------------------------------------------------------
2514 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2515 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2516 | and returns the proper extended double-precision floating-point value
2517 | corresponding to the abstract input. Ordinarily, the abstract value is
2518 | rounded and packed into the extended double-precision format, with the
2519 | inexact exception raised if the abstract input cannot be represented
2520 | exactly. However, if the abstract value is too large, the overflow and
2521 | inexact exceptions are raised and an infinity or maximal finite value is
2522 | returned. If the abstract value is too small, the input value is rounded to
2523 | a subnormal number, and the underflow and inexact exceptions are raised if
2524 | the abstract input cannot be represented exactly as a subnormal extended
2525 | double-precision floating-point number.
2526 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
2527 | number of bits as single or double precision, respectively. Otherwise, the
2528 | result is rounded to the full precision of the extended double-precision
2529 | format.
2530 | The input significand must be normalized or smaller. If the input
2531 | significand is not normalized, `zExp' must be 0; in that case, the result
2532 | returned is a subnormal number, and it must not require rounding. The
2533 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2534 | Floating-Point Arithmetic.
2535 *----------------------------------------------------------------------------*/
2537 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2538 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2539 float_status *status)
2541 int8_t roundingMode;
2542 flag roundNearestEven, increment, isTiny;
2543 int64_t roundIncrement, roundMask, roundBits;
2545 roundingMode = status->float_rounding_mode;
2546 roundNearestEven = ( roundingMode == float_round_nearest_even );
2547 if ( roundingPrecision == 80 ) goto precision80;
2548 if ( roundingPrecision == 64 ) {
2549 roundIncrement = LIT64( 0x0000000000000400 );
2550 roundMask = LIT64( 0x00000000000007FF );
2552 else if ( roundingPrecision == 32 ) {
2553 roundIncrement = LIT64( 0x0000008000000000 );
2554 roundMask = LIT64( 0x000000FFFFFFFFFF );
2556 else {
2557 goto precision80;
2559 zSig0 |= ( zSig1 != 0 );
2560 switch (roundingMode) {
2561 case float_round_nearest_even:
2562 case float_round_ties_away:
2563 break;
2564 case float_round_to_zero:
2565 roundIncrement = 0;
2566 break;
2567 case float_round_up:
2568 roundIncrement = zSign ? 0 : roundMask;
2569 break;
2570 case float_round_down:
2571 roundIncrement = zSign ? roundMask : 0;
2572 break;
2573 default:
2574 abort();
2576 roundBits = zSig0 & roundMask;
2577 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2578 if ( ( 0x7FFE < zExp )
2579 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2581 goto overflow;
2583 if ( zExp <= 0 ) {
2584 if (status->flush_to_zero) {
2585 float_raise(float_flag_output_denormal, status);
2586 return packFloatx80(zSign, 0, 0);
2588 isTiny =
2589 (status->float_detect_tininess
2590 == float_tininess_before_rounding)
2591 || ( zExp < 0 )
2592 || ( zSig0 <= zSig0 + roundIncrement );
2593 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2594 zExp = 0;
2595 roundBits = zSig0 & roundMask;
2596 if (isTiny && roundBits) {
2597 float_raise(float_flag_underflow, status);
2599 if (roundBits) {
2600 status->float_exception_flags |= float_flag_inexact;
2602 zSig0 += roundIncrement;
2603 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2604 roundIncrement = roundMask + 1;
2605 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2606 roundMask |= roundIncrement;
2608 zSig0 &= ~ roundMask;
2609 return packFloatx80( zSign, zExp, zSig0 );
2612 if (roundBits) {
2613 status->float_exception_flags |= float_flag_inexact;
2615 zSig0 += roundIncrement;
2616 if ( zSig0 < roundIncrement ) {
2617 ++zExp;
2618 zSig0 = LIT64( 0x8000000000000000 );
2620 roundIncrement = roundMask + 1;
2621 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2622 roundMask |= roundIncrement;
2624 zSig0 &= ~ roundMask;
2625 if ( zSig0 == 0 ) zExp = 0;
2626 return packFloatx80( zSign, zExp, zSig0 );
2627 precision80:
2628 switch (roundingMode) {
2629 case float_round_nearest_even:
2630 case float_round_ties_away:
2631 increment = ((int64_t)zSig1 < 0);
2632 break;
2633 case float_round_to_zero:
2634 increment = 0;
2635 break;
2636 case float_round_up:
2637 increment = !zSign && zSig1;
2638 break;
2639 case float_round_down:
2640 increment = zSign && zSig1;
2641 break;
2642 default:
2643 abort();
2645 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2646 if ( ( 0x7FFE < zExp )
2647 || ( ( zExp == 0x7FFE )
2648 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2649 && increment
2652 roundMask = 0;
2653 overflow:
2654 float_raise(float_flag_overflow | float_flag_inexact, status);
2655 if ( ( roundingMode == float_round_to_zero )
2656 || ( zSign && ( roundingMode == float_round_up ) )
2657 || ( ! zSign && ( roundingMode == float_round_down ) )
2659 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2661 return packFloatx80(zSign,
2662 floatx80_infinity_high,
2663 floatx80_infinity_low);
2665 if ( zExp <= 0 ) {
2666 isTiny =
2667 (status->float_detect_tininess
2668 == float_tininess_before_rounding)
2669 || ( zExp < 0 )
2670 || ! increment
2671 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2672 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2673 zExp = 0;
2674 if (isTiny && zSig1) {
2675 float_raise(float_flag_underflow, status);
2677 if (zSig1) {
2678 status->float_exception_flags |= float_flag_inexact;
2680 switch (roundingMode) {
2681 case float_round_nearest_even:
2682 case float_round_ties_away:
2683 increment = ((int64_t)zSig1 < 0);
2684 break;
2685 case float_round_to_zero:
2686 increment = 0;
2687 break;
2688 case float_round_up:
2689 increment = !zSign && zSig1;
2690 break;
2691 case float_round_down:
2692 increment = zSign && zSig1;
2693 break;
2694 default:
2695 abort();
2697 if ( increment ) {
2698 ++zSig0;
2699 zSig0 &=
2700 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2701 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2703 return packFloatx80( zSign, zExp, zSig0 );
2706 if (zSig1) {
2707 status->float_exception_flags |= float_flag_inexact;
2709 if ( increment ) {
2710 ++zSig0;
2711 if ( zSig0 == 0 ) {
2712 ++zExp;
2713 zSig0 = LIT64( 0x8000000000000000 );
2715 else {
2716 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2719 else {
2720 if ( zSig0 == 0 ) zExp = 0;
2722 return packFloatx80( zSign, zExp, zSig0 );
2726 /*----------------------------------------------------------------------------
2727 | Takes an abstract floating-point value having sign `zSign', exponent
2728 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2729 | and returns the proper extended double-precision floating-point value
2730 | corresponding to the abstract input. This routine is just like
2731 | `roundAndPackFloatx80' except that the input significand does not have to be
2732 | normalized.
2733 *----------------------------------------------------------------------------*/
2735 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2736 flag zSign, int32_t zExp,
2737 uint64_t zSig0, uint64_t zSig1,
2738 float_status *status)
2740 int8_t shiftCount;
2742 if ( zSig0 == 0 ) {
2743 zSig0 = zSig1;
2744 zSig1 = 0;
2745 zExp -= 64;
2747 shiftCount = countLeadingZeros64( zSig0 );
2748 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2749 zExp -= shiftCount;
2750 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2751 zSig0, zSig1, status);
2755 /*----------------------------------------------------------------------------
2756 | Returns the least-significant 64 fraction bits of the quadruple-precision
2757 | floating-point value `a'.
2758 *----------------------------------------------------------------------------*/
2760 static inline uint64_t extractFloat128Frac1( float128 a )
2763 return a.low;
2767 /*----------------------------------------------------------------------------
2768 | Returns the most-significant 48 fraction bits of the quadruple-precision
2769 | floating-point value `a'.
2770 *----------------------------------------------------------------------------*/
2772 static inline uint64_t extractFloat128Frac0( float128 a )
2775 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2779 /*----------------------------------------------------------------------------
2780 | Returns the exponent bits of the quadruple-precision floating-point value
2781 | `a'.
2782 *----------------------------------------------------------------------------*/
2784 static inline int32_t extractFloat128Exp( float128 a )
2787 return ( a.high>>48 ) & 0x7FFF;
2791 /*----------------------------------------------------------------------------
2792 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2793 *----------------------------------------------------------------------------*/
2795 static inline flag extractFloat128Sign( float128 a )
2798 return a.high>>63;
2802 /*----------------------------------------------------------------------------
2803 | Normalizes the subnormal quadruple-precision floating-point value
2804 | represented by the denormalized significand formed by the concatenation of
2805 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
2806 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
2807 | significand are stored at the location pointed to by `zSig0Ptr', and the
2808 | least significant 64 bits of the normalized significand are stored at the
2809 | location pointed to by `zSig1Ptr'.
2810 *----------------------------------------------------------------------------*/
2812 static void
2813 normalizeFloat128Subnormal(
2814 uint64_t aSig0,
2815 uint64_t aSig1,
2816 int32_t *zExpPtr,
2817 uint64_t *zSig0Ptr,
2818 uint64_t *zSig1Ptr
2821 int8_t shiftCount;
2823 if ( aSig0 == 0 ) {
2824 shiftCount = countLeadingZeros64( aSig1 ) - 15;
2825 if ( shiftCount < 0 ) {
2826 *zSig0Ptr = aSig1>>( - shiftCount );
2827 *zSig1Ptr = aSig1<<( shiftCount & 63 );
2829 else {
2830 *zSig0Ptr = aSig1<<shiftCount;
2831 *zSig1Ptr = 0;
2833 *zExpPtr = - shiftCount - 63;
2835 else {
2836 shiftCount = countLeadingZeros64( aSig0 ) - 15;
2837 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2838 *zExpPtr = 1 - shiftCount;
2843 /*----------------------------------------------------------------------------
2844 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2845 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2846 | floating-point value, returning the result. After being shifted into the
2847 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2848 | added together to form the most significant 32 bits of the result. This
2849 | means that any integer portion of `zSig0' will be added into the exponent.
2850 | Since a properly normalized significand will have an integer portion equal
2851 | to 1, the `zExp' input should be 1 less than the desired result exponent
2852 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2853 | significand.
2854 *----------------------------------------------------------------------------*/
2856 static inline float128
2857 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
2859 float128 z;
2861 z.low = zSig1;
2862 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
2863 return z;
2867 /*----------------------------------------------------------------------------
2868 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2869 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2870 | and `zSig2', and returns the proper quadruple-precision floating-point value
2871 | corresponding to the abstract input. Ordinarily, the abstract value is
2872 | simply rounded and packed into the quadruple-precision format, with the
2873 | inexact exception raised if the abstract input cannot be represented
2874 | exactly. However, if the abstract value is too large, the overflow and
2875 | inexact exceptions are raised and an infinity or maximal finite value is
2876 | returned. If the abstract value is too small, the input value is rounded to
2877 | a subnormal number, and the underflow and inexact exceptions are raised if
2878 | the abstract input cannot be represented exactly as a subnormal quadruple-
2879 | precision floating-point number.
2880 | The input significand must be normalized or smaller. If the input
2881 | significand is not normalized, `zExp' must be 0; in that case, the result
2882 | returned is a subnormal number, and it must not require rounding. In the
2883 | usual case that the input significand is normalized, `zExp' must be 1 less
2884 | than the ``true'' floating-point exponent. The handling of underflow and
2885 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2886 *----------------------------------------------------------------------------*/
2888 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
2889 uint64_t zSig0, uint64_t zSig1,
2890 uint64_t zSig2, float_status *status)
2892 int8_t roundingMode;
2893 flag roundNearestEven, increment, isTiny;
2895 roundingMode = status->float_rounding_mode;
2896 roundNearestEven = ( roundingMode == float_round_nearest_even );
2897 switch (roundingMode) {
2898 case float_round_nearest_even:
2899 case float_round_ties_away:
2900 increment = ((int64_t)zSig2 < 0);
2901 break;
2902 case float_round_to_zero:
2903 increment = 0;
2904 break;
2905 case float_round_up:
2906 increment = !zSign && zSig2;
2907 break;
2908 case float_round_down:
2909 increment = zSign && zSig2;
2910 break;
2911 case float_round_to_odd:
2912 increment = !(zSig1 & 0x1) && zSig2;
2913 break;
2914 default:
2915 abort();
2917 if ( 0x7FFD <= (uint32_t) zExp ) {
2918 if ( ( 0x7FFD < zExp )
2919 || ( ( zExp == 0x7FFD )
2920 && eq128(
2921 LIT64( 0x0001FFFFFFFFFFFF ),
2922 LIT64( 0xFFFFFFFFFFFFFFFF ),
2923 zSig0,
2924 zSig1
2926 && increment
2929 float_raise(float_flag_overflow | float_flag_inexact, status);
2930 if ( ( roundingMode == float_round_to_zero )
2931 || ( zSign && ( roundingMode == float_round_up ) )
2932 || ( ! zSign && ( roundingMode == float_round_down ) )
2933 || (roundingMode == float_round_to_odd)
2935 return
2936 packFloat128(
2937 zSign,
2938 0x7FFE,
2939 LIT64( 0x0000FFFFFFFFFFFF ),
2940 LIT64( 0xFFFFFFFFFFFFFFFF )
2943 return packFloat128( zSign, 0x7FFF, 0, 0 );
2945 if ( zExp < 0 ) {
2946 if (status->flush_to_zero) {
2947 float_raise(float_flag_output_denormal, status);
2948 return packFloat128(zSign, 0, 0, 0);
2950 isTiny =
2951 (status->float_detect_tininess
2952 == float_tininess_before_rounding)
2953 || ( zExp < -1 )
2954 || ! increment
2955 || lt128(
2956 zSig0,
2957 zSig1,
2958 LIT64( 0x0001FFFFFFFFFFFF ),
2959 LIT64( 0xFFFFFFFFFFFFFFFF )
2961 shift128ExtraRightJamming(
2962 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
2963 zExp = 0;
2964 if (isTiny && zSig2) {
2965 float_raise(float_flag_underflow, status);
2967 switch (roundingMode) {
2968 case float_round_nearest_even:
2969 case float_round_ties_away:
2970 increment = ((int64_t)zSig2 < 0);
2971 break;
2972 case float_round_to_zero:
2973 increment = 0;
2974 break;
2975 case float_round_up:
2976 increment = !zSign && zSig2;
2977 break;
2978 case float_round_down:
2979 increment = zSign && zSig2;
2980 break;
2981 case float_round_to_odd:
2982 increment = !(zSig1 & 0x1) && zSig2;
2983 break;
2984 default:
2985 abort();
2989 if (zSig2) {
2990 status->float_exception_flags |= float_flag_inexact;
2992 if ( increment ) {
2993 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
2994 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
2996 else {
2997 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
2999 return packFloat128( zSign, zExp, zSig0, zSig1 );
3003 /*----------------------------------------------------------------------------
3004 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3005 | and significand formed by the concatenation of `zSig0' and `zSig1', and
3006 | returns the proper quadruple-precision floating-point value corresponding
3007 | to the abstract input. This routine is just like `roundAndPackFloat128'
3008 | except that the input significand has fewer bits and does not have to be
3009 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
3010 | point exponent.
3011 *----------------------------------------------------------------------------*/
3013 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
3014 uint64_t zSig0, uint64_t zSig1,
3015 float_status *status)
3017 int8_t shiftCount;
3018 uint64_t zSig2;
3020 if ( zSig0 == 0 ) {
3021 zSig0 = zSig1;
3022 zSig1 = 0;
3023 zExp -= 64;
3025 shiftCount = countLeadingZeros64( zSig0 ) - 15;
3026 if ( 0 <= shiftCount ) {
3027 zSig2 = 0;
3028 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3030 else {
3031 shift128ExtraRightJamming(
3032 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3034 zExp -= shiftCount;
3035 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3040 /*----------------------------------------------------------------------------
3041 | Returns the result of converting the 32-bit two's complement integer `a'
3042 | to the extended double-precision floating-point format. The conversion
3043 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3044 | Arithmetic.
3045 *----------------------------------------------------------------------------*/
3047 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3049 flag zSign;
3050 uint32_t absA;
3051 int8_t shiftCount;
3052 uint64_t zSig;
3054 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3055 zSign = ( a < 0 );
3056 absA = zSign ? - a : a;
3057 shiftCount = countLeadingZeros32( absA ) + 32;
3058 zSig = absA;
3059 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3063 /*----------------------------------------------------------------------------
3064 | Returns the result of converting the 32-bit two's complement integer `a' to
3065 | the quadruple-precision floating-point format. The conversion is performed
3066 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3067 *----------------------------------------------------------------------------*/
3069 float128 int32_to_float128(int32_t a, float_status *status)
3071 flag zSign;
3072 uint32_t absA;
3073 int8_t shiftCount;
3074 uint64_t zSig0;
3076 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3077 zSign = ( a < 0 );
3078 absA = zSign ? - a : a;
3079 shiftCount = countLeadingZeros32( absA ) + 17;
3080 zSig0 = absA;
3081 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3085 /*----------------------------------------------------------------------------
3086 | Returns the result of converting the 64-bit two's complement integer `a'
3087 | to the extended double-precision floating-point format. The conversion
3088 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3089 | Arithmetic.
3090 *----------------------------------------------------------------------------*/
3092 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3094 flag zSign;
3095 uint64_t absA;
3096 int8_t shiftCount;
3098 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3099 zSign = ( a < 0 );
3100 absA = zSign ? - a : a;
3101 shiftCount = countLeadingZeros64( absA );
3102 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3106 /*----------------------------------------------------------------------------
3107 | Returns the result of converting the 64-bit two's complement integer `a' to
3108 | the quadruple-precision floating-point format. The conversion is performed
3109 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3110 *----------------------------------------------------------------------------*/
3112 float128 int64_to_float128(int64_t a, float_status *status)
3114 flag zSign;
3115 uint64_t absA;
3116 int8_t shiftCount;
3117 int32_t zExp;
3118 uint64_t zSig0, zSig1;
3120 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3121 zSign = ( a < 0 );
3122 absA = zSign ? - a : a;
3123 shiftCount = countLeadingZeros64( absA ) + 49;
3124 zExp = 0x406E - shiftCount;
3125 if ( 64 <= shiftCount ) {
3126 zSig1 = 0;
3127 zSig0 = absA;
3128 shiftCount -= 64;
3130 else {
3131 zSig1 = absA;
3132 zSig0 = 0;
3134 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3135 return packFloat128( zSign, zExp, zSig0, zSig1 );
3139 /*----------------------------------------------------------------------------
3140 | Returns the result of converting the 64-bit unsigned integer `a'
3141 | to the quadruple-precision floating-point format. The conversion is performed
3142 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3143 *----------------------------------------------------------------------------*/
3145 float128 uint64_to_float128(uint64_t a, float_status *status)
3147 if (a == 0) {
3148 return float128_zero;
3150 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
3156 /*----------------------------------------------------------------------------
3157 | Returns the result of converting the single-precision floating-point value
3158 | `a' to the double-precision floating-point format. The conversion is
3159 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3160 | Arithmetic.
3161 *----------------------------------------------------------------------------*/
3163 float64 float32_to_float64(float32 a, float_status *status)
3165 flag aSign;
3166 int aExp;
3167 uint32_t aSig;
3168 a = float32_squash_input_denormal(a, status);
3170 aSig = extractFloat32Frac( a );
3171 aExp = extractFloat32Exp( a );
3172 aSign = extractFloat32Sign( a );
3173 if ( aExp == 0xFF ) {
3174 if (aSig) {
3175 return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
3177 return packFloat64( aSign, 0x7FF, 0 );
3179 if ( aExp == 0 ) {
3180 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
3181 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3182 --aExp;
3184 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
3188 /*----------------------------------------------------------------------------
3189 | Returns the result of converting the single-precision floating-point value
3190 | `a' to the extended double-precision floating-point format. The conversion
3191 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3192 | Arithmetic.
3193 *----------------------------------------------------------------------------*/
3195 floatx80 float32_to_floatx80(float32 a, float_status *status)
3197 flag aSign;
3198 int aExp;
3199 uint32_t aSig;
3201 a = float32_squash_input_denormal(a, status);
3202 aSig = extractFloat32Frac( a );
3203 aExp = extractFloat32Exp( a );
3204 aSign = extractFloat32Sign( a );
3205 if ( aExp == 0xFF ) {
3206 if (aSig) {
3207 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3209 return packFloatx80(aSign,
3210 floatx80_infinity_high,
3211 floatx80_infinity_low);
3213 if ( aExp == 0 ) {
3214 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3215 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3217 aSig |= 0x00800000;
3218 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3222 /*----------------------------------------------------------------------------
3223 | Returns the result of converting the single-precision floating-point value
3224 | `a' to the double-precision floating-point format. The conversion is
3225 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3226 | Arithmetic.
3227 *----------------------------------------------------------------------------*/
3229 float128 float32_to_float128(float32 a, float_status *status)
3231 flag aSign;
3232 int aExp;
3233 uint32_t aSig;
3235 a = float32_squash_input_denormal(a, status);
3236 aSig = extractFloat32Frac( a );
3237 aExp = extractFloat32Exp( a );
3238 aSign = extractFloat32Sign( a );
3239 if ( aExp == 0xFF ) {
3240 if (aSig) {
3241 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3243 return packFloat128( aSign, 0x7FFF, 0, 0 );
3245 if ( aExp == 0 ) {
3246 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3247 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3248 --aExp;
3250 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3254 /*----------------------------------------------------------------------------
3255 | Returns the remainder of the single-precision floating-point value `a'
3256 | with respect to the corresponding value `b'. The operation is performed
3257 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3258 *----------------------------------------------------------------------------*/
3260 float32 float32_rem(float32 a, float32 b, float_status *status)
3262 flag aSign, zSign;
3263 int aExp, bExp, expDiff;
3264 uint32_t aSig, bSig;
3265 uint32_t q;
3266 uint64_t aSig64, bSig64, q64;
3267 uint32_t alternateASig;
3268 int32_t sigMean;
3269 a = float32_squash_input_denormal(a, status);
3270 b = float32_squash_input_denormal(b, status);
3272 aSig = extractFloat32Frac( a );
3273 aExp = extractFloat32Exp( a );
3274 aSign = extractFloat32Sign( a );
3275 bSig = extractFloat32Frac( b );
3276 bExp = extractFloat32Exp( b );
3277 if ( aExp == 0xFF ) {
3278 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3279 return propagateFloat32NaN(a, b, status);
3281 float_raise(float_flag_invalid, status);
3282 return float32_default_nan(status);
3284 if ( bExp == 0xFF ) {
3285 if (bSig) {
3286 return propagateFloat32NaN(a, b, status);
3288 return a;
3290 if ( bExp == 0 ) {
3291 if ( bSig == 0 ) {
3292 float_raise(float_flag_invalid, status);
3293 return float32_default_nan(status);
3295 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3297 if ( aExp == 0 ) {
3298 if ( aSig == 0 ) return a;
3299 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3301 expDiff = aExp - bExp;
3302 aSig |= 0x00800000;
3303 bSig |= 0x00800000;
3304 if ( expDiff < 32 ) {
3305 aSig <<= 8;
3306 bSig <<= 8;
3307 if ( expDiff < 0 ) {
3308 if ( expDiff < -1 ) return a;
3309 aSig >>= 1;
3311 q = ( bSig <= aSig );
3312 if ( q ) aSig -= bSig;
3313 if ( 0 < expDiff ) {
3314 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3315 q >>= 32 - expDiff;
3316 bSig >>= 2;
3317 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3319 else {
3320 aSig >>= 2;
3321 bSig >>= 2;
3324 else {
3325 if ( bSig <= aSig ) aSig -= bSig;
3326 aSig64 = ( (uint64_t) aSig )<<40;
3327 bSig64 = ( (uint64_t) bSig )<<40;
3328 expDiff -= 64;
3329 while ( 0 < expDiff ) {
3330 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3331 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3332 aSig64 = - ( ( bSig * q64 )<<38 );
3333 expDiff -= 62;
3335 expDiff += 64;
3336 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3337 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3338 q = q64>>( 64 - expDiff );
3339 bSig <<= 6;
3340 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3342 do {
3343 alternateASig = aSig;
3344 ++q;
3345 aSig -= bSig;
3346 } while ( 0 <= (int32_t) aSig );
3347 sigMean = aSig + alternateASig;
3348 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3349 aSig = alternateASig;
3351 zSign = ( (int32_t) aSig < 0 );
3352 if ( zSign ) aSig = - aSig;
3353 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3358 /*----------------------------------------------------------------------------
3359 | Returns the binary exponential of the single-precision floating-point value
3360 | `a'. The operation is performed according to the IEC/IEEE Standard for
3361 | Binary Floating-Point Arithmetic.
3363 | Uses the following identities:
3365 | 1. -------------------------------------------------------------------------
3366 | x x*ln(2)
3367 | 2 = e
3369 | 2. -------------------------------------------------------------------------
3370 | 2 3 4 5 n
3371 | x x x x x x x
3372 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3373 | 1! 2! 3! 4! 5! n!
3374 *----------------------------------------------------------------------------*/
3376 static const float64 float32_exp2_coefficients[15] =
3378 const_float64( 0x3ff0000000000000ll ), /* 1 */
3379 const_float64( 0x3fe0000000000000ll ), /* 2 */
3380 const_float64( 0x3fc5555555555555ll ), /* 3 */
3381 const_float64( 0x3fa5555555555555ll ), /* 4 */
3382 const_float64( 0x3f81111111111111ll ), /* 5 */
3383 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
3384 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
3385 const_float64( 0x3efa01a01a01a01all ), /* 8 */
3386 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
3387 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3388 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3389 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3390 const_float64( 0x3de6124613a86d09ll ), /* 13 */
3391 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3392 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3395 float32 float32_exp2(float32 a, float_status *status)
3397 flag aSign;
3398 int aExp;
3399 uint32_t aSig;
3400 float64 r, x, xn;
3401 int i;
3402 a = float32_squash_input_denormal(a, status);
3404 aSig = extractFloat32Frac( a );
3405 aExp = extractFloat32Exp( a );
3406 aSign = extractFloat32Sign( a );
3408 if ( aExp == 0xFF) {
3409 if (aSig) {
3410 return propagateFloat32NaN(a, float32_zero, status);
3412 return (aSign) ? float32_zero : a;
3414 if (aExp == 0) {
3415 if (aSig == 0) return float32_one;
3418 float_raise(float_flag_inexact, status);
3420 /* ******************************* */
3421 /* using float64 for approximation */
3422 /* ******************************* */
3423 x = float32_to_float64(a, status);
3424 x = float64_mul(x, float64_ln2, status);
3426 xn = x;
3427 r = float64_one;
3428 for (i = 0 ; i < 15 ; i++) {
3429 float64 f;
3431 f = float64_mul(xn, float32_exp2_coefficients[i], status);
3432 r = float64_add(r, f, status);
3434 xn = float64_mul(xn, x, status);
3437 return float64_to_float32(r, status);
3440 /*----------------------------------------------------------------------------
3441 | Returns the binary log of the single-precision floating-point value `a'.
3442 | The operation is performed according to the IEC/IEEE Standard for Binary
3443 | Floating-Point Arithmetic.
3444 *----------------------------------------------------------------------------*/
3445 float32 float32_log2(float32 a, float_status *status)
3447 flag aSign, zSign;
3448 int aExp;
3449 uint32_t aSig, zSig, i;
3451 a = float32_squash_input_denormal(a, status);
3452 aSig = extractFloat32Frac( a );
3453 aExp = extractFloat32Exp( a );
3454 aSign = extractFloat32Sign( a );
3456 if ( aExp == 0 ) {
3457 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3458 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3460 if ( aSign ) {
3461 float_raise(float_flag_invalid, status);
3462 return float32_default_nan(status);
3464 if ( aExp == 0xFF ) {
3465 if (aSig) {
3466 return propagateFloat32NaN(a, float32_zero, status);
3468 return a;
3471 aExp -= 0x7F;
3472 aSig |= 0x00800000;
3473 zSign = aExp < 0;
3474 zSig = aExp << 23;
3476 for (i = 1 << 22; i > 0; i >>= 1) {
3477 aSig = ( (uint64_t)aSig * aSig ) >> 23;
3478 if ( aSig & 0x01000000 ) {
3479 aSig >>= 1;
3480 zSig |= i;
3484 if ( zSign )
3485 zSig = -zSig;
3487 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3490 /*----------------------------------------------------------------------------
3491 | Returns 1 if the single-precision floating-point value `a' is equal to
3492 | the corresponding value `b', and 0 otherwise. The invalid exception is
3493 | raised if either operand is a NaN. Otherwise, the comparison is performed
3494 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3495 *----------------------------------------------------------------------------*/
3497 int float32_eq(float32 a, float32 b, float_status *status)
3499 uint32_t av, bv;
3500 a = float32_squash_input_denormal(a, status);
3501 b = float32_squash_input_denormal(b, status);
3503 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3504 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3506 float_raise(float_flag_invalid, status);
3507 return 0;
3509 av = float32_val(a);
3510 bv = float32_val(b);
3511 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3514 /*----------------------------------------------------------------------------
3515 | Returns 1 if the single-precision floating-point value `a' is less than
3516 | or equal to the corresponding value `b', and 0 otherwise. The invalid
3517 | exception is raised if either operand is a NaN. The comparison is performed
3518 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3519 *----------------------------------------------------------------------------*/
3521 int float32_le(float32 a, float32 b, float_status *status)
3523 flag aSign, bSign;
3524 uint32_t av, bv;
3525 a = float32_squash_input_denormal(a, status);
3526 b = float32_squash_input_denormal(b, status);
3528 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3529 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3531 float_raise(float_flag_invalid, status);
3532 return 0;
3534 aSign = extractFloat32Sign( a );
3535 bSign = extractFloat32Sign( b );
3536 av = float32_val(a);
3537 bv = float32_val(b);
3538 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3539 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3543 /*----------------------------------------------------------------------------
3544 | Returns 1 if the single-precision floating-point value `a' is less than
3545 | the corresponding value `b', and 0 otherwise. The invalid exception is
3546 | raised if either operand is a NaN. The comparison is performed according
3547 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3548 *----------------------------------------------------------------------------*/
3550 int float32_lt(float32 a, float32 b, float_status *status)
3552 flag aSign, bSign;
3553 uint32_t av, bv;
3554 a = float32_squash_input_denormal(a, status);
3555 b = float32_squash_input_denormal(b, status);
3557 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3558 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3560 float_raise(float_flag_invalid, status);
3561 return 0;
3563 aSign = extractFloat32Sign( a );
3564 bSign = extractFloat32Sign( b );
3565 av = float32_val(a);
3566 bv = float32_val(b);
3567 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3568 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3572 /*----------------------------------------------------------------------------
3573 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3574 | be compared, and 0 otherwise. The invalid exception is raised if either
3575 | operand is a NaN. The comparison is performed according to the IEC/IEEE
3576 | Standard for Binary Floating-Point Arithmetic.
3577 *----------------------------------------------------------------------------*/
3579 int float32_unordered(float32 a, float32 b, float_status *status)
3581 a = float32_squash_input_denormal(a, status);
3582 b = float32_squash_input_denormal(b, status);
3584 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3585 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3587 float_raise(float_flag_invalid, status);
3588 return 1;
3590 return 0;
3593 /*----------------------------------------------------------------------------
3594 | Returns 1 if the single-precision floating-point value `a' is equal to
3595 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3596 | exception. The comparison is performed according to the IEC/IEEE Standard
3597 | for Binary Floating-Point Arithmetic.
3598 *----------------------------------------------------------------------------*/
3600 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3602 a = float32_squash_input_denormal(a, status);
3603 b = float32_squash_input_denormal(b, status);
3605 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3606 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3608 if (float32_is_signaling_nan(a, status)
3609 || float32_is_signaling_nan(b, status)) {
3610 float_raise(float_flag_invalid, status);
3612 return 0;
3614 return ( float32_val(a) == float32_val(b) ) ||
3615 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3618 /*----------------------------------------------------------------------------
3619 | Returns 1 if the single-precision floating-point value `a' is less than or
3620 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3621 | cause an exception. Otherwise, the comparison is performed according to the
3622 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3623 *----------------------------------------------------------------------------*/
3625 int float32_le_quiet(float32 a, float32 b, float_status *status)
3627 flag aSign, bSign;
3628 uint32_t av, bv;
3629 a = float32_squash_input_denormal(a, status);
3630 b = float32_squash_input_denormal(b, status);
3632 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3633 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3635 if (float32_is_signaling_nan(a, status)
3636 || float32_is_signaling_nan(b, status)) {
3637 float_raise(float_flag_invalid, status);
3639 return 0;
3641 aSign = extractFloat32Sign( a );
3642 bSign = extractFloat32Sign( b );
3643 av = float32_val(a);
3644 bv = float32_val(b);
3645 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3646 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3650 /*----------------------------------------------------------------------------
3651 | Returns 1 if the single-precision floating-point value `a' is less than
3652 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3653 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3654 | Standard for Binary Floating-Point Arithmetic.
3655 *----------------------------------------------------------------------------*/
3657 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3659 flag aSign, bSign;
3660 uint32_t av, bv;
3661 a = float32_squash_input_denormal(a, status);
3662 b = float32_squash_input_denormal(b, status);
3664 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3665 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3667 if (float32_is_signaling_nan(a, status)
3668 || float32_is_signaling_nan(b, status)) {
3669 float_raise(float_flag_invalid, status);
3671 return 0;
3673 aSign = extractFloat32Sign( a );
3674 bSign = extractFloat32Sign( b );
3675 av = float32_val(a);
3676 bv = float32_val(b);
3677 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3678 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3682 /*----------------------------------------------------------------------------
3683 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3684 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3685 | comparison is performed according to the IEC/IEEE Standard for Binary
3686 | Floating-Point Arithmetic.
3687 *----------------------------------------------------------------------------*/
3689 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3691 a = float32_squash_input_denormal(a, status);
3692 b = float32_squash_input_denormal(b, status);
3694 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3695 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3697 if (float32_is_signaling_nan(a, status)
3698 || float32_is_signaling_nan(b, status)) {
3699 float_raise(float_flag_invalid, status);
3701 return 1;
3703 return 0;
3707 /*----------------------------------------------------------------------------
3708 | Returns the result of converting the double-precision floating-point value
3709 | `a' to the single-precision floating-point format. The conversion is
3710 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3711 | Arithmetic.
3712 *----------------------------------------------------------------------------*/
3714 float32 float64_to_float32(float64 a, float_status *status)
3716 flag aSign;
3717 int aExp;
3718 uint64_t aSig;
3719 uint32_t zSig;
3720 a = float64_squash_input_denormal(a, status);
3722 aSig = extractFloat64Frac( a );
3723 aExp = extractFloat64Exp( a );
3724 aSign = extractFloat64Sign( a );
3725 if ( aExp == 0x7FF ) {
3726 if (aSig) {
3727 return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3729 return packFloat32( aSign, 0xFF, 0 );
3731 shift64RightJamming( aSig, 22, &aSig );
3732 zSig = aSig;
3733 if ( aExp || zSig ) {
3734 zSig |= 0x40000000;
3735 aExp -= 0x381;
3737 return roundAndPackFloat32(aSign, aExp, zSig, status);
3742 /*----------------------------------------------------------------------------
3743 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3744 | half-precision floating-point value, returning the result. After being
3745 | shifted into the proper positions, the three fields are simply added
3746 | together to form the result. This means that any integer portion of `zSig'
3747 | will be added into the exponent. Since a properly normalized significand
3748 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3749 | than the desired result exponent whenever `zSig' is a complete, normalized
3750 | significand.
3751 *----------------------------------------------------------------------------*/
3752 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3754 return make_float16(
3755 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3758 /*----------------------------------------------------------------------------
3759 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3760 | and significand `zSig', and returns the proper half-precision floating-
3761 | point value corresponding to the abstract input. Ordinarily, the abstract
3762 | value is simply rounded and packed into the half-precision format, with
3763 | the inexact exception raised if the abstract input cannot be represented
3764 | exactly. However, if the abstract value is too large, the overflow and
3765 | inexact exceptions are raised and an infinity or maximal finite value is
3766 | returned. If the abstract value is too small, the input value is rounded to
3767 | a subnormal number, and the underflow and inexact exceptions are raised if
3768 | the abstract input cannot be represented exactly as a subnormal half-
3769 | precision floating-point number.
3770 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3771 | ARM-style "alternative representation", which omits the NaN and Inf
3772 | encodings in order to raise the maximum representable exponent by one.
3773 | The input significand `zSig' has its binary point between bits 22
3774 | and 23, which is 13 bits to the left of the usual location. This shifted
3775 | significand must be normalized or smaller. If `zSig' is not normalized,
3776 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3777 | and it must not require rounding. In the usual case that `zSig' is
3778 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3779 | Note the slightly odd position of the binary point in zSig compared with the
3780 | other roundAndPackFloat functions. This should probably be fixed if we
3781 | need to implement more float16 routines than just conversion.
3782 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3783 | Binary Floating-Point Arithmetic.
3784 *----------------------------------------------------------------------------*/
3786 static float16 roundAndPackFloat16(flag zSign, int zExp,
3787 uint32_t zSig, flag ieee,
3788 float_status *status)
3790 int maxexp = ieee ? 29 : 30;
3791 uint32_t mask;
3792 uint32_t increment;
3793 bool rounding_bumps_exp;
3794 bool is_tiny = false;
3796 /* Calculate the mask of bits of the mantissa which are not
3797 * representable in half-precision and will be lost.
3799 if (zExp < 1) {
3800 /* Will be denormal in halfprec */
3801 mask = 0x00ffffff;
3802 if (zExp >= -11) {
3803 mask >>= 11 + zExp;
3805 } else {
3806 /* Normal number in halfprec */
3807 mask = 0x00001fff;
3810 switch (status->float_rounding_mode) {
3811 case float_round_nearest_even:
3812 increment = (mask + 1) >> 1;
3813 if ((zSig & mask) == increment) {
3814 increment = zSig & (increment << 1);
3816 break;
3817 case float_round_ties_away:
3818 increment = (mask + 1) >> 1;
3819 break;
3820 case float_round_up:
3821 increment = zSign ? 0 : mask;
3822 break;
3823 case float_round_down:
3824 increment = zSign ? mask : 0;
3825 break;
3826 default: /* round_to_zero */
3827 increment = 0;
3828 break;
3831 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3833 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3834 if (ieee) {
3835 float_raise(float_flag_overflow | float_flag_inexact, status);
3836 return packFloat16(zSign, 0x1f, 0);
3837 } else {
3838 float_raise(float_flag_invalid, status);
3839 return packFloat16(zSign, 0x1f, 0x3ff);
3843 if (zExp < 0) {
3844 /* Note that flush-to-zero does not affect half-precision results */
3845 is_tiny =
3846 (status->float_detect_tininess == float_tininess_before_rounding)
3847 || (zExp < -1)
3848 || (!rounding_bumps_exp);
3850 if (zSig & mask) {
3851 float_raise(float_flag_inexact, status);
3852 if (is_tiny) {
3853 float_raise(float_flag_underflow, status);
3857 zSig += increment;
3858 if (rounding_bumps_exp) {
3859 zSig >>= 1;
3860 zExp++;
3863 if (zExp < -10) {
3864 return packFloat16(zSign, 0, 0);
3866 if (zExp < 0) {
3867 zSig >>= -zExp;
3868 zExp = 0;
3870 return packFloat16(zSign, zExp, zSig >> 13);
3873 /*----------------------------------------------------------------------------
3874 | If `a' is denormal and we are in flush-to-zero mode then set the
3875 | input-denormal exception and return zero. Otherwise just return the value.
3876 *----------------------------------------------------------------------------*/
3877 float16 float16_squash_input_denormal(float16 a, float_status *status)
3879 if (status->flush_inputs_to_zero) {
3880 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3881 float_raise(float_flag_input_denormal, status);
3882 return make_float16(float16_val(a) & 0x8000);
3885 return a;
3888 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3889 uint32_t *zSigPtr)
3891 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3892 *zSigPtr = aSig << shiftCount;
3893 *zExpPtr = 1 - shiftCount;
3896 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3897 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3899 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3901 flag aSign;
3902 int aExp;
3903 uint32_t aSig;
3905 aSign = extractFloat16Sign(a);
3906 aExp = extractFloat16Exp(a);
3907 aSig = extractFloat16Frac(a);
3909 if (aExp == 0x1f && ieee) {
3910 if (aSig) {
3911 return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3913 return packFloat32(aSign, 0xff, 0);
3915 if (aExp == 0) {
3916 if (aSig == 0) {
3917 return packFloat32(aSign, 0, 0);
3920 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3921 aExp--;
3923 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3926 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3928 flag aSign;
3929 int aExp;
3930 uint32_t aSig;
3932 a = float32_squash_input_denormal(a, status);
3934 aSig = extractFloat32Frac( a );
3935 aExp = extractFloat32Exp( a );
3936 aSign = extractFloat32Sign( a );
3937 if ( aExp == 0xFF ) {
3938 if (aSig) {
3939 /* Input is a NaN */
3940 if (!ieee) {
3941 float_raise(float_flag_invalid, status);
3942 return packFloat16(aSign, 0, 0);
3944 return commonNaNToFloat16(
3945 float32ToCommonNaN(a, status), status);
3947 /* Infinity */
3948 if (!ieee) {
3949 float_raise(float_flag_invalid, status);
3950 return packFloat16(aSign, 0x1f, 0x3ff);
3952 return packFloat16(aSign, 0x1f, 0);
3954 if (aExp == 0 && aSig == 0) {
3955 return packFloat16(aSign, 0, 0);
3957 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3958 * even if the input is denormal; however this is harmless because
3959 * the largest possible single-precision denormal is still smaller
3960 * than the smallest representable half-precision denormal, and so we
3961 * will end up ignoring aSig and returning via the "always return zero"
3962 * codepath.
3964 aSig |= 0x00800000;
3965 aExp -= 0x71;
3967 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3970 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3972 flag aSign;
3973 int aExp;
3974 uint32_t aSig;
3976 aSign = extractFloat16Sign(a);
3977 aExp = extractFloat16Exp(a);
3978 aSig = extractFloat16Frac(a);
3980 if (aExp == 0x1f && ieee) {
3981 if (aSig) {
3982 return commonNaNToFloat64(
3983 float16ToCommonNaN(a, status), status);
3985 return packFloat64(aSign, 0x7ff, 0);
3987 if (aExp == 0) {
3988 if (aSig == 0) {
3989 return packFloat64(aSign, 0, 0);
3992 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3993 aExp--;
3995 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3998 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
4000 flag aSign;
4001 int aExp;
4002 uint64_t aSig;
4003 uint32_t zSig;
4005 a = float64_squash_input_denormal(a, status);
4007 aSig = extractFloat64Frac(a);
4008 aExp = extractFloat64Exp(a);
4009 aSign = extractFloat64Sign(a);
4010 if (aExp == 0x7FF) {
4011 if (aSig) {
4012 /* Input is a NaN */
4013 if (!ieee) {
4014 float_raise(float_flag_invalid, status);
4015 return packFloat16(aSign, 0, 0);
4017 return commonNaNToFloat16(
4018 float64ToCommonNaN(a, status), status);
4020 /* Infinity */
4021 if (!ieee) {
4022 float_raise(float_flag_invalid, status);
4023 return packFloat16(aSign, 0x1f, 0x3ff);
4025 return packFloat16(aSign, 0x1f, 0);
4027 shift64RightJamming(aSig, 29, &aSig);
4028 zSig = aSig;
4029 if (aExp == 0 && zSig == 0) {
4030 return packFloat16(aSign, 0, 0);
4032 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4033 * even if the input is denormal; however this is harmless because
4034 * the largest possible single-precision denormal is still smaller
4035 * than the smallest representable half-precision denormal, and so we
4036 * will end up ignoring aSig and returning via the "always return zero"
4037 * codepath.
4039 zSig |= 0x00800000;
4040 aExp -= 0x3F1;
4042 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
4045 /*----------------------------------------------------------------------------
4046 | Returns the result of converting the double-precision floating-point value
4047 | `a' to the extended double-precision floating-point format. The conversion
4048 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4049 | Arithmetic.
4050 *----------------------------------------------------------------------------*/
4052 floatx80 float64_to_floatx80(float64 a, float_status *status)
4054 flag aSign;
4055 int aExp;
4056 uint64_t aSig;
4058 a = float64_squash_input_denormal(a, status);
4059 aSig = extractFloat64Frac( a );
4060 aExp = extractFloat64Exp( a );
4061 aSign = extractFloat64Sign( a );
4062 if ( aExp == 0x7FF ) {
4063 if (aSig) {
4064 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4066 return packFloatx80(aSign,
4067 floatx80_infinity_high,
4068 floatx80_infinity_low);
4070 if ( aExp == 0 ) {
4071 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4072 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4074 return
4075 packFloatx80(
4076 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4080 /*----------------------------------------------------------------------------
4081 | Returns the result of converting the double-precision floating-point value
4082 | `a' to the quadruple-precision floating-point format. The conversion is
4083 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4084 | Arithmetic.
4085 *----------------------------------------------------------------------------*/
4087 float128 float64_to_float128(float64 a, float_status *status)
4089 flag aSign;
4090 int aExp;
4091 uint64_t aSig, zSig0, zSig1;
4093 a = float64_squash_input_denormal(a, status);
4094 aSig = extractFloat64Frac( a );
4095 aExp = extractFloat64Exp( a );
4096 aSign = extractFloat64Sign( a );
4097 if ( aExp == 0x7FF ) {
4098 if (aSig) {
4099 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4101 return packFloat128( aSign, 0x7FFF, 0, 0 );
4103 if ( aExp == 0 ) {
4104 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4105 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4106 --aExp;
4108 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4109 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4114 /*----------------------------------------------------------------------------
4115 | Returns the remainder of the double-precision floating-point value `a'
4116 | with respect to the corresponding value `b'. The operation is performed
4117 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4118 *----------------------------------------------------------------------------*/
4120 float64 float64_rem(float64 a, float64 b, float_status *status)
4122 flag aSign, zSign;
4123 int aExp, bExp, expDiff;
4124 uint64_t aSig, bSig;
4125 uint64_t q, alternateASig;
4126 int64_t sigMean;
4128 a = float64_squash_input_denormal(a, status);
4129 b = float64_squash_input_denormal(b, status);
4130 aSig = extractFloat64Frac( a );
4131 aExp = extractFloat64Exp( a );
4132 aSign = extractFloat64Sign( a );
4133 bSig = extractFloat64Frac( b );
4134 bExp = extractFloat64Exp( b );
4135 if ( aExp == 0x7FF ) {
4136 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4137 return propagateFloat64NaN(a, b, status);
4139 float_raise(float_flag_invalid, status);
4140 return float64_default_nan(status);
4142 if ( bExp == 0x7FF ) {
4143 if (bSig) {
4144 return propagateFloat64NaN(a, b, status);
4146 return a;
4148 if ( bExp == 0 ) {
4149 if ( bSig == 0 ) {
4150 float_raise(float_flag_invalid, status);
4151 return float64_default_nan(status);
4153 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4155 if ( aExp == 0 ) {
4156 if ( aSig == 0 ) return a;
4157 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4159 expDiff = aExp - bExp;
4160 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4161 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4162 if ( expDiff < 0 ) {
4163 if ( expDiff < -1 ) return a;
4164 aSig >>= 1;
4166 q = ( bSig <= aSig );
4167 if ( q ) aSig -= bSig;
4168 expDiff -= 64;
4169 while ( 0 < expDiff ) {
4170 q = estimateDiv128To64( aSig, 0, bSig );
4171 q = ( 2 < q ) ? q - 2 : 0;
4172 aSig = - ( ( bSig>>2 ) * q );
4173 expDiff -= 62;
4175 expDiff += 64;
4176 if ( 0 < expDiff ) {
4177 q = estimateDiv128To64( aSig, 0, bSig );
4178 q = ( 2 < q ) ? q - 2 : 0;
4179 q >>= 64 - expDiff;
4180 bSig >>= 2;
4181 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4183 else {
4184 aSig >>= 2;
4185 bSig >>= 2;
4187 do {
4188 alternateASig = aSig;
4189 ++q;
4190 aSig -= bSig;
4191 } while ( 0 <= (int64_t) aSig );
4192 sigMean = aSig + alternateASig;
4193 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4194 aSig = alternateASig;
4196 zSign = ( (int64_t) aSig < 0 );
4197 if ( zSign ) aSig = - aSig;
4198 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4202 /*----------------------------------------------------------------------------
4203 | Returns the binary log of the double-precision floating-point value `a'.
4204 | The operation is performed according to the IEC/IEEE Standard for Binary
4205 | Floating-Point Arithmetic.
4206 *----------------------------------------------------------------------------*/
4207 float64 float64_log2(float64 a, float_status *status)
4209 flag aSign, zSign;
4210 int aExp;
4211 uint64_t aSig, aSig0, aSig1, zSig, i;
4212 a = float64_squash_input_denormal(a, status);
4214 aSig = extractFloat64Frac( a );
4215 aExp = extractFloat64Exp( a );
4216 aSign = extractFloat64Sign( a );
4218 if ( aExp == 0 ) {
4219 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4220 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4222 if ( aSign ) {
4223 float_raise(float_flag_invalid, status);
4224 return float64_default_nan(status);
4226 if ( aExp == 0x7FF ) {
4227 if (aSig) {
4228 return propagateFloat64NaN(a, float64_zero, status);
4230 return a;
4233 aExp -= 0x3FF;
4234 aSig |= LIT64( 0x0010000000000000 );
4235 zSign = aExp < 0;
4236 zSig = (uint64_t)aExp << 52;
4237 for (i = 1LL << 51; i > 0; i >>= 1) {
4238 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4239 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4240 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4241 aSig >>= 1;
4242 zSig |= i;
4246 if ( zSign )
4247 zSig = -zSig;
4248 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4251 /*----------------------------------------------------------------------------
4252 | Returns 1 if the double-precision floating-point value `a' is equal to the
4253 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4254 | if either operand is a NaN. Otherwise, the comparison is performed
4255 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4256 *----------------------------------------------------------------------------*/
4258 int float64_eq(float64 a, float64 b, float_status *status)
4260 uint64_t av, bv;
4261 a = float64_squash_input_denormal(a, status);
4262 b = float64_squash_input_denormal(b, status);
4264 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4265 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4267 float_raise(float_flag_invalid, status);
4268 return 0;
4270 av = float64_val(a);
4271 bv = float64_val(b);
4272 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4276 /*----------------------------------------------------------------------------
4277 | Returns 1 if the double-precision floating-point value `a' is less than or
4278 | equal to the corresponding value `b', and 0 otherwise. The invalid
4279 | exception is raised if either operand is a NaN. The comparison is performed
4280 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4281 *----------------------------------------------------------------------------*/
4283 int float64_le(float64 a, float64 b, float_status *status)
4285 flag aSign, bSign;
4286 uint64_t av, bv;
4287 a = float64_squash_input_denormal(a, status);
4288 b = float64_squash_input_denormal(b, status);
4290 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4291 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4293 float_raise(float_flag_invalid, status);
4294 return 0;
4296 aSign = extractFloat64Sign( a );
4297 bSign = extractFloat64Sign( b );
4298 av = float64_val(a);
4299 bv = float64_val(b);
4300 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4301 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4305 /*----------------------------------------------------------------------------
4306 | Returns 1 if the double-precision floating-point value `a' is less than
4307 | the corresponding value `b', and 0 otherwise. The invalid exception is
4308 | raised if either operand is a NaN. The comparison is performed according
4309 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4310 *----------------------------------------------------------------------------*/
4312 int float64_lt(float64 a, float64 b, float_status *status)
4314 flag aSign, bSign;
4315 uint64_t av, bv;
4317 a = float64_squash_input_denormal(a, status);
4318 b = float64_squash_input_denormal(b, status);
4319 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4320 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4322 float_raise(float_flag_invalid, status);
4323 return 0;
4325 aSign = extractFloat64Sign( a );
4326 bSign = extractFloat64Sign( b );
4327 av = float64_val(a);
4328 bv = float64_val(b);
4329 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4330 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4334 /*----------------------------------------------------------------------------
4335 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4336 | be compared, and 0 otherwise. The invalid exception is raised if either
4337 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4338 | Standard for Binary Floating-Point Arithmetic.
4339 *----------------------------------------------------------------------------*/
4341 int float64_unordered(float64 a, float64 b, float_status *status)
4343 a = float64_squash_input_denormal(a, status);
4344 b = float64_squash_input_denormal(b, status);
4346 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4347 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4349 float_raise(float_flag_invalid, status);
4350 return 1;
4352 return 0;
4355 /*----------------------------------------------------------------------------
4356 | Returns 1 if the double-precision floating-point value `a' is equal to the
4357 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4358 | exception.The comparison is performed according to the IEC/IEEE Standard
4359 | for Binary Floating-Point Arithmetic.
4360 *----------------------------------------------------------------------------*/
4362 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4364 uint64_t av, bv;
4365 a = float64_squash_input_denormal(a, status);
4366 b = float64_squash_input_denormal(b, status);
4368 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4369 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4371 if (float64_is_signaling_nan(a, status)
4372 || float64_is_signaling_nan(b, status)) {
4373 float_raise(float_flag_invalid, status);
4375 return 0;
4377 av = float64_val(a);
4378 bv = float64_val(b);
4379 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4383 /*----------------------------------------------------------------------------
4384 | Returns 1 if the double-precision floating-point value `a' is less than or
4385 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4386 | cause an exception. Otherwise, the comparison is performed according to the
4387 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4388 *----------------------------------------------------------------------------*/
4390 int float64_le_quiet(float64 a, float64 b, float_status *status)
4392 flag aSign, bSign;
4393 uint64_t av, bv;
4394 a = float64_squash_input_denormal(a, status);
4395 b = float64_squash_input_denormal(b, status);
4397 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4398 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4400 if (float64_is_signaling_nan(a, status)
4401 || float64_is_signaling_nan(b, status)) {
4402 float_raise(float_flag_invalid, status);
4404 return 0;
4406 aSign = extractFloat64Sign( a );
4407 bSign = extractFloat64Sign( b );
4408 av = float64_val(a);
4409 bv = float64_val(b);
4410 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4411 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4415 /*----------------------------------------------------------------------------
4416 | Returns 1 if the double-precision floating-point value `a' is less than
4417 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4418 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4419 | Standard for Binary Floating-Point Arithmetic.
4420 *----------------------------------------------------------------------------*/
4422 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4424 flag aSign, bSign;
4425 uint64_t av, bv;
4426 a = float64_squash_input_denormal(a, status);
4427 b = float64_squash_input_denormal(b, status);
4429 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4430 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4432 if (float64_is_signaling_nan(a, status)
4433 || float64_is_signaling_nan(b, status)) {
4434 float_raise(float_flag_invalid, status);
4436 return 0;
4438 aSign = extractFloat64Sign( a );
4439 bSign = extractFloat64Sign( b );
4440 av = float64_val(a);
4441 bv = float64_val(b);
4442 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4443 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4447 /*----------------------------------------------------------------------------
4448 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4449 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4450 | comparison is performed according to the IEC/IEEE Standard for Binary
4451 | Floating-Point Arithmetic.
4452 *----------------------------------------------------------------------------*/
4454 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4456 a = float64_squash_input_denormal(a, status);
4457 b = float64_squash_input_denormal(b, status);
4459 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4460 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4462 if (float64_is_signaling_nan(a, status)
4463 || float64_is_signaling_nan(b, status)) {
4464 float_raise(float_flag_invalid, status);
4466 return 1;
4468 return 0;
4471 /*----------------------------------------------------------------------------
4472 | Returns the result of converting the extended double-precision floating-
4473 | point value `a' to the 32-bit two's complement integer format. The
4474 | conversion is performed according to the IEC/IEEE Standard for Binary
4475 | Floating-Point Arithmetic---which means in particular that the conversion
4476 | is rounded according to the current rounding mode. If `a' is a NaN, the
4477 | largest positive integer is returned. Otherwise, if the conversion
4478 | overflows, the largest integer with the same sign as `a' is returned.
4479 *----------------------------------------------------------------------------*/
4481 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4483 flag aSign;
4484 int32_t aExp, shiftCount;
4485 uint64_t aSig;
4487 if (floatx80_invalid_encoding(a)) {
4488 float_raise(float_flag_invalid, status);
4489 return 1 << 31;
4491 aSig = extractFloatx80Frac( a );
4492 aExp = extractFloatx80Exp( a );
4493 aSign = extractFloatx80Sign( a );
4494 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4495 shiftCount = 0x4037 - aExp;
4496 if ( shiftCount <= 0 ) shiftCount = 1;
4497 shift64RightJamming( aSig, shiftCount, &aSig );
4498 return roundAndPackInt32(aSign, aSig, status);
4502 /*----------------------------------------------------------------------------
4503 | Returns the result of converting the extended double-precision floating-
4504 | point value `a' to the 32-bit two's complement integer format. The
4505 | conversion is performed according to the IEC/IEEE Standard for Binary
4506 | Floating-Point Arithmetic, except that the conversion is always rounded
4507 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4508 | Otherwise, if the conversion overflows, the largest integer with the same
4509 | sign as `a' is returned.
4510 *----------------------------------------------------------------------------*/
4512 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4514 flag aSign;
4515 int32_t aExp, shiftCount;
4516 uint64_t aSig, savedASig;
4517 int32_t z;
4519 if (floatx80_invalid_encoding(a)) {
4520 float_raise(float_flag_invalid, status);
4521 return 1 << 31;
4523 aSig = extractFloatx80Frac( a );
4524 aExp = extractFloatx80Exp( a );
4525 aSign = extractFloatx80Sign( a );
4526 if ( 0x401E < aExp ) {
4527 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4528 goto invalid;
4530 else if ( aExp < 0x3FFF ) {
4531 if (aExp || aSig) {
4532 status->float_exception_flags |= float_flag_inexact;
4534 return 0;
4536 shiftCount = 0x403E - aExp;
4537 savedASig = aSig;
4538 aSig >>= shiftCount;
4539 z = aSig;
4540 if ( aSign ) z = - z;
4541 if ( ( z < 0 ) ^ aSign ) {
4542 invalid:
4543 float_raise(float_flag_invalid, status);
4544 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4546 if ( ( aSig<<shiftCount ) != savedASig ) {
4547 status->float_exception_flags |= float_flag_inexact;
4549 return z;
4553 /*----------------------------------------------------------------------------
4554 | Returns the result of converting the extended double-precision floating-
4555 | point value `a' to the 64-bit two's complement integer format. The
4556 | conversion is performed according to the IEC/IEEE Standard for Binary
4557 | Floating-Point Arithmetic---which means in particular that the conversion
4558 | is rounded according to the current rounding mode. If `a' is a NaN,
4559 | the largest positive integer is returned. Otherwise, if the conversion
4560 | overflows, the largest integer with the same sign as `a' is returned.
4561 *----------------------------------------------------------------------------*/
4563 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4565 flag aSign;
4566 int32_t aExp, shiftCount;
4567 uint64_t aSig, aSigExtra;
4569 if (floatx80_invalid_encoding(a)) {
4570 float_raise(float_flag_invalid, status);
4571 return 1ULL << 63;
4573 aSig = extractFloatx80Frac( a );
4574 aExp = extractFloatx80Exp( a );
4575 aSign = extractFloatx80Sign( a );
4576 shiftCount = 0x403E - aExp;
4577 if ( shiftCount <= 0 ) {
4578 if ( shiftCount ) {
4579 float_raise(float_flag_invalid, status);
4580 if (!aSign || floatx80_is_any_nan(a)) {
4581 return LIT64( 0x7FFFFFFFFFFFFFFF );
4583 return (int64_t) LIT64( 0x8000000000000000 );
4585 aSigExtra = 0;
4587 else {
4588 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4590 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4594 /*----------------------------------------------------------------------------
4595 | Returns the result of converting the extended double-precision floating-
4596 | point value `a' to the 64-bit two's complement integer format. The
4597 | conversion is performed according to the IEC/IEEE Standard for Binary
4598 | Floating-Point Arithmetic, except that the conversion is always rounded
4599 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4600 | Otherwise, if the conversion overflows, the largest integer with the same
4601 | sign as `a' is returned.
4602 *----------------------------------------------------------------------------*/
4604 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4606 flag aSign;
4607 int32_t aExp, shiftCount;
4608 uint64_t aSig;
4609 int64_t z;
4611 if (floatx80_invalid_encoding(a)) {
4612 float_raise(float_flag_invalid, status);
4613 return 1ULL << 63;
4615 aSig = extractFloatx80Frac( a );
4616 aExp = extractFloatx80Exp( a );
4617 aSign = extractFloatx80Sign( a );
4618 shiftCount = aExp - 0x403E;
4619 if ( 0 <= shiftCount ) {
4620 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4621 if ( ( a.high != 0xC03E ) || aSig ) {
4622 float_raise(float_flag_invalid, status);
4623 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4624 return LIT64( 0x7FFFFFFFFFFFFFFF );
4627 return (int64_t) LIT64( 0x8000000000000000 );
4629 else if ( aExp < 0x3FFF ) {
4630 if (aExp | aSig) {
4631 status->float_exception_flags |= float_flag_inexact;
4633 return 0;
4635 z = aSig>>( - shiftCount );
4636 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4637 status->float_exception_flags |= float_flag_inexact;
4639 if ( aSign ) z = - z;
4640 return z;
4644 /*----------------------------------------------------------------------------
4645 | Returns the result of converting the extended double-precision floating-
4646 | point value `a' to the single-precision floating-point format. The
4647 | conversion is performed according to the IEC/IEEE Standard for Binary
4648 | Floating-Point Arithmetic.
4649 *----------------------------------------------------------------------------*/
4651 float32 floatx80_to_float32(floatx80 a, float_status *status)
4653 flag aSign;
4654 int32_t aExp;
4655 uint64_t aSig;
4657 if (floatx80_invalid_encoding(a)) {
4658 float_raise(float_flag_invalid, status);
4659 return float32_default_nan(status);
4661 aSig = extractFloatx80Frac( a );
4662 aExp = extractFloatx80Exp( a );
4663 aSign = extractFloatx80Sign( a );
4664 if ( aExp == 0x7FFF ) {
4665 if ( (uint64_t) ( aSig<<1 ) ) {
4666 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4668 return packFloat32( aSign, 0xFF, 0 );
4670 shift64RightJamming( aSig, 33, &aSig );
4671 if ( aExp || aSig ) aExp -= 0x3F81;
4672 return roundAndPackFloat32(aSign, aExp, aSig, status);
4676 /*----------------------------------------------------------------------------
4677 | Returns the result of converting the extended double-precision floating-
4678 | point value `a' to the double-precision floating-point format. The
4679 | conversion is performed according to the IEC/IEEE Standard for Binary
4680 | Floating-Point Arithmetic.
4681 *----------------------------------------------------------------------------*/
4683 float64 floatx80_to_float64(floatx80 a, float_status *status)
4685 flag aSign;
4686 int32_t aExp;
4687 uint64_t aSig, zSig;
4689 if (floatx80_invalid_encoding(a)) {
4690 float_raise(float_flag_invalid, status);
4691 return float64_default_nan(status);
4693 aSig = extractFloatx80Frac( a );
4694 aExp = extractFloatx80Exp( a );
4695 aSign = extractFloatx80Sign( a );
4696 if ( aExp == 0x7FFF ) {
4697 if ( (uint64_t) ( aSig<<1 ) ) {
4698 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4700 return packFloat64( aSign, 0x7FF, 0 );
4702 shift64RightJamming( aSig, 1, &zSig );
4703 if ( aExp || aSig ) aExp -= 0x3C01;
4704 return roundAndPackFloat64(aSign, aExp, zSig, status);
4708 /*----------------------------------------------------------------------------
4709 | Returns the result of converting the extended double-precision floating-
4710 | point value `a' to the quadruple-precision floating-point format. The
4711 | conversion is performed according to the IEC/IEEE Standard for Binary
4712 | Floating-Point Arithmetic.
4713 *----------------------------------------------------------------------------*/
4715 float128 floatx80_to_float128(floatx80 a, float_status *status)
4717 flag aSign;
4718 int aExp;
4719 uint64_t aSig, zSig0, zSig1;
4721 if (floatx80_invalid_encoding(a)) {
4722 float_raise(float_flag_invalid, status);
4723 return float128_default_nan(status);
4725 aSig = extractFloatx80Frac( a );
4726 aExp = extractFloatx80Exp( a );
4727 aSign = extractFloatx80Sign( a );
4728 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4729 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4731 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4732 return packFloat128( aSign, aExp, zSig0, zSig1 );
4736 /*----------------------------------------------------------------------------
4737 | Rounds the extended double-precision floating-point value `a'
4738 | to the precision provided by floatx80_rounding_precision and returns the
4739 | result as an extended double-precision floating-point value.
4740 | The operation is performed according to the IEC/IEEE Standard for Binary
4741 | Floating-Point Arithmetic.
4742 *----------------------------------------------------------------------------*/
4744 floatx80 floatx80_round(floatx80 a, float_status *status)
4746 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4747 extractFloatx80Sign(a),
4748 extractFloatx80Exp(a),
4749 extractFloatx80Frac(a), 0, status);
4752 /*----------------------------------------------------------------------------
4753 | Rounds the extended double-precision floating-point value `a' to an integer,
4754 | and returns the result as an extended quadruple-precision floating-point
4755 | value. The operation is performed according to the IEC/IEEE Standard for
4756 | Binary Floating-Point Arithmetic.
4757 *----------------------------------------------------------------------------*/
4759 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4761 flag aSign;
4762 int32_t aExp;
4763 uint64_t lastBitMask, roundBitsMask;
4764 floatx80 z;
4766 if (floatx80_invalid_encoding(a)) {
4767 float_raise(float_flag_invalid, status);
4768 return floatx80_default_nan(status);
4770 aExp = extractFloatx80Exp( a );
4771 if ( 0x403E <= aExp ) {
4772 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4773 return propagateFloatx80NaN(a, a, status);
4775 return a;
4777 if ( aExp < 0x3FFF ) {
4778 if ( ( aExp == 0 )
4779 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4780 return a;
4782 status->float_exception_flags |= float_flag_inexact;
4783 aSign = extractFloatx80Sign( a );
4784 switch (status->float_rounding_mode) {
4785 case float_round_nearest_even:
4786 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4788 return
4789 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4791 break;
4792 case float_round_ties_away:
4793 if (aExp == 0x3FFE) {
4794 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4796 break;
4797 case float_round_down:
4798 return
4799 aSign ?
4800 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4801 : packFloatx80( 0, 0, 0 );
4802 case float_round_up:
4803 return
4804 aSign ? packFloatx80( 1, 0, 0 )
4805 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4807 return packFloatx80( aSign, 0, 0 );
4809 lastBitMask = 1;
4810 lastBitMask <<= 0x403E - aExp;
4811 roundBitsMask = lastBitMask - 1;
4812 z = a;
4813 switch (status->float_rounding_mode) {
4814 case float_round_nearest_even:
4815 z.low += lastBitMask>>1;
4816 if ((z.low & roundBitsMask) == 0) {
4817 z.low &= ~lastBitMask;
4819 break;
4820 case float_round_ties_away:
4821 z.low += lastBitMask >> 1;
4822 break;
4823 case float_round_to_zero:
4824 break;
4825 case float_round_up:
4826 if (!extractFloatx80Sign(z)) {
4827 z.low += roundBitsMask;
4829 break;
4830 case float_round_down:
4831 if (extractFloatx80Sign(z)) {
4832 z.low += roundBitsMask;
4834 break;
4835 default:
4836 abort();
4838 z.low &= ~ roundBitsMask;
4839 if ( z.low == 0 ) {
4840 ++z.high;
4841 z.low = LIT64( 0x8000000000000000 );
4843 if (z.low != a.low) {
4844 status->float_exception_flags |= float_flag_inexact;
4846 return z;
4850 /*----------------------------------------------------------------------------
4851 | Returns the result of adding the absolute values of the extended double-
4852 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4853 | negated before being returned. `zSign' is ignored if the result is a NaN.
4854 | The addition is performed according to the IEC/IEEE Standard for Binary
4855 | Floating-Point Arithmetic.
4856 *----------------------------------------------------------------------------*/
4858 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4859 float_status *status)
4861 int32_t aExp, bExp, zExp;
4862 uint64_t aSig, bSig, zSig0, zSig1;
4863 int32_t expDiff;
4865 aSig = extractFloatx80Frac( a );
4866 aExp = extractFloatx80Exp( a );
4867 bSig = extractFloatx80Frac( b );
4868 bExp = extractFloatx80Exp( b );
4869 expDiff = aExp - bExp;
4870 if ( 0 < expDiff ) {
4871 if ( aExp == 0x7FFF ) {
4872 if ((uint64_t)(aSig << 1)) {
4873 return propagateFloatx80NaN(a, b, status);
4875 return a;
4877 if ( bExp == 0 ) --expDiff;
4878 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4879 zExp = aExp;
4881 else if ( expDiff < 0 ) {
4882 if ( bExp == 0x7FFF ) {
4883 if ((uint64_t)(bSig << 1)) {
4884 return propagateFloatx80NaN(a, b, status);
4886 return packFloatx80(zSign,
4887 floatx80_infinity_high,
4888 floatx80_infinity_low);
4890 if ( aExp == 0 ) ++expDiff;
4891 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4892 zExp = bExp;
4894 else {
4895 if ( aExp == 0x7FFF ) {
4896 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4897 return propagateFloatx80NaN(a, b, status);
4899 return a;
4901 zSig1 = 0;
4902 zSig0 = aSig + bSig;
4903 if ( aExp == 0 ) {
4904 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4905 goto roundAndPack;
4907 zExp = aExp;
4908 goto shiftRight1;
4910 zSig0 = aSig + bSig;
4911 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4912 shiftRight1:
4913 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4914 zSig0 |= LIT64( 0x8000000000000000 );
4915 ++zExp;
4916 roundAndPack:
4917 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4918 zSign, zExp, zSig0, zSig1, status);
4921 /*----------------------------------------------------------------------------
4922 | Returns the result of subtracting the absolute values of the extended
4923 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4924 | difference is negated before being returned. `zSign' is ignored if the
4925 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4926 | Standard for Binary Floating-Point Arithmetic.
4927 *----------------------------------------------------------------------------*/
4929 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4930 float_status *status)
4932 int32_t aExp, bExp, zExp;
4933 uint64_t aSig, bSig, zSig0, zSig1;
4934 int32_t expDiff;
4936 aSig = extractFloatx80Frac( a );
4937 aExp = extractFloatx80Exp( a );
4938 bSig = extractFloatx80Frac( b );
4939 bExp = extractFloatx80Exp( b );
4940 expDiff = aExp - bExp;
4941 if ( 0 < expDiff ) goto aExpBigger;
4942 if ( expDiff < 0 ) goto bExpBigger;
4943 if ( aExp == 0x7FFF ) {
4944 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4945 return propagateFloatx80NaN(a, b, status);
4947 float_raise(float_flag_invalid, status);
4948 return floatx80_default_nan(status);
4950 if ( aExp == 0 ) {
4951 aExp = 1;
4952 bExp = 1;
4954 zSig1 = 0;
4955 if ( bSig < aSig ) goto aBigger;
4956 if ( aSig < bSig ) goto bBigger;
4957 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
4958 bExpBigger:
4959 if ( bExp == 0x7FFF ) {
4960 if ((uint64_t)(bSig << 1)) {
4961 return propagateFloatx80NaN(a, b, status);
4963 return packFloatx80(zSign ^ 1, floatx80_infinity_high,
4964 floatx80_infinity_low);
4966 if ( aExp == 0 ) ++expDiff;
4967 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4968 bBigger:
4969 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4970 zExp = bExp;
4971 zSign ^= 1;
4972 goto normalizeRoundAndPack;
4973 aExpBigger:
4974 if ( aExp == 0x7FFF ) {
4975 if ((uint64_t)(aSig << 1)) {
4976 return propagateFloatx80NaN(a, b, status);
4978 return a;
4980 if ( bExp == 0 ) --expDiff;
4981 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4982 aBigger:
4983 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4984 zExp = aExp;
4985 normalizeRoundAndPack:
4986 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
4987 zSign, zExp, zSig0, zSig1, status);
4990 /*----------------------------------------------------------------------------
4991 | Returns the result of adding the extended double-precision floating-point
4992 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4993 | Standard for Binary Floating-Point Arithmetic.
4994 *----------------------------------------------------------------------------*/
4996 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
4998 flag aSign, bSign;
5000 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5001 float_raise(float_flag_invalid, status);
5002 return floatx80_default_nan(status);
5004 aSign = extractFloatx80Sign( a );
5005 bSign = extractFloatx80Sign( b );
5006 if ( aSign == bSign ) {
5007 return addFloatx80Sigs(a, b, aSign, status);
5009 else {
5010 return subFloatx80Sigs(a, b, aSign, status);
5015 /*----------------------------------------------------------------------------
5016 | Returns the result of subtracting the extended double-precision floating-
5017 | point values `a' and `b'. The operation is performed according to the
5018 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5019 *----------------------------------------------------------------------------*/
5021 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5023 flag aSign, bSign;
5025 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5026 float_raise(float_flag_invalid, status);
5027 return floatx80_default_nan(status);
5029 aSign = extractFloatx80Sign( a );
5030 bSign = extractFloatx80Sign( b );
5031 if ( aSign == bSign ) {
5032 return subFloatx80Sigs(a, b, aSign, status);
5034 else {
5035 return addFloatx80Sigs(a, b, aSign, status);
5040 /*----------------------------------------------------------------------------
5041 | Returns the result of multiplying the extended double-precision floating-
5042 | point values `a' and `b'. The operation is performed according to the
5043 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5044 *----------------------------------------------------------------------------*/
5046 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5048 flag aSign, bSign, zSign;
5049 int32_t aExp, bExp, zExp;
5050 uint64_t aSig, bSig, zSig0, zSig1;
5052 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5053 float_raise(float_flag_invalid, status);
5054 return floatx80_default_nan(status);
5056 aSig = extractFloatx80Frac( a );
5057 aExp = extractFloatx80Exp( a );
5058 aSign = extractFloatx80Sign( a );
5059 bSig = extractFloatx80Frac( b );
5060 bExp = extractFloatx80Exp( b );
5061 bSign = extractFloatx80Sign( b );
5062 zSign = aSign ^ bSign;
5063 if ( aExp == 0x7FFF ) {
5064 if ( (uint64_t) ( aSig<<1 )
5065 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5066 return propagateFloatx80NaN(a, b, status);
5068 if ( ( bExp | bSig ) == 0 ) goto invalid;
5069 return packFloatx80(zSign, floatx80_infinity_high,
5070 floatx80_infinity_low);
5072 if ( bExp == 0x7FFF ) {
5073 if ((uint64_t)(bSig << 1)) {
5074 return propagateFloatx80NaN(a, b, status);
5076 if ( ( aExp | aSig ) == 0 ) {
5077 invalid:
5078 float_raise(float_flag_invalid, status);
5079 return floatx80_default_nan(status);
5081 return packFloatx80(zSign, floatx80_infinity_high,
5082 floatx80_infinity_low);
5084 if ( aExp == 0 ) {
5085 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5086 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5088 if ( bExp == 0 ) {
5089 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5090 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5092 zExp = aExp + bExp - 0x3FFE;
5093 mul64To128( aSig, bSig, &zSig0, &zSig1 );
5094 if ( 0 < (int64_t) zSig0 ) {
5095 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5096 --zExp;
5098 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5099 zSign, zExp, zSig0, zSig1, status);
5102 /*----------------------------------------------------------------------------
5103 | Returns the result of dividing the extended double-precision floating-point
5104 | value `a' by the corresponding value `b'. The operation is performed
5105 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5106 *----------------------------------------------------------------------------*/
5108 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5110 flag aSign, bSign, zSign;
5111 int32_t aExp, bExp, zExp;
5112 uint64_t aSig, bSig, zSig0, zSig1;
5113 uint64_t rem0, rem1, rem2, term0, term1, term2;
5115 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5116 float_raise(float_flag_invalid, status);
5117 return floatx80_default_nan(status);
5119 aSig = extractFloatx80Frac( a );
5120 aExp = extractFloatx80Exp( a );
5121 aSign = extractFloatx80Sign( a );
5122 bSig = extractFloatx80Frac( b );
5123 bExp = extractFloatx80Exp( b );
5124 bSign = extractFloatx80Sign( b );
5125 zSign = aSign ^ bSign;
5126 if ( aExp == 0x7FFF ) {
5127 if ((uint64_t)(aSig << 1)) {
5128 return propagateFloatx80NaN(a, b, status);
5130 if ( bExp == 0x7FFF ) {
5131 if ((uint64_t)(bSig << 1)) {
5132 return propagateFloatx80NaN(a, b, status);
5134 goto invalid;
5136 return packFloatx80(zSign, floatx80_infinity_high,
5137 floatx80_infinity_low);
5139 if ( bExp == 0x7FFF ) {
5140 if ((uint64_t)(bSig << 1)) {
5141 return propagateFloatx80NaN(a, b, status);
5143 return packFloatx80( zSign, 0, 0 );
5145 if ( bExp == 0 ) {
5146 if ( bSig == 0 ) {
5147 if ( ( aExp | aSig ) == 0 ) {
5148 invalid:
5149 float_raise(float_flag_invalid, status);
5150 return floatx80_default_nan(status);
5152 float_raise(float_flag_divbyzero, status);
5153 return packFloatx80(zSign, floatx80_infinity_high,
5154 floatx80_infinity_low);
5156 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5158 if ( aExp == 0 ) {
5159 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5160 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5162 zExp = aExp - bExp + 0x3FFE;
5163 rem1 = 0;
5164 if ( bSig <= aSig ) {
5165 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5166 ++zExp;
5168 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5169 mul64To128( bSig, zSig0, &term0, &term1 );
5170 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5171 while ( (int64_t) rem0 < 0 ) {
5172 --zSig0;
5173 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5175 zSig1 = estimateDiv128To64( rem1, 0, bSig );
5176 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5177 mul64To128( bSig, zSig1, &term1, &term2 );
5178 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5179 while ( (int64_t) rem1 < 0 ) {
5180 --zSig1;
5181 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5183 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5185 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5186 zSign, zExp, zSig0, zSig1, status);
5189 /*----------------------------------------------------------------------------
5190 | Returns the remainder of the extended double-precision floating-point value
5191 | `a' with respect to the corresponding value `b'. The operation is performed
5192 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5193 *----------------------------------------------------------------------------*/
5195 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5197 flag aSign, zSign;
5198 int32_t aExp, bExp, expDiff;
5199 uint64_t aSig0, aSig1, bSig;
5200 uint64_t q, term0, term1, alternateASig0, alternateASig1;
5202 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5203 float_raise(float_flag_invalid, status);
5204 return floatx80_default_nan(status);
5206 aSig0 = extractFloatx80Frac( a );
5207 aExp = extractFloatx80Exp( a );
5208 aSign = extractFloatx80Sign( a );
5209 bSig = extractFloatx80Frac( b );
5210 bExp = extractFloatx80Exp( b );
5211 if ( aExp == 0x7FFF ) {
5212 if ( (uint64_t) ( aSig0<<1 )
5213 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5214 return propagateFloatx80NaN(a, b, status);
5216 goto invalid;
5218 if ( bExp == 0x7FFF ) {
5219 if ((uint64_t)(bSig << 1)) {
5220 return propagateFloatx80NaN(a, b, status);
5222 return a;
5224 if ( bExp == 0 ) {
5225 if ( bSig == 0 ) {
5226 invalid:
5227 float_raise(float_flag_invalid, status);
5228 return floatx80_default_nan(status);
5230 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5232 if ( aExp == 0 ) {
5233 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5234 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5236 bSig |= LIT64( 0x8000000000000000 );
5237 zSign = aSign;
5238 expDiff = aExp - bExp;
5239 aSig1 = 0;
5240 if ( expDiff < 0 ) {
5241 if ( expDiff < -1 ) return a;
5242 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5243 expDiff = 0;
5245 q = ( bSig <= aSig0 );
5246 if ( q ) aSig0 -= bSig;
5247 expDiff -= 64;
5248 while ( 0 < expDiff ) {
5249 q = estimateDiv128To64( aSig0, aSig1, bSig );
5250 q = ( 2 < q ) ? q - 2 : 0;
5251 mul64To128( bSig, q, &term0, &term1 );
5252 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5253 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5254 expDiff -= 62;
5256 expDiff += 64;
5257 if ( 0 < expDiff ) {
5258 q = estimateDiv128To64( aSig0, aSig1, bSig );
5259 q = ( 2 < q ) ? q - 2 : 0;
5260 q >>= 64 - expDiff;
5261 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5262 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5263 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5264 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5265 ++q;
5266 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5269 else {
5270 term1 = 0;
5271 term0 = bSig;
5273 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5274 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5275 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5276 && ( q & 1 ) )
5278 aSig0 = alternateASig0;
5279 aSig1 = alternateASig1;
5280 zSign = ! zSign;
5282 return
5283 normalizeRoundAndPackFloatx80(
5284 80, zSign, bExp + expDiff, aSig0, aSig1, status);
5288 /*----------------------------------------------------------------------------
5289 | Returns the square root of the extended double-precision floating-point
5290 | value `a'. The operation is performed according to the IEC/IEEE Standard
5291 | for Binary Floating-Point Arithmetic.
5292 *----------------------------------------------------------------------------*/
5294 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5296 flag aSign;
5297 int32_t aExp, zExp;
5298 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5299 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5301 if (floatx80_invalid_encoding(a)) {
5302 float_raise(float_flag_invalid, status);
5303 return floatx80_default_nan(status);
5305 aSig0 = extractFloatx80Frac( a );
5306 aExp = extractFloatx80Exp( a );
5307 aSign = extractFloatx80Sign( a );
5308 if ( aExp == 0x7FFF ) {
5309 if ((uint64_t)(aSig0 << 1)) {
5310 return propagateFloatx80NaN(a, a, status);
5312 if ( ! aSign ) return a;
5313 goto invalid;
5315 if ( aSign ) {
5316 if ( ( aExp | aSig0 ) == 0 ) return a;
5317 invalid:
5318 float_raise(float_flag_invalid, status);
5319 return floatx80_default_nan(status);
5321 if ( aExp == 0 ) {
5322 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5323 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5325 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5326 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5327 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5328 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5329 doubleZSig0 = zSig0<<1;
5330 mul64To128( zSig0, zSig0, &term0, &term1 );
5331 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5332 while ( (int64_t) rem0 < 0 ) {
5333 --zSig0;
5334 doubleZSig0 -= 2;
5335 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5337 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5338 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5339 if ( zSig1 == 0 ) zSig1 = 1;
5340 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5341 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5342 mul64To128( zSig1, zSig1, &term2, &term3 );
5343 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5344 while ( (int64_t) rem1 < 0 ) {
5345 --zSig1;
5346 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5347 term3 |= 1;
5348 term2 |= doubleZSig0;
5349 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5351 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5353 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5354 zSig0 |= doubleZSig0;
5355 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5356 0, zExp, zSig0, zSig1, status);
5359 /*----------------------------------------------------------------------------
5360 | Returns 1 if the extended double-precision floating-point value `a' is equal
5361 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5362 | raised if either operand is a NaN. Otherwise, the comparison is performed
5363 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5364 *----------------------------------------------------------------------------*/
5366 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5369 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5370 || (extractFloatx80Exp(a) == 0x7FFF
5371 && (uint64_t) (extractFloatx80Frac(a) << 1))
5372 || (extractFloatx80Exp(b) == 0x7FFF
5373 && (uint64_t) (extractFloatx80Frac(b) << 1))
5375 float_raise(float_flag_invalid, status);
5376 return 0;
5378 return
5379 ( a.low == b.low )
5380 && ( ( a.high == b.high )
5381 || ( ( a.low == 0 )
5382 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5387 /*----------------------------------------------------------------------------
5388 | Returns 1 if the extended double-precision floating-point value `a' is
5389 | less than or equal to the corresponding value `b', and 0 otherwise. The
5390 | invalid exception is raised if either operand is a NaN. The comparison is
5391 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5392 | Arithmetic.
5393 *----------------------------------------------------------------------------*/
5395 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5397 flag aSign, bSign;
5399 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5400 || (extractFloatx80Exp(a) == 0x7FFF
5401 && (uint64_t) (extractFloatx80Frac(a) << 1))
5402 || (extractFloatx80Exp(b) == 0x7FFF
5403 && (uint64_t) (extractFloatx80Frac(b) << 1))
5405 float_raise(float_flag_invalid, status);
5406 return 0;
5408 aSign = extractFloatx80Sign( a );
5409 bSign = extractFloatx80Sign( b );
5410 if ( aSign != bSign ) {
5411 return
5412 aSign
5413 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5414 == 0 );
5416 return
5417 aSign ? le128( b.high, b.low, a.high, a.low )
5418 : le128( a.high, a.low, b.high, b.low );
5422 /*----------------------------------------------------------------------------
5423 | Returns 1 if the extended double-precision floating-point value `a' is
5424 | less than the corresponding value `b', and 0 otherwise. The invalid
5425 | exception is raised if either operand is a NaN. The comparison is performed
5426 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5427 *----------------------------------------------------------------------------*/
5429 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5431 flag aSign, bSign;
5433 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5434 || (extractFloatx80Exp(a) == 0x7FFF
5435 && (uint64_t) (extractFloatx80Frac(a) << 1))
5436 || (extractFloatx80Exp(b) == 0x7FFF
5437 && (uint64_t) (extractFloatx80Frac(b) << 1))
5439 float_raise(float_flag_invalid, status);
5440 return 0;
5442 aSign = extractFloatx80Sign( a );
5443 bSign = extractFloatx80Sign( b );
5444 if ( aSign != bSign ) {
5445 return
5446 aSign
5447 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5448 != 0 );
5450 return
5451 aSign ? lt128( b.high, b.low, a.high, a.low )
5452 : lt128( a.high, a.low, b.high, b.low );
5456 /*----------------------------------------------------------------------------
5457 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5458 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5459 | either operand is a NaN. The comparison is performed according to the
5460 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5461 *----------------------------------------------------------------------------*/
5462 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5464 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5465 || (extractFloatx80Exp(a) == 0x7FFF
5466 && (uint64_t) (extractFloatx80Frac(a) << 1))
5467 || (extractFloatx80Exp(b) == 0x7FFF
5468 && (uint64_t) (extractFloatx80Frac(b) << 1))
5470 float_raise(float_flag_invalid, status);
5471 return 1;
5473 return 0;
5476 /*----------------------------------------------------------------------------
5477 | Returns 1 if the extended double-precision floating-point value `a' is
5478 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5479 | cause an exception. The comparison is performed according to the IEC/IEEE
5480 | Standard for Binary Floating-Point Arithmetic.
5481 *----------------------------------------------------------------------------*/
5483 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5486 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5487 float_raise(float_flag_invalid, status);
5488 return 0;
5490 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5491 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5492 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5493 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5495 if (floatx80_is_signaling_nan(a, status)
5496 || floatx80_is_signaling_nan(b, status)) {
5497 float_raise(float_flag_invalid, status);
5499 return 0;
5501 return
5502 ( a.low == b.low )
5503 && ( ( a.high == b.high )
5504 || ( ( a.low == 0 )
5505 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5510 /*----------------------------------------------------------------------------
5511 | Returns 1 if the extended double-precision floating-point value `a' is less
5512 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5513 | do not cause an exception. Otherwise, the comparison is performed according
5514 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5515 *----------------------------------------------------------------------------*/
5517 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5519 flag aSign, bSign;
5521 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5522 float_raise(float_flag_invalid, status);
5523 return 0;
5525 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5526 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5527 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5528 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5530 if (floatx80_is_signaling_nan(a, status)
5531 || floatx80_is_signaling_nan(b, status)) {
5532 float_raise(float_flag_invalid, status);
5534 return 0;
5536 aSign = extractFloatx80Sign( a );
5537 bSign = extractFloatx80Sign( b );
5538 if ( aSign != bSign ) {
5539 return
5540 aSign
5541 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5542 == 0 );
5544 return
5545 aSign ? le128( b.high, b.low, a.high, a.low )
5546 : le128( a.high, a.low, b.high, b.low );
5550 /*----------------------------------------------------------------------------
5551 | Returns 1 if the extended double-precision floating-point value `a' is less
5552 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5553 | an exception. Otherwise, the comparison is performed according to the
5554 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5555 *----------------------------------------------------------------------------*/
5557 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5559 flag aSign, bSign;
5561 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5562 float_raise(float_flag_invalid, status);
5563 return 0;
5565 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5566 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5567 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5568 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5570 if (floatx80_is_signaling_nan(a, status)
5571 || floatx80_is_signaling_nan(b, status)) {
5572 float_raise(float_flag_invalid, status);
5574 return 0;
5576 aSign = extractFloatx80Sign( a );
5577 bSign = extractFloatx80Sign( b );
5578 if ( aSign != bSign ) {
5579 return
5580 aSign
5581 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5582 != 0 );
5584 return
5585 aSign ? lt128( b.high, b.low, a.high, a.low )
5586 : lt128( a.high, a.low, b.high, b.low );
5590 /*----------------------------------------------------------------------------
5591 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5592 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5593 | The comparison is performed according to the IEC/IEEE Standard for Binary
5594 | Floating-Point Arithmetic.
5595 *----------------------------------------------------------------------------*/
5596 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5598 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5599 float_raise(float_flag_invalid, status);
5600 return 1;
5602 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5603 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5604 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5605 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5607 if (floatx80_is_signaling_nan(a, status)
5608 || floatx80_is_signaling_nan(b, status)) {
5609 float_raise(float_flag_invalid, status);
5611 return 1;
5613 return 0;
5616 /*----------------------------------------------------------------------------
5617 | Returns the result of converting the quadruple-precision floating-point
5618 | value `a' to the 32-bit two's complement integer format. The conversion
5619 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5620 | Arithmetic---which means in particular that the conversion is rounded
5621 | according to the current rounding mode. If `a' is a NaN, the largest
5622 | positive integer is returned. Otherwise, if the conversion overflows, the
5623 | largest integer with the same sign as `a' is returned.
5624 *----------------------------------------------------------------------------*/
5626 int32_t float128_to_int32(float128 a, float_status *status)
5628 flag aSign;
5629 int32_t aExp, shiftCount;
5630 uint64_t aSig0, aSig1;
5632 aSig1 = extractFloat128Frac1( a );
5633 aSig0 = extractFloat128Frac0( a );
5634 aExp = extractFloat128Exp( a );
5635 aSign = extractFloat128Sign( a );
5636 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5637 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5638 aSig0 |= ( aSig1 != 0 );
5639 shiftCount = 0x4028 - aExp;
5640 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5641 return roundAndPackInt32(aSign, aSig0, status);
5645 /*----------------------------------------------------------------------------
5646 | Returns the result of converting the quadruple-precision floating-point
5647 | value `a' to the 32-bit two's complement integer format. The conversion
5648 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5649 | Arithmetic, except that the conversion is always rounded toward zero. If
5650 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5651 | conversion overflows, the largest integer with the same sign as `a' is
5652 | returned.
5653 *----------------------------------------------------------------------------*/
5655 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5657 flag aSign;
5658 int32_t aExp, shiftCount;
5659 uint64_t aSig0, aSig1, savedASig;
5660 int32_t z;
5662 aSig1 = extractFloat128Frac1( a );
5663 aSig0 = extractFloat128Frac0( a );
5664 aExp = extractFloat128Exp( a );
5665 aSign = extractFloat128Sign( a );
5666 aSig0 |= ( aSig1 != 0 );
5667 if ( 0x401E < aExp ) {
5668 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5669 goto invalid;
5671 else if ( aExp < 0x3FFF ) {
5672 if (aExp || aSig0) {
5673 status->float_exception_flags |= float_flag_inexact;
5675 return 0;
5677 aSig0 |= LIT64( 0x0001000000000000 );
5678 shiftCount = 0x402F - aExp;
5679 savedASig = aSig0;
5680 aSig0 >>= shiftCount;
5681 z = aSig0;
5682 if ( aSign ) z = - z;
5683 if ( ( z < 0 ) ^ aSign ) {
5684 invalid:
5685 float_raise(float_flag_invalid, status);
5686 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5688 if ( ( aSig0<<shiftCount ) != savedASig ) {
5689 status->float_exception_flags |= float_flag_inexact;
5691 return z;
5695 /*----------------------------------------------------------------------------
5696 | Returns the result of converting the quadruple-precision floating-point
5697 | value `a' to the 64-bit two's complement integer format. The conversion
5698 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5699 | Arithmetic---which means in particular that the conversion is rounded
5700 | according to the current rounding mode. If `a' is a NaN, the largest
5701 | positive integer is returned. Otherwise, if the conversion overflows, the
5702 | largest integer with the same sign as `a' is returned.
5703 *----------------------------------------------------------------------------*/
5705 int64_t float128_to_int64(float128 a, float_status *status)
5707 flag aSign;
5708 int32_t aExp, shiftCount;
5709 uint64_t aSig0, aSig1;
5711 aSig1 = extractFloat128Frac1( a );
5712 aSig0 = extractFloat128Frac0( a );
5713 aExp = extractFloat128Exp( a );
5714 aSign = extractFloat128Sign( a );
5715 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5716 shiftCount = 0x402F - aExp;
5717 if ( shiftCount <= 0 ) {
5718 if ( 0x403E < aExp ) {
5719 float_raise(float_flag_invalid, status);
5720 if ( ! aSign
5721 || ( ( aExp == 0x7FFF )
5722 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5725 return LIT64( 0x7FFFFFFFFFFFFFFF );
5727 return (int64_t) LIT64( 0x8000000000000000 );
5729 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5731 else {
5732 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5734 return roundAndPackInt64(aSign, aSig0, aSig1, status);
5738 /*----------------------------------------------------------------------------
5739 | Returns the result of converting the quadruple-precision floating-point
5740 | value `a' to the 64-bit two's complement integer format. The conversion
5741 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5742 | Arithmetic, except that the conversion is always rounded toward zero.
5743 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5744 | the conversion overflows, the largest integer with the same sign as `a' is
5745 | returned.
5746 *----------------------------------------------------------------------------*/
5748 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5750 flag aSign;
5751 int32_t aExp, shiftCount;
5752 uint64_t aSig0, aSig1;
5753 int64_t z;
5755 aSig1 = extractFloat128Frac1( a );
5756 aSig0 = extractFloat128Frac0( a );
5757 aExp = extractFloat128Exp( a );
5758 aSign = extractFloat128Sign( a );
5759 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5760 shiftCount = aExp - 0x402F;
5761 if ( 0 < shiftCount ) {
5762 if ( 0x403E <= aExp ) {
5763 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5764 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5765 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5766 if (aSig1) {
5767 status->float_exception_flags |= float_flag_inexact;
5770 else {
5771 float_raise(float_flag_invalid, status);
5772 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5773 return LIT64( 0x7FFFFFFFFFFFFFFF );
5776 return (int64_t) LIT64( 0x8000000000000000 );
5778 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5779 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5780 status->float_exception_flags |= float_flag_inexact;
5783 else {
5784 if ( aExp < 0x3FFF ) {
5785 if ( aExp | aSig0 | aSig1 ) {
5786 status->float_exception_flags |= float_flag_inexact;
5788 return 0;
5790 z = aSig0>>( - shiftCount );
5791 if ( aSig1
5792 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5793 status->float_exception_flags |= float_flag_inexact;
5796 if ( aSign ) z = - z;
5797 return z;
5801 /*----------------------------------------------------------------------------
5802 | Returns the result of converting the quadruple-precision floating-point value
5803 | `a' to the 64-bit unsigned integer format. The conversion is
5804 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5805 | Arithmetic---which means in particular that the conversion is rounded
5806 | according to the current rounding mode. If `a' is a NaN, the largest
5807 | positive integer is returned. If the conversion overflows, the
5808 | largest unsigned integer is returned. If 'a' is negative, the value is
5809 | rounded and zero is returned; negative values that do not round to zero
5810 | will raise the inexact exception.
5811 *----------------------------------------------------------------------------*/
5813 uint64_t float128_to_uint64(float128 a, float_status *status)
5815 flag aSign;
5816 int aExp;
5817 int shiftCount;
5818 uint64_t aSig0, aSig1;
5820 aSig0 = extractFloat128Frac0(a);
5821 aSig1 = extractFloat128Frac1(a);
5822 aExp = extractFloat128Exp(a);
5823 aSign = extractFloat128Sign(a);
5824 if (aSign && (aExp > 0x3FFE)) {
5825 float_raise(float_flag_invalid, status);
5826 if (float128_is_any_nan(a)) {
5827 return LIT64(0xFFFFFFFFFFFFFFFF);
5828 } else {
5829 return 0;
5832 if (aExp) {
5833 aSig0 |= LIT64(0x0001000000000000);
5835 shiftCount = 0x402F - aExp;
5836 if (shiftCount <= 0) {
5837 if (0x403E < aExp) {
5838 float_raise(float_flag_invalid, status);
5839 return LIT64(0xFFFFFFFFFFFFFFFF);
5841 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5842 } else {
5843 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5845 return roundAndPackUint64(aSign, aSig0, aSig1, status);
5848 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5850 uint64_t v;
5851 signed char current_rounding_mode = status->float_rounding_mode;
5853 set_float_rounding_mode(float_round_to_zero, status);
5854 v = float128_to_uint64(a, status);
5855 set_float_rounding_mode(current_rounding_mode, status);
5857 return v;
5860 /*----------------------------------------------------------------------------
5861 | Returns the result of converting the quadruple-precision floating-point
5862 | value `a' to the 32-bit unsigned integer format. The conversion
5863 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5864 | Arithmetic except that the conversion is always rounded toward zero.
5865 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
5866 | if the conversion overflows, the largest unsigned integer is returned.
5867 | If 'a' is negative, the value is rounded and zero is returned; negative
5868 | values that do not round to zero will raise the inexact exception.
5869 *----------------------------------------------------------------------------*/
5871 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5873 uint64_t v;
5874 uint32_t res;
5875 int old_exc_flags = get_float_exception_flags(status);
5877 v = float128_to_uint64_round_to_zero(a, status);
5878 if (v > 0xffffffff) {
5879 res = 0xffffffff;
5880 } else {
5881 return v;
5883 set_float_exception_flags(old_exc_flags, status);
5884 float_raise(float_flag_invalid, status);
5885 return res;
5888 /*----------------------------------------------------------------------------
5889 | Returns the result of converting the quadruple-precision floating-point
5890 | value `a' to the single-precision floating-point format. The conversion
5891 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5892 | Arithmetic.
5893 *----------------------------------------------------------------------------*/
5895 float32 float128_to_float32(float128 a, float_status *status)
5897 flag aSign;
5898 int32_t aExp;
5899 uint64_t aSig0, aSig1;
5900 uint32_t zSig;
5902 aSig1 = extractFloat128Frac1( a );
5903 aSig0 = extractFloat128Frac0( a );
5904 aExp = extractFloat128Exp( a );
5905 aSign = extractFloat128Sign( a );
5906 if ( aExp == 0x7FFF ) {
5907 if ( aSig0 | aSig1 ) {
5908 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5910 return packFloat32( aSign, 0xFF, 0 );
5912 aSig0 |= ( aSig1 != 0 );
5913 shift64RightJamming( aSig0, 18, &aSig0 );
5914 zSig = aSig0;
5915 if ( aExp || zSig ) {
5916 zSig |= 0x40000000;
5917 aExp -= 0x3F81;
5919 return roundAndPackFloat32(aSign, aExp, zSig, status);
5923 /*----------------------------------------------------------------------------
5924 | Returns the result of converting the quadruple-precision floating-point
5925 | value `a' to the double-precision floating-point format. The conversion
5926 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5927 | Arithmetic.
5928 *----------------------------------------------------------------------------*/
5930 float64 float128_to_float64(float128 a, float_status *status)
5932 flag aSign;
5933 int32_t aExp;
5934 uint64_t aSig0, aSig1;
5936 aSig1 = extractFloat128Frac1( a );
5937 aSig0 = extractFloat128Frac0( a );
5938 aExp = extractFloat128Exp( a );
5939 aSign = extractFloat128Sign( a );
5940 if ( aExp == 0x7FFF ) {
5941 if ( aSig0 | aSig1 ) {
5942 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5944 return packFloat64( aSign, 0x7FF, 0 );
5946 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5947 aSig0 |= ( aSig1 != 0 );
5948 if ( aExp || aSig0 ) {
5949 aSig0 |= LIT64( 0x4000000000000000 );
5950 aExp -= 0x3C01;
5952 return roundAndPackFloat64(aSign, aExp, aSig0, status);
5956 /*----------------------------------------------------------------------------
5957 | Returns the result of converting the quadruple-precision floating-point
5958 | value `a' to the extended double-precision floating-point format. The
5959 | conversion is performed according to the IEC/IEEE Standard for Binary
5960 | Floating-Point Arithmetic.
5961 *----------------------------------------------------------------------------*/
5963 floatx80 float128_to_floatx80(float128 a, float_status *status)
5965 flag aSign;
5966 int32_t aExp;
5967 uint64_t aSig0, aSig1;
5969 aSig1 = extractFloat128Frac1( a );
5970 aSig0 = extractFloat128Frac0( a );
5971 aExp = extractFloat128Exp( a );
5972 aSign = extractFloat128Sign( a );
5973 if ( aExp == 0x7FFF ) {
5974 if ( aSig0 | aSig1 ) {
5975 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
5977 return packFloatx80(aSign, floatx80_infinity_high,
5978 floatx80_infinity_low);
5980 if ( aExp == 0 ) {
5981 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5982 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5984 else {
5985 aSig0 |= LIT64( 0x0001000000000000 );
5987 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5988 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
5992 /*----------------------------------------------------------------------------
5993 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5994 | returns the result as a quadruple-precision floating-point value. The
5995 | operation is performed according to the IEC/IEEE Standard for Binary
5996 | Floating-Point Arithmetic.
5997 *----------------------------------------------------------------------------*/
5999 float128 float128_round_to_int(float128 a, float_status *status)
6001 flag aSign;
6002 int32_t aExp;
6003 uint64_t lastBitMask, roundBitsMask;
6004 float128 z;
6006 aExp = extractFloat128Exp( a );
6007 if ( 0x402F <= aExp ) {
6008 if ( 0x406F <= aExp ) {
6009 if ( ( aExp == 0x7FFF )
6010 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6012 return propagateFloat128NaN(a, a, status);
6014 return a;
6016 lastBitMask = 1;
6017 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6018 roundBitsMask = lastBitMask - 1;
6019 z = a;
6020 switch (status->float_rounding_mode) {
6021 case float_round_nearest_even:
6022 if ( lastBitMask ) {
6023 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6024 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6026 else {
6027 if ( (int64_t) z.low < 0 ) {
6028 ++z.high;
6029 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6032 break;
6033 case float_round_ties_away:
6034 if (lastBitMask) {
6035 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6036 } else {
6037 if ((int64_t) z.low < 0) {
6038 ++z.high;
6041 break;
6042 case float_round_to_zero:
6043 break;
6044 case float_round_up:
6045 if (!extractFloat128Sign(z)) {
6046 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6048 break;
6049 case float_round_down:
6050 if (extractFloat128Sign(z)) {
6051 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6053 break;
6054 default:
6055 abort();
6057 z.low &= ~ roundBitsMask;
6059 else {
6060 if ( aExp < 0x3FFF ) {
6061 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6062 status->float_exception_flags |= float_flag_inexact;
6063 aSign = extractFloat128Sign( a );
6064 switch (status->float_rounding_mode) {
6065 case float_round_nearest_even:
6066 if ( ( aExp == 0x3FFE )
6067 && ( extractFloat128Frac0( a )
6068 | extractFloat128Frac1( a ) )
6070 return packFloat128( aSign, 0x3FFF, 0, 0 );
6072 break;
6073 case float_round_ties_away:
6074 if (aExp == 0x3FFE) {
6075 return packFloat128(aSign, 0x3FFF, 0, 0);
6077 break;
6078 case float_round_down:
6079 return
6080 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6081 : packFloat128( 0, 0, 0, 0 );
6082 case float_round_up:
6083 return
6084 aSign ? packFloat128( 1, 0, 0, 0 )
6085 : packFloat128( 0, 0x3FFF, 0, 0 );
6087 return packFloat128( aSign, 0, 0, 0 );
6089 lastBitMask = 1;
6090 lastBitMask <<= 0x402F - aExp;
6091 roundBitsMask = lastBitMask - 1;
6092 z.low = 0;
6093 z.high = a.high;
6094 switch (status->float_rounding_mode) {
6095 case float_round_nearest_even:
6096 z.high += lastBitMask>>1;
6097 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6098 z.high &= ~ lastBitMask;
6100 break;
6101 case float_round_ties_away:
6102 z.high += lastBitMask>>1;
6103 break;
6104 case float_round_to_zero:
6105 break;
6106 case float_round_up:
6107 if (!extractFloat128Sign(z)) {
6108 z.high |= ( a.low != 0 );
6109 z.high += roundBitsMask;
6111 break;
6112 case float_round_down:
6113 if (extractFloat128Sign(z)) {
6114 z.high |= (a.low != 0);
6115 z.high += roundBitsMask;
6117 break;
6118 default:
6119 abort();
6121 z.high &= ~ roundBitsMask;
6123 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6124 status->float_exception_flags |= float_flag_inexact;
6126 return z;
6130 /*----------------------------------------------------------------------------
6131 | Returns the result of adding the absolute values of the quadruple-precision
6132 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6133 | before being returned. `zSign' is ignored if the result is a NaN.
6134 | The addition is performed according to the IEC/IEEE Standard for Binary
6135 | Floating-Point Arithmetic.
6136 *----------------------------------------------------------------------------*/
6138 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6139 float_status *status)
6141 int32_t aExp, bExp, zExp;
6142 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6143 int32_t expDiff;
6145 aSig1 = extractFloat128Frac1( a );
6146 aSig0 = extractFloat128Frac0( a );
6147 aExp = extractFloat128Exp( a );
6148 bSig1 = extractFloat128Frac1( b );
6149 bSig0 = extractFloat128Frac0( b );
6150 bExp = extractFloat128Exp( b );
6151 expDiff = aExp - bExp;
6152 if ( 0 < expDiff ) {
6153 if ( aExp == 0x7FFF ) {
6154 if (aSig0 | aSig1) {
6155 return propagateFloat128NaN(a, b, status);
6157 return a;
6159 if ( bExp == 0 ) {
6160 --expDiff;
6162 else {
6163 bSig0 |= LIT64( 0x0001000000000000 );
6165 shift128ExtraRightJamming(
6166 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6167 zExp = aExp;
6169 else if ( expDiff < 0 ) {
6170 if ( bExp == 0x7FFF ) {
6171 if (bSig0 | bSig1) {
6172 return propagateFloat128NaN(a, b, status);
6174 return packFloat128( zSign, 0x7FFF, 0, 0 );
6176 if ( aExp == 0 ) {
6177 ++expDiff;
6179 else {
6180 aSig0 |= LIT64( 0x0001000000000000 );
6182 shift128ExtraRightJamming(
6183 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6184 zExp = bExp;
6186 else {
6187 if ( aExp == 0x7FFF ) {
6188 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6189 return propagateFloat128NaN(a, b, status);
6191 return a;
6193 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6194 if ( aExp == 0 ) {
6195 if (status->flush_to_zero) {
6196 if (zSig0 | zSig1) {
6197 float_raise(float_flag_output_denormal, status);
6199 return packFloat128(zSign, 0, 0, 0);
6201 return packFloat128( zSign, 0, zSig0, zSig1 );
6203 zSig2 = 0;
6204 zSig0 |= LIT64( 0x0002000000000000 );
6205 zExp = aExp;
6206 goto shiftRight1;
6208 aSig0 |= LIT64( 0x0001000000000000 );
6209 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6210 --zExp;
6211 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6212 ++zExp;
6213 shiftRight1:
6214 shift128ExtraRightJamming(
6215 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6216 roundAndPack:
6217 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6221 /*----------------------------------------------------------------------------
6222 | Returns the result of subtracting the absolute values of the quadruple-
6223 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6224 | difference is negated before being returned. `zSign' is ignored if the
6225 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6226 | Standard for Binary Floating-Point Arithmetic.
6227 *----------------------------------------------------------------------------*/
6229 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6230 float_status *status)
6232 int32_t aExp, bExp, zExp;
6233 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6234 int32_t expDiff;
6236 aSig1 = extractFloat128Frac1( a );
6237 aSig0 = extractFloat128Frac0( a );
6238 aExp = extractFloat128Exp( a );
6239 bSig1 = extractFloat128Frac1( b );
6240 bSig0 = extractFloat128Frac0( b );
6241 bExp = extractFloat128Exp( b );
6242 expDiff = aExp - bExp;
6243 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6244 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6245 if ( 0 < expDiff ) goto aExpBigger;
6246 if ( expDiff < 0 ) goto bExpBigger;
6247 if ( aExp == 0x7FFF ) {
6248 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6249 return propagateFloat128NaN(a, b, status);
6251 float_raise(float_flag_invalid, status);
6252 return float128_default_nan(status);
6254 if ( aExp == 0 ) {
6255 aExp = 1;
6256 bExp = 1;
6258 if ( bSig0 < aSig0 ) goto aBigger;
6259 if ( aSig0 < bSig0 ) goto bBigger;
6260 if ( bSig1 < aSig1 ) goto aBigger;
6261 if ( aSig1 < bSig1 ) goto bBigger;
6262 return packFloat128(status->float_rounding_mode == float_round_down,
6263 0, 0, 0);
6264 bExpBigger:
6265 if ( bExp == 0x7FFF ) {
6266 if (bSig0 | bSig1) {
6267 return propagateFloat128NaN(a, b, status);
6269 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6271 if ( aExp == 0 ) {
6272 ++expDiff;
6274 else {
6275 aSig0 |= LIT64( 0x4000000000000000 );
6277 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6278 bSig0 |= LIT64( 0x4000000000000000 );
6279 bBigger:
6280 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6281 zExp = bExp;
6282 zSign ^= 1;
6283 goto normalizeRoundAndPack;
6284 aExpBigger:
6285 if ( aExp == 0x7FFF ) {
6286 if (aSig0 | aSig1) {
6287 return propagateFloat128NaN(a, b, status);
6289 return a;
6291 if ( bExp == 0 ) {
6292 --expDiff;
6294 else {
6295 bSig0 |= LIT64( 0x4000000000000000 );
6297 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6298 aSig0 |= LIT64( 0x4000000000000000 );
6299 aBigger:
6300 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6301 zExp = aExp;
6302 normalizeRoundAndPack:
6303 --zExp;
6304 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6305 status);
6309 /*----------------------------------------------------------------------------
6310 | Returns the result of adding the quadruple-precision floating-point values
6311 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6312 | for Binary Floating-Point Arithmetic.
6313 *----------------------------------------------------------------------------*/
6315 float128 float128_add(float128 a, float128 b, float_status *status)
6317 flag aSign, bSign;
6319 aSign = extractFloat128Sign( a );
6320 bSign = extractFloat128Sign( b );
6321 if ( aSign == bSign ) {
6322 return addFloat128Sigs(a, b, aSign, status);
6324 else {
6325 return subFloat128Sigs(a, b, aSign, status);
6330 /*----------------------------------------------------------------------------
6331 | Returns the result of subtracting the quadruple-precision floating-point
6332 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6333 | Standard for Binary Floating-Point Arithmetic.
6334 *----------------------------------------------------------------------------*/
6336 float128 float128_sub(float128 a, float128 b, float_status *status)
6338 flag aSign, bSign;
6340 aSign = extractFloat128Sign( a );
6341 bSign = extractFloat128Sign( b );
6342 if ( aSign == bSign ) {
6343 return subFloat128Sigs(a, b, aSign, status);
6345 else {
6346 return addFloat128Sigs(a, b, aSign, status);
6351 /*----------------------------------------------------------------------------
6352 | Returns the result of multiplying the quadruple-precision floating-point
6353 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6354 | Standard for Binary Floating-Point Arithmetic.
6355 *----------------------------------------------------------------------------*/
6357 float128 float128_mul(float128 a, float128 b, float_status *status)
6359 flag aSign, bSign, zSign;
6360 int32_t aExp, bExp, zExp;
6361 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6363 aSig1 = extractFloat128Frac1( a );
6364 aSig0 = extractFloat128Frac0( a );
6365 aExp = extractFloat128Exp( a );
6366 aSign = extractFloat128Sign( a );
6367 bSig1 = extractFloat128Frac1( b );
6368 bSig0 = extractFloat128Frac0( b );
6369 bExp = extractFloat128Exp( b );
6370 bSign = extractFloat128Sign( b );
6371 zSign = aSign ^ bSign;
6372 if ( aExp == 0x7FFF ) {
6373 if ( ( aSig0 | aSig1 )
6374 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6375 return propagateFloat128NaN(a, b, status);
6377 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6378 return packFloat128( zSign, 0x7FFF, 0, 0 );
6380 if ( bExp == 0x7FFF ) {
6381 if (bSig0 | bSig1) {
6382 return propagateFloat128NaN(a, b, status);
6384 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6385 invalid:
6386 float_raise(float_flag_invalid, status);
6387 return float128_default_nan(status);
6389 return packFloat128( zSign, 0x7FFF, 0, 0 );
6391 if ( aExp == 0 ) {
6392 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6393 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6395 if ( bExp == 0 ) {
6396 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6397 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6399 zExp = aExp + bExp - 0x4000;
6400 aSig0 |= LIT64( 0x0001000000000000 );
6401 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6402 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6403 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6404 zSig2 |= ( zSig3 != 0 );
6405 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6406 shift128ExtraRightJamming(
6407 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6408 ++zExp;
6410 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6414 /*----------------------------------------------------------------------------
6415 | Returns the result of dividing the quadruple-precision floating-point value
6416 | `a' by the corresponding value `b'. The operation is performed according to
6417 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6418 *----------------------------------------------------------------------------*/
6420 float128 float128_div(float128 a, float128 b, float_status *status)
6422 flag aSign, bSign, zSign;
6423 int32_t aExp, bExp, zExp;
6424 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6425 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6427 aSig1 = extractFloat128Frac1( a );
6428 aSig0 = extractFloat128Frac0( a );
6429 aExp = extractFloat128Exp( a );
6430 aSign = extractFloat128Sign( a );
6431 bSig1 = extractFloat128Frac1( b );
6432 bSig0 = extractFloat128Frac0( b );
6433 bExp = extractFloat128Exp( b );
6434 bSign = extractFloat128Sign( b );
6435 zSign = aSign ^ bSign;
6436 if ( aExp == 0x7FFF ) {
6437 if (aSig0 | aSig1) {
6438 return propagateFloat128NaN(a, b, status);
6440 if ( bExp == 0x7FFF ) {
6441 if (bSig0 | bSig1) {
6442 return propagateFloat128NaN(a, b, status);
6444 goto invalid;
6446 return packFloat128( zSign, 0x7FFF, 0, 0 );
6448 if ( bExp == 0x7FFF ) {
6449 if (bSig0 | bSig1) {
6450 return propagateFloat128NaN(a, b, status);
6452 return packFloat128( zSign, 0, 0, 0 );
6454 if ( bExp == 0 ) {
6455 if ( ( bSig0 | bSig1 ) == 0 ) {
6456 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6457 invalid:
6458 float_raise(float_flag_invalid, status);
6459 return float128_default_nan(status);
6461 float_raise(float_flag_divbyzero, status);
6462 return packFloat128( zSign, 0x7FFF, 0, 0 );
6464 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6466 if ( aExp == 0 ) {
6467 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6468 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6470 zExp = aExp - bExp + 0x3FFD;
6471 shortShift128Left(
6472 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6473 shortShift128Left(
6474 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6475 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6476 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6477 ++zExp;
6479 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6480 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6481 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6482 while ( (int64_t) rem0 < 0 ) {
6483 --zSig0;
6484 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6486 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6487 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6488 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6489 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6490 while ( (int64_t) rem1 < 0 ) {
6491 --zSig1;
6492 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6494 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6496 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6497 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6501 /*----------------------------------------------------------------------------
6502 | Returns the remainder of the quadruple-precision floating-point value `a'
6503 | with respect to the corresponding value `b'. The operation is performed
6504 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6505 *----------------------------------------------------------------------------*/
6507 float128 float128_rem(float128 a, float128 b, float_status *status)
6509 flag aSign, zSign;
6510 int32_t aExp, bExp, expDiff;
6511 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6512 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6513 int64_t sigMean0;
6515 aSig1 = extractFloat128Frac1( a );
6516 aSig0 = extractFloat128Frac0( a );
6517 aExp = extractFloat128Exp( a );
6518 aSign = extractFloat128Sign( a );
6519 bSig1 = extractFloat128Frac1( b );
6520 bSig0 = extractFloat128Frac0( b );
6521 bExp = extractFloat128Exp( b );
6522 if ( aExp == 0x7FFF ) {
6523 if ( ( aSig0 | aSig1 )
6524 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6525 return propagateFloat128NaN(a, b, status);
6527 goto invalid;
6529 if ( bExp == 0x7FFF ) {
6530 if (bSig0 | bSig1) {
6531 return propagateFloat128NaN(a, b, status);
6533 return a;
6535 if ( bExp == 0 ) {
6536 if ( ( bSig0 | bSig1 ) == 0 ) {
6537 invalid:
6538 float_raise(float_flag_invalid, status);
6539 return float128_default_nan(status);
6541 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6543 if ( aExp == 0 ) {
6544 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6545 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6547 expDiff = aExp - bExp;
6548 if ( expDiff < -1 ) return a;
6549 shortShift128Left(
6550 aSig0 | LIT64( 0x0001000000000000 ),
6551 aSig1,
6552 15 - ( expDiff < 0 ),
6553 &aSig0,
6554 &aSig1
6556 shortShift128Left(
6557 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6558 q = le128( bSig0, bSig1, aSig0, aSig1 );
6559 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6560 expDiff -= 64;
6561 while ( 0 < expDiff ) {
6562 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6563 q = ( 4 < q ) ? q - 4 : 0;
6564 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6565 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6566 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6567 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6568 expDiff -= 61;
6570 if ( -64 < expDiff ) {
6571 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6572 q = ( 4 < q ) ? q - 4 : 0;
6573 q >>= - expDiff;
6574 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6575 expDiff += 52;
6576 if ( expDiff < 0 ) {
6577 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6579 else {
6580 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6582 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6583 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6585 else {
6586 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6587 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6589 do {
6590 alternateASig0 = aSig0;
6591 alternateASig1 = aSig1;
6592 ++q;
6593 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6594 } while ( 0 <= (int64_t) aSig0 );
6595 add128(
6596 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6597 if ( ( sigMean0 < 0 )
6598 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6599 aSig0 = alternateASig0;
6600 aSig1 = alternateASig1;
6602 zSign = ( (int64_t) aSig0 < 0 );
6603 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6604 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6605 status);
6608 /*----------------------------------------------------------------------------
6609 | Returns the square root of the quadruple-precision floating-point value `a'.
6610 | The operation is performed according to the IEC/IEEE Standard for Binary
6611 | Floating-Point Arithmetic.
6612 *----------------------------------------------------------------------------*/
6614 float128 float128_sqrt(float128 a, float_status *status)
6616 flag aSign;
6617 int32_t aExp, zExp;
6618 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6619 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6621 aSig1 = extractFloat128Frac1( a );
6622 aSig0 = extractFloat128Frac0( a );
6623 aExp = extractFloat128Exp( a );
6624 aSign = extractFloat128Sign( a );
6625 if ( aExp == 0x7FFF ) {
6626 if (aSig0 | aSig1) {
6627 return propagateFloat128NaN(a, a, status);
6629 if ( ! aSign ) return a;
6630 goto invalid;
6632 if ( aSign ) {
6633 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6634 invalid:
6635 float_raise(float_flag_invalid, status);
6636 return float128_default_nan(status);
6638 if ( aExp == 0 ) {
6639 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6640 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6642 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6643 aSig0 |= LIT64( 0x0001000000000000 );
6644 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6645 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6646 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6647 doubleZSig0 = zSig0<<1;
6648 mul64To128( zSig0, zSig0, &term0, &term1 );
6649 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6650 while ( (int64_t) rem0 < 0 ) {
6651 --zSig0;
6652 doubleZSig0 -= 2;
6653 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6655 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6656 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6657 if ( zSig1 == 0 ) zSig1 = 1;
6658 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6659 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6660 mul64To128( zSig1, zSig1, &term2, &term3 );
6661 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6662 while ( (int64_t) rem1 < 0 ) {
6663 --zSig1;
6664 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6665 term3 |= 1;
6666 term2 |= doubleZSig0;
6667 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6669 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6671 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6672 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6676 /*----------------------------------------------------------------------------
6677 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6678 | the corresponding value `b', and 0 otherwise. The invalid exception is
6679 | raised if either operand is a NaN. Otherwise, the comparison is performed
6680 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6681 *----------------------------------------------------------------------------*/
6683 int float128_eq(float128 a, float128 b, float_status *status)
6686 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6687 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6688 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6689 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6691 float_raise(float_flag_invalid, status);
6692 return 0;
6694 return
6695 ( a.low == b.low )
6696 && ( ( a.high == b.high )
6697 || ( ( a.low == 0 )
6698 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6703 /*----------------------------------------------------------------------------
6704 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6705 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6706 | exception is raised if either operand is a NaN. The comparison is performed
6707 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6708 *----------------------------------------------------------------------------*/
6710 int float128_le(float128 a, float128 b, float_status *status)
6712 flag aSign, bSign;
6714 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6715 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6716 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6717 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6719 float_raise(float_flag_invalid, status);
6720 return 0;
6722 aSign = extractFloat128Sign( a );
6723 bSign = extractFloat128Sign( b );
6724 if ( aSign != bSign ) {
6725 return
6726 aSign
6727 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6728 == 0 );
6730 return
6731 aSign ? le128( b.high, b.low, a.high, a.low )
6732 : le128( a.high, a.low, b.high, b.low );
6736 /*----------------------------------------------------------------------------
6737 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6738 | the corresponding value `b', and 0 otherwise. The invalid exception is
6739 | raised if either operand is a NaN. The comparison is performed according
6740 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6741 *----------------------------------------------------------------------------*/
6743 int float128_lt(float128 a, float128 b, float_status *status)
6745 flag aSign, bSign;
6747 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6748 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6749 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6750 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6752 float_raise(float_flag_invalid, status);
6753 return 0;
6755 aSign = extractFloat128Sign( a );
6756 bSign = extractFloat128Sign( b );
6757 if ( aSign != bSign ) {
6758 return
6759 aSign
6760 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6761 != 0 );
6763 return
6764 aSign ? lt128( b.high, b.low, a.high, a.low )
6765 : lt128( a.high, a.low, b.high, b.low );
6769 /*----------------------------------------------------------------------------
6770 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6771 | be compared, and 0 otherwise. The invalid exception is raised if either
6772 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6773 | Standard for Binary Floating-Point Arithmetic.
6774 *----------------------------------------------------------------------------*/
6776 int float128_unordered(float128 a, float128 b, float_status *status)
6778 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6779 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6780 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6781 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6783 float_raise(float_flag_invalid, status);
6784 return 1;
6786 return 0;
6789 /*----------------------------------------------------------------------------
6790 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6791 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6792 | exception. The comparison is performed according to the IEC/IEEE Standard
6793 | for Binary Floating-Point Arithmetic.
6794 *----------------------------------------------------------------------------*/
6796 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6799 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6800 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6801 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6802 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6804 if (float128_is_signaling_nan(a, status)
6805 || float128_is_signaling_nan(b, status)) {
6806 float_raise(float_flag_invalid, status);
6808 return 0;
6810 return
6811 ( a.low == b.low )
6812 && ( ( a.high == b.high )
6813 || ( ( a.low == 0 )
6814 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6819 /*----------------------------------------------------------------------------
6820 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6821 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6822 | cause an exception. Otherwise, the comparison is performed according to the
6823 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6824 *----------------------------------------------------------------------------*/
6826 int float128_le_quiet(float128 a, float128 b, float_status *status)
6828 flag aSign, bSign;
6830 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6831 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6832 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6833 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6835 if (float128_is_signaling_nan(a, status)
6836 || float128_is_signaling_nan(b, status)) {
6837 float_raise(float_flag_invalid, status);
6839 return 0;
6841 aSign = extractFloat128Sign( a );
6842 bSign = extractFloat128Sign( b );
6843 if ( aSign != bSign ) {
6844 return
6845 aSign
6846 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6847 == 0 );
6849 return
6850 aSign ? le128( b.high, b.low, a.high, a.low )
6851 : le128( a.high, a.low, b.high, b.low );
6855 /*----------------------------------------------------------------------------
6856 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6857 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6858 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6859 | Standard for Binary Floating-Point Arithmetic.
6860 *----------------------------------------------------------------------------*/
6862 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6864 flag aSign, bSign;
6866 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6867 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6868 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6869 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6871 if (float128_is_signaling_nan(a, status)
6872 || float128_is_signaling_nan(b, status)) {
6873 float_raise(float_flag_invalid, status);
6875 return 0;
6877 aSign = extractFloat128Sign( a );
6878 bSign = extractFloat128Sign( b );
6879 if ( aSign != bSign ) {
6880 return
6881 aSign
6882 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6883 != 0 );
6885 return
6886 aSign ? lt128( b.high, b.low, a.high, a.low )
6887 : lt128( a.high, a.low, b.high, b.low );
6891 /*----------------------------------------------------------------------------
6892 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6893 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6894 | comparison is performed according to the IEC/IEEE Standard for Binary
6895 | Floating-Point Arithmetic.
6896 *----------------------------------------------------------------------------*/
6898 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6900 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6901 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6902 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6903 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6905 if (float128_is_signaling_nan(a, status)
6906 || float128_is_signaling_nan(b, status)) {
6907 float_raise(float_flag_invalid, status);
6909 return 1;
6911 return 0;
6914 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6915 int is_quiet, float_status *status)
6917 flag aSign, bSign;
6919 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6920 float_raise(float_flag_invalid, status);
6921 return float_relation_unordered;
6923 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6924 ( extractFloatx80Frac( a )<<1 ) ) ||
6925 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6926 ( extractFloatx80Frac( b )<<1 ) )) {
6927 if (!is_quiet ||
6928 floatx80_is_signaling_nan(a, status) ||
6929 floatx80_is_signaling_nan(b, status)) {
6930 float_raise(float_flag_invalid, status);
6932 return float_relation_unordered;
6934 aSign = extractFloatx80Sign( a );
6935 bSign = extractFloatx80Sign( b );
6936 if ( aSign != bSign ) {
6938 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6939 ( ( a.low | b.low ) == 0 ) ) {
6940 /* zero case */
6941 return float_relation_equal;
6942 } else {
6943 return 1 - (2 * aSign);
6945 } else {
6946 if (a.low == b.low && a.high == b.high) {
6947 return float_relation_equal;
6948 } else {
6949 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6954 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6956 return floatx80_compare_internal(a, b, 0, status);
6959 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6961 return floatx80_compare_internal(a, b, 1, status);
6964 static inline int float128_compare_internal(float128 a, float128 b,
6965 int is_quiet, float_status *status)
6967 flag aSign, bSign;
6969 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6970 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6971 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6972 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6973 if (!is_quiet ||
6974 float128_is_signaling_nan(a, status) ||
6975 float128_is_signaling_nan(b, status)) {
6976 float_raise(float_flag_invalid, status);
6978 return float_relation_unordered;
6980 aSign = extractFloat128Sign( a );
6981 bSign = extractFloat128Sign( b );
6982 if ( aSign != bSign ) {
6983 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6984 /* zero case */
6985 return float_relation_equal;
6986 } else {
6987 return 1 - (2 * aSign);
6989 } else {
6990 if (a.low == b.low && a.high == b.high) {
6991 return float_relation_equal;
6992 } else {
6993 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6998 int float128_compare(float128 a, float128 b, float_status *status)
7000 return float128_compare_internal(a, b, 0, status);
7003 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7005 return float128_compare_internal(a, b, 1, status);
7008 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7010 flag aSign;
7011 int32_t aExp;
7012 uint64_t aSig;
7014 if (floatx80_invalid_encoding(a)) {
7015 float_raise(float_flag_invalid, status);
7016 return floatx80_default_nan(status);
7018 aSig = extractFloatx80Frac( a );
7019 aExp = extractFloatx80Exp( a );
7020 aSign = extractFloatx80Sign( a );
7022 if ( aExp == 0x7FFF ) {
7023 if ( aSig<<1 ) {
7024 return propagateFloatx80NaN(a, a, status);
7026 return a;
7029 if (aExp == 0) {
7030 if (aSig == 0) {
7031 return a;
7033 aExp++;
7036 if (n > 0x10000) {
7037 n = 0x10000;
7038 } else if (n < -0x10000) {
7039 n = -0x10000;
7042 aExp += n;
7043 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7044 aSign, aExp, aSig, 0, status);
7047 float128 float128_scalbn(float128 a, int n, float_status *status)
7049 flag aSign;
7050 int32_t aExp;
7051 uint64_t aSig0, aSig1;
7053 aSig1 = extractFloat128Frac1( a );
7054 aSig0 = extractFloat128Frac0( a );
7055 aExp = extractFloat128Exp( a );
7056 aSign = extractFloat128Sign( a );
7057 if ( aExp == 0x7FFF ) {
7058 if ( aSig0 | aSig1 ) {
7059 return propagateFloat128NaN(a, a, status);
7061 return a;
7063 if (aExp != 0) {
7064 aSig0 |= LIT64( 0x0001000000000000 );
7065 } else if (aSig0 == 0 && aSig1 == 0) {
7066 return a;
7067 } else {
7068 aExp++;
7071 if (n > 0x10000) {
7072 n = 0x10000;
7073 } else if (n < -0x10000) {
7074 n = -0x10000;
7077 aExp += n - 1;
7078 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7079 , status);