fpu/softfloat: Use float*_silence_nan in propagateFloat*NaN
[qemu.git] / fpu / softfloat.c
blob55e6701f262d7440f77bac24cb06d056c2fee682
1 /*
2 * QEMU float support
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "fpu/softfloat-macros.h"
98 /*----------------------------------------------------------------------------
99 | Returns the fraction bits of the half-precision floating-point value `a'.
100 *----------------------------------------------------------------------------*/
102 static inline uint32_t extractFloat16Frac(float16 a)
104 return float16_val(a) & 0x3ff;
107 /*----------------------------------------------------------------------------
108 | Returns the exponent bits of the half-precision floating-point value `a'.
109 *----------------------------------------------------------------------------*/
111 static inline int extractFloat16Exp(float16 a)
113 return (float16_val(a) >> 10) & 0x1f;
116 /*----------------------------------------------------------------------------
117 | Returns the fraction bits of the single-precision floating-point value `a'.
118 *----------------------------------------------------------------------------*/
120 static inline uint32_t extractFloat32Frac(float32 a)
122 return float32_val(a) & 0x007FFFFF;
125 /*----------------------------------------------------------------------------
126 | Returns the exponent bits of the single-precision floating-point value `a'.
127 *----------------------------------------------------------------------------*/
129 static inline int extractFloat32Exp(float32 a)
131 return (float32_val(a) >> 23) & 0xFF;
134 /*----------------------------------------------------------------------------
135 | Returns the sign bit of the single-precision floating-point value `a'.
136 *----------------------------------------------------------------------------*/
138 static inline flag extractFloat32Sign(float32 a)
140 return float32_val(a) >> 31;
143 /*----------------------------------------------------------------------------
144 | Returns the fraction bits of the double-precision floating-point value `a'.
145 *----------------------------------------------------------------------------*/
147 static inline uint64_t extractFloat64Frac(float64 a)
149 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
152 /*----------------------------------------------------------------------------
153 | Returns the exponent bits of the double-precision floating-point value `a'.
154 *----------------------------------------------------------------------------*/
156 static inline int extractFloat64Exp(float64 a)
158 return (float64_val(a) >> 52) & 0x7FF;
161 /*----------------------------------------------------------------------------
162 | Returns the sign bit of the double-precision floating-point value `a'.
163 *----------------------------------------------------------------------------*/
165 static inline flag extractFloat64Sign(float64 a)
167 return float64_val(a) >> 63;
171 * Classify a floating point number. Everything above float_class_qnan
172 * is a NaN so cls >= float_class_qnan is any NaN.
175 typedef enum __attribute__ ((__packed__)) {
176 float_class_unclassified,
177 float_class_zero,
178 float_class_normal,
179 float_class_inf,
180 float_class_qnan, /* all NaNs from here */
181 float_class_snan,
182 } FloatClass;
185 * Structure holding all of the decomposed parts of a float. The
186 * exponent is unbiased and the fraction is normalized. All
187 * calculations are done with a 64 bit fraction and then rounded as
188 * appropriate for the final format.
190 * Thanks to the packed FloatClass a decent compiler should be able to
191 * fit the whole structure into registers and avoid using the stack
192 * for parameter passing.
195 typedef struct {
196 uint64_t frac;
197 int32_t exp;
198 FloatClass cls;
199 bool sign;
200 } FloatParts;
202 #define DECOMPOSED_BINARY_POINT (64 - 2)
203 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
204 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
206 /* Structure holding all of the relevant parameters for a format.
207 * exp_size: the size of the exponent field
208 * exp_bias: the offset applied to the exponent field
209 * exp_max: the maximum normalised exponent
210 * frac_size: the size of the fraction field
211 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
212 * The following are computed based the size of fraction
213 * frac_lsb: least significant bit of fraction
214 * frac_lsbm1: the bit below the least significant bit (for rounding)
215 * round_mask/roundeven_mask: masks used for rounding
216 * The following optional modifiers are available:
217 * arm_althp: handle ARM Alternative Half Precision
219 typedef struct {
220 int exp_size;
221 int exp_bias;
222 int exp_max;
223 int frac_size;
224 int frac_shift;
225 uint64_t frac_lsb;
226 uint64_t frac_lsbm1;
227 uint64_t round_mask;
228 uint64_t roundeven_mask;
229 bool arm_althp;
230 } FloatFmt;
232 /* Expand fields based on the size of exponent and fraction */
233 #define FLOAT_PARAMS(E, F) \
234 .exp_size = E, \
235 .exp_bias = ((1 << E) - 1) >> 1, \
236 .exp_max = (1 << E) - 1, \
237 .frac_size = F, \
238 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
239 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
240 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
241 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
242 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
244 static const FloatFmt float16_params = {
245 FLOAT_PARAMS(5, 10)
248 static const FloatFmt float16_params_ahp = {
249 FLOAT_PARAMS(5, 10),
250 .arm_althp = true
253 static const FloatFmt float32_params = {
254 FLOAT_PARAMS(8, 23)
257 static const FloatFmt float64_params = {
258 FLOAT_PARAMS(11, 52)
261 /* Unpack a float to parts, but do not canonicalize. */
262 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
264 const int sign_pos = fmt.frac_size + fmt.exp_size;
266 return (FloatParts) {
267 .cls = float_class_unclassified,
268 .sign = extract64(raw, sign_pos, 1),
269 .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
270 .frac = extract64(raw, 0, fmt.frac_size),
274 static inline FloatParts float16_unpack_raw(float16 f)
276 return unpack_raw(float16_params, f);
279 static inline FloatParts float32_unpack_raw(float32 f)
281 return unpack_raw(float32_params, f);
284 static inline FloatParts float64_unpack_raw(float64 f)
286 return unpack_raw(float64_params, f);
289 /* Pack a float from parts, but do not canonicalize. */
290 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
292 const int sign_pos = fmt.frac_size + fmt.exp_size;
293 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
294 return deposit64(ret, sign_pos, 1, p.sign);
297 static inline float16 float16_pack_raw(FloatParts p)
299 return make_float16(pack_raw(float16_params, p));
302 static inline float32 float32_pack_raw(FloatParts p)
304 return make_float32(pack_raw(float32_params, p));
307 static inline float64 float64_pack_raw(FloatParts p)
309 return make_float64(pack_raw(float64_params, p));
312 /*----------------------------------------------------------------------------
313 | Functions and definitions to determine: (1) whether tininess for underflow
314 | is detected before or after rounding by default, (2) what (if anything)
315 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
316 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
317 | are propagated from function inputs to output. These details are target-
318 | specific.
319 *----------------------------------------------------------------------------*/
320 #include "softfloat-specialize.h"
322 /* Canonicalize EXP and FRAC, setting CLS. */
323 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
324 float_status *status)
326 if (part.exp == parm->exp_max && !parm->arm_althp) {
327 if (part.frac == 0) {
328 part.cls = float_class_inf;
329 } else {
330 part.frac <<= parm->frac_shift;
331 part.cls = (parts_is_snan_frac(part.frac, status)
332 ? float_class_snan : float_class_qnan);
334 } else if (part.exp == 0) {
335 if (likely(part.frac == 0)) {
336 part.cls = float_class_zero;
337 } else if (status->flush_inputs_to_zero) {
338 float_raise(float_flag_input_denormal, status);
339 part.cls = float_class_zero;
340 part.frac = 0;
341 } else {
342 int shift = clz64(part.frac) - 1;
343 part.cls = float_class_normal;
344 part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
345 part.frac <<= shift;
347 } else {
348 part.cls = float_class_normal;
349 part.exp -= parm->exp_bias;
350 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
352 return part;
355 /* Round and uncanonicalize a floating-point number by parts. There
356 * are FRAC_SHIFT bits that may require rounding at the bottom of the
357 * fraction; these bits will be removed. The exponent will be biased
358 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
361 static FloatParts round_canonical(FloatParts p, float_status *s,
362 const FloatFmt *parm)
364 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
365 const uint64_t round_mask = parm->round_mask;
366 const uint64_t roundeven_mask = parm->roundeven_mask;
367 const int exp_max = parm->exp_max;
368 const int frac_shift = parm->frac_shift;
369 uint64_t frac, inc;
370 int exp, flags = 0;
371 bool overflow_norm;
373 frac = p.frac;
374 exp = p.exp;
376 switch (p.cls) {
377 case float_class_normal:
378 switch (s->float_rounding_mode) {
379 case float_round_nearest_even:
380 overflow_norm = false;
381 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
382 break;
383 case float_round_ties_away:
384 overflow_norm = false;
385 inc = frac_lsbm1;
386 break;
387 case float_round_to_zero:
388 overflow_norm = true;
389 inc = 0;
390 break;
391 case float_round_up:
392 inc = p.sign ? 0 : round_mask;
393 overflow_norm = p.sign;
394 break;
395 case float_round_down:
396 inc = p.sign ? round_mask : 0;
397 overflow_norm = !p.sign;
398 break;
399 default:
400 g_assert_not_reached();
403 exp += parm->exp_bias;
404 if (likely(exp > 0)) {
405 if (frac & round_mask) {
406 flags |= float_flag_inexact;
407 frac += inc;
408 if (frac & DECOMPOSED_OVERFLOW_BIT) {
409 frac >>= 1;
410 exp++;
413 frac >>= frac_shift;
415 if (parm->arm_althp) {
416 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
417 if (unlikely(exp > exp_max)) {
418 /* Overflow. Return the maximum normal. */
419 flags = float_flag_invalid;
420 exp = exp_max;
421 frac = -1;
423 } else if (unlikely(exp >= exp_max)) {
424 flags |= float_flag_overflow | float_flag_inexact;
425 if (overflow_norm) {
426 exp = exp_max - 1;
427 frac = -1;
428 } else {
429 p.cls = float_class_inf;
430 goto do_inf;
433 } else if (s->flush_to_zero) {
434 flags |= float_flag_output_denormal;
435 p.cls = float_class_zero;
436 goto do_zero;
437 } else {
438 bool is_tiny = (s->float_detect_tininess
439 == float_tininess_before_rounding)
440 || (exp < 0)
441 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
443 shift64RightJamming(frac, 1 - exp, &frac);
444 if (frac & round_mask) {
445 /* Need to recompute round-to-even. */
446 if (s->float_rounding_mode == float_round_nearest_even) {
447 inc = ((frac & roundeven_mask) != frac_lsbm1
448 ? frac_lsbm1 : 0);
450 flags |= float_flag_inexact;
451 frac += inc;
454 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
455 frac >>= frac_shift;
457 if (is_tiny && (flags & float_flag_inexact)) {
458 flags |= float_flag_underflow;
460 if (exp == 0 && frac == 0) {
461 p.cls = float_class_zero;
464 break;
466 case float_class_zero:
467 do_zero:
468 exp = 0;
469 frac = 0;
470 break;
472 case float_class_inf:
473 do_inf:
474 assert(!parm->arm_althp);
475 exp = exp_max;
476 frac = 0;
477 break;
479 case float_class_qnan:
480 case float_class_snan:
481 assert(!parm->arm_althp);
482 exp = exp_max;
483 frac >>= parm->frac_shift;
484 break;
486 default:
487 g_assert_not_reached();
490 float_raise(flags, s);
491 p.exp = exp;
492 p.frac = frac;
493 return p;
496 /* Explicit FloatFmt version */
497 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
498 const FloatFmt *params)
500 return canonicalize(float16_unpack_raw(f), params, s);
503 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
505 return float16a_unpack_canonical(f, s, &float16_params);
508 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
509 const FloatFmt *params)
511 return float16_pack_raw(round_canonical(p, s, params));
514 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
516 return float16a_round_pack_canonical(p, s, &float16_params);
519 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
521 return canonicalize(float32_unpack_raw(f), &float32_params, s);
524 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
526 return float32_pack_raw(round_canonical(p, s, &float32_params));
529 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
531 return canonicalize(float64_unpack_raw(f), &float64_params, s);
534 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
536 return float64_pack_raw(round_canonical(p, s, &float64_params));
539 /* Simple helpers for checking if what NaN we have */
540 static bool is_nan(FloatClass c)
542 return unlikely(c >= float_class_qnan);
544 static bool is_snan(FloatClass c)
546 return c == float_class_snan;
548 static bool is_qnan(FloatClass c)
550 return c == float_class_qnan;
553 static FloatParts return_nan(FloatParts a, float_status *s)
555 switch (a.cls) {
556 case float_class_snan:
557 s->float_exception_flags |= float_flag_invalid;
558 a = parts_silence_nan(a, s);
559 /* fall through */
560 case float_class_qnan:
561 if (s->default_nan_mode) {
562 return parts_default_nan(s);
564 break;
566 default:
567 g_assert_not_reached();
569 return a;
572 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
574 if (is_snan(a.cls) || is_snan(b.cls)) {
575 s->float_exception_flags |= float_flag_invalid;
578 if (s->default_nan_mode) {
579 return parts_default_nan(s);
580 } else {
581 if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
582 is_qnan(b.cls), is_snan(b.cls),
583 a.frac > b.frac ||
584 (a.frac == b.frac && a.sign < b.sign))) {
585 a = b;
587 if (is_snan(a.cls)) {
588 return parts_silence_nan(a, s);
591 return a;
594 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
595 bool inf_zero, float_status *s)
597 int which;
599 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
600 s->float_exception_flags |= float_flag_invalid;
603 which = pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
604 is_qnan(b.cls), is_snan(b.cls),
605 is_qnan(c.cls), is_snan(c.cls),
606 inf_zero, s);
608 if (s->default_nan_mode) {
609 /* Note that this check is after pickNaNMulAdd so that function
610 * has an opportunity to set the Invalid flag.
612 which = 3;
615 switch (which) {
616 case 0:
617 break;
618 case 1:
619 a = b;
620 break;
621 case 2:
622 a = c;
623 break;
624 case 3:
625 return parts_default_nan(s);
626 default:
627 g_assert_not_reached();
630 if (is_snan(a.cls)) {
631 return parts_silence_nan(a, s);
633 return a;
637 * Returns the result of adding or subtracting the values of the
638 * floating-point values `a' and `b'. The operation is performed
639 * according to the IEC/IEEE Standard for Binary Floating-Point
640 * Arithmetic.
643 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
644 float_status *s)
646 bool a_sign = a.sign;
647 bool b_sign = b.sign ^ subtract;
649 if (a_sign != b_sign) {
650 /* Subtraction */
652 if (a.cls == float_class_normal && b.cls == float_class_normal) {
653 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
654 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
655 a.frac = a.frac - b.frac;
656 } else {
657 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
658 a.frac = b.frac - a.frac;
659 a.exp = b.exp;
660 a_sign ^= 1;
663 if (a.frac == 0) {
664 a.cls = float_class_zero;
665 a.sign = s->float_rounding_mode == float_round_down;
666 } else {
667 int shift = clz64(a.frac) - 1;
668 a.frac = a.frac << shift;
669 a.exp = a.exp - shift;
670 a.sign = a_sign;
672 return a;
674 if (is_nan(a.cls) || is_nan(b.cls)) {
675 return pick_nan(a, b, s);
677 if (a.cls == float_class_inf) {
678 if (b.cls == float_class_inf) {
679 float_raise(float_flag_invalid, s);
680 return parts_default_nan(s);
682 return a;
684 if (a.cls == float_class_zero && b.cls == float_class_zero) {
685 a.sign = s->float_rounding_mode == float_round_down;
686 return a;
688 if (a.cls == float_class_zero || b.cls == float_class_inf) {
689 b.sign = a_sign ^ 1;
690 return b;
692 if (b.cls == float_class_zero) {
693 return a;
695 } else {
696 /* Addition */
697 if (a.cls == float_class_normal && b.cls == float_class_normal) {
698 if (a.exp > b.exp) {
699 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
700 } else if (a.exp < b.exp) {
701 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
702 a.exp = b.exp;
704 a.frac += b.frac;
705 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
706 a.frac >>= 1;
707 a.exp += 1;
709 return a;
711 if (is_nan(a.cls) || is_nan(b.cls)) {
712 return pick_nan(a, b, s);
714 if (a.cls == float_class_inf || b.cls == float_class_zero) {
715 return a;
717 if (b.cls == float_class_inf || a.cls == float_class_zero) {
718 b.sign = b_sign;
719 return b;
722 g_assert_not_reached();
726 * Returns the result of adding or subtracting the floating-point
727 * values `a' and `b'. The operation is performed according to the
728 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
731 float16 __attribute__((flatten)) float16_add(float16 a, float16 b,
732 float_status *status)
734 FloatParts pa = float16_unpack_canonical(a, status);
735 FloatParts pb = float16_unpack_canonical(b, status);
736 FloatParts pr = addsub_floats(pa, pb, false, status);
738 return float16_round_pack_canonical(pr, status);
741 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
742 float_status *status)
744 FloatParts pa = float32_unpack_canonical(a, status);
745 FloatParts pb = float32_unpack_canonical(b, status);
746 FloatParts pr = addsub_floats(pa, pb, false, status);
748 return float32_round_pack_canonical(pr, status);
751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
752 float_status *status)
754 FloatParts pa = float64_unpack_canonical(a, status);
755 FloatParts pb = float64_unpack_canonical(b, status);
756 FloatParts pr = addsub_floats(pa, pb, false, status);
758 return float64_round_pack_canonical(pr, status);
761 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
762 float_status *status)
764 FloatParts pa = float16_unpack_canonical(a, status);
765 FloatParts pb = float16_unpack_canonical(b, status);
766 FloatParts pr = addsub_floats(pa, pb, true, status);
768 return float16_round_pack_canonical(pr, status);
771 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
772 float_status *status)
774 FloatParts pa = float32_unpack_canonical(a, status);
775 FloatParts pb = float32_unpack_canonical(b, status);
776 FloatParts pr = addsub_floats(pa, pb, true, status);
778 return float32_round_pack_canonical(pr, status);
781 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
782 float_status *status)
784 FloatParts pa = float64_unpack_canonical(a, status);
785 FloatParts pb = float64_unpack_canonical(b, status);
786 FloatParts pr = addsub_floats(pa, pb, true, status);
788 return float64_round_pack_canonical(pr, status);
792 * Returns the result of multiplying the floating-point values `a' and
793 * `b'. The operation is performed according to the IEC/IEEE Standard
794 * for Binary Floating-Point Arithmetic.
797 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
799 bool sign = a.sign ^ b.sign;
801 if (a.cls == float_class_normal && b.cls == float_class_normal) {
802 uint64_t hi, lo;
803 int exp = a.exp + b.exp;
805 mul64To128(a.frac, b.frac, &hi, &lo);
806 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
807 if (lo & DECOMPOSED_OVERFLOW_BIT) {
808 shift64RightJamming(lo, 1, &lo);
809 exp += 1;
812 /* Re-use a */
813 a.exp = exp;
814 a.sign = sign;
815 a.frac = lo;
816 return a;
818 /* handle all the NaN cases */
819 if (is_nan(a.cls) || is_nan(b.cls)) {
820 return pick_nan(a, b, s);
822 /* Inf * Zero == NaN */
823 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
824 (a.cls == float_class_zero && b.cls == float_class_inf)) {
825 s->float_exception_flags |= float_flag_invalid;
826 return parts_default_nan(s);
828 /* Multiply by 0 or Inf */
829 if (a.cls == float_class_inf || a.cls == float_class_zero) {
830 a.sign = sign;
831 return a;
833 if (b.cls == float_class_inf || b.cls == float_class_zero) {
834 b.sign = sign;
835 return b;
837 g_assert_not_reached();
840 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
841 float_status *status)
843 FloatParts pa = float16_unpack_canonical(a, status);
844 FloatParts pb = float16_unpack_canonical(b, status);
845 FloatParts pr = mul_floats(pa, pb, status);
847 return float16_round_pack_canonical(pr, status);
850 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
851 float_status *status)
853 FloatParts pa = float32_unpack_canonical(a, status);
854 FloatParts pb = float32_unpack_canonical(b, status);
855 FloatParts pr = mul_floats(pa, pb, status);
857 return float32_round_pack_canonical(pr, status);
860 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
861 float_status *status)
863 FloatParts pa = float64_unpack_canonical(a, status);
864 FloatParts pb = float64_unpack_canonical(b, status);
865 FloatParts pr = mul_floats(pa, pb, status);
867 return float64_round_pack_canonical(pr, status);
871 * Returns the result of multiplying the floating-point values `a' and
872 * `b' then adding 'c', with no intermediate rounding step after the
873 * multiplication. The operation is performed according to the
874 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
875 * The flags argument allows the caller to select negation of the
876 * addend, the intermediate product, or the final result. (The
877 * difference between this and having the caller do a separate
878 * negation is that negating externally will flip the sign bit on
879 * NaNs.)
882 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
883 int flags, float_status *s)
885 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
886 ((1 << float_class_inf) | (1 << float_class_zero));
887 bool p_sign;
888 bool sign_flip = flags & float_muladd_negate_result;
889 FloatClass p_class;
890 uint64_t hi, lo;
891 int p_exp;
893 /* It is implementation-defined whether the cases of (0,inf,qnan)
894 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
895 * they return if they do), so we have to hand this information
896 * off to the target-specific pick-a-NaN routine.
898 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
899 return pick_nan_muladd(a, b, c, inf_zero, s);
902 if (inf_zero) {
903 s->float_exception_flags |= float_flag_invalid;
904 return parts_default_nan(s);
907 if (flags & float_muladd_negate_c) {
908 c.sign ^= 1;
911 p_sign = a.sign ^ b.sign;
913 if (flags & float_muladd_negate_product) {
914 p_sign ^= 1;
917 if (a.cls == float_class_inf || b.cls == float_class_inf) {
918 p_class = float_class_inf;
919 } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
920 p_class = float_class_zero;
921 } else {
922 p_class = float_class_normal;
925 if (c.cls == float_class_inf) {
926 if (p_class == float_class_inf && p_sign != c.sign) {
927 s->float_exception_flags |= float_flag_invalid;
928 return parts_default_nan(s);
929 } else {
930 a.cls = float_class_inf;
931 a.sign = c.sign ^ sign_flip;
932 return a;
936 if (p_class == float_class_inf) {
937 a.cls = float_class_inf;
938 a.sign = p_sign ^ sign_flip;
939 return a;
942 if (p_class == float_class_zero) {
943 if (c.cls == float_class_zero) {
944 if (p_sign != c.sign) {
945 p_sign = s->float_rounding_mode == float_round_down;
947 c.sign = p_sign;
948 } else if (flags & float_muladd_halve_result) {
949 c.exp -= 1;
951 c.sign ^= sign_flip;
952 return c;
955 /* a & b should be normals now... */
956 assert(a.cls == float_class_normal &&
957 b.cls == float_class_normal);
959 p_exp = a.exp + b.exp;
961 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
962 * result.
964 mul64To128(a.frac, b.frac, &hi, &lo);
965 /* binary point now at bit 124 */
967 /* check for overflow */
968 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
969 shift128RightJamming(hi, lo, 1, &hi, &lo);
970 p_exp += 1;
973 /* + add/sub */
974 if (c.cls == float_class_zero) {
975 /* move binary point back to 62 */
976 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
977 } else {
978 int exp_diff = p_exp - c.exp;
979 if (p_sign == c.sign) {
980 /* Addition */
981 if (exp_diff <= 0) {
982 shift128RightJamming(hi, lo,
983 DECOMPOSED_BINARY_POINT - exp_diff,
984 &hi, &lo);
985 lo += c.frac;
986 p_exp = c.exp;
987 } else {
988 uint64_t c_hi, c_lo;
989 /* shift c to the same binary point as the product (124) */
990 c_hi = c.frac >> 2;
991 c_lo = 0;
992 shift128RightJamming(c_hi, c_lo,
993 exp_diff,
994 &c_hi, &c_lo);
995 add128(hi, lo, c_hi, c_lo, &hi, &lo);
996 /* move binary point back to 62 */
997 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1000 if (lo & DECOMPOSED_OVERFLOW_BIT) {
1001 shift64RightJamming(lo, 1, &lo);
1002 p_exp += 1;
1005 } else {
1006 /* Subtraction */
1007 uint64_t c_hi, c_lo;
1008 /* make C binary point match product at bit 124 */
1009 c_hi = c.frac >> 2;
1010 c_lo = 0;
1012 if (exp_diff <= 0) {
1013 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1014 if (exp_diff == 0
1016 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1017 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1018 } else {
1019 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1020 p_sign ^= 1;
1021 p_exp = c.exp;
1023 } else {
1024 shift128RightJamming(c_hi, c_lo,
1025 exp_diff,
1026 &c_hi, &c_lo);
1027 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1030 if (hi == 0 && lo == 0) {
1031 a.cls = float_class_zero;
1032 a.sign = s->float_rounding_mode == float_round_down;
1033 a.sign ^= sign_flip;
1034 return a;
1035 } else {
1036 int shift;
1037 if (hi != 0) {
1038 shift = clz64(hi);
1039 } else {
1040 shift = clz64(lo) + 64;
1042 /* Normalizing to a binary point of 124 is the
1043 correct adjust for the exponent. However since we're
1044 shifting, we might as well put the binary point back
1045 at 62 where we really want it. Therefore shift as
1046 if we're leaving 1 bit at the top of the word, but
1047 adjust the exponent as if we're leaving 3 bits. */
1048 shift -= 1;
1049 if (shift >= 64) {
1050 lo = lo << (shift - 64);
1051 } else {
1052 hi = (hi << shift) | (lo >> (64 - shift));
1053 lo = hi | ((lo << shift) != 0);
1055 p_exp -= shift - 2;
1060 if (flags & float_muladd_halve_result) {
1061 p_exp -= 1;
1064 /* finally prepare our result */
1065 a.cls = float_class_normal;
1066 a.sign = p_sign ^ sign_flip;
1067 a.exp = p_exp;
1068 a.frac = lo;
1070 return a;
1073 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1074 int flags, float_status *status)
1076 FloatParts pa = float16_unpack_canonical(a, status);
1077 FloatParts pb = float16_unpack_canonical(b, status);
1078 FloatParts pc = float16_unpack_canonical(c, status);
1079 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1081 return float16_round_pack_canonical(pr, status);
1084 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1085 int flags, float_status *status)
1087 FloatParts pa = float32_unpack_canonical(a, status);
1088 FloatParts pb = float32_unpack_canonical(b, status);
1089 FloatParts pc = float32_unpack_canonical(c, status);
1090 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1092 return float32_round_pack_canonical(pr, status);
1095 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1096 int flags, float_status *status)
1098 FloatParts pa = float64_unpack_canonical(a, status);
1099 FloatParts pb = float64_unpack_canonical(b, status);
1100 FloatParts pc = float64_unpack_canonical(c, status);
1101 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1103 return float64_round_pack_canonical(pr, status);
1107 * Returns the result of dividing the floating-point value `a' by the
1108 * corresponding value `b'. The operation is performed according to
1109 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1112 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1114 bool sign = a.sign ^ b.sign;
1116 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1117 uint64_t temp_lo, temp_hi;
1118 int exp = a.exp - b.exp;
1119 if (a.frac < b.frac) {
1120 exp -= 1;
1121 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1122 &temp_hi, &temp_lo);
1123 } else {
1124 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1125 &temp_hi, &temp_lo);
1127 /* LSB of quot is set if inexact which roundandpack will use
1128 * to set flags. Yet again we re-use a for the result */
1129 a.frac = div128To64(temp_lo, temp_hi, b.frac);
1130 a.sign = sign;
1131 a.exp = exp;
1132 return a;
1134 /* handle all the NaN cases */
1135 if (is_nan(a.cls) || is_nan(b.cls)) {
1136 return pick_nan(a, b, s);
1138 /* 0/0 or Inf/Inf */
1139 if (a.cls == b.cls
1141 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1142 s->float_exception_flags |= float_flag_invalid;
1143 return parts_default_nan(s);
1145 /* Inf / x or 0 / x */
1146 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1147 a.sign = sign;
1148 return a;
1150 /* Div 0 => Inf */
1151 if (b.cls == float_class_zero) {
1152 s->float_exception_flags |= float_flag_divbyzero;
1153 a.cls = float_class_inf;
1154 a.sign = sign;
1155 return a;
1157 /* Div by Inf */
1158 if (b.cls == float_class_inf) {
1159 a.cls = float_class_zero;
1160 a.sign = sign;
1161 return a;
1163 g_assert_not_reached();
1166 float16 float16_div(float16 a, float16 b, float_status *status)
1168 FloatParts pa = float16_unpack_canonical(a, status);
1169 FloatParts pb = float16_unpack_canonical(b, status);
1170 FloatParts pr = div_floats(pa, pb, status);
1172 return float16_round_pack_canonical(pr, status);
1175 float32 float32_div(float32 a, float32 b, float_status *status)
1177 FloatParts pa = float32_unpack_canonical(a, status);
1178 FloatParts pb = float32_unpack_canonical(b, status);
1179 FloatParts pr = div_floats(pa, pb, status);
1181 return float32_round_pack_canonical(pr, status);
1184 float64 float64_div(float64 a, float64 b, float_status *status)
1186 FloatParts pa = float64_unpack_canonical(a, status);
1187 FloatParts pb = float64_unpack_canonical(b, status);
1188 FloatParts pr = div_floats(pa, pb, status);
1190 return float64_round_pack_canonical(pr, status);
1194 * Float to Float conversions
1196 * Returns the result of converting one float format to another. The
1197 * conversion is performed according to the IEC/IEEE Standard for
1198 * Binary Floating-Point Arithmetic.
1200 * The float_to_float helper only needs to take care of raising
1201 * invalid exceptions and handling the conversion on NaNs.
1204 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1205 float_status *s)
1207 if (dstf->arm_althp) {
1208 switch (a.cls) {
1209 case float_class_qnan:
1210 case float_class_snan:
1211 /* There is no NaN in the destination format. Raise Invalid
1212 * and return a zero with the sign of the input NaN.
1214 s->float_exception_flags |= float_flag_invalid;
1215 a.cls = float_class_zero;
1216 a.frac = 0;
1217 a.exp = 0;
1218 break;
1220 case float_class_inf:
1221 /* There is no Inf in the destination format. Raise Invalid
1222 * and return the maximum normal with the correct sign.
1224 s->float_exception_flags |= float_flag_invalid;
1225 a.cls = float_class_normal;
1226 a.exp = dstf->exp_max;
1227 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1228 break;
1230 default:
1231 break;
1233 } else if (is_nan(a.cls)) {
1234 if (is_snan(a.cls)) {
1235 s->float_exception_flags |= float_flag_invalid;
1236 a = parts_silence_nan(a, s);
1238 if (s->default_nan_mode) {
1239 return parts_default_nan(s);
1242 return a;
1245 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1247 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1248 FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1249 FloatParts pr = float_to_float(p, &float32_params, s);
1250 return float32_round_pack_canonical(pr, s);
1253 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1255 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1256 FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1257 FloatParts pr = float_to_float(p, &float64_params, s);
1258 return float64_round_pack_canonical(pr, s);
1261 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1263 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1264 FloatParts p = float32_unpack_canonical(a, s);
1265 FloatParts pr = float_to_float(p, fmt16, s);
1266 return float16a_round_pack_canonical(pr, s, fmt16);
1269 float64 float32_to_float64(float32 a, float_status *s)
1271 FloatParts p = float32_unpack_canonical(a, s);
1272 FloatParts pr = float_to_float(p, &float64_params, s);
1273 return float64_round_pack_canonical(pr, s);
1276 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1278 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1279 FloatParts p = float64_unpack_canonical(a, s);
1280 FloatParts pr = float_to_float(p, fmt16, s);
1281 return float16a_round_pack_canonical(pr, s, fmt16);
1284 float32 float64_to_float32(float64 a, float_status *s)
1286 FloatParts p = float64_unpack_canonical(a, s);
1287 FloatParts pr = float_to_float(p, &float32_params, s);
1288 return float32_round_pack_canonical(pr, s);
1292 * Rounds the floating-point value `a' to an integer, and returns the
1293 * result as a floating-point value. The operation is performed
1294 * according to the IEC/IEEE Standard for Binary Floating-Point
1295 * Arithmetic.
1298 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1300 if (is_nan(a.cls)) {
1301 return return_nan(a, s);
1304 switch (a.cls) {
1305 case float_class_zero:
1306 case float_class_inf:
1307 case float_class_qnan:
1308 /* already "integral" */
1309 break;
1310 case float_class_normal:
1311 if (a.exp >= DECOMPOSED_BINARY_POINT) {
1312 /* already integral */
1313 break;
1315 if (a.exp < 0) {
1316 bool one;
1317 /* all fractional */
1318 s->float_exception_flags |= float_flag_inexact;
1319 switch (rounding_mode) {
1320 case float_round_nearest_even:
1321 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1322 break;
1323 case float_round_ties_away:
1324 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1325 break;
1326 case float_round_to_zero:
1327 one = false;
1328 break;
1329 case float_round_up:
1330 one = !a.sign;
1331 break;
1332 case float_round_down:
1333 one = a.sign;
1334 break;
1335 default:
1336 g_assert_not_reached();
1339 if (one) {
1340 a.frac = DECOMPOSED_IMPLICIT_BIT;
1341 a.exp = 0;
1342 } else {
1343 a.cls = float_class_zero;
1345 } else {
1346 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1347 uint64_t frac_lsbm1 = frac_lsb >> 1;
1348 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1349 uint64_t rnd_mask = rnd_even_mask >> 1;
1350 uint64_t inc;
1352 switch (rounding_mode) {
1353 case float_round_nearest_even:
1354 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1355 break;
1356 case float_round_ties_away:
1357 inc = frac_lsbm1;
1358 break;
1359 case float_round_to_zero:
1360 inc = 0;
1361 break;
1362 case float_round_up:
1363 inc = a.sign ? 0 : rnd_mask;
1364 break;
1365 case float_round_down:
1366 inc = a.sign ? rnd_mask : 0;
1367 break;
1368 default:
1369 g_assert_not_reached();
1372 if (a.frac & rnd_mask) {
1373 s->float_exception_flags |= float_flag_inexact;
1374 a.frac += inc;
1375 a.frac &= ~rnd_mask;
1376 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1377 a.frac >>= 1;
1378 a.exp++;
1382 break;
1383 default:
1384 g_assert_not_reached();
1386 return a;
1389 float16 float16_round_to_int(float16 a, float_status *s)
1391 FloatParts pa = float16_unpack_canonical(a, s);
1392 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1393 return float16_round_pack_canonical(pr, s);
1396 float32 float32_round_to_int(float32 a, float_status *s)
1398 FloatParts pa = float32_unpack_canonical(a, s);
1399 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1400 return float32_round_pack_canonical(pr, s);
1403 float64 float64_round_to_int(float64 a, float_status *s)
1405 FloatParts pa = float64_unpack_canonical(a, s);
1406 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1407 return float64_round_pack_canonical(pr, s);
1410 float64 float64_trunc_to_int(float64 a, float_status *s)
1412 FloatParts pa = float64_unpack_canonical(a, s);
1413 FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1414 return float64_round_pack_canonical(pr, s);
1418 * Returns the result of converting the floating-point value `a' to
1419 * the two's complement integer format. The conversion is performed
1420 * according to the IEC/IEEE Standard for Binary Floating-Point
1421 * Arithmetic---which means in particular that the conversion is
1422 * rounded according to the current rounding mode. If `a' is a NaN,
1423 * the largest positive integer is returned. Otherwise, if the
1424 * conversion overflows, the largest integer with the same sign as `a'
1425 * is returned.
1428 static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1429 int64_t min, int64_t max,
1430 float_status *s)
1432 uint64_t r;
1433 int orig_flags = get_float_exception_flags(s);
1434 FloatParts p = round_to_int(in, rmode, s);
1436 switch (p.cls) {
1437 case float_class_snan:
1438 case float_class_qnan:
1439 s->float_exception_flags = orig_flags | float_flag_invalid;
1440 return max;
1441 case float_class_inf:
1442 s->float_exception_flags = orig_flags | float_flag_invalid;
1443 return p.sign ? min : max;
1444 case float_class_zero:
1445 return 0;
1446 case float_class_normal:
1447 if (p.exp < DECOMPOSED_BINARY_POINT) {
1448 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1449 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1450 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1451 } else {
1452 r = UINT64_MAX;
1454 if (p.sign) {
1455 if (r <= -(uint64_t) min) {
1456 return -r;
1457 } else {
1458 s->float_exception_flags = orig_flags | float_flag_invalid;
1459 return min;
1461 } else {
1462 if (r <= max) {
1463 return r;
1464 } else {
1465 s->float_exception_flags = orig_flags | float_flag_invalid;
1466 return max;
1469 default:
1470 g_assert_not_reached();
1474 #define FLOAT_TO_INT(fsz, isz) \
1475 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \
1476 float_status *s) \
1478 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1479 return round_to_int_and_pack(p, s->float_rounding_mode, \
1480 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1481 s); \
1484 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \
1485 (float ## fsz a, float_status *s) \
1487 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1488 return round_to_int_and_pack(p, float_round_to_zero, \
1489 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1490 s); \
1493 FLOAT_TO_INT(16, 16)
1494 FLOAT_TO_INT(16, 32)
1495 FLOAT_TO_INT(16, 64)
1497 FLOAT_TO_INT(32, 16)
1498 FLOAT_TO_INT(32, 32)
1499 FLOAT_TO_INT(32, 64)
1501 FLOAT_TO_INT(64, 16)
1502 FLOAT_TO_INT(64, 32)
1503 FLOAT_TO_INT(64, 64)
1505 #undef FLOAT_TO_INT
1508 * Returns the result of converting the floating-point value `a' to
1509 * the unsigned integer format. The conversion is performed according
1510 * to the IEC/IEEE Standard for Binary Floating-Point
1511 * Arithmetic---which means in particular that the conversion is
1512 * rounded according to the current rounding mode. If `a' is a NaN,
1513 * the largest unsigned integer is returned. Otherwise, if the
1514 * conversion overflows, the largest unsigned integer is returned. If
1515 * the 'a' is negative, the result is rounded and zero is returned;
1516 * values that do not round to zero will raise the inexact exception
1517 * flag.
1520 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1521 float_status *s)
1523 int orig_flags = get_float_exception_flags(s);
1524 FloatParts p = round_to_int(in, rmode, s);
1526 switch (p.cls) {
1527 case float_class_snan:
1528 case float_class_qnan:
1529 s->float_exception_flags = orig_flags | float_flag_invalid;
1530 return max;
1531 case float_class_inf:
1532 s->float_exception_flags = orig_flags | float_flag_invalid;
1533 return p.sign ? 0 : max;
1534 case float_class_zero:
1535 return 0;
1536 case float_class_normal:
1538 uint64_t r;
1539 if (p.sign) {
1540 s->float_exception_flags = orig_flags | float_flag_invalid;
1541 return 0;
1544 if (p.exp < DECOMPOSED_BINARY_POINT) {
1545 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1546 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1547 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1548 } else {
1549 s->float_exception_flags = orig_flags | float_flag_invalid;
1550 return max;
1553 /* For uint64 this will never trip, but if p.exp is too large
1554 * to shift a decomposed fraction we shall have exited via the
1555 * 3rd leg above.
1557 if (r > max) {
1558 s->float_exception_flags = orig_flags | float_flag_invalid;
1559 return max;
1560 } else {
1561 return r;
1564 default:
1565 g_assert_not_reached();
1569 #define FLOAT_TO_UINT(fsz, isz) \
1570 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \
1571 float_status *s) \
1573 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1574 return round_to_uint_and_pack(p, s->float_rounding_mode, \
1575 UINT ## isz ## _MAX, s); \
1578 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \
1579 (float ## fsz a, float_status *s) \
1581 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1582 return round_to_uint_and_pack(p, float_round_to_zero, \
1583 UINT ## isz ## _MAX, s); \
1586 FLOAT_TO_UINT(16, 16)
1587 FLOAT_TO_UINT(16, 32)
1588 FLOAT_TO_UINT(16, 64)
1590 FLOAT_TO_UINT(32, 16)
1591 FLOAT_TO_UINT(32, 32)
1592 FLOAT_TO_UINT(32, 64)
1594 FLOAT_TO_UINT(64, 16)
1595 FLOAT_TO_UINT(64, 32)
1596 FLOAT_TO_UINT(64, 64)
1598 #undef FLOAT_TO_UINT
1601 * Integer to float conversions
1603 * Returns the result of converting the two's complement integer `a'
1604 * to the floating-point format. The conversion is performed according
1605 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1608 static FloatParts int_to_float(int64_t a, float_status *status)
1610 FloatParts r = {};
1611 if (a == 0) {
1612 r.cls = float_class_zero;
1613 r.sign = false;
1614 } else if (a == (1ULL << 63)) {
1615 r.cls = float_class_normal;
1616 r.sign = true;
1617 r.frac = DECOMPOSED_IMPLICIT_BIT;
1618 r.exp = 63;
1619 } else {
1620 uint64_t f;
1621 if (a < 0) {
1622 f = -a;
1623 r.sign = true;
1624 } else {
1625 f = a;
1626 r.sign = false;
1628 int shift = clz64(f) - 1;
1629 r.cls = float_class_normal;
1630 r.exp = (DECOMPOSED_BINARY_POINT - shift);
1631 r.frac = f << shift;
1634 return r;
1637 float16 int64_to_float16(int64_t a, float_status *status)
1639 FloatParts pa = int_to_float(a, status);
1640 return float16_round_pack_canonical(pa, status);
1643 float16 int32_to_float16(int32_t a, float_status *status)
1645 return int64_to_float16(a, status);
1648 float16 int16_to_float16(int16_t a, float_status *status)
1650 return int64_to_float16(a, status);
1653 float32 int64_to_float32(int64_t a, float_status *status)
1655 FloatParts pa = int_to_float(a, status);
1656 return float32_round_pack_canonical(pa, status);
1659 float32 int32_to_float32(int32_t a, float_status *status)
1661 return int64_to_float32(a, status);
1664 float32 int16_to_float32(int16_t a, float_status *status)
1666 return int64_to_float32(a, status);
1669 float64 int64_to_float64(int64_t a, float_status *status)
1671 FloatParts pa = int_to_float(a, status);
1672 return float64_round_pack_canonical(pa, status);
1675 float64 int32_to_float64(int32_t a, float_status *status)
1677 return int64_to_float64(a, status);
1680 float64 int16_to_float64(int16_t a, float_status *status)
1682 return int64_to_float64(a, status);
1687 * Unsigned Integer to float conversions
1689 * Returns the result of converting the unsigned integer `a' to the
1690 * floating-point format. The conversion is performed according to the
1691 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1694 static FloatParts uint_to_float(uint64_t a, float_status *status)
1696 FloatParts r = { .sign = false};
1698 if (a == 0) {
1699 r.cls = float_class_zero;
1700 } else {
1701 int spare_bits = clz64(a) - 1;
1702 r.cls = float_class_normal;
1703 r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1704 if (spare_bits < 0) {
1705 shift64RightJamming(a, -spare_bits, &a);
1706 r.frac = a;
1707 } else {
1708 r.frac = a << spare_bits;
1712 return r;
1715 float16 uint64_to_float16(uint64_t a, float_status *status)
1717 FloatParts pa = uint_to_float(a, status);
1718 return float16_round_pack_canonical(pa, status);
1721 float16 uint32_to_float16(uint32_t a, float_status *status)
1723 return uint64_to_float16(a, status);
1726 float16 uint16_to_float16(uint16_t a, float_status *status)
1728 return uint64_to_float16(a, status);
1731 float32 uint64_to_float32(uint64_t a, float_status *status)
1733 FloatParts pa = uint_to_float(a, status);
1734 return float32_round_pack_canonical(pa, status);
1737 float32 uint32_to_float32(uint32_t a, float_status *status)
1739 return uint64_to_float32(a, status);
1742 float32 uint16_to_float32(uint16_t a, float_status *status)
1744 return uint64_to_float32(a, status);
1747 float64 uint64_to_float64(uint64_t a, float_status *status)
1749 FloatParts pa = uint_to_float(a, status);
1750 return float64_round_pack_canonical(pa, status);
1753 float64 uint32_to_float64(uint32_t a, float_status *status)
1755 return uint64_to_float64(a, status);
1758 float64 uint16_to_float64(uint16_t a, float_status *status)
1760 return uint64_to_float64(a, status);
1763 /* Float Min/Max */
1764 /* min() and max() functions. These can't be implemented as
1765 * 'compare and pick one input' because that would mishandle
1766 * NaNs and +0 vs -0.
1768 * minnum() and maxnum() functions. These are similar to the min()
1769 * and max() functions but if one of the arguments is a QNaN and
1770 * the other is numerical then the numerical argument is returned.
1771 * SNaNs will get quietened before being returned.
1772 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1773 * and maxNum() operations. min() and max() are the typical min/max
1774 * semantics provided by many CPUs which predate that specification.
1776 * minnummag() and maxnummag() functions correspond to minNumMag()
1777 * and minNumMag() from the IEEE-754 2008.
1779 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1780 bool ieee, bool ismag, float_status *s)
1782 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1783 if (ieee) {
1784 /* Takes two floating-point values `a' and `b', one of
1785 * which is a NaN, and returns the appropriate NaN
1786 * result. If either `a' or `b' is a signaling NaN,
1787 * the invalid exception is raised.
1789 if (is_snan(a.cls) || is_snan(b.cls)) {
1790 return pick_nan(a, b, s);
1791 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1792 return b;
1793 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1794 return a;
1797 return pick_nan(a, b, s);
1798 } else {
1799 int a_exp, b_exp;
1801 switch (a.cls) {
1802 case float_class_normal:
1803 a_exp = a.exp;
1804 break;
1805 case float_class_inf:
1806 a_exp = INT_MAX;
1807 break;
1808 case float_class_zero:
1809 a_exp = INT_MIN;
1810 break;
1811 default:
1812 g_assert_not_reached();
1813 break;
1815 switch (b.cls) {
1816 case float_class_normal:
1817 b_exp = b.exp;
1818 break;
1819 case float_class_inf:
1820 b_exp = INT_MAX;
1821 break;
1822 case float_class_zero:
1823 b_exp = INT_MIN;
1824 break;
1825 default:
1826 g_assert_not_reached();
1827 break;
1830 if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
1831 bool a_less = a_exp < b_exp;
1832 if (a_exp == b_exp) {
1833 a_less = a.frac < b.frac;
1835 return a_less ^ ismin ? b : a;
1838 if (a.sign == b.sign) {
1839 bool a_less = a_exp < b_exp;
1840 if (a_exp == b_exp) {
1841 a_less = a.frac < b.frac;
1843 return a.sign ^ a_less ^ ismin ? b : a;
1844 } else {
1845 return a.sign ^ ismin ? b : a;
1850 #define MINMAX(sz, name, ismin, isiee, ismag) \
1851 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
1852 float_status *s) \
1854 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1855 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1856 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
1858 return float ## sz ## _round_pack_canonical(pr, s); \
1861 MINMAX(16, min, true, false, false)
1862 MINMAX(16, minnum, true, true, false)
1863 MINMAX(16, minnummag, true, true, true)
1864 MINMAX(16, max, false, false, false)
1865 MINMAX(16, maxnum, false, true, false)
1866 MINMAX(16, maxnummag, false, true, true)
1868 MINMAX(32, min, true, false, false)
1869 MINMAX(32, minnum, true, true, false)
1870 MINMAX(32, minnummag, true, true, true)
1871 MINMAX(32, max, false, false, false)
1872 MINMAX(32, maxnum, false, true, false)
1873 MINMAX(32, maxnummag, false, true, true)
1875 MINMAX(64, min, true, false, false)
1876 MINMAX(64, minnum, true, true, false)
1877 MINMAX(64, minnummag, true, true, true)
1878 MINMAX(64, max, false, false, false)
1879 MINMAX(64, maxnum, false, true, false)
1880 MINMAX(64, maxnummag, false, true, true)
1882 #undef MINMAX
1884 /* Floating point compare */
1885 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
1886 float_status *s)
1888 if (is_nan(a.cls) || is_nan(b.cls)) {
1889 if (!is_quiet ||
1890 a.cls == float_class_snan ||
1891 b.cls == float_class_snan) {
1892 s->float_exception_flags |= float_flag_invalid;
1894 return float_relation_unordered;
1897 if (a.cls == float_class_zero) {
1898 if (b.cls == float_class_zero) {
1899 return float_relation_equal;
1901 return b.sign ? float_relation_greater : float_relation_less;
1902 } else if (b.cls == float_class_zero) {
1903 return a.sign ? float_relation_less : float_relation_greater;
1906 /* The only really important thing about infinity is its sign. If
1907 * both are infinities the sign marks the smallest of the two.
1909 if (a.cls == float_class_inf) {
1910 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
1911 return float_relation_equal;
1913 return a.sign ? float_relation_less : float_relation_greater;
1914 } else if (b.cls == float_class_inf) {
1915 return b.sign ? float_relation_greater : float_relation_less;
1918 if (a.sign != b.sign) {
1919 return a.sign ? float_relation_less : float_relation_greater;
1922 if (a.exp == b.exp) {
1923 if (a.frac == b.frac) {
1924 return float_relation_equal;
1926 if (a.sign) {
1927 return a.frac > b.frac ?
1928 float_relation_less : float_relation_greater;
1929 } else {
1930 return a.frac > b.frac ?
1931 float_relation_greater : float_relation_less;
1933 } else {
1934 if (a.sign) {
1935 return a.exp > b.exp ? float_relation_less : float_relation_greater;
1936 } else {
1937 return a.exp > b.exp ? float_relation_greater : float_relation_less;
1942 #define COMPARE(sz) \
1943 int float ## sz ## _compare(float ## sz a, float ## sz b, \
1944 float_status *s) \
1946 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1947 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1948 return compare_floats(pa, pb, false, s); \
1950 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \
1951 float_status *s) \
1953 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1954 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1955 return compare_floats(pa, pb, true, s); \
1958 COMPARE(16)
1959 COMPARE(32)
1960 COMPARE(64)
1962 #undef COMPARE
1964 /* Multiply A by 2 raised to the power N. */
1965 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1967 if (unlikely(is_nan(a.cls))) {
1968 return return_nan(a, s);
1970 if (a.cls == float_class_normal) {
1971 /* The largest float type (even though not supported by FloatParts)
1972 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
1973 * still allows rounding to infinity, without allowing overflow
1974 * within the int32_t that backs FloatParts.exp.
1976 n = MIN(MAX(n, -0x10000), 0x10000);
1977 a.exp += n;
1979 return a;
1982 float16 float16_scalbn(float16 a, int n, float_status *status)
1984 FloatParts pa = float16_unpack_canonical(a, status);
1985 FloatParts pr = scalbn_decomposed(pa, n, status);
1986 return float16_round_pack_canonical(pr, status);
1989 float32 float32_scalbn(float32 a, int n, float_status *status)
1991 FloatParts pa = float32_unpack_canonical(a, status);
1992 FloatParts pr = scalbn_decomposed(pa, n, status);
1993 return float32_round_pack_canonical(pr, status);
1996 float64 float64_scalbn(float64 a, int n, float_status *status)
1998 FloatParts pa = float64_unpack_canonical(a, status);
1999 FloatParts pr = scalbn_decomposed(pa, n, status);
2000 return float64_round_pack_canonical(pr, status);
2004 * Square Root
2006 * The old softfloat code did an approximation step before zeroing in
2007 * on the final result. However for simpleness we just compute the
2008 * square root by iterating down from the implicit bit to enough extra
2009 * bits to ensure we get a correctly rounded result.
2011 * This does mean however the calculation is slower than before,
2012 * especially for 64 bit floats.
2015 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
2017 uint64_t a_frac, r_frac, s_frac;
2018 int bit, last_bit;
2020 if (is_nan(a.cls)) {
2021 return return_nan(a, s);
2023 if (a.cls == float_class_zero) {
2024 return a; /* sqrt(+-0) = +-0 */
2026 if (a.sign) {
2027 s->float_exception_flags |= float_flag_invalid;
2028 return parts_default_nan(s);
2030 if (a.cls == float_class_inf) {
2031 return a; /* sqrt(+inf) = +inf */
2034 assert(a.cls == float_class_normal);
2036 /* We need two overflow bits at the top. Adding room for that is a
2037 * right shift. If the exponent is odd, we can discard the low bit
2038 * by multiplying the fraction by 2; that's a left shift. Combine
2039 * those and we shift right if the exponent is even.
2041 a_frac = a.frac;
2042 if (!(a.exp & 1)) {
2043 a_frac >>= 1;
2045 a.exp >>= 1;
2047 /* Bit-by-bit computation of sqrt. */
2048 r_frac = 0;
2049 s_frac = 0;
2051 /* Iterate from implicit bit down to the 3 extra bits to compute a
2052 * properly rounded result. Remember we've inserted one more bit
2053 * at the top, so these positions are one less.
2055 bit = DECOMPOSED_BINARY_POINT - 1;
2056 last_bit = MAX(p->frac_shift - 4, 0);
2057 do {
2058 uint64_t q = 1ULL << bit;
2059 uint64_t t_frac = s_frac + q;
2060 if (t_frac <= a_frac) {
2061 s_frac = t_frac + q;
2062 a_frac -= t_frac;
2063 r_frac += q;
2065 a_frac <<= 1;
2066 } while (--bit >= last_bit);
2068 /* Undo the right shift done above. If there is any remaining
2069 * fraction, the result is inexact. Set the sticky bit.
2071 a.frac = (r_frac << 1) + (a_frac != 0);
2073 return a;
2076 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
2078 FloatParts pa = float16_unpack_canonical(a, status);
2079 FloatParts pr = sqrt_float(pa, status, &float16_params);
2080 return float16_round_pack_canonical(pr, status);
2083 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
2085 FloatParts pa = float32_unpack_canonical(a, status);
2086 FloatParts pr = sqrt_float(pa, status, &float32_params);
2087 return float32_round_pack_canonical(pr, status);
2090 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
2092 FloatParts pa = float64_unpack_canonical(a, status);
2093 FloatParts pr = sqrt_float(pa, status, &float64_params);
2094 return float64_round_pack_canonical(pr, status);
2098 /*----------------------------------------------------------------------------
2099 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2100 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2101 | input. If `zSign' is 1, the input is negated before being converted to an
2102 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
2103 | is simply rounded to an integer, with the inexact exception raised if the
2104 | input cannot be represented exactly as an integer. However, if the fixed-
2105 | point input is too large, the invalid exception is raised and the largest
2106 | positive or negative integer is returned.
2107 *----------------------------------------------------------------------------*/
2109 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2111 int8_t roundingMode;
2112 flag roundNearestEven;
2113 int8_t roundIncrement, roundBits;
2114 int32_t z;
2116 roundingMode = status->float_rounding_mode;
2117 roundNearestEven = ( roundingMode == float_round_nearest_even );
2118 switch (roundingMode) {
2119 case float_round_nearest_even:
2120 case float_round_ties_away:
2121 roundIncrement = 0x40;
2122 break;
2123 case float_round_to_zero:
2124 roundIncrement = 0;
2125 break;
2126 case float_round_up:
2127 roundIncrement = zSign ? 0 : 0x7f;
2128 break;
2129 case float_round_down:
2130 roundIncrement = zSign ? 0x7f : 0;
2131 break;
2132 default:
2133 abort();
2135 roundBits = absZ & 0x7F;
2136 absZ = ( absZ + roundIncrement )>>7;
2137 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2138 z = absZ;
2139 if ( zSign ) z = - z;
2140 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2141 float_raise(float_flag_invalid, status);
2142 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2144 if (roundBits) {
2145 status->float_exception_flags |= float_flag_inexact;
2147 return z;
2151 /*----------------------------------------------------------------------------
2152 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2153 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2154 | and returns the properly rounded 64-bit integer corresponding to the input.
2155 | If `zSign' is 1, the input is negated before being converted to an integer.
2156 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2157 | the inexact exception raised if the input cannot be represented exactly as
2158 | an integer. However, if the fixed-point input is too large, the invalid
2159 | exception is raised and the largest positive or negative integer is
2160 | returned.
2161 *----------------------------------------------------------------------------*/
2163 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2164 float_status *status)
2166 int8_t roundingMode;
2167 flag roundNearestEven, increment;
2168 int64_t z;
2170 roundingMode = status->float_rounding_mode;
2171 roundNearestEven = ( roundingMode == float_round_nearest_even );
2172 switch (roundingMode) {
2173 case float_round_nearest_even:
2174 case float_round_ties_away:
2175 increment = ((int64_t) absZ1 < 0);
2176 break;
2177 case float_round_to_zero:
2178 increment = 0;
2179 break;
2180 case float_round_up:
2181 increment = !zSign && absZ1;
2182 break;
2183 case float_round_down:
2184 increment = zSign && absZ1;
2185 break;
2186 default:
2187 abort();
2189 if ( increment ) {
2190 ++absZ0;
2191 if ( absZ0 == 0 ) goto overflow;
2192 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2194 z = absZ0;
2195 if ( zSign ) z = - z;
2196 if ( z && ( ( z < 0 ) ^ zSign ) ) {
2197 overflow:
2198 float_raise(float_flag_invalid, status);
2199 return
2200 zSign ? (int64_t) LIT64( 0x8000000000000000 )
2201 : LIT64( 0x7FFFFFFFFFFFFFFF );
2203 if (absZ1) {
2204 status->float_exception_flags |= float_flag_inexact;
2206 return z;
2210 /*----------------------------------------------------------------------------
2211 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2212 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2213 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2214 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
2215 | with the inexact exception raised if the input cannot be represented exactly
2216 | as an integer. However, if the fixed-point input is too large, the invalid
2217 | exception is raised and the largest unsigned integer is returned.
2218 *----------------------------------------------------------------------------*/
2220 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2221 uint64_t absZ1, float_status *status)
2223 int8_t roundingMode;
2224 flag roundNearestEven, increment;
2226 roundingMode = status->float_rounding_mode;
2227 roundNearestEven = (roundingMode == float_round_nearest_even);
2228 switch (roundingMode) {
2229 case float_round_nearest_even:
2230 case float_round_ties_away:
2231 increment = ((int64_t)absZ1 < 0);
2232 break;
2233 case float_round_to_zero:
2234 increment = 0;
2235 break;
2236 case float_round_up:
2237 increment = !zSign && absZ1;
2238 break;
2239 case float_round_down:
2240 increment = zSign && absZ1;
2241 break;
2242 default:
2243 abort();
2245 if (increment) {
2246 ++absZ0;
2247 if (absZ0 == 0) {
2248 float_raise(float_flag_invalid, status);
2249 return LIT64(0xFFFFFFFFFFFFFFFF);
2251 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2254 if (zSign && absZ0) {
2255 float_raise(float_flag_invalid, status);
2256 return 0;
2259 if (absZ1) {
2260 status->float_exception_flags |= float_flag_inexact;
2262 return absZ0;
2265 /*----------------------------------------------------------------------------
2266 | If `a' is denormal and we are in flush-to-zero mode then set the
2267 | input-denormal exception and return zero. Otherwise just return the value.
2268 *----------------------------------------------------------------------------*/
2269 float32 float32_squash_input_denormal(float32 a, float_status *status)
2271 if (status->flush_inputs_to_zero) {
2272 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2273 float_raise(float_flag_input_denormal, status);
2274 return make_float32(float32_val(a) & 0x80000000);
2277 return a;
2280 /*----------------------------------------------------------------------------
2281 | Normalizes the subnormal single-precision floating-point value represented
2282 | by the denormalized significand `aSig'. The normalized exponent and
2283 | significand are stored at the locations pointed to by `zExpPtr' and
2284 | `zSigPtr', respectively.
2285 *----------------------------------------------------------------------------*/
2287 static void
2288 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2290 int8_t shiftCount;
2292 shiftCount = countLeadingZeros32( aSig ) - 8;
2293 *zSigPtr = aSig<<shiftCount;
2294 *zExpPtr = 1 - shiftCount;
2298 /*----------------------------------------------------------------------------
2299 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2300 | and significand `zSig', and returns the proper single-precision floating-
2301 | point value corresponding to the abstract input. Ordinarily, the abstract
2302 | value is simply rounded and packed into the single-precision format, with
2303 | the inexact exception raised if the abstract input cannot be represented
2304 | exactly. However, if the abstract value is too large, the overflow and
2305 | inexact exceptions are raised and an infinity or maximal finite value is
2306 | returned. If the abstract value is too small, the input value is rounded to
2307 | a subnormal number, and the underflow and inexact exceptions are raised if
2308 | the abstract input cannot be represented exactly as a subnormal single-
2309 | precision floating-point number.
2310 | The input significand `zSig' has its binary point between bits 30
2311 | and 29, which is 7 bits to the left of the usual location. This shifted
2312 | significand must be normalized or smaller. If `zSig' is not normalized,
2313 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2314 | and it must not require rounding. In the usual case that `zSig' is
2315 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2316 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2317 | Binary Floating-Point Arithmetic.
2318 *----------------------------------------------------------------------------*/
2320 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2321 float_status *status)
2323 int8_t roundingMode;
2324 flag roundNearestEven;
2325 int8_t roundIncrement, roundBits;
2326 flag isTiny;
2328 roundingMode = status->float_rounding_mode;
2329 roundNearestEven = ( roundingMode == float_round_nearest_even );
2330 switch (roundingMode) {
2331 case float_round_nearest_even:
2332 case float_round_ties_away:
2333 roundIncrement = 0x40;
2334 break;
2335 case float_round_to_zero:
2336 roundIncrement = 0;
2337 break;
2338 case float_round_up:
2339 roundIncrement = zSign ? 0 : 0x7f;
2340 break;
2341 case float_round_down:
2342 roundIncrement = zSign ? 0x7f : 0;
2343 break;
2344 default:
2345 abort();
2346 break;
2348 roundBits = zSig & 0x7F;
2349 if ( 0xFD <= (uint16_t) zExp ) {
2350 if ( ( 0xFD < zExp )
2351 || ( ( zExp == 0xFD )
2352 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2354 float_raise(float_flag_overflow | float_flag_inexact, status);
2355 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2357 if ( zExp < 0 ) {
2358 if (status->flush_to_zero) {
2359 float_raise(float_flag_output_denormal, status);
2360 return packFloat32(zSign, 0, 0);
2362 isTiny =
2363 (status->float_detect_tininess
2364 == float_tininess_before_rounding)
2365 || ( zExp < -1 )
2366 || ( zSig + roundIncrement < 0x80000000 );
2367 shift32RightJamming( zSig, - zExp, &zSig );
2368 zExp = 0;
2369 roundBits = zSig & 0x7F;
2370 if (isTiny && roundBits) {
2371 float_raise(float_flag_underflow, status);
2375 if (roundBits) {
2376 status->float_exception_flags |= float_flag_inexact;
2378 zSig = ( zSig + roundIncrement )>>7;
2379 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2380 if ( zSig == 0 ) zExp = 0;
2381 return packFloat32( zSign, zExp, zSig );
2385 /*----------------------------------------------------------------------------
2386 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2387 | and significand `zSig', and returns the proper single-precision floating-
2388 | point value corresponding to the abstract input. This routine is just like
2389 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2390 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2391 | floating-point exponent.
2392 *----------------------------------------------------------------------------*/
2394 static float32
2395 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2396 float_status *status)
2398 int8_t shiftCount;
2400 shiftCount = countLeadingZeros32( zSig ) - 1;
2401 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2402 status);
2406 /*----------------------------------------------------------------------------
2407 | If `a' is denormal and we are in flush-to-zero mode then set the
2408 | input-denormal exception and return zero. Otherwise just return the value.
2409 *----------------------------------------------------------------------------*/
2410 float64 float64_squash_input_denormal(float64 a, float_status *status)
2412 if (status->flush_inputs_to_zero) {
2413 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2414 float_raise(float_flag_input_denormal, status);
2415 return make_float64(float64_val(a) & (1ULL << 63));
2418 return a;
2421 /*----------------------------------------------------------------------------
2422 | Normalizes the subnormal double-precision floating-point value represented
2423 | by the denormalized significand `aSig'. The normalized exponent and
2424 | significand are stored at the locations pointed to by `zExpPtr' and
2425 | `zSigPtr', respectively.
2426 *----------------------------------------------------------------------------*/
2428 static void
2429 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2431 int8_t shiftCount;
2433 shiftCount = countLeadingZeros64( aSig ) - 11;
2434 *zSigPtr = aSig<<shiftCount;
2435 *zExpPtr = 1 - shiftCount;
2439 /*----------------------------------------------------------------------------
2440 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2441 | double-precision floating-point value, returning the result. After being
2442 | shifted into the proper positions, the three fields are simply added
2443 | together to form the result. This means that any integer portion of `zSig'
2444 | will be added into the exponent. Since a properly normalized significand
2445 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2446 | than the desired result exponent whenever `zSig' is a complete, normalized
2447 | significand.
2448 *----------------------------------------------------------------------------*/
2450 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2453 return make_float64(
2454 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2458 /*----------------------------------------------------------------------------
2459 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2460 | and significand `zSig', and returns the proper double-precision floating-
2461 | point value corresponding to the abstract input. Ordinarily, the abstract
2462 | value is simply rounded and packed into the double-precision format, with
2463 | the inexact exception raised if the abstract input cannot be represented
2464 | exactly. However, if the abstract value is too large, the overflow and
2465 | inexact exceptions are raised and an infinity or maximal finite value is
2466 | returned. If the abstract value is too small, the input value is rounded to
2467 | a subnormal number, and the underflow and inexact exceptions are raised if
2468 | the abstract input cannot be represented exactly as a subnormal double-
2469 | precision floating-point number.
2470 | The input significand `zSig' has its binary point between bits 62
2471 | and 61, which is 10 bits to the left of the usual location. This shifted
2472 | significand must be normalized or smaller. If `zSig' is not normalized,
2473 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2474 | and it must not require rounding. In the usual case that `zSig' is
2475 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2476 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2477 | Binary Floating-Point Arithmetic.
2478 *----------------------------------------------------------------------------*/
2480 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2481 float_status *status)
2483 int8_t roundingMode;
2484 flag roundNearestEven;
2485 int roundIncrement, roundBits;
2486 flag isTiny;
2488 roundingMode = status->float_rounding_mode;
2489 roundNearestEven = ( roundingMode == float_round_nearest_even );
2490 switch (roundingMode) {
2491 case float_round_nearest_even:
2492 case float_round_ties_away:
2493 roundIncrement = 0x200;
2494 break;
2495 case float_round_to_zero:
2496 roundIncrement = 0;
2497 break;
2498 case float_round_up:
2499 roundIncrement = zSign ? 0 : 0x3ff;
2500 break;
2501 case float_round_down:
2502 roundIncrement = zSign ? 0x3ff : 0;
2503 break;
2504 case float_round_to_odd:
2505 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2506 break;
2507 default:
2508 abort();
2510 roundBits = zSig & 0x3FF;
2511 if ( 0x7FD <= (uint16_t) zExp ) {
2512 if ( ( 0x7FD < zExp )
2513 || ( ( zExp == 0x7FD )
2514 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2516 bool overflow_to_inf = roundingMode != float_round_to_odd &&
2517 roundIncrement != 0;
2518 float_raise(float_flag_overflow | float_flag_inexact, status);
2519 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2521 if ( zExp < 0 ) {
2522 if (status->flush_to_zero) {
2523 float_raise(float_flag_output_denormal, status);
2524 return packFloat64(zSign, 0, 0);
2526 isTiny =
2527 (status->float_detect_tininess
2528 == float_tininess_before_rounding)
2529 || ( zExp < -1 )
2530 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2531 shift64RightJamming( zSig, - zExp, &zSig );
2532 zExp = 0;
2533 roundBits = zSig & 0x3FF;
2534 if (isTiny && roundBits) {
2535 float_raise(float_flag_underflow, status);
2537 if (roundingMode == float_round_to_odd) {
2539 * For round-to-odd case, the roundIncrement depends on
2540 * zSig which just changed.
2542 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2546 if (roundBits) {
2547 status->float_exception_flags |= float_flag_inexact;
2549 zSig = ( zSig + roundIncrement )>>10;
2550 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2551 if ( zSig == 0 ) zExp = 0;
2552 return packFloat64( zSign, zExp, zSig );
2556 /*----------------------------------------------------------------------------
2557 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2558 | and significand `zSig', and returns the proper double-precision floating-
2559 | point value corresponding to the abstract input. This routine is just like
2560 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2561 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2562 | floating-point exponent.
2563 *----------------------------------------------------------------------------*/
2565 static float64
2566 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2567 float_status *status)
2569 int8_t shiftCount;
2571 shiftCount = countLeadingZeros64( zSig ) - 1;
2572 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2573 status);
2577 /*----------------------------------------------------------------------------
2578 | Normalizes the subnormal extended double-precision floating-point value
2579 | represented by the denormalized significand `aSig'. The normalized exponent
2580 | and significand are stored at the locations pointed to by `zExpPtr' and
2581 | `zSigPtr', respectively.
2582 *----------------------------------------------------------------------------*/
2584 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2585 uint64_t *zSigPtr)
2587 int8_t shiftCount;
2589 shiftCount = countLeadingZeros64( aSig );
2590 *zSigPtr = aSig<<shiftCount;
2591 *zExpPtr = 1 - shiftCount;
2594 /*----------------------------------------------------------------------------
2595 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2596 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2597 | and returns the proper extended double-precision floating-point value
2598 | corresponding to the abstract input. Ordinarily, the abstract value is
2599 | rounded and packed into the extended double-precision format, with the
2600 | inexact exception raised if the abstract input cannot be represented
2601 | exactly. However, if the abstract value is too large, the overflow and
2602 | inexact exceptions are raised and an infinity or maximal finite value is
2603 | returned. If the abstract value is too small, the input value is rounded to
2604 | a subnormal number, and the underflow and inexact exceptions are raised if
2605 | the abstract input cannot be represented exactly as a subnormal extended
2606 | double-precision floating-point number.
2607 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
2608 | number of bits as single or double precision, respectively. Otherwise, the
2609 | result is rounded to the full precision of the extended double-precision
2610 | format.
2611 | The input significand must be normalized or smaller. If the input
2612 | significand is not normalized, `zExp' must be 0; in that case, the result
2613 | returned is a subnormal number, and it must not require rounding. The
2614 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2615 | Floating-Point Arithmetic.
2616 *----------------------------------------------------------------------------*/
2618 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2619 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2620 float_status *status)
2622 int8_t roundingMode;
2623 flag roundNearestEven, increment, isTiny;
2624 int64_t roundIncrement, roundMask, roundBits;
2626 roundingMode = status->float_rounding_mode;
2627 roundNearestEven = ( roundingMode == float_round_nearest_even );
2628 if ( roundingPrecision == 80 ) goto precision80;
2629 if ( roundingPrecision == 64 ) {
2630 roundIncrement = LIT64( 0x0000000000000400 );
2631 roundMask = LIT64( 0x00000000000007FF );
2633 else if ( roundingPrecision == 32 ) {
2634 roundIncrement = LIT64( 0x0000008000000000 );
2635 roundMask = LIT64( 0x000000FFFFFFFFFF );
2637 else {
2638 goto precision80;
2640 zSig0 |= ( zSig1 != 0 );
2641 switch (roundingMode) {
2642 case float_round_nearest_even:
2643 case float_round_ties_away:
2644 break;
2645 case float_round_to_zero:
2646 roundIncrement = 0;
2647 break;
2648 case float_round_up:
2649 roundIncrement = zSign ? 0 : roundMask;
2650 break;
2651 case float_round_down:
2652 roundIncrement = zSign ? roundMask : 0;
2653 break;
2654 default:
2655 abort();
2657 roundBits = zSig0 & roundMask;
2658 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2659 if ( ( 0x7FFE < zExp )
2660 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2662 goto overflow;
2664 if ( zExp <= 0 ) {
2665 if (status->flush_to_zero) {
2666 float_raise(float_flag_output_denormal, status);
2667 return packFloatx80(zSign, 0, 0);
2669 isTiny =
2670 (status->float_detect_tininess
2671 == float_tininess_before_rounding)
2672 || ( zExp < 0 )
2673 || ( zSig0 <= zSig0 + roundIncrement );
2674 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2675 zExp = 0;
2676 roundBits = zSig0 & roundMask;
2677 if (isTiny && roundBits) {
2678 float_raise(float_flag_underflow, status);
2680 if (roundBits) {
2681 status->float_exception_flags |= float_flag_inexact;
2683 zSig0 += roundIncrement;
2684 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2685 roundIncrement = roundMask + 1;
2686 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2687 roundMask |= roundIncrement;
2689 zSig0 &= ~ roundMask;
2690 return packFloatx80( zSign, zExp, zSig0 );
2693 if (roundBits) {
2694 status->float_exception_flags |= float_flag_inexact;
2696 zSig0 += roundIncrement;
2697 if ( zSig0 < roundIncrement ) {
2698 ++zExp;
2699 zSig0 = LIT64( 0x8000000000000000 );
2701 roundIncrement = roundMask + 1;
2702 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2703 roundMask |= roundIncrement;
2705 zSig0 &= ~ roundMask;
2706 if ( zSig0 == 0 ) zExp = 0;
2707 return packFloatx80( zSign, zExp, zSig0 );
2708 precision80:
2709 switch (roundingMode) {
2710 case float_round_nearest_even:
2711 case float_round_ties_away:
2712 increment = ((int64_t)zSig1 < 0);
2713 break;
2714 case float_round_to_zero:
2715 increment = 0;
2716 break;
2717 case float_round_up:
2718 increment = !zSign && zSig1;
2719 break;
2720 case float_round_down:
2721 increment = zSign && zSig1;
2722 break;
2723 default:
2724 abort();
2726 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2727 if ( ( 0x7FFE < zExp )
2728 || ( ( zExp == 0x7FFE )
2729 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2730 && increment
2733 roundMask = 0;
2734 overflow:
2735 float_raise(float_flag_overflow | float_flag_inexact, status);
2736 if ( ( roundingMode == float_round_to_zero )
2737 || ( zSign && ( roundingMode == float_round_up ) )
2738 || ( ! zSign && ( roundingMode == float_round_down ) )
2740 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2742 return packFloatx80(zSign,
2743 floatx80_infinity_high,
2744 floatx80_infinity_low);
2746 if ( zExp <= 0 ) {
2747 isTiny =
2748 (status->float_detect_tininess
2749 == float_tininess_before_rounding)
2750 || ( zExp < 0 )
2751 || ! increment
2752 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2753 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2754 zExp = 0;
2755 if (isTiny && zSig1) {
2756 float_raise(float_flag_underflow, status);
2758 if (zSig1) {
2759 status->float_exception_flags |= float_flag_inexact;
2761 switch (roundingMode) {
2762 case float_round_nearest_even:
2763 case float_round_ties_away:
2764 increment = ((int64_t)zSig1 < 0);
2765 break;
2766 case float_round_to_zero:
2767 increment = 0;
2768 break;
2769 case float_round_up:
2770 increment = !zSign && zSig1;
2771 break;
2772 case float_round_down:
2773 increment = zSign && zSig1;
2774 break;
2775 default:
2776 abort();
2778 if ( increment ) {
2779 ++zSig0;
2780 zSig0 &=
2781 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2782 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2784 return packFloatx80( zSign, zExp, zSig0 );
2787 if (zSig1) {
2788 status->float_exception_flags |= float_flag_inexact;
2790 if ( increment ) {
2791 ++zSig0;
2792 if ( zSig0 == 0 ) {
2793 ++zExp;
2794 zSig0 = LIT64( 0x8000000000000000 );
2796 else {
2797 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2800 else {
2801 if ( zSig0 == 0 ) zExp = 0;
2803 return packFloatx80( zSign, zExp, zSig0 );
2807 /*----------------------------------------------------------------------------
2808 | Takes an abstract floating-point value having sign `zSign', exponent
2809 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2810 | and returns the proper extended double-precision floating-point value
2811 | corresponding to the abstract input. This routine is just like
2812 | `roundAndPackFloatx80' except that the input significand does not have to be
2813 | normalized.
2814 *----------------------------------------------------------------------------*/
2816 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2817 flag zSign, int32_t zExp,
2818 uint64_t zSig0, uint64_t zSig1,
2819 float_status *status)
2821 int8_t shiftCount;
2823 if ( zSig0 == 0 ) {
2824 zSig0 = zSig1;
2825 zSig1 = 0;
2826 zExp -= 64;
2828 shiftCount = countLeadingZeros64( zSig0 );
2829 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2830 zExp -= shiftCount;
2831 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2832 zSig0, zSig1, status);
2836 /*----------------------------------------------------------------------------
2837 | Returns the least-significant 64 fraction bits of the quadruple-precision
2838 | floating-point value `a'.
2839 *----------------------------------------------------------------------------*/
2841 static inline uint64_t extractFloat128Frac1( float128 a )
2844 return a.low;
2848 /*----------------------------------------------------------------------------
2849 | Returns the most-significant 48 fraction bits of the quadruple-precision
2850 | floating-point value `a'.
2851 *----------------------------------------------------------------------------*/
2853 static inline uint64_t extractFloat128Frac0( float128 a )
2856 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2860 /*----------------------------------------------------------------------------
2861 | Returns the exponent bits of the quadruple-precision floating-point value
2862 | `a'.
2863 *----------------------------------------------------------------------------*/
2865 static inline int32_t extractFloat128Exp( float128 a )
2868 return ( a.high>>48 ) & 0x7FFF;
2872 /*----------------------------------------------------------------------------
2873 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2874 *----------------------------------------------------------------------------*/
2876 static inline flag extractFloat128Sign( float128 a )
2879 return a.high>>63;
2883 /*----------------------------------------------------------------------------
2884 | Normalizes the subnormal quadruple-precision floating-point value
2885 | represented by the denormalized significand formed by the concatenation of
2886 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
2887 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
2888 | significand are stored at the location pointed to by `zSig0Ptr', and the
2889 | least significant 64 bits of the normalized significand are stored at the
2890 | location pointed to by `zSig1Ptr'.
2891 *----------------------------------------------------------------------------*/
2893 static void
2894 normalizeFloat128Subnormal(
2895 uint64_t aSig0,
2896 uint64_t aSig1,
2897 int32_t *zExpPtr,
2898 uint64_t *zSig0Ptr,
2899 uint64_t *zSig1Ptr
2902 int8_t shiftCount;
2904 if ( aSig0 == 0 ) {
2905 shiftCount = countLeadingZeros64( aSig1 ) - 15;
2906 if ( shiftCount < 0 ) {
2907 *zSig0Ptr = aSig1>>( - shiftCount );
2908 *zSig1Ptr = aSig1<<( shiftCount & 63 );
2910 else {
2911 *zSig0Ptr = aSig1<<shiftCount;
2912 *zSig1Ptr = 0;
2914 *zExpPtr = - shiftCount - 63;
2916 else {
2917 shiftCount = countLeadingZeros64( aSig0 ) - 15;
2918 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2919 *zExpPtr = 1 - shiftCount;
2924 /*----------------------------------------------------------------------------
2925 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2926 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2927 | floating-point value, returning the result. After being shifted into the
2928 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2929 | added together to form the most significant 32 bits of the result. This
2930 | means that any integer portion of `zSig0' will be added into the exponent.
2931 | Since a properly normalized significand will have an integer portion equal
2932 | to 1, the `zExp' input should be 1 less than the desired result exponent
2933 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2934 | significand.
2935 *----------------------------------------------------------------------------*/
2937 static inline float128
2938 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
2940 float128 z;
2942 z.low = zSig1;
2943 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
2944 return z;
2948 /*----------------------------------------------------------------------------
2949 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2950 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2951 | and `zSig2', and returns the proper quadruple-precision floating-point value
2952 | corresponding to the abstract input. Ordinarily, the abstract value is
2953 | simply rounded and packed into the quadruple-precision format, with the
2954 | inexact exception raised if the abstract input cannot be represented
2955 | exactly. However, if the abstract value is too large, the overflow and
2956 | inexact exceptions are raised and an infinity or maximal finite value is
2957 | returned. If the abstract value is too small, the input value is rounded to
2958 | a subnormal number, and the underflow and inexact exceptions are raised if
2959 | the abstract input cannot be represented exactly as a subnormal quadruple-
2960 | precision floating-point number.
2961 | The input significand must be normalized or smaller. If the input
2962 | significand is not normalized, `zExp' must be 0; in that case, the result
2963 | returned is a subnormal number, and it must not require rounding. In the
2964 | usual case that the input significand is normalized, `zExp' must be 1 less
2965 | than the ``true'' floating-point exponent. The handling of underflow and
2966 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2967 *----------------------------------------------------------------------------*/
2969 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
2970 uint64_t zSig0, uint64_t zSig1,
2971 uint64_t zSig2, float_status *status)
2973 int8_t roundingMode;
2974 flag roundNearestEven, increment, isTiny;
2976 roundingMode = status->float_rounding_mode;
2977 roundNearestEven = ( roundingMode == float_round_nearest_even );
2978 switch (roundingMode) {
2979 case float_round_nearest_even:
2980 case float_round_ties_away:
2981 increment = ((int64_t)zSig2 < 0);
2982 break;
2983 case float_round_to_zero:
2984 increment = 0;
2985 break;
2986 case float_round_up:
2987 increment = !zSign && zSig2;
2988 break;
2989 case float_round_down:
2990 increment = zSign && zSig2;
2991 break;
2992 case float_round_to_odd:
2993 increment = !(zSig1 & 0x1) && zSig2;
2994 break;
2995 default:
2996 abort();
2998 if ( 0x7FFD <= (uint32_t) zExp ) {
2999 if ( ( 0x7FFD < zExp )
3000 || ( ( zExp == 0x7FFD )
3001 && eq128(
3002 LIT64( 0x0001FFFFFFFFFFFF ),
3003 LIT64( 0xFFFFFFFFFFFFFFFF ),
3004 zSig0,
3005 zSig1
3007 && increment
3010 float_raise(float_flag_overflow | float_flag_inexact, status);
3011 if ( ( roundingMode == float_round_to_zero )
3012 || ( zSign && ( roundingMode == float_round_up ) )
3013 || ( ! zSign && ( roundingMode == float_round_down ) )
3014 || (roundingMode == float_round_to_odd)
3016 return
3017 packFloat128(
3018 zSign,
3019 0x7FFE,
3020 LIT64( 0x0000FFFFFFFFFFFF ),
3021 LIT64( 0xFFFFFFFFFFFFFFFF )
3024 return packFloat128( zSign, 0x7FFF, 0, 0 );
3026 if ( zExp < 0 ) {
3027 if (status->flush_to_zero) {
3028 float_raise(float_flag_output_denormal, status);
3029 return packFloat128(zSign, 0, 0, 0);
3031 isTiny =
3032 (status->float_detect_tininess
3033 == float_tininess_before_rounding)
3034 || ( zExp < -1 )
3035 || ! increment
3036 || lt128(
3037 zSig0,
3038 zSig1,
3039 LIT64( 0x0001FFFFFFFFFFFF ),
3040 LIT64( 0xFFFFFFFFFFFFFFFF )
3042 shift128ExtraRightJamming(
3043 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
3044 zExp = 0;
3045 if (isTiny && zSig2) {
3046 float_raise(float_flag_underflow, status);
3048 switch (roundingMode) {
3049 case float_round_nearest_even:
3050 case float_round_ties_away:
3051 increment = ((int64_t)zSig2 < 0);
3052 break;
3053 case float_round_to_zero:
3054 increment = 0;
3055 break;
3056 case float_round_up:
3057 increment = !zSign && zSig2;
3058 break;
3059 case float_round_down:
3060 increment = zSign && zSig2;
3061 break;
3062 case float_round_to_odd:
3063 increment = !(zSig1 & 0x1) && zSig2;
3064 break;
3065 default:
3066 abort();
3070 if (zSig2) {
3071 status->float_exception_flags |= float_flag_inexact;
3073 if ( increment ) {
3074 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
3075 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
3077 else {
3078 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
3080 return packFloat128( zSign, zExp, zSig0, zSig1 );
3084 /*----------------------------------------------------------------------------
3085 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3086 | and significand formed by the concatenation of `zSig0' and `zSig1', and
3087 | returns the proper quadruple-precision floating-point value corresponding
3088 | to the abstract input. This routine is just like `roundAndPackFloat128'
3089 | except that the input significand has fewer bits and does not have to be
3090 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
3091 | point exponent.
3092 *----------------------------------------------------------------------------*/
3094 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
3095 uint64_t zSig0, uint64_t zSig1,
3096 float_status *status)
3098 int8_t shiftCount;
3099 uint64_t zSig2;
3101 if ( zSig0 == 0 ) {
3102 zSig0 = zSig1;
3103 zSig1 = 0;
3104 zExp -= 64;
3106 shiftCount = countLeadingZeros64( zSig0 ) - 15;
3107 if ( 0 <= shiftCount ) {
3108 zSig2 = 0;
3109 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3111 else {
3112 shift128ExtraRightJamming(
3113 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3115 zExp -= shiftCount;
3116 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3121 /*----------------------------------------------------------------------------
3122 | Returns the result of converting the 32-bit two's complement integer `a'
3123 | to the extended double-precision floating-point format. The conversion
3124 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3125 | Arithmetic.
3126 *----------------------------------------------------------------------------*/
3128 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3130 flag zSign;
3131 uint32_t absA;
3132 int8_t shiftCount;
3133 uint64_t zSig;
3135 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3136 zSign = ( a < 0 );
3137 absA = zSign ? - a : a;
3138 shiftCount = countLeadingZeros32( absA ) + 32;
3139 zSig = absA;
3140 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3144 /*----------------------------------------------------------------------------
3145 | Returns the result of converting the 32-bit two's complement integer `a' to
3146 | the quadruple-precision floating-point format. The conversion is performed
3147 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3148 *----------------------------------------------------------------------------*/
3150 float128 int32_to_float128(int32_t a, float_status *status)
3152 flag zSign;
3153 uint32_t absA;
3154 int8_t shiftCount;
3155 uint64_t zSig0;
3157 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3158 zSign = ( a < 0 );
3159 absA = zSign ? - a : a;
3160 shiftCount = countLeadingZeros32( absA ) + 17;
3161 zSig0 = absA;
3162 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3166 /*----------------------------------------------------------------------------
3167 | Returns the result of converting the 64-bit two's complement integer `a'
3168 | to the extended double-precision floating-point format. The conversion
3169 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3170 | Arithmetic.
3171 *----------------------------------------------------------------------------*/
3173 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3175 flag zSign;
3176 uint64_t absA;
3177 int8_t shiftCount;
3179 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3180 zSign = ( a < 0 );
3181 absA = zSign ? - a : a;
3182 shiftCount = countLeadingZeros64( absA );
3183 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3187 /*----------------------------------------------------------------------------
3188 | Returns the result of converting the 64-bit two's complement integer `a' to
3189 | the quadruple-precision floating-point format. The conversion is performed
3190 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3191 *----------------------------------------------------------------------------*/
3193 float128 int64_to_float128(int64_t a, float_status *status)
3195 flag zSign;
3196 uint64_t absA;
3197 int8_t shiftCount;
3198 int32_t zExp;
3199 uint64_t zSig0, zSig1;
3201 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3202 zSign = ( a < 0 );
3203 absA = zSign ? - a : a;
3204 shiftCount = countLeadingZeros64( absA ) + 49;
3205 zExp = 0x406E - shiftCount;
3206 if ( 64 <= shiftCount ) {
3207 zSig1 = 0;
3208 zSig0 = absA;
3209 shiftCount -= 64;
3211 else {
3212 zSig1 = absA;
3213 zSig0 = 0;
3215 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3216 return packFloat128( zSign, zExp, zSig0, zSig1 );
3220 /*----------------------------------------------------------------------------
3221 | Returns the result of converting the 64-bit unsigned integer `a'
3222 | to the quadruple-precision floating-point format. The conversion is performed
3223 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3224 *----------------------------------------------------------------------------*/
3226 float128 uint64_to_float128(uint64_t a, float_status *status)
3228 if (a == 0) {
3229 return float128_zero;
3231 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
3234 /*----------------------------------------------------------------------------
3235 | Returns the result of converting the single-precision floating-point value
3236 | `a' to the extended double-precision floating-point format. The conversion
3237 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3238 | Arithmetic.
3239 *----------------------------------------------------------------------------*/
3241 floatx80 float32_to_floatx80(float32 a, float_status *status)
3243 flag aSign;
3244 int aExp;
3245 uint32_t aSig;
3247 a = float32_squash_input_denormal(a, status);
3248 aSig = extractFloat32Frac( a );
3249 aExp = extractFloat32Exp( a );
3250 aSign = extractFloat32Sign( a );
3251 if ( aExp == 0xFF ) {
3252 if (aSig) {
3253 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3255 return packFloatx80(aSign,
3256 floatx80_infinity_high,
3257 floatx80_infinity_low);
3259 if ( aExp == 0 ) {
3260 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3261 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3263 aSig |= 0x00800000;
3264 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3268 /*----------------------------------------------------------------------------
3269 | Returns the result of converting the single-precision floating-point value
3270 | `a' to the double-precision floating-point format. The conversion is
3271 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3272 | Arithmetic.
3273 *----------------------------------------------------------------------------*/
3275 float128 float32_to_float128(float32 a, float_status *status)
3277 flag aSign;
3278 int aExp;
3279 uint32_t aSig;
3281 a = float32_squash_input_denormal(a, status);
3282 aSig = extractFloat32Frac( a );
3283 aExp = extractFloat32Exp( a );
3284 aSign = extractFloat32Sign( a );
3285 if ( aExp == 0xFF ) {
3286 if (aSig) {
3287 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3289 return packFloat128( aSign, 0x7FFF, 0, 0 );
3291 if ( aExp == 0 ) {
3292 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3293 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3294 --aExp;
3296 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3300 /*----------------------------------------------------------------------------
3301 | Returns the remainder of the single-precision floating-point value `a'
3302 | with respect to the corresponding value `b'. The operation is performed
3303 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3304 *----------------------------------------------------------------------------*/
3306 float32 float32_rem(float32 a, float32 b, float_status *status)
3308 flag aSign, zSign;
3309 int aExp, bExp, expDiff;
3310 uint32_t aSig, bSig;
3311 uint32_t q;
3312 uint64_t aSig64, bSig64, q64;
3313 uint32_t alternateASig;
3314 int32_t sigMean;
3315 a = float32_squash_input_denormal(a, status);
3316 b = float32_squash_input_denormal(b, status);
3318 aSig = extractFloat32Frac( a );
3319 aExp = extractFloat32Exp( a );
3320 aSign = extractFloat32Sign( a );
3321 bSig = extractFloat32Frac( b );
3322 bExp = extractFloat32Exp( b );
3323 if ( aExp == 0xFF ) {
3324 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3325 return propagateFloat32NaN(a, b, status);
3327 float_raise(float_flag_invalid, status);
3328 return float32_default_nan(status);
3330 if ( bExp == 0xFF ) {
3331 if (bSig) {
3332 return propagateFloat32NaN(a, b, status);
3334 return a;
3336 if ( bExp == 0 ) {
3337 if ( bSig == 0 ) {
3338 float_raise(float_flag_invalid, status);
3339 return float32_default_nan(status);
3341 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3343 if ( aExp == 0 ) {
3344 if ( aSig == 0 ) return a;
3345 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3347 expDiff = aExp - bExp;
3348 aSig |= 0x00800000;
3349 bSig |= 0x00800000;
3350 if ( expDiff < 32 ) {
3351 aSig <<= 8;
3352 bSig <<= 8;
3353 if ( expDiff < 0 ) {
3354 if ( expDiff < -1 ) return a;
3355 aSig >>= 1;
3357 q = ( bSig <= aSig );
3358 if ( q ) aSig -= bSig;
3359 if ( 0 < expDiff ) {
3360 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3361 q >>= 32 - expDiff;
3362 bSig >>= 2;
3363 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3365 else {
3366 aSig >>= 2;
3367 bSig >>= 2;
3370 else {
3371 if ( bSig <= aSig ) aSig -= bSig;
3372 aSig64 = ( (uint64_t) aSig )<<40;
3373 bSig64 = ( (uint64_t) bSig )<<40;
3374 expDiff -= 64;
3375 while ( 0 < expDiff ) {
3376 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3377 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3378 aSig64 = - ( ( bSig * q64 )<<38 );
3379 expDiff -= 62;
3381 expDiff += 64;
3382 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3383 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3384 q = q64>>( 64 - expDiff );
3385 bSig <<= 6;
3386 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3388 do {
3389 alternateASig = aSig;
3390 ++q;
3391 aSig -= bSig;
3392 } while ( 0 <= (int32_t) aSig );
3393 sigMean = aSig + alternateASig;
3394 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3395 aSig = alternateASig;
3397 zSign = ( (int32_t) aSig < 0 );
3398 if ( zSign ) aSig = - aSig;
3399 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3404 /*----------------------------------------------------------------------------
3405 | Returns the binary exponential of the single-precision floating-point value
3406 | `a'. The operation is performed according to the IEC/IEEE Standard for
3407 | Binary Floating-Point Arithmetic.
3409 | Uses the following identities:
3411 | 1. -------------------------------------------------------------------------
3412 | x x*ln(2)
3413 | 2 = e
3415 | 2. -------------------------------------------------------------------------
3416 | 2 3 4 5 n
3417 | x x x x x x x
3418 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3419 | 1! 2! 3! 4! 5! n!
3420 *----------------------------------------------------------------------------*/
3422 static const float64 float32_exp2_coefficients[15] =
3424 const_float64( 0x3ff0000000000000ll ), /* 1 */
3425 const_float64( 0x3fe0000000000000ll ), /* 2 */
3426 const_float64( 0x3fc5555555555555ll ), /* 3 */
3427 const_float64( 0x3fa5555555555555ll ), /* 4 */
3428 const_float64( 0x3f81111111111111ll ), /* 5 */
3429 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
3430 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
3431 const_float64( 0x3efa01a01a01a01all ), /* 8 */
3432 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
3433 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3434 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3435 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3436 const_float64( 0x3de6124613a86d09ll ), /* 13 */
3437 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3438 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3441 float32 float32_exp2(float32 a, float_status *status)
3443 flag aSign;
3444 int aExp;
3445 uint32_t aSig;
3446 float64 r, x, xn;
3447 int i;
3448 a = float32_squash_input_denormal(a, status);
3450 aSig = extractFloat32Frac( a );
3451 aExp = extractFloat32Exp( a );
3452 aSign = extractFloat32Sign( a );
3454 if ( aExp == 0xFF) {
3455 if (aSig) {
3456 return propagateFloat32NaN(a, float32_zero, status);
3458 return (aSign) ? float32_zero : a;
3460 if (aExp == 0) {
3461 if (aSig == 0) return float32_one;
3464 float_raise(float_flag_inexact, status);
3466 /* ******************************* */
3467 /* using float64 for approximation */
3468 /* ******************************* */
3469 x = float32_to_float64(a, status);
3470 x = float64_mul(x, float64_ln2, status);
3472 xn = x;
3473 r = float64_one;
3474 for (i = 0 ; i < 15 ; i++) {
3475 float64 f;
3477 f = float64_mul(xn, float32_exp2_coefficients[i], status);
3478 r = float64_add(r, f, status);
3480 xn = float64_mul(xn, x, status);
3483 return float64_to_float32(r, status);
3486 /*----------------------------------------------------------------------------
3487 | Returns the binary log of the single-precision floating-point value `a'.
3488 | The operation is performed according to the IEC/IEEE Standard for Binary
3489 | Floating-Point Arithmetic.
3490 *----------------------------------------------------------------------------*/
3491 float32 float32_log2(float32 a, float_status *status)
3493 flag aSign, zSign;
3494 int aExp;
3495 uint32_t aSig, zSig, i;
3497 a = float32_squash_input_denormal(a, status);
3498 aSig = extractFloat32Frac( a );
3499 aExp = extractFloat32Exp( a );
3500 aSign = extractFloat32Sign( a );
3502 if ( aExp == 0 ) {
3503 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3504 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3506 if ( aSign ) {
3507 float_raise(float_flag_invalid, status);
3508 return float32_default_nan(status);
3510 if ( aExp == 0xFF ) {
3511 if (aSig) {
3512 return propagateFloat32NaN(a, float32_zero, status);
3514 return a;
3517 aExp -= 0x7F;
3518 aSig |= 0x00800000;
3519 zSign = aExp < 0;
3520 zSig = aExp << 23;
3522 for (i = 1 << 22; i > 0; i >>= 1) {
3523 aSig = ( (uint64_t)aSig * aSig ) >> 23;
3524 if ( aSig & 0x01000000 ) {
3525 aSig >>= 1;
3526 zSig |= i;
3530 if ( zSign )
3531 zSig = -zSig;
3533 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3536 /*----------------------------------------------------------------------------
3537 | Returns 1 if the single-precision floating-point value `a' is equal to
3538 | the corresponding value `b', and 0 otherwise. The invalid exception is
3539 | raised if either operand is a NaN. Otherwise, the comparison is performed
3540 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3541 *----------------------------------------------------------------------------*/
3543 int float32_eq(float32 a, float32 b, float_status *status)
3545 uint32_t av, bv;
3546 a = float32_squash_input_denormal(a, status);
3547 b = float32_squash_input_denormal(b, status);
3549 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3550 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3552 float_raise(float_flag_invalid, status);
3553 return 0;
3555 av = float32_val(a);
3556 bv = float32_val(b);
3557 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3560 /*----------------------------------------------------------------------------
3561 | Returns 1 if the single-precision floating-point value `a' is less than
3562 | or equal to the corresponding value `b', and 0 otherwise. The invalid
3563 | exception is raised if either operand is a NaN. The comparison is performed
3564 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3565 *----------------------------------------------------------------------------*/
3567 int float32_le(float32 a, float32 b, float_status *status)
3569 flag aSign, bSign;
3570 uint32_t av, bv;
3571 a = float32_squash_input_denormal(a, status);
3572 b = float32_squash_input_denormal(b, status);
3574 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3575 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3577 float_raise(float_flag_invalid, status);
3578 return 0;
3580 aSign = extractFloat32Sign( a );
3581 bSign = extractFloat32Sign( b );
3582 av = float32_val(a);
3583 bv = float32_val(b);
3584 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3585 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3589 /*----------------------------------------------------------------------------
3590 | Returns 1 if the single-precision floating-point value `a' is less than
3591 | the corresponding value `b', and 0 otherwise. The invalid exception is
3592 | raised if either operand is a NaN. The comparison is performed according
3593 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3594 *----------------------------------------------------------------------------*/
3596 int float32_lt(float32 a, float32 b, float_status *status)
3598 flag aSign, bSign;
3599 uint32_t av, bv;
3600 a = float32_squash_input_denormal(a, status);
3601 b = float32_squash_input_denormal(b, status);
3603 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3604 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3606 float_raise(float_flag_invalid, status);
3607 return 0;
3609 aSign = extractFloat32Sign( a );
3610 bSign = extractFloat32Sign( b );
3611 av = float32_val(a);
3612 bv = float32_val(b);
3613 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3614 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3618 /*----------------------------------------------------------------------------
3619 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3620 | be compared, and 0 otherwise. The invalid exception is raised if either
3621 | operand is a NaN. The comparison is performed according to the IEC/IEEE
3622 | Standard for Binary Floating-Point Arithmetic.
3623 *----------------------------------------------------------------------------*/
3625 int float32_unordered(float32 a, float32 b, float_status *status)
3627 a = float32_squash_input_denormal(a, status);
3628 b = float32_squash_input_denormal(b, status);
3630 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3631 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3633 float_raise(float_flag_invalid, status);
3634 return 1;
3636 return 0;
3639 /*----------------------------------------------------------------------------
3640 | Returns 1 if the single-precision floating-point value `a' is equal to
3641 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3642 | exception. The comparison is performed according to the IEC/IEEE Standard
3643 | for Binary Floating-Point Arithmetic.
3644 *----------------------------------------------------------------------------*/
3646 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3648 a = float32_squash_input_denormal(a, status);
3649 b = float32_squash_input_denormal(b, status);
3651 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3652 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3654 if (float32_is_signaling_nan(a, status)
3655 || float32_is_signaling_nan(b, status)) {
3656 float_raise(float_flag_invalid, status);
3658 return 0;
3660 return ( float32_val(a) == float32_val(b) ) ||
3661 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3664 /*----------------------------------------------------------------------------
3665 | Returns 1 if the single-precision floating-point value `a' is less than or
3666 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3667 | cause an exception. Otherwise, the comparison is performed according to the
3668 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3669 *----------------------------------------------------------------------------*/
3671 int float32_le_quiet(float32 a, float32 b, float_status *status)
3673 flag aSign, bSign;
3674 uint32_t av, bv;
3675 a = float32_squash_input_denormal(a, status);
3676 b = float32_squash_input_denormal(b, status);
3678 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3679 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3681 if (float32_is_signaling_nan(a, status)
3682 || float32_is_signaling_nan(b, status)) {
3683 float_raise(float_flag_invalid, status);
3685 return 0;
3687 aSign = extractFloat32Sign( a );
3688 bSign = extractFloat32Sign( b );
3689 av = float32_val(a);
3690 bv = float32_val(b);
3691 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3692 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3696 /*----------------------------------------------------------------------------
3697 | Returns 1 if the single-precision floating-point value `a' is less than
3698 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3699 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3700 | Standard for Binary Floating-Point Arithmetic.
3701 *----------------------------------------------------------------------------*/
3703 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3705 flag aSign, bSign;
3706 uint32_t av, bv;
3707 a = float32_squash_input_denormal(a, status);
3708 b = float32_squash_input_denormal(b, status);
3710 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3711 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3713 if (float32_is_signaling_nan(a, status)
3714 || float32_is_signaling_nan(b, status)) {
3715 float_raise(float_flag_invalid, status);
3717 return 0;
3719 aSign = extractFloat32Sign( a );
3720 bSign = extractFloat32Sign( b );
3721 av = float32_val(a);
3722 bv = float32_val(b);
3723 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3724 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3728 /*----------------------------------------------------------------------------
3729 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3730 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3731 | comparison is performed according to the IEC/IEEE Standard for Binary
3732 | Floating-Point Arithmetic.
3733 *----------------------------------------------------------------------------*/
3735 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3737 a = float32_squash_input_denormal(a, status);
3738 b = float32_squash_input_denormal(b, status);
3740 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3741 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3743 if (float32_is_signaling_nan(a, status)
3744 || float32_is_signaling_nan(b, status)) {
3745 float_raise(float_flag_invalid, status);
3747 return 1;
3749 return 0;
3752 /*----------------------------------------------------------------------------
3753 | If `a' is denormal and we are in flush-to-zero mode then set the
3754 | input-denormal exception and return zero. Otherwise just return the value.
3755 *----------------------------------------------------------------------------*/
3756 float16 float16_squash_input_denormal(float16 a, float_status *status)
3758 if (status->flush_inputs_to_zero) {
3759 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3760 float_raise(float_flag_input_denormal, status);
3761 return make_float16(float16_val(a) & 0x8000);
3764 return a;
3767 /*----------------------------------------------------------------------------
3768 | Returns the result of converting the double-precision floating-point value
3769 | `a' to the extended double-precision floating-point format. The conversion
3770 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3771 | Arithmetic.
3772 *----------------------------------------------------------------------------*/
3774 floatx80 float64_to_floatx80(float64 a, float_status *status)
3776 flag aSign;
3777 int aExp;
3778 uint64_t aSig;
3780 a = float64_squash_input_denormal(a, status);
3781 aSig = extractFloat64Frac( a );
3782 aExp = extractFloat64Exp( a );
3783 aSign = extractFloat64Sign( a );
3784 if ( aExp == 0x7FF ) {
3785 if (aSig) {
3786 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3788 return packFloatx80(aSign,
3789 floatx80_infinity_high,
3790 floatx80_infinity_low);
3792 if ( aExp == 0 ) {
3793 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3794 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3796 return
3797 packFloatx80(
3798 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3802 /*----------------------------------------------------------------------------
3803 | Returns the result of converting the double-precision floating-point value
3804 | `a' to the quadruple-precision floating-point format. The conversion is
3805 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3806 | Arithmetic.
3807 *----------------------------------------------------------------------------*/
3809 float128 float64_to_float128(float64 a, float_status *status)
3811 flag aSign;
3812 int aExp;
3813 uint64_t aSig, zSig0, zSig1;
3815 a = float64_squash_input_denormal(a, status);
3816 aSig = extractFloat64Frac( a );
3817 aExp = extractFloat64Exp( a );
3818 aSign = extractFloat64Sign( a );
3819 if ( aExp == 0x7FF ) {
3820 if (aSig) {
3821 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3823 return packFloat128( aSign, 0x7FFF, 0, 0 );
3825 if ( aExp == 0 ) {
3826 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3827 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3828 --aExp;
3830 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3831 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3836 /*----------------------------------------------------------------------------
3837 | Returns the remainder of the double-precision floating-point value `a'
3838 | with respect to the corresponding value `b'. The operation is performed
3839 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3840 *----------------------------------------------------------------------------*/
3842 float64 float64_rem(float64 a, float64 b, float_status *status)
3844 flag aSign, zSign;
3845 int aExp, bExp, expDiff;
3846 uint64_t aSig, bSig;
3847 uint64_t q, alternateASig;
3848 int64_t sigMean;
3850 a = float64_squash_input_denormal(a, status);
3851 b = float64_squash_input_denormal(b, status);
3852 aSig = extractFloat64Frac( a );
3853 aExp = extractFloat64Exp( a );
3854 aSign = extractFloat64Sign( a );
3855 bSig = extractFloat64Frac( b );
3856 bExp = extractFloat64Exp( b );
3857 if ( aExp == 0x7FF ) {
3858 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3859 return propagateFloat64NaN(a, b, status);
3861 float_raise(float_flag_invalid, status);
3862 return float64_default_nan(status);
3864 if ( bExp == 0x7FF ) {
3865 if (bSig) {
3866 return propagateFloat64NaN(a, b, status);
3868 return a;
3870 if ( bExp == 0 ) {
3871 if ( bSig == 0 ) {
3872 float_raise(float_flag_invalid, status);
3873 return float64_default_nan(status);
3875 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3877 if ( aExp == 0 ) {
3878 if ( aSig == 0 ) return a;
3879 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3881 expDiff = aExp - bExp;
3882 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3883 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3884 if ( expDiff < 0 ) {
3885 if ( expDiff < -1 ) return a;
3886 aSig >>= 1;
3888 q = ( bSig <= aSig );
3889 if ( q ) aSig -= bSig;
3890 expDiff -= 64;
3891 while ( 0 < expDiff ) {
3892 q = estimateDiv128To64( aSig, 0, bSig );
3893 q = ( 2 < q ) ? q - 2 : 0;
3894 aSig = - ( ( bSig>>2 ) * q );
3895 expDiff -= 62;
3897 expDiff += 64;
3898 if ( 0 < expDiff ) {
3899 q = estimateDiv128To64( aSig, 0, bSig );
3900 q = ( 2 < q ) ? q - 2 : 0;
3901 q >>= 64 - expDiff;
3902 bSig >>= 2;
3903 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3905 else {
3906 aSig >>= 2;
3907 bSig >>= 2;
3909 do {
3910 alternateASig = aSig;
3911 ++q;
3912 aSig -= bSig;
3913 } while ( 0 <= (int64_t) aSig );
3914 sigMean = aSig + alternateASig;
3915 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3916 aSig = alternateASig;
3918 zSign = ( (int64_t) aSig < 0 );
3919 if ( zSign ) aSig = - aSig;
3920 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
3924 /*----------------------------------------------------------------------------
3925 | Returns the binary log of the double-precision floating-point value `a'.
3926 | The operation is performed according to the IEC/IEEE Standard for Binary
3927 | Floating-Point Arithmetic.
3928 *----------------------------------------------------------------------------*/
3929 float64 float64_log2(float64 a, float_status *status)
3931 flag aSign, zSign;
3932 int aExp;
3933 uint64_t aSig, aSig0, aSig1, zSig, i;
3934 a = float64_squash_input_denormal(a, status);
3936 aSig = extractFloat64Frac( a );
3937 aExp = extractFloat64Exp( a );
3938 aSign = extractFloat64Sign( a );
3940 if ( aExp == 0 ) {
3941 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3942 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3944 if ( aSign ) {
3945 float_raise(float_flag_invalid, status);
3946 return float64_default_nan(status);
3948 if ( aExp == 0x7FF ) {
3949 if (aSig) {
3950 return propagateFloat64NaN(a, float64_zero, status);
3952 return a;
3955 aExp -= 0x3FF;
3956 aSig |= LIT64( 0x0010000000000000 );
3957 zSign = aExp < 0;
3958 zSig = (uint64_t)aExp << 52;
3959 for (i = 1LL << 51; i > 0; i >>= 1) {
3960 mul64To128( aSig, aSig, &aSig0, &aSig1 );
3961 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3962 if ( aSig & LIT64( 0x0020000000000000 ) ) {
3963 aSig >>= 1;
3964 zSig |= i;
3968 if ( zSign )
3969 zSig = -zSig;
3970 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
3973 /*----------------------------------------------------------------------------
3974 | Returns 1 if the double-precision floating-point value `a' is equal to the
3975 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3976 | if either operand is a NaN. Otherwise, the comparison is performed
3977 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3978 *----------------------------------------------------------------------------*/
3980 int float64_eq(float64 a, float64 b, float_status *status)
3982 uint64_t av, bv;
3983 a = float64_squash_input_denormal(a, status);
3984 b = float64_squash_input_denormal(b, status);
3986 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3987 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3989 float_raise(float_flag_invalid, status);
3990 return 0;
3992 av = float64_val(a);
3993 bv = float64_val(b);
3994 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3998 /*----------------------------------------------------------------------------
3999 | Returns 1 if the double-precision floating-point value `a' is less than or
4000 | equal to the corresponding value `b', and 0 otherwise. The invalid
4001 | exception is raised if either operand is a NaN. The comparison is performed
4002 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4003 *----------------------------------------------------------------------------*/
4005 int float64_le(float64 a, float64 b, float_status *status)
4007 flag aSign, bSign;
4008 uint64_t av, bv;
4009 a = float64_squash_input_denormal(a, status);
4010 b = float64_squash_input_denormal(b, status);
4012 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4013 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4015 float_raise(float_flag_invalid, status);
4016 return 0;
4018 aSign = extractFloat64Sign( a );
4019 bSign = extractFloat64Sign( b );
4020 av = float64_val(a);
4021 bv = float64_val(b);
4022 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4023 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4027 /*----------------------------------------------------------------------------
4028 | Returns 1 if the double-precision floating-point value `a' is less than
4029 | the corresponding value `b', and 0 otherwise. The invalid exception is
4030 | raised if either operand is a NaN. The comparison is performed according
4031 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4032 *----------------------------------------------------------------------------*/
4034 int float64_lt(float64 a, float64 b, float_status *status)
4036 flag aSign, bSign;
4037 uint64_t av, bv;
4039 a = float64_squash_input_denormal(a, status);
4040 b = float64_squash_input_denormal(b, status);
4041 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4042 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4044 float_raise(float_flag_invalid, status);
4045 return 0;
4047 aSign = extractFloat64Sign( a );
4048 bSign = extractFloat64Sign( b );
4049 av = float64_val(a);
4050 bv = float64_val(b);
4051 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4052 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4056 /*----------------------------------------------------------------------------
4057 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4058 | be compared, and 0 otherwise. The invalid exception is raised if either
4059 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4060 | Standard for Binary Floating-Point Arithmetic.
4061 *----------------------------------------------------------------------------*/
4063 int float64_unordered(float64 a, float64 b, float_status *status)
4065 a = float64_squash_input_denormal(a, status);
4066 b = float64_squash_input_denormal(b, status);
4068 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4069 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4071 float_raise(float_flag_invalid, status);
4072 return 1;
4074 return 0;
4077 /*----------------------------------------------------------------------------
4078 | Returns 1 if the double-precision floating-point value `a' is equal to the
4079 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4080 | exception.The comparison is performed according to the IEC/IEEE Standard
4081 | for Binary Floating-Point Arithmetic.
4082 *----------------------------------------------------------------------------*/
4084 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4086 uint64_t av, bv;
4087 a = float64_squash_input_denormal(a, status);
4088 b = float64_squash_input_denormal(b, status);
4090 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4091 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4093 if (float64_is_signaling_nan(a, status)
4094 || float64_is_signaling_nan(b, status)) {
4095 float_raise(float_flag_invalid, status);
4097 return 0;
4099 av = float64_val(a);
4100 bv = float64_val(b);
4101 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4105 /*----------------------------------------------------------------------------
4106 | Returns 1 if the double-precision floating-point value `a' is less than or
4107 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4108 | cause an exception. Otherwise, the comparison is performed according to the
4109 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4110 *----------------------------------------------------------------------------*/
4112 int float64_le_quiet(float64 a, float64 b, float_status *status)
4114 flag aSign, bSign;
4115 uint64_t av, bv;
4116 a = float64_squash_input_denormal(a, status);
4117 b = float64_squash_input_denormal(b, status);
4119 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4120 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4122 if (float64_is_signaling_nan(a, status)
4123 || float64_is_signaling_nan(b, status)) {
4124 float_raise(float_flag_invalid, status);
4126 return 0;
4128 aSign = extractFloat64Sign( a );
4129 bSign = extractFloat64Sign( b );
4130 av = float64_val(a);
4131 bv = float64_val(b);
4132 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4133 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4137 /*----------------------------------------------------------------------------
4138 | Returns 1 if the double-precision floating-point value `a' is less than
4139 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4140 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4141 | Standard for Binary Floating-Point Arithmetic.
4142 *----------------------------------------------------------------------------*/
4144 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4146 flag aSign, bSign;
4147 uint64_t av, bv;
4148 a = float64_squash_input_denormal(a, status);
4149 b = float64_squash_input_denormal(b, status);
4151 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4152 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4154 if (float64_is_signaling_nan(a, status)
4155 || float64_is_signaling_nan(b, status)) {
4156 float_raise(float_flag_invalid, status);
4158 return 0;
4160 aSign = extractFloat64Sign( a );
4161 bSign = extractFloat64Sign( b );
4162 av = float64_val(a);
4163 bv = float64_val(b);
4164 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4165 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4169 /*----------------------------------------------------------------------------
4170 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4171 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4172 | comparison is performed according to the IEC/IEEE Standard for Binary
4173 | Floating-Point Arithmetic.
4174 *----------------------------------------------------------------------------*/
4176 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4178 a = float64_squash_input_denormal(a, status);
4179 b = float64_squash_input_denormal(b, status);
4181 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4182 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4184 if (float64_is_signaling_nan(a, status)
4185 || float64_is_signaling_nan(b, status)) {
4186 float_raise(float_flag_invalid, status);
4188 return 1;
4190 return 0;
4193 /*----------------------------------------------------------------------------
4194 | Returns the result of converting the extended double-precision floating-
4195 | point value `a' to the 32-bit two's complement integer format. The
4196 | conversion is performed according to the IEC/IEEE Standard for Binary
4197 | Floating-Point Arithmetic---which means in particular that the conversion
4198 | is rounded according to the current rounding mode. If `a' is a NaN, the
4199 | largest positive integer is returned. Otherwise, if the conversion
4200 | overflows, the largest integer with the same sign as `a' is returned.
4201 *----------------------------------------------------------------------------*/
4203 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4205 flag aSign;
4206 int32_t aExp, shiftCount;
4207 uint64_t aSig;
4209 if (floatx80_invalid_encoding(a)) {
4210 float_raise(float_flag_invalid, status);
4211 return 1 << 31;
4213 aSig = extractFloatx80Frac( a );
4214 aExp = extractFloatx80Exp( a );
4215 aSign = extractFloatx80Sign( a );
4216 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4217 shiftCount = 0x4037 - aExp;
4218 if ( shiftCount <= 0 ) shiftCount = 1;
4219 shift64RightJamming( aSig, shiftCount, &aSig );
4220 return roundAndPackInt32(aSign, aSig, status);
4224 /*----------------------------------------------------------------------------
4225 | Returns the result of converting the extended double-precision floating-
4226 | point value `a' to the 32-bit two's complement integer format. The
4227 | conversion is performed according to the IEC/IEEE Standard for Binary
4228 | Floating-Point Arithmetic, except that the conversion is always rounded
4229 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4230 | Otherwise, if the conversion overflows, the largest integer with the same
4231 | sign as `a' is returned.
4232 *----------------------------------------------------------------------------*/
4234 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4236 flag aSign;
4237 int32_t aExp, shiftCount;
4238 uint64_t aSig, savedASig;
4239 int32_t z;
4241 if (floatx80_invalid_encoding(a)) {
4242 float_raise(float_flag_invalid, status);
4243 return 1 << 31;
4245 aSig = extractFloatx80Frac( a );
4246 aExp = extractFloatx80Exp( a );
4247 aSign = extractFloatx80Sign( a );
4248 if ( 0x401E < aExp ) {
4249 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4250 goto invalid;
4252 else if ( aExp < 0x3FFF ) {
4253 if (aExp || aSig) {
4254 status->float_exception_flags |= float_flag_inexact;
4256 return 0;
4258 shiftCount = 0x403E - aExp;
4259 savedASig = aSig;
4260 aSig >>= shiftCount;
4261 z = aSig;
4262 if ( aSign ) z = - z;
4263 if ( ( z < 0 ) ^ aSign ) {
4264 invalid:
4265 float_raise(float_flag_invalid, status);
4266 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4268 if ( ( aSig<<shiftCount ) != savedASig ) {
4269 status->float_exception_flags |= float_flag_inexact;
4271 return z;
4275 /*----------------------------------------------------------------------------
4276 | Returns the result of converting the extended double-precision floating-
4277 | point value `a' to the 64-bit two's complement integer format. The
4278 | conversion is performed according to the IEC/IEEE Standard for Binary
4279 | Floating-Point Arithmetic---which means in particular that the conversion
4280 | is rounded according to the current rounding mode. If `a' is a NaN,
4281 | the largest positive integer is returned. Otherwise, if the conversion
4282 | overflows, the largest integer with the same sign as `a' is returned.
4283 *----------------------------------------------------------------------------*/
4285 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4287 flag aSign;
4288 int32_t aExp, shiftCount;
4289 uint64_t aSig, aSigExtra;
4291 if (floatx80_invalid_encoding(a)) {
4292 float_raise(float_flag_invalid, status);
4293 return 1ULL << 63;
4295 aSig = extractFloatx80Frac( a );
4296 aExp = extractFloatx80Exp( a );
4297 aSign = extractFloatx80Sign( a );
4298 shiftCount = 0x403E - aExp;
4299 if ( shiftCount <= 0 ) {
4300 if ( shiftCount ) {
4301 float_raise(float_flag_invalid, status);
4302 if (!aSign || floatx80_is_any_nan(a)) {
4303 return LIT64( 0x7FFFFFFFFFFFFFFF );
4305 return (int64_t) LIT64( 0x8000000000000000 );
4307 aSigExtra = 0;
4309 else {
4310 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4312 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4316 /*----------------------------------------------------------------------------
4317 | Returns the result of converting the extended double-precision floating-
4318 | point value `a' to the 64-bit two's complement integer format. The
4319 | conversion is performed according to the IEC/IEEE Standard for Binary
4320 | Floating-Point Arithmetic, except that the conversion is always rounded
4321 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4322 | Otherwise, if the conversion overflows, the largest integer with the same
4323 | sign as `a' is returned.
4324 *----------------------------------------------------------------------------*/
4326 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4328 flag aSign;
4329 int32_t aExp, shiftCount;
4330 uint64_t aSig;
4331 int64_t z;
4333 if (floatx80_invalid_encoding(a)) {
4334 float_raise(float_flag_invalid, status);
4335 return 1ULL << 63;
4337 aSig = extractFloatx80Frac( a );
4338 aExp = extractFloatx80Exp( a );
4339 aSign = extractFloatx80Sign( a );
4340 shiftCount = aExp - 0x403E;
4341 if ( 0 <= shiftCount ) {
4342 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4343 if ( ( a.high != 0xC03E ) || aSig ) {
4344 float_raise(float_flag_invalid, status);
4345 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4346 return LIT64( 0x7FFFFFFFFFFFFFFF );
4349 return (int64_t) LIT64( 0x8000000000000000 );
4351 else if ( aExp < 0x3FFF ) {
4352 if (aExp | aSig) {
4353 status->float_exception_flags |= float_flag_inexact;
4355 return 0;
4357 z = aSig>>( - shiftCount );
4358 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4359 status->float_exception_flags |= float_flag_inexact;
4361 if ( aSign ) z = - z;
4362 return z;
4366 /*----------------------------------------------------------------------------
4367 | Returns the result of converting the extended double-precision floating-
4368 | point value `a' to the single-precision floating-point format. The
4369 | conversion is performed according to the IEC/IEEE Standard for Binary
4370 | Floating-Point Arithmetic.
4371 *----------------------------------------------------------------------------*/
4373 float32 floatx80_to_float32(floatx80 a, float_status *status)
4375 flag aSign;
4376 int32_t aExp;
4377 uint64_t aSig;
4379 if (floatx80_invalid_encoding(a)) {
4380 float_raise(float_flag_invalid, status);
4381 return float32_default_nan(status);
4383 aSig = extractFloatx80Frac( a );
4384 aExp = extractFloatx80Exp( a );
4385 aSign = extractFloatx80Sign( a );
4386 if ( aExp == 0x7FFF ) {
4387 if ( (uint64_t) ( aSig<<1 ) ) {
4388 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4390 return packFloat32( aSign, 0xFF, 0 );
4392 shift64RightJamming( aSig, 33, &aSig );
4393 if ( aExp || aSig ) aExp -= 0x3F81;
4394 return roundAndPackFloat32(aSign, aExp, aSig, status);
4398 /*----------------------------------------------------------------------------
4399 | Returns the result of converting the extended double-precision floating-
4400 | point value `a' to the double-precision floating-point format. The
4401 | conversion is performed according to the IEC/IEEE Standard for Binary
4402 | Floating-Point Arithmetic.
4403 *----------------------------------------------------------------------------*/
4405 float64 floatx80_to_float64(floatx80 a, float_status *status)
4407 flag aSign;
4408 int32_t aExp;
4409 uint64_t aSig, zSig;
4411 if (floatx80_invalid_encoding(a)) {
4412 float_raise(float_flag_invalid, status);
4413 return float64_default_nan(status);
4415 aSig = extractFloatx80Frac( a );
4416 aExp = extractFloatx80Exp( a );
4417 aSign = extractFloatx80Sign( a );
4418 if ( aExp == 0x7FFF ) {
4419 if ( (uint64_t) ( aSig<<1 ) ) {
4420 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4422 return packFloat64( aSign, 0x7FF, 0 );
4424 shift64RightJamming( aSig, 1, &zSig );
4425 if ( aExp || aSig ) aExp -= 0x3C01;
4426 return roundAndPackFloat64(aSign, aExp, zSig, status);
4430 /*----------------------------------------------------------------------------
4431 | Returns the result of converting the extended double-precision floating-
4432 | point value `a' to the quadruple-precision floating-point format. The
4433 | conversion is performed according to the IEC/IEEE Standard for Binary
4434 | Floating-Point Arithmetic.
4435 *----------------------------------------------------------------------------*/
4437 float128 floatx80_to_float128(floatx80 a, float_status *status)
4439 flag aSign;
4440 int aExp;
4441 uint64_t aSig, zSig0, zSig1;
4443 if (floatx80_invalid_encoding(a)) {
4444 float_raise(float_flag_invalid, status);
4445 return float128_default_nan(status);
4447 aSig = extractFloatx80Frac( a );
4448 aExp = extractFloatx80Exp( a );
4449 aSign = extractFloatx80Sign( a );
4450 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4451 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4453 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4454 return packFloat128( aSign, aExp, zSig0, zSig1 );
4458 /*----------------------------------------------------------------------------
4459 | Rounds the extended double-precision floating-point value `a'
4460 | to the precision provided by floatx80_rounding_precision and returns the
4461 | result as an extended double-precision floating-point value.
4462 | The operation is performed according to the IEC/IEEE Standard for Binary
4463 | Floating-Point Arithmetic.
4464 *----------------------------------------------------------------------------*/
4466 floatx80 floatx80_round(floatx80 a, float_status *status)
4468 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4469 extractFloatx80Sign(a),
4470 extractFloatx80Exp(a),
4471 extractFloatx80Frac(a), 0, status);
4474 /*----------------------------------------------------------------------------
4475 | Rounds the extended double-precision floating-point value `a' to an integer,
4476 | and returns the result as an extended quadruple-precision floating-point
4477 | value. The operation is performed according to the IEC/IEEE Standard for
4478 | Binary Floating-Point Arithmetic.
4479 *----------------------------------------------------------------------------*/
4481 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4483 flag aSign;
4484 int32_t aExp;
4485 uint64_t lastBitMask, roundBitsMask;
4486 floatx80 z;
4488 if (floatx80_invalid_encoding(a)) {
4489 float_raise(float_flag_invalid, status);
4490 return floatx80_default_nan(status);
4492 aExp = extractFloatx80Exp( a );
4493 if ( 0x403E <= aExp ) {
4494 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4495 return propagateFloatx80NaN(a, a, status);
4497 return a;
4499 if ( aExp < 0x3FFF ) {
4500 if ( ( aExp == 0 )
4501 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4502 return a;
4504 status->float_exception_flags |= float_flag_inexact;
4505 aSign = extractFloatx80Sign( a );
4506 switch (status->float_rounding_mode) {
4507 case float_round_nearest_even:
4508 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4510 return
4511 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4513 break;
4514 case float_round_ties_away:
4515 if (aExp == 0x3FFE) {
4516 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4518 break;
4519 case float_round_down:
4520 return
4521 aSign ?
4522 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4523 : packFloatx80( 0, 0, 0 );
4524 case float_round_up:
4525 return
4526 aSign ? packFloatx80( 1, 0, 0 )
4527 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4529 return packFloatx80( aSign, 0, 0 );
4531 lastBitMask = 1;
4532 lastBitMask <<= 0x403E - aExp;
4533 roundBitsMask = lastBitMask - 1;
4534 z = a;
4535 switch (status->float_rounding_mode) {
4536 case float_round_nearest_even:
4537 z.low += lastBitMask>>1;
4538 if ((z.low & roundBitsMask) == 0) {
4539 z.low &= ~lastBitMask;
4541 break;
4542 case float_round_ties_away:
4543 z.low += lastBitMask >> 1;
4544 break;
4545 case float_round_to_zero:
4546 break;
4547 case float_round_up:
4548 if (!extractFloatx80Sign(z)) {
4549 z.low += roundBitsMask;
4551 break;
4552 case float_round_down:
4553 if (extractFloatx80Sign(z)) {
4554 z.low += roundBitsMask;
4556 break;
4557 default:
4558 abort();
4560 z.low &= ~ roundBitsMask;
4561 if ( z.low == 0 ) {
4562 ++z.high;
4563 z.low = LIT64( 0x8000000000000000 );
4565 if (z.low != a.low) {
4566 status->float_exception_flags |= float_flag_inexact;
4568 return z;
4572 /*----------------------------------------------------------------------------
4573 | Returns the result of adding the absolute values of the extended double-
4574 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4575 | negated before being returned. `zSign' is ignored if the result is a NaN.
4576 | The addition is performed according to the IEC/IEEE Standard for Binary
4577 | Floating-Point Arithmetic.
4578 *----------------------------------------------------------------------------*/
4580 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4581 float_status *status)
4583 int32_t aExp, bExp, zExp;
4584 uint64_t aSig, bSig, zSig0, zSig1;
4585 int32_t expDiff;
4587 aSig = extractFloatx80Frac( a );
4588 aExp = extractFloatx80Exp( a );
4589 bSig = extractFloatx80Frac( b );
4590 bExp = extractFloatx80Exp( b );
4591 expDiff = aExp - bExp;
4592 if ( 0 < expDiff ) {
4593 if ( aExp == 0x7FFF ) {
4594 if ((uint64_t)(aSig << 1)) {
4595 return propagateFloatx80NaN(a, b, status);
4597 return a;
4599 if ( bExp == 0 ) --expDiff;
4600 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4601 zExp = aExp;
4603 else if ( expDiff < 0 ) {
4604 if ( bExp == 0x7FFF ) {
4605 if ((uint64_t)(bSig << 1)) {
4606 return propagateFloatx80NaN(a, b, status);
4608 return packFloatx80(zSign,
4609 floatx80_infinity_high,
4610 floatx80_infinity_low);
4612 if ( aExp == 0 ) ++expDiff;
4613 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4614 zExp = bExp;
4616 else {
4617 if ( aExp == 0x7FFF ) {
4618 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4619 return propagateFloatx80NaN(a, b, status);
4621 return a;
4623 zSig1 = 0;
4624 zSig0 = aSig + bSig;
4625 if ( aExp == 0 ) {
4626 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4627 goto roundAndPack;
4629 zExp = aExp;
4630 goto shiftRight1;
4632 zSig0 = aSig + bSig;
4633 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4634 shiftRight1:
4635 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4636 zSig0 |= LIT64( 0x8000000000000000 );
4637 ++zExp;
4638 roundAndPack:
4639 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4640 zSign, zExp, zSig0, zSig1, status);
4643 /*----------------------------------------------------------------------------
4644 | Returns the result of subtracting the absolute values of the extended
4645 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4646 | difference is negated before being returned. `zSign' is ignored if the
4647 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4648 | Standard for Binary Floating-Point Arithmetic.
4649 *----------------------------------------------------------------------------*/
4651 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4652 float_status *status)
4654 int32_t aExp, bExp, zExp;
4655 uint64_t aSig, bSig, zSig0, zSig1;
4656 int32_t expDiff;
4658 aSig = extractFloatx80Frac( a );
4659 aExp = extractFloatx80Exp( a );
4660 bSig = extractFloatx80Frac( b );
4661 bExp = extractFloatx80Exp( b );
4662 expDiff = aExp - bExp;
4663 if ( 0 < expDiff ) goto aExpBigger;
4664 if ( expDiff < 0 ) goto bExpBigger;
4665 if ( aExp == 0x7FFF ) {
4666 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4667 return propagateFloatx80NaN(a, b, status);
4669 float_raise(float_flag_invalid, status);
4670 return floatx80_default_nan(status);
4672 if ( aExp == 0 ) {
4673 aExp = 1;
4674 bExp = 1;
4676 zSig1 = 0;
4677 if ( bSig < aSig ) goto aBigger;
4678 if ( aSig < bSig ) goto bBigger;
4679 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
4680 bExpBigger:
4681 if ( bExp == 0x7FFF ) {
4682 if ((uint64_t)(bSig << 1)) {
4683 return propagateFloatx80NaN(a, b, status);
4685 return packFloatx80(zSign ^ 1, floatx80_infinity_high,
4686 floatx80_infinity_low);
4688 if ( aExp == 0 ) ++expDiff;
4689 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4690 bBigger:
4691 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4692 zExp = bExp;
4693 zSign ^= 1;
4694 goto normalizeRoundAndPack;
4695 aExpBigger:
4696 if ( aExp == 0x7FFF ) {
4697 if ((uint64_t)(aSig << 1)) {
4698 return propagateFloatx80NaN(a, b, status);
4700 return a;
4702 if ( bExp == 0 ) --expDiff;
4703 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4704 aBigger:
4705 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4706 zExp = aExp;
4707 normalizeRoundAndPack:
4708 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
4709 zSign, zExp, zSig0, zSig1, status);
4712 /*----------------------------------------------------------------------------
4713 | Returns the result of adding the extended double-precision floating-point
4714 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4715 | Standard for Binary Floating-Point Arithmetic.
4716 *----------------------------------------------------------------------------*/
4718 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
4720 flag aSign, bSign;
4722 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4723 float_raise(float_flag_invalid, status);
4724 return floatx80_default_nan(status);
4726 aSign = extractFloatx80Sign( a );
4727 bSign = extractFloatx80Sign( b );
4728 if ( aSign == bSign ) {
4729 return addFloatx80Sigs(a, b, aSign, status);
4731 else {
4732 return subFloatx80Sigs(a, b, aSign, status);
4737 /*----------------------------------------------------------------------------
4738 | Returns the result of subtracting the extended double-precision floating-
4739 | point values `a' and `b'. The operation is performed according to the
4740 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4741 *----------------------------------------------------------------------------*/
4743 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
4745 flag aSign, bSign;
4747 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4748 float_raise(float_flag_invalid, status);
4749 return floatx80_default_nan(status);
4751 aSign = extractFloatx80Sign( a );
4752 bSign = extractFloatx80Sign( b );
4753 if ( aSign == bSign ) {
4754 return subFloatx80Sigs(a, b, aSign, status);
4756 else {
4757 return addFloatx80Sigs(a, b, aSign, status);
4762 /*----------------------------------------------------------------------------
4763 | Returns the result of multiplying the extended double-precision floating-
4764 | point values `a' and `b'. The operation is performed according to the
4765 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4766 *----------------------------------------------------------------------------*/
4768 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
4770 flag aSign, bSign, zSign;
4771 int32_t aExp, bExp, zExp;
4772 uint64_t aSig, bSig, zSig0, zSig1;
4774 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4775 float_raise(float_flag_invalid, status);
4776 return floatx80_default_nan(status);
4778 aSig = extractFloatx80Frac( a );
4779 aExp = extractFloatx80Exp( a );
4780 aSign = extractFloatx80Sign( a );
4781 bSig = extractFloatx80Frac( b );
4782 bExp = extractFloatx80Exp( b );
4783 bSign = extractFloatx80Sign( b );
4784 zSign = aSign ^ bSign;
4785 if ( aExp == 0x7FFF ) {
4786 if ( (uint64_t) ( aSig<<1 )
4787 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4788 return propagateFloatx80NaN(a, b, status);
4790 if ( ( bExp | bSig ) == 0 ) goto invalid;
4791 return packFloatx80(zSign, floatx80_infinity_high,
4792 floatx80_infinity_low);
4794 if ( bExp == 0x7FFF ) {
4795 if ((uint64_t)(bSig << 1)) {
4796 return propagateFloatx80NaN(a, b, status);
4798 if ( ( aExp | aSig ) == 0 ) {
4799 invalid:
4800 float_raise(float_flag_invalid, status);
4801 return floatx80_default_nan(status);
4803 return packFloatx80(zSign, floatx80_infinity_high,
4804 floatx80_infinity_low);
4806 if ( aExp == 0 ) {
4807 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4808 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4810 if ( bExp == 0 ) {
4811 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4812 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4814 zExp = aExp + bExp - 0x3FFE;
4815 mul64To128( aSig, bSig, &zSig0, &zSig1 );
4816 if ( 0 < (int64_t) zSig0 ) {
4817 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4818 --zExp;
4820 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4821 zSign, zExp, zSig0, zSig1, status);
4824 /*----------------------------------------------------------------------------
4825 | Returns the result of dividing the extended double-precision floating-point
4826 | value `a' by the corresponding value `b'. The operation is performed
4827 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4828 *----------------------------------------------------------------------------*/
4830 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
4832 flag aSign, bSign, zSign;
4833 int32_t aExp, bExp, zExp;
4834 uint64_t aSig, bSig, zSig0, zSig1;
4835 uint64_t rem0, rem1, rem2, term0, term1, term2;
4837 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4838 float_raise(float_flag_invalid, status);
4839 return floatx80_default_nan(status);
4841 aSig = extractFloatx80Frac( a );
4842 aExp = extractFloatx80Exp( a );
4843 aSign = extractFloatx80Sign( a );
4844 bSig = extractFloatx80Frac( b );
4845 bExp = extractFloatx80Exp( b );
4846 bSign = extractFloatx80Sign( b );
4847 zSign = aSign ^ bSign;
4848 if ( aExp == 0x7FFF ) {
4849 if ((uint64_t)(aSig << 1)) {
4850 return propagateFloatx80NaN(a, b, status);
4852 if ( bExp == 0x7FFF ) {
4853 if ((uint64_t)(bSig << 1)) {
4854 return propagateFloatx80NaN(a, b, status);
4856 goto invalid;
4858 return packFloatx80(zSign, floatx80_infinity_high,
4859 floatx80_infinity_low);
4861 if ( bExp == 0x7FFF ) {
4862 if ((uint64_t)(bSig << 1)) {
4863 return propagateFloatx80NaN(a, b, status);
4865 return packFloatx80( zSign, 0, 0 );
4867 if ( bExp == 0 ) {
4868 if ( bSig == 0 ) {
4869 if ( ( aExp | aSig ) == 0 ) {
4870 invalid:
4871 float_raise(float_flag_invalid, status);
4872 return floatx80_default_nan(status);
4874 float_raise(float_flag_divbyzero, status);
4875 return packFloatx80(zSign, floatx80_infinity_high,
4876 floatx80_infinity_low);
4878 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4880 if ( aExp == 0 ) {
4881 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4882 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4884 zExp = aExp - bExp + 0x3FFE;
4885 rem1 = 0;
4886 if ( bSig <= aSig ) {
4887 shift128Right( aSig, 0, 1, &aSig, &rem1 );
4888 ++zExp;
4890 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4891 mul64To128( bSig, zSig0, &term0, &term1 );
4892 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4893 while ( (int64_t) rem0 < 0 ) {
4894 --zSig0;
4895 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4897 zSig1 = estimateDiv128To64( rem1, 0, bSig );
4898 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4899 mul64To128( bSig, zSig1, &term1, &term2 );
4900 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4901 while ( (int64_t) rem1 < 0 ) {
4902 --zSig1;
4903 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4905 zSig1 |= ( ( rem1 | rem2 ) != 0 );
4907 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4908 zSign, zExp, zSig0, zSig1, status);
4911 /*----------------------------------------------------------------------------
4912 | Returns the remainder of the extended double-precision floating-point value
4913 | `a' with respect to the corresponding value `b'. The operation is performed
4914 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4915 *----------------------------------------------------------------------------*/
4917 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
4919 flag aSign, zSign;
4920 int32_t aExp, bExp, expDiff;
4921 uint64_t aSig0, aSig1, bSig;
4922 uint64_t q, term0, term1, alternateASig0, alternateASig1;
4924 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4925 float_raise(float_flag_invalid, status);
4926 return floatx80_default_nan(status);
4928 aSig0 = extractFloatx80Frac( a );
4929 aExp = extractFloatx80Exp( a );
4930 aSign = extractFloatx80Sign( a );
4931 bSig = extractFloatx80Frac( b );
4932 bExp = extractFloatx80Exp( b );
4933 if ( aExp == 0x7FFF ) {
4934 if ( (uint64_t) ( aSig0<<1 )
4935 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4936 return propagateFloatx80NaN(a, b, status);
4938 goto invalid;
4940 if ( bExp == 0x7FFF ) {
4941 if ((uint64_t)(bSig << 1)) {
4942 return propagateFloatx80NaN(a, b, status);
4944 return a;
4946 if ( bExp == 0 ) {
4947 if ( bSig == 0 ) {
4948 invalid:
4949 float_raise(float_flag_invalid, status);
4950 return floatx80_default_nan(status);
4952 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4954 if ( aExp == 0 ) {
4955 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
4956 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4958 bSig |= LIT64( 0x8000000000000000 );
4959 zSign = aSign;
4960 expDiff = aExp - bExp;
4961 aSig1 = 0;
4962 if ( expDiff < 0 ) {
4963 if ( expDiff < -1 ) return a;
4964 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4965 expDiff = 0;
4967 q = ( bSig <= aSig0 );
4968 if ( q ) aSig0 -= bSig;
4969 expDiff -= 64;
4970 while ( 0 < expDiff ) {
4971 q = estimateDiv128To64( aSig0, aSig1, bSig );
4972 q = ( 2 < q ) ? q - 2 : 0;
4973 mul64To128( bSig, q, &term0, &term1 );
4974 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4975 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4976 expDiff -= 62;
4978 expDiff += 64;
4979 if ( 0 < expDiff ) {
4980 q = estimateDiv128To64( aSig0, aSig1, bSig );
4981 q = ( 2 < q ) ? q - 2 : 0;
4982 q >>= 64 - expDiff;
4983 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4984 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4985 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4986 while ( le128( term0, term1, aSig0, aSig1 ) ) {
4987 ++q;
4988 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4991 else {
4992 term1 = 0;
4993 term0 = bSig;
4995 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4996 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4997 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4998 && ( q & 1 ) )
5000 aSig0 = alternateASig0;
5001 aSig1 = alternateASig1;
5002 zSign = ! zSign;
5004 return
5005 normalizeRoundAndPackFloatx80(
5006 80, zSign, bExp + expDiff, aSig0, aSig1, status);
5010 /*----------------------------------------------------------------------------
5011 | Returns the square root of the extended double-precision floating-point
5012 | value `a'. The operation is performed according to the IEC/IEEE Standard
5013 | for Binary Floating-Point Arithmetic.
5014 *----------------------------------------------------------------------------*/
5016 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5018 flag aSign;
5019 int32_t aExp, zExp;
5020 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5021 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5023 if (floatx80_invalid_encoding(a)) {
5024 float_raise(float_flag_invalid, status);
5025 return floatx80_default_nan(status);
5027 aSig0 = extractFloatx80Frac( a );
5028 aExp = extractFloatx80Exp( a );
5029 aSign = extractFloatx80Sign( a );
5030 if ( aExp == 0x7FFF ) {
5031 if ((uint64_t)(aSig0 << 1)) {
5032 return propagateFloatx80NaN(a, a, status);
5034 if ( ! aSign ) return a;
5035 goto invalid;
5037 if ( aSign ) {
5038 if ( ( aExp | aSig0 ) == 0 ) return a;
5039 invalid:
5040 float_raise(float_flag_invalid, status);
5041 return floatx80_default_nan(status);
5043 if ( aExp == 0 ) {
5044 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5045 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5047 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5048 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5049 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5050 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5051 doubleZSig0 = zSig0<<1;
5052 mul64To128( zSig0, zSig0, &term0, &term1 );
5053 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5054 while ( (int64_t) rem0 < 0 ) {
5055 --zSig0;
5056 doubleZSig0 -= 2;
5057 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5059 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5060 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5061 if ( zSig1 == 0 ) zSig1 = 1;
5062 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5063 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5064 mul64To128( zSig1, zSig1, &term2, &term3 );
5065 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5066 while ( (int64_t) rem1 < 0 ) {
5067 --zSig1;
5068 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5069 term3 |= 1;
5070 term2 |= doubleZSig0;
5071 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5073 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5075 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5076 zSig0 |= doubleZSig0;
5077 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5078 0, zExp, zSig0, zSig1, status);
5081 /*----------------------------------------------------------------------------
5082 | Returns 1 if the extended double-precision floating-point value `a' is equal
5083 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5084 | raised if either operand is a NaN. Otherwise, the comparison is performed
5085 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5086 *----------------------------------------------------------------------------*/
5088 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5091 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5092 || (extractFloatx80Exp(a) == 0x7FFF
5093 && (uint64_t) (extractFloatx80Frac(a) << 1))
5094 || (extractFloatx80Exp(b) == 0x7FFF
5095 && (uint64_t) (extractFloatx80Frac(b) << 1))
5097 float_raise(float_flag_invalid, status);
5098 return 0;
5100 return
5101 ( a.low == b.low )
5102 && ( ( a.high == b.high )
5103 || ( ( a.low == 0 )
5104 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5109 /*----------------------------------------------------------------------------
5110 | Returns 1 if the extended double-precision floating-point value `a' is
5111 | less than or equal to the corresponding value `b', and 0 otherwise. The
5112 | invalid exception is raised if either operand is a NaN. The comparison is
5113 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5114 | Arithmetic.
5115 *----------------------------------------------------------------------------*/
5117 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5119 flag aSign, bSign;
5121 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5122 || (extractFloatx80Exp(a) == 0x7FFF
5123 && (uint64_t) (extractFloatx80Frac(a) << 1))
5124 || (extractFloatx80Exp(b) == 0x7FFF
5125 && (uint64_t) (extractFloatx80Frac(b) << 1))
5127 float_raise(float_flag_invalid, status);
5128 return 0;
5130 aSign = extractFloatx80Sign( a );
5131 bSign = extractFloatx80Sign( b );
5132 if ( aSign != bSign ) {
5133 return
5134 aSign
5135 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5136 == 0 );
5138 return
5139 aSign ? le128( b.high, b.low, a.high, a.low )
5140 : le128( a.high, a.low, b.high, b.low );
5144 /*----------------------------------------------------------------------------
5145 | Returns 1 if the extended double-precision floating-point value `a' is
5146 | less than the corresponding value `b', and 0 otherwise. The invalid
5147 | exception is raised if either operand is a NaN. The comparison is performed
5148 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5149 *----------------------------------------------------------------------------*/
5151 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5153 flag aSign, bSign;
5155 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5156 || (extractFloatx80Exp(a) == 0x7FFF
5157 && (uint64_t) (extractFloatx80Frac(a) << 1))
5158 || (extractFloatx80Exp(b) == 0x7FFF
5159 && (uint64_t) (extractFloatx80Frac(b) << 1))
5161 float_raise(float_flag_invalid, status);
5162 return 0;
5164 aSign = extractFloatx80Sign( a );
5165 bSign = extractFloatx80Sign( b );
5166 if ( aSign != bSign ) {
5167 return
5168 aSign
5169 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5170 != 0 );
5172 return
5173 aSign ? lt128( b.high, b.low, a.high, a.low )
5174 : lt128( a.high, a.low, b.high, b.low );
5178 /*----------------------------------------------------------------------------
5179 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5180 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5181 | either operand is a NaN. The comparison is performed according to the
5182 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5183 *----------------------------------------------------------------------------*/
5184 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5186 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5187 || (extractFloatx80Exp(a) == 0x7FFF
5188 && (uint64_t) (extractFloatx80Frac(a) << 1))
5189 || (extractFloatx80Exp(b) == 0x7FFF
5190 && (uint64_t) (extractFloatx80Frac(b) << 1))
5192 float_raise(float_flag_invalid, status);
5193 return 1;
5195 return 0;
5198 /*----------------------------------------------------------------------------
5199 | Returns 1 if the extended double-precision floating-point value `a' is
5200 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5201 | cause an exception. The comparison is performed according to the IEC/IEEE
5202 | Standard for Binary Floating-Point Arithmetic.
5203 *----------------------------------------------------------------------------*/
5205 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5208 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5209 float_raise(float_flag_invalid, status);
5210 return 0;
5212 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5213 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5214 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5215 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5217 if (floatx80_is_signaling_nan(a, status)
5218 || floatx80_is_signaling_nan(b, status)) {
5219 float_raise(float_flag_invalid, status);
5221 return 0;
5223 return
5224 ( a.low == b.low )
5225 && ( ( a.high == b.high )
5226 || ( ( a.low == 0 )
5227 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5232 /*----------------------------------------------------------------------------
5233 | Returns 1 if the extended double-precision floating-point value `a' is less
5234 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5235 | do not cause an exception. Otherwise, the comparison is performed according
5236 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5237 *----------------------------------------------------------------------------*/
5239 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5241 flag aSign, bSign;
5243 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5244 float_raise(float_flag_invalid, status);
5245 return 0;
5247 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5248 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5249 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5250 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5252 if (floatx80_is_signaling_nan(a, status)
5253 || floatx80_is_signaling_nan(b, status)) {
5254 float_raise(float_flag_invalid, status);
5256 return 0;
5258 aSign = extractFloatx80Sign( a );
5259 bSign = extractFloatx80Sign( b );
5260 if ( aSign != bSign ) {
5261 return
5262 aSign
5263 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5264 == 0 );
5266 return
5267 aSign ? le128( b.high, b.low, a.high, a.low )
5268 : le128( a.high, a.low, b.high, b.low );
5272 /*----------------------------------------------------------------------------
5273 | Returns 1 if the extended double-precision floating-point value `a' is less
5274 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5275 | an exception. Otherwise, the comparison is performed according to the
5276 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5277 *----------------------------------------------------------------------------*/
5279 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5281 flag aSign, bSign;
5283 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5284 float_raise(float_flag_invalid, status);
5285 return 0;
5287 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5288 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5289 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5290 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5292 if (floatx80_is_signaling_nan(a, status)
5293 || floatx80_is_signaling_nan(b, status)) {
5294 float_raise(float_flag_invalid, status);
5296 return 0;
5298 aSign = extractFloatx80Sign( a );
5299 bSign = extractFloatx80Sign( b );
5300 if ( aSign != bSign ) {
5301 return
5302 aSign
5303 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5304 != 0 );
5306 return
5307 aSign ? lt128( b.high, b.low, a.high, a.low )
5308 : lt128( a.high, a.low, b.high, b.low );
5312 /*----------------------------------------------------------------------------
5313 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5314 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5315 | The comparison is performed according to the IEC/IEEE Standard for Binary
5316 | Floating-Point Arithmetic.
5317 *----------------------------------------------------------------------------*/
5318 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5320 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5321 float_raise(float_flag_invalid, status);
5322 return 1;
5324 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5325 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5326 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5327 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5329 if (floatx80_is_signaling_nan(a, status)
5330 || floatx80_is_signaling_nan(b, status)) {
5331 float_raise(float_flag_invalid, status);
5333 return 1;
5335 return 0;
5338 /*----------------------------------------------------------------------------
5339 | Returns the result of converting the quadruple-precision floating-point
5340 | value `a' to the 32-bit two's complement integer format. The conversion
5341 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5342 | Arithmetic---which means in particular that the conversion is rounded
5343 | according to the current rounding mode. If `a' is a NaN, the largest
5344 | positive integer is returned. Otherwise, if the conversion overflows, the
5345 | largest integer with the same sign as `a' is returned.
5346 *----------------------------------------------------------------------------*/
5348 int32_t float128_to_int32(float128 a, float_status *status)
5350 flag aSign;
5351 int32_t aExp, shiftCount;
5352 uint64_t aSig0, aSig1;
5354 aSig1 = extractFloat128Frac1( a );
5355 aSig0 = extractFloat128Frac0( a );
5356 aExp = extractFloat128Exp( a );
5357 aSign = extractFloat128Sign( a );
5358 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5359 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5360 aSig0 |= ( aSig1 != 0 );
5361 shiftCount = 0x4028 - aExp;
5362 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5363 return roundAndPackInt32(aSign, aSig0, status);
5367 /*----------------------------------------------------------------------------
5368 | Returns the result of converting the quadruple-precision floating-point
5369 | value `a' to the 32-bit two's complement integer format. The conversion
5370 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5371 | Arithmetic, except that the conversion is always rounded toward zero. If
5372 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5373 | conversion overflows, the largest integer with the same sign as `a' is
5374 | returned.
5375 *----------------------------------------------------------------------------*/
5377 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5379 flag aSign;
5380 int32_t aExp, shiftCount;
5381 uint64_t aSig0, aSig1, savedASig;
5382 int32_t z;
5384 aSig1 = extractFloat128Frac1( a );
5385 aSig0 = extractFloat128Frac0( a );
5386 aExp = extractFloat128Exp( a );
5387 aSign = extractFloat128Sign( a );
5388 aSig0 |= ( aSig1 != 0 );
5389 if ( 0x401E < aExp ) {
5390 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5391 goto invalid;
5393 else if ( aExp < 0x3FFF ) {
5394 if (aExp || aSig0) {
5395 status->float_exception_flags |= float_flag_inexact;
5397 return 0;
5399 aSig0 |= LIT64( 0x0001000000000000 );
5400 shiftCount = 0x402F - aExp;
5401 savedASig = aSig0;
5402 aSig0 >>= shiftCount;
5403 z = aSig0;
5404 if ( aSign ) z = - z;
5405 if ( ( z < 0 ) ^ aSign ) {
5406 invalid:
5407 float_raise(float_flag_invalid, status);
5408 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5410 if ( ( aSig0<<shiftCount ) != savedASig ) {
5411 status->float_exception_flags |= float_flag_inexact;
5413 return z;
5417 /*----------------------------------------------------------------------------
5418 | Returns the result of converting the quadruple-precision floating-point
5419 | value `a' to the 64-bit two's complement integer format. The conversion
5420 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5421 | Arithmetic---which means in particular that the conversion is rounded
5422 | according to the current rounding mode. If `a' is a NaN, the largest
5423 | positive integer is returned. Otherwise, if the conversion overflows, the
5424 | largest integer with the same sign as `a' is returned.
5425 *----------------------------------------------------------------------------*/
5427 int64_t float128_to_int64(float128 a, float_status *status)
5429 flag aSign;
5430 int32_t aExp, shiftCount;
5431 uint64_t aSig0, aSig1;
5433 aSig1 = extractFloat128Frac1( a );
5434 aSig0 = extractFloat128Frac0( a );
5435 aExp = extractFloat128Exp( a );
5436 aSign = extractFloat128Sign( a );
5437 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5438 shiftCount = 0x402F - aExp;
5439 if ( shiftCount <= 0 ) {
5440 if ( 0x403E < aExp ) {
5441 float_raise(float_flag_invalid, status);
5442 if ( ! aSign
5443 || ( ( aExp == 0x7FFF )
5444 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5447 return LIT64( 0x7FFFFFFFFFFFFFFF );
5449 return (int64_t) LIT64( 0x8000000000000000 );
5451 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5453 else {
5454 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5456 return roundAndPackInt64(aSign, aSig0, aSig1, status);
5460 /*----------------------------------------------------------------------------
5461 | Returns the result of converting the quadruple-precision floating-point
5462 | value `a' to the 64-bit two's complement integer format. The conversion
5463 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5464 | Arithmetic, except that the conversion is always rounded toward zero.
5465 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5466 | the conversion overflows, the largest integer with the same sign as `a' is
5467 | returned.
5468 *----------------------------------------------------------------------------*/
5470 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5472 flag aSign;
5473 int32_t aExp, shiftCount;
5474 uint64_t aSig0, aSig1;
5475 int64_t z;
5477 aSig1 = extractFloat128Frac1( a );
5478 aSig0 = extractFloat128Frac0( a );
5479 aExp = extractFloat128Exp( a );
5480 aSign = extractFloat128Sign( a );
5481 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5482 shiftCount = aExp - 0x402F;
5483 if ( 0 < shiftCount ) {
5484 if ( 0x403E <= aExp ) {
5485 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5486 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5487 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5488 if (aSig1) {
5489 status->float_exception_flags |= float_flag_inexact;
5492 else {
5493 float_raise(float_flag_invalid, status);
5494 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5495 return LIT64( 0x7FFFFFFFFFFFFFFF );
5498 return (int64_t) LIT64( 0x8000000000000000 );
5500 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5501 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5502 status->float_exception_flags |= float_flag_inexact;
5505 else {
5506 if ( aExp < 0x3FFF ) {
5507 if ( aExp | aSig0 | aSig1 ) {
5508 status->float_exception_flags |= float_flag_inexact;
5510 return 0;
5512 z = aSig0>>( - shiftCount );
5513 if ( aSig1
5514 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5515 status->float_exception_flags |= float_flag_inexact;
5518 if ( aSign ) z = - z;
5519 return z;
5523 /*----------------------------------------------------------------------------
5524 | Returns the result of converting the quadruple-precision floating-point value
5525 | `a' to the 64-bit unsigned integer format. The conversion is
5526 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5527 | Arithmetic---which means in particular that the conversion is rounded
5528 | according to the current rounding mode. If `a' is a NaN, the largest
5529 | positive integer is returned. If the conversion overflows, the
5530 | largest unsigned integer is returned. If 'a' is negative, the value is
5531 | rounded and zero is returned; negative values that do not round to zero
5532 | will raise the inexact exception.
5533 *----------------------------------------------------------------------------*/
5535 uint64_t float128_to_uint64(float128 a, float_status *status)
5537 flag aSign;
5538 int aExp;
5539 int shiftCount;
5540 uint64_t aSig0, aSig1;
5542 aSig0 = extractFloat128Frac0(a);
5543 aSig1 = extractFloat128Frac1(a);
5544 aExp = extractFloat128Exp(a);
5545 aSign = extractFloat128Sign(a);
5546 if (aSign && (aExp > 0x3FFE)) {
5547 float_raise(float_flag_invalid, status);
5548 if (float128_is_any_nan(a)) {
5549 return LIT64(0xFFFFFFFFFFFFFFFF);
5550 } else {
5551 return 0;
5554 if (aExp) {
5555 aSig0 |= LIT64(0x0001000000000000);
5557 shiftCount = 0x402F - aExp;
5558 if (shiftCount <= 0) {
5559 if (0x403E < aExp) {
5560 float_raise(float_flag_invalid, status);
5561 return LIT64(0xFFFFFFFFFFFFFFFF);
5563 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5564 } else {
5565 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5567 return roundAndPackUint64(aSign, aSig0, aSig1, status);
5570 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5572 uint64_t v;
5573 signed char current_rounding_mode = status->float_rounding_mode;
5575 set_float_rounding_mode(float_round_to_zero, status);
5576 v = float128_to_uint64(a, status);
5577 set_float_rounding_mode(current_rounding_mode, status);
5579 return v;
5582 /*----------------------------------------------------------------------------
5583 | Returns the result of converting the quadruple-precision floating-point
5584 | value `a' to the 32-bit unsigned integer format. The conversion
5585 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5586 | Arithmetic except that the conversion is always rounded toward zero.
5587 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
5588 | if the conversion overflows, the largest unsigned integer is returned.
5589 | If 'a' is negative, the value is rounded and zero is returned; negative
5590 | values that do not round to zero will raise the inexact exception.
5591 *----------------------------------------------------------------------------*/
5593 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5595 uint64_t v;
5596 uint32_t res;
5597 int old_exc_flags = get_float_exception_flags(status);
5599 v = float128_to_uint64_round_to_zero(a, status);
5600 if (v > 0xffffffff) {
5601 res = 0xffffffff;
5602 } else {
5603 return v;
5605 set_float_exception_flags(old_exc_flags, status);
5606 float_raise(float_flag_invalid, status);
5607 return res;
5610 /*----------------------------------------------------------------------------
5611 | Returns the result of converting the quadruple-precision floating-point
5612 | value `a' to the single-precision floating-point format. The conversion
5613 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5614 | Arithmetic.
5615 *----------------------------------------------------------------------------*/
5617 float32 float128_to_float32(float128 a, float_status *status)
5619 flag aSign;
5620 int32_t aExp;
5621 uint64_t aSig0, aSig1;
5622 uint32_t zSig;
5624 aSig1 = extractFloat128Frac1( a );
5625 aSig0 = extractFloat128Frac0( a );
5626 aExp = extractFloat128Exp( a );
5627 aSign = extractFloat128Sign( a );
5628 if ( aExp == 0x7FFF ) {
5629 if ( aSig0 | aSig1 ) {
5630 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5632 return packFloat32( aSign, 0xFF, 0 );
5634 aSig0 |= ( aSig1 != 0 );
5635 shift64RightJamming( aSig0, 18, &aSig0 );
5636 zSig = aSig0;
5637 if ( aExp || zSig ) {
5638 zSig |= 0x40000000;
5639 aExp -= 0x3F81;
5641 return roundAndPackFloat32(aSign, aExp, zSig, status);
5645 /*----------------------------------------------------------------------------
5646 | Returns the result of converting the quadruple-precision floating-point
5647 | value `a' to the double-precision floating-point format. The conversion
5648 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5649 | Arithmetic.
5650 *----------------------------------------------------------------------------*/
5652 float64 float128_to_float64(float128 a, float_status *status)
5654 flag aSign;
5655 int32_t aExp;
5656 uint64_t aSig0, aSig1;
5658 aSig1 = extractFloat128Frac1( a );
5659 aSig0 = extractFloat128Frac0( a );
5660 aExp = extractFloat128Exp( a );
5661 aSign = extractFloat128Sign( a );
5662 if ( aExp == 0x7FFF ) {
5663 if ( aSig0 | aSig1 ) {
5664 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5666 return packFloat64( aSign, 0x7FF, 0 );
5668 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5669 aSig0 |= ( aSig1 != 0 );
5670 if ( aExp || aSig0 ) {
5671 aSig0 |= LIT64( 0x4000000000000000 );
5672 aExp -= 0x3C01;
5674 return roundAndPackFloat64(aSign, aExp, aSig0, status);
5678 /*----------------------------------------------------------------------------
5679 | Returns the result of converting the quadruple-precision floating-point
5680 | value `a' to the extended double-precision floating-point format. The
5681 | conversion is performed according to the IEC/IEEE Standard for Binary
5682 | Floating-Point Arithmetic.
5683 *----------------------------------------------------------------------------*/
5685 floatx80 float128_to_floatx80(float128 a, float_status *status)
5687 flag aSign;
5688 int32_t aExp;
5689 uint64_t aSig0, aSig1;
5691 aSig1 = extractFloat128Frac1( a );
5692 aSig0 = extractFloat128Frac0( a );
5693 aExp = extractFloat128Exp( a );
5694 aSign = extractFloat128Sign( a );
5695 if ( aExp == 0x7FFF ) {
5696 if ( aSig0 | aSig1 ) {
5697 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
5699 return packFloatx80(aSign, floatx80_infinity_high,
5700 floatx80_infinity_low);
5702 if ( aExp == 0 ) {
5703 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5704 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5706 else {
5707 aSig0 |= LIT64( 0x0001000000000000 );
5709 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5710 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
5714 /*----------------------------------------------------------------------------
5715 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5716 | returns the result as a quadruple-precision floating-point value. The
5717 | operation is performed according to the IEC/IEEE Standard for Binary
5718 | Floating-Point Arithmetic.
5719 *----------------------------------------------------------------------------*/
5721 float128 float128_round_to_int(float128 a, float_status *status)
5723 flag aSign;
5724 int32_t aExp;
5725 uint64_t lastBitMask, roundBitsMask;
5726 float128 z;
5728 aExp = extractFloat128Exp( a );
5729 if ( 0x402F <= aExp ) {
5730 if ( 0x406F <= aExp ) {
5731 if ( ( aExp == 0x7FFF )
5732 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5734 return propagateFloat128NaN(a, a, status);
5736 return a;
5738 lastBitMask = 1;
5739 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5740 roundBitsMask = lastBitMask - 1;
5741 z = a;
5742 switch (status->float_rounding_mode) {
5743 case float_round_nearest_even:
5744 if ( lastBitMask ) {
5745 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5746 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5748 else {
5749 if ( (int64_t) z.low < 0 ) {
5750 ++z.high;
5751 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5754 break;
5755 case float_round_ties_away:
5756 if (lastBitMask) {
5757 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
5758 } else {
5759 if ((int64_t) z.low < 0) {
5760 ++z.high;
5763 break;
5764 case float_round_to_zero:
5765 break;
5766 case float_round_up:
5767 if (!extractFloat128Sign(z)) {
5768 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5770 break;
5771 case float_round_down:
5772 if (extractFloat128Sign(z)) {
5773 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5775 break;
5776 default:
5777 abort();
5779 z.low &= ~ roundBitsMask;
5781 else {
5782 if ( aExp < 0x3FFF ) {
5783 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5784 status->float_exception_flags |= float_flag_inexact;
5785 aSign = extractFloat128Sign( a );
5786 switch (status->float_rounding_mode) {
5787 case float_round_nearest_even:
5788 if ( ( aExp == 0x3FFE )
5789 && ( extractFloat128Frac0( a )
5790 | extractFloat128Frac1( a ) )
5792 return packFloat128( aSign, 0x3FFF, 0, 0 );
5794 break;
5795 case float_round_ties_away:
5796 if (aExp == 0x3FFE) {
5797 return packFloat128(aSign, 0x3FFF, 0, 0);
5799 break;
5800 case float_round_down:
5801 return
5802 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5803 : packFloat128( 0, 0, 0, 0 );
5804 case float_round_up:
5805 return
5806 aSign ? packFloat128( 1, 0, 0, 0 )
5807 : packFloat128( 0, 0x3FFF, 0, 0 );
5809 return packFloat128( aSign, 0, 0, 0 );
5811 lastBitMask = 1;
5812 lastBitMask <<= 0x402F - aExp;
5813 roundBitsMask = lastBitMask - 1;
5814 z.low = 0;
5815 z.high = a.high;
5816 switch (status->float_rounding_mode) {
5817 case float_round_nearest_even:
5818 z.high += lastBitMask>>1;
5819 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5820 z.high &= ~ lastBitMask;
5822 break;
5823 case float_round_ties_away:
5824 z.high += lastBitMask>>1;
5825 break;
5826 case float_round_to_zero:
5827 break;
5828 case float_round_up:
5829 if (!extractFloat128Sign(z)) {
5830 z.high |= ( a.low != 0 );
5831 z.high += roundBitsMask;
5833 break;
5834 case float_round_down:
5835 if (extractFloat128Sign(z)) {
5836 z.high |= (a.low != 0);
5837 z.high += roundBitsMask;
5839 break;
5840 default:
5841 abort();
5843 z.high &= ~ roundBitsMask;
5845 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5846 status->float_exception_flags |= float_flag_inexact;
5848 return z;
5852 /*----------------------------------------------------------------------------
5853 | Returns the result of adding the absolute values of the quadruple-precision
5854 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5855 | before being returned. `zSign' is ignored if the result is a NaN.
5856 | The addition is performed according to the IEC/IEEE Standard for Binary
5857 | Floating-Point Arithmetic.
5858 *----------------------------------------------------------------------------*/
5860 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
5861 float_status *status)
5863 int32_t aExp, bExp, zExp;
5864 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5865 int32_t expDiff;
5867 aSig1 = extractFloat128Frac1( a );
5868 aSig0 = extractFloat128Frac0( a );
5869 aExp = extractFloat128Exp( a );
5870 bSig1 = extractFloat128Frac1( b );
5871 bSig0 = extractFloat128Frac0( b );
5872 bExp = extractFloat128Exp( b );
5873 expDiff = aExp - bExp;
5874 if ( 0 < expDiff ) {
5875 if ( aExp == 0x7FFF ) {
5876 if (aSig0 | aSig1) {
5877 return propagateFloat128NaN(a, b, status);
5879 return a;
5881 if ( bExp == 0 ) {
5882 --expDiff;
5884 else {
5885 bSig0 |= LIT64( 0x0001000000000000 );
5887 shift128ExtraRightJamming(
5888 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5889 zExp = aExp;
5891 else if ( expDiff < 0 ) {
5892 if ( bExp == 0x7FFF ) {
5893 if (bSig0 | bSig1) {
5894 return propagateFloat128NaN(a, b, status);
5896 return packFloat128( zSign, 0x7FFF, 0, 0 );
5898 if ( aExp == 0 ) {
5899 ++expDiff;
5901 else {
5902 aSig0 |= LIT64( 0x0001000000000000 );
5904 shift128ExtraRightJamming(
5905 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5906 zExp = bExp;
5908 else {
5909 if ( aExp == 0x7FFF ) {
5910 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5911 return propagateFloat128NaN(a, b, status);
5913 return a;
5915 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5916 if ( aExp == 0 ) {
5917 if (status->flush_to_zero) {
5918 if (zSig0 | zSig1) {
5919 float_raise(float_flag_output_denormal, status);
5921 return packFloat128(zSign, 0, 0, 0);
5923 return packFloat128( zSign, 0, zSig0, zSig1 );
5925 zSig2 = 0;
5926 zSig0 |= LIT64( 0x0002000000000000 );
5927 zExp = aExp;
5928 goto shiftRight1;
5930 aSig0 |= LIT64( 0x0001000000000000 );
5931 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5932 --zExp;
5933 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5934 ++zExp;
5935 shiftRight1:
5936 shift128ExtraRightJamming(
5937 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5938 roundAndPack:
5939 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
5943 /*----------------------------------------------------------------------------
5944 | Returns the result of subtracting the absolute values of the quadruple-
5945 | precision floating-point values `a' and `b'. If `zSign' is 1, the
5946 | difference is negated before being returned. `zSign' is ignored if the
5947 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5948 | Standard for Binary Floating-Point Arithmetic.
5949 *----------------------------------------------------------------------------*/
5951 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
5952 float_status *status)
5954 int32_t aExp, bExp, zExp;
5955 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5956 int32_t expDiff;
5958 aSig1 = extractFloat128Frac1( a );
5959 aSig0 = extractFloat128Frac0( a );
5960 aExp = extractFloat128Exp( a );
5961 bSig1 = extractFloat128Frac1( b );
5962 bSig0 = extractFloat128Frac0( b );
5963 bExp = extractFloat128Exp( b );
5964 expDiff = aExp - bExp;
5965 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5966 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5967 if ( 0 < expDiff ) goto aExpBigger;
5968 if ( expDiff < 0 ) goto bExpBigger;
5969 if ( aExp == 0x7FFF ) {
5970 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5971 return propagateFloat128NaN(a, b, status);
5973 float_raise(float_flag_invalid, status);
5974 return float128_default_nan(status);
5976 if ( aExp == 0 ) {
5977 aExp = 1;
5978 bExp = 1;
5980 if ( bSig0 < aSig0 ) goto aBigger;
5981 if ( aSig0 < bSig0 ) goto bBigger;
5982 if ( bSig1 < aSig1 ) goto aBigger;
5983 if ( aSig1 < bSig1 ) goto bBigger;
5984 return packFloat128(status->float_rounding_mode == float_round_down,
5985 0, 0, 0);
5986 bExpBigger:
5987 if ( bExp == 0x7FFF ) {
5988 if (bSig0 | bSig1) {
5989 return propagateFloat128NaN(a, b, status);
5991 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5993 if ( aExp == 0 ) {
5994 ++expDiff;
5996 else {
5997 aSig0 |= LIT64( 0x4000000000000000 );
5999 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6000 bSig0 |= LIT64( 0x4000000000000000 );
6001 bBigger:
6002 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6003 zExp = bExp;
6004 zSign ^= 1;
6005 goto normalizeRoundAndPack;
6006 aExpBigger:
6007 if ( aExp == 0x7FFF ) {
6008 if (aSig0 | aSig1) {
6009 return propagateFloat128NaN(a, b, status);
6011 return a;
6013 if ( bExp == 0 ) {
6014 --expDiff;
6016 else {
6017 bSig0 |= LIT64( 0x4000000000000000 );
6019 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6020 aSig0 |= LIT64( 0x4000000000000000 );
6021 aBigger:
6022 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6023 zExp = aExp;
6024 normalizeRoundAndPack:
6025 --zExp;
6026 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6027 status);
6031 /*----------------------------------------------------------------------------
6032 | Returns the result of adding the quadruple-precision floating-point values
6033 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6034 | for Binary Floating-Point Arithmetic.
6035 *----------------------------------------------------------------------------*/
6037 float128 float128_add(float128 a, float128 b, float_status *status)
6039 flag aSign, bSign;
6041 aSign = extractFloat128Sign( a );
6042 bSign = extractFloat128Sign( b );
6043 if ( aSign == bSign ) {
6044 return addFloat128Sigs(a, b, aSign, status);
6046 else {
6047 return subFloat128Sigs(a, b, aSign, status);
6052 /*----------------------------------------------------------------------------
6053 | Returns the result of subtracting the quadruple-precision floating-point
6054 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6055 | Standard for Binary Floating-Point Arithmetic.
6056 *----------------------------------------------------------------------------*/
6058 float128 float128_sub(float128 a, float128 b, float_status *status)
6060 flag aSign, bSign;
6062 aSign = extractFloat128Sign( a );
6063 bSign = extractFloat128Sign( b );
6064 if ( aSign == bSign ) {
6065 return subFloat128Sigs(a, b, aSign, status);
6067 else {
6068 return addFloat128Sigs(a, b, aSign, status);
6073 /*----------------------------------------------------------------------------
6074 | Returns the result of multiplying the quadruple-precision floating-point
6075 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6076 | Standard for Binary Floating-Point Arithmetic.
6077 *----------------------------------------------------------------------------*/
6079 float128 float128_mul(float128 a, float128 b, float_status *status)
6081 flag aSign, bSign, zSign;
6082 int32_t aExp, bExp, zExp;
6083 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6085 aSig1 = extractFloat128Frac1( a );
6086 aSig0 = extractFloat128Frac0( a );
6087 aExp = extractFloat128Exp( a );
6088 aSign = extractFloat128Sign( a );
6089 bSig1 = extractFloat128Frac1( b );
6090 bSig0 = extractFloat128Frac0( b );
6091 bExp = extractFloat128Exp( b );
6092 bSign = extractFloat128Sign( b );
6093 zSign = aSign ^ bSign;
6094 if ( aExp == 0x7FFF ) {
6095 if ( ( aSig0 | aSig1 )
6096 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6097 return propagateFloat128NaN(a, b, status);
6099 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6100 return packFloat128( zSign, 0x7FFF, 0, 0 );
6102 if ( bExp == 0x7FFF ) {
6103 if (bSig0 | bSig1) {
6104 return propagateFloat128NaN(a, b, status);
6106 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6107 invalid:
6108 float_raise(float_flag_invalid, status);
6109 return float128_default_nan(status);
6111 return packFloat128( zSign, 0x7FFF, 0, 0 );
6113 if ( aExp == 0 ) {
6114 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6115 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6117 if ( bExp == 0 ) {
6118 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6119 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6121 zExp = aExp + bExp - 0x4000;
6122 aSig0 |= LIT64( 0x0001000000000000 );
6123 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6124 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6125 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6126 zSig2 |= ( zSig3 != 0 );
6127 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6128 shift128ExtraRightJamming(
6129 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6130 ++zExp;
6132 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6136 /*----------------------------------------------------------------------------
6137 | Returns the result of dividing the quadruple-precision floating-point value
6138 | `a' by the corresponding value `b'. The operation is performed according to
6139 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6140 *----------------------------------------------------------------------------*/
6142 float128 float128_div(float128 a, float128 b, float_status *status)
6144 flag aSign, bSign, zSign;
6145 int32_t aExp, bExp, zExp;
6146 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6147 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6149 aSig1 = extractFloat128Frac1( a );
6150 aSig0 = extractFloat128Frac0( a );
6151 aExp = extractFloat128Exp( a );
6152 aSign = extractFloat128Sign( a );
6153 bSig1 = extractFloat128Frac1( b );
6154 bSig0 = extractFloat128Frac0( b );
6155 bExp = extractFloat128Exp( b );
6156 bSign = extractFloat128Sign( b );
6157 zSign = aSign ^ bSign;
6158 if ( aExp == 0x7FFF ) {
6159 if (aSig0 | aSig1) {
6160 return propagateFloat128NaN(a, b, status);
6162 if ( bExp == 0x7FFF ) {
6163 if (bSig0 | bSig1) {
6164 return propagateFloat128NaN(a, b, status);
6166 goto invalid;
6168 return packFloat128( zSign, 0x7FFF, 0, 0 );
6170 if ( bExp == 0x7FFF ) {
6171 if (bSig0 | bSig1) {
6172 return propagateFloat128NaN(a, b, status);
6174 return packFloat128( zSign, 0, 0, 0 );
6176 if ( bExp == 0 ) {
6177 if ( ( bSig0 | bSig1 ) == 0 ) {
6178 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6179 invalid:
6180 float_raise(float_flag_invalid, status);
6181 return float128_default_nan(status);
6183 float_raise(float_flag_divbyzero, status);
6184 return packFloat128( zSign, 0x7FFF, 0, 0 );
6186 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6188 if ( aExp == 0 ) {
6189 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6190 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6192 zExp = aExp - bExp + 0x3FFD;
6193 shortShift128Left(
6194 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6195 shortShift128Left(
6196 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6197 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6198 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6199 ++zExp;
6201 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6202 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6203 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6204 while ( (int64_t) rem0 < 0 ) {
6205 --zSig0;
6206 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6208 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6209 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6210 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6211 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6212 while ( (int64_t) rem1 < 0 ) {
6213 --zSig1;
6214 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6216 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6218 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6219 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6223 /*----------------------------------------------------------------------------
6224 | Returns the remainder of the quadruple-precision floating-point value `a'
6225 | with respect to the corresponding value `b'. The operation is performed
6226 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6227 *----------------------------------------------------------------------------*/
6229 float128 float128_rem(float128 a, float128 b, float_status *status)
6231 flag aSign, zSign;
6232 int32_t aExp, bExp, expDiff;
6233 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6234 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6235 int64_t sigMean0;
6237 aSig1 = extractFloat128Frac1( a );
6238 aSig0 = extractFloat128Frac0( a );
6239 aExp = extractFloat128Exp( a );
6240 aSign = extractFloat128Sign( a );
6241 bSig1 = extractFloat128Frac1( b );
6242 bSig0 = extractFloat128Frac0( b );
6243 bExp = extractFloat128Exp( b );
6244 if ( aExp == 0x7FFF ) {
6245 if ( ( aSig0 | aSig1 )
6246 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6247 return propagateFloat128NaN(a, b, status);
6249 goto invalid;
6251 if ( bExp == 0x7FFF ) {
6252 if (bSig0 | bSig1) {
6253 return propagateFloat128NaN(a, b, status);
6255 return a;
6257 if ( bExp == 0 ) {
6258 if ( ( bSig0 | bSig1 ) == 0 ) {
6259 invalid:
6260 float_raise(float_flag_invalid, status);
6261 return float128_default_nan(status);
6263 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6265 if ( aExp == 0 ) {
6266 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6267 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6269 expDiff = aExp - bExp;
6270 if ( expDiff < -1 ) return a;
6271 shortShift128Left(
6272 aSig0 | LIT64( 0x0001000000000000 ),
6273 aSig1,
6274 15 - ( expDiff < 0 ),
6275 &aSig0,
6276 &aSig1
6278 shortShift128Left(
6279 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6280 q = le128( bSig0, bSig1, aSig0, aSig1 );
6281 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6282 expDiff -= 64;
6283 while ( 0 < expDiff ) {
6284 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6285 q = ( 4 < q ) ? q - 4 : 0;
6286 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6287 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6288 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6289 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6290 expDiff -= 61;
6292 if ( -64 < expDiff ) {
6293 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6294 q = ( 4 < q ) ? q - 4 : 0;
6295 q >>= - expDiff;
6296 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6297 expDiff += 52;
6298 if ( expDiff < 0 ) {
6299 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6301 else {
6302 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6304 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6305 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6307 else {
6308 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6309 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6311 do {
6312 alternateASig0 = aSig0;
6313 alternateASig1 = aSig1;
6314 ++q;
6315 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6316 } while ( 0 <= (int64_t) aSig0 );
6317 add128(
6318 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6319 if ( ( sigMean0 < 0 )
6320 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6321 aSig0 = alternateASig0;
6322 aSig1 = alternateASig1;
6324 zSign = ( (int64_t) aSig0 < 0 );
6325 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6326 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6327 status);
6330 /*----------------------------------------------------------------------------
6331 | Returns the square root of the quadruple-precision floating-point value `a'.
6332 | The operation is performed according to the IEC/IEEE Standard for Binary
6333 | Floating-Point Arithmetic.
6334 *----------------------------------------------------------------------------*/
6336 float128 float128_sqrt(float128 a, float_status *status)
6338 flag aSign;
6339 int32_t aExp, zExp;
6340 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6341 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6343 aSig1 = extractFloat128Frac1( a );
6344 aSig0 = extractFloat128Frac0( a );
6345 aExp = extractFloat128Exp( a );
6346 aSign = extractFloat128Sign( a );
6347 if ( aExp == 0x7FFF ) {
6348 if (aSig0 | aSig1) {
6349 return propagateFloat128NaN(a, a, status);
6351 if ( ! aSign ) return a;
6352 goto invalid;
6354 if ( aSign ) {
6355 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6356 invalid:
6357 float_raise(float_flag_invalid, status);
6358 return float128_default_nan(status);
6360 if ( aExp == 0 ) {
6361 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6362 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6364 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6365 aSig0 |= LIT64( 0x0001000000000000 );
6366 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6367 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6368 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6369 doubleZSig0 = zSig0<<1;
6370 mul64To128( zSig0, zSig0, &term0, &term1 );
6371 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6372 while ( (int64_t) rem0 < 0 ) {
6373 --zSig0;
6374 doubleZSig0 -= 2;
6375 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6377 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6378 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6379 if ( zSig1 == 0 ) zSig1 = 1;
6380 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6381 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6382 mul64To128( zSig1, zSig1, &term2, &term3 );
6383 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6384 while ( (int64_t) rem1 < 0 ) {
6385 --zSig1;
6386 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6387 term3 |= 1;
6388 term2 |= doubleZSig0;
6389 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6391 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6393 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6394 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6398 /*----------------------------------------------------------------------------
6399 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6400 | the corresponding value `b', and 0 otherwise. The invalid exception is
6401 | raised if either operand is a NaN. Otherwise, the comparison is performed
6402 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6403 *----------------------------------------------------------------------------*/
6405 int float128_eq(float128 a, float128 b, float_status *status)
6408 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6409 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6410 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6411 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6413 float_raise(float_flag_invalid, status);
6414 return 0;
6416 return
6417 ( a.low == b.low )
6418 && ( ( a.high == b.high )
6419 || ( ( a.low == 0 )
6420 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6425 /*----------------------------------------------------------------------------
6426 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6427 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6428 | exception is raised if either operand is a NaN. The comparison is performed
6429 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6430 *----------------------------------------------------------------------------*/
6432 int float128_le(float128 a, float128 b, float_status *status)
6434 flag aSign, bSign;
6436 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6437 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6438 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6439 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6441 float_raise(float_flag_invalid, status);
6442 return 0;
6444 aSign = extractFloat128Sign( a );
6445 bSign = extractFloat128Sign( b );
6446 if ( aSign != bSign ) {
6447 return
6448 aSign
6449 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6450 == 0 );
6452 return
6453 aSign ? le128( b.high, b.low, a.high, a.low )
6454 : le128( a.high, a.low, b.high, b.low );
6458 /*----------------------------------------------------------------------------
6459 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6460 | the corresponding value `b', and 0 otherwise. The invalid exception is
6461 | raised if either operand is a NaN. The comparison is performed according
6462 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6463 *----------------------------------------------------------------------------*/
6465 int float128_lt(float128 a, float128 b, float_status *status)
6467 flag aSign, bSign;
6469 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6470 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6471 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6472 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6474 float_raise(float_flag_invalid, status);
6475 return 0;
6477 aSign = extractFloat128Sign( a );
6478 bSign = extractFloat128Sign( b );
6479 if ( aSign != bSign ) {
6480 return
6481 aSign
6482 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6483 != 0 );
6485 return
6486 aSign ? lt128( b.high, b.low, a.high, a.low )
6487 : lt128( a.high, a.low, b.high, b.low );
6491 /*----------------------------------------------------------------------------
6492 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6493 | be compared, and 0 otherwise. The invalid exception is raised if either
6494 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6495 | Standard for Binary Floating-Point Arithmetic.
6496 *----------------------------------------------------------------------------*/
6498 int float128_unordered(float128 a, float128 b, float_status *status)
6500 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6501 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6502 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6503 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6505 float_raise(float_flag_invalid, status);
6506 return 1;
6508 return 0;
6511 /*----------------------------------------------------------------------------
6512 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6513 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6514 | exception. The comparison is performed according to the IEC/IEEE Standard
6515 | for Binary Floating-Point Arithmetic.
6516 *----------------------------------------------------------------------------*/
6518 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6521 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6522 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6523 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6524 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6526 if (float128_is_signaling_nan(a, status)
6527 || float128_is_signaling_nan(b, status)) {
6528 float_raise(float_flag_invalid, status);
6530 return 0;
6532 return
6533 ( a.low == b.low )
6534 && ( ( a.high == b.high )
6535 || ( ( a.low == 0 )
6536 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6541 /*----------------------------------------------------------------------------
6542 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6543 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6544 | cause an exception. Otherwise, the comparison is performed according to the
6545 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6546 *----------------------------------------------------------------------------*/
6548 int float128_le_quiet(float128 a, float128 b, float_status *status)
6550 flag aSign, bSign;
6552 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6553 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6554 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6555 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6557 if (float128_is_signaling_nan(a, status)
6558 || float128_is_signaling_nan(b, status)) {
6559 float_raise(float_flag_invalid, status);
6561 return 0;
6563 aSign = extractFloat128Sign( a );
6564 bSign = extractFloat128Sign( b );
6565 if ( aSign != bSign ) {
6566 return
6567 aSign
6568 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6569 == 0 );
6571 return
6572 aSign ? le128( b.high, b.low, a.high, a.low )
6573 : le128( a.high, a.low, b.high, b.low );
6577 /*----------------------------------------------------------------------------
6578 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6579 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6580 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6581 | Standard for Binary Floating-Point Arithmetic.
6582 *----------------------------------------------------------------------------*/
6584 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6586 flag aSign, bSign;
6588 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6589 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6590 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6591 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6593 if (float128_is_signaling_nan(a, status)
6594 || float128_is_signaling_nan(b, status)) {
6595 float_raise(float_flag_invalid, status);
6597 return 0;
6599 aSign = extractFloat128Sign( a );
6600 bSign = extractFloat128Sign( b );
6601 if ( aSign != bSign ) {
6602 return
6603 aSign
6604 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6605 != 0 );
6607 return
6608 aSign ? lt128( b.high, b.low, a.high, a.low )
6609 : lt128( a.high, a.low, b.high, b.low );
6613 /*----------------------------------------------------------------------------
6614 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6615 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6616 | comparison is performed according to the IEC/IEEE Standard for Binary
6617 | Floating-Point Arithmetic.
6618 *----------------------------------------------------------------------------*/
6620 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6622 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6623 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6624 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6625 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6627 if (float128_is_signaling_nan(a, status)
6628 || float128_is_signaling_nan(b, status)) {
6629 float_raise(float_flag_invalid, status);
6631 return 1;
6633 return 0;
6636 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6637 int is_quiet, float_status *status)
6639 flag aSign, bSign;
6641 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6642 float_raise(float_flag_invalid, status);
6643 return float_relation_unordered;
6645 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6646 ( extractFloatx80Frac( a )<<1 ) ) ||
6647 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6648 ( extractFloatx80Frac( b )<<1 ) )) {
6649 if (!is_quiet ||
6650 floatx80_is_signaling_nan(a, status) ||
6651 floatx80_is_signaling_nan(b, status)) {
6652 float_raise(float_flag_invalid, status);
6654 return float_relation_unordered;
6656 aSign = extractFloatx80Sign( a );
6657 bSign = extractFloatx80Sign( b );
6658 if ( aSign != bSign ) {
6660 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6661 ( ( a.low | b.low ) == 0 ) ) {
6662 /* zero case */
6663 return float_relation_equal;
6664 } else {
6665 return 1 - (2 * aSign);
6667 } else {
6668 if (a.low == b.low && a.high == b.high) {
6669 return float_relation_equal;
6670 } else {
6671 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6676 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6678 return floatx80_compare_internal(a, b, 0, status);
6681 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6683 return floatx80_compare_internal(a, b, 1, status);
6686 static inline int float128_compare_internal(float128 a, float128 b,
6687 int is_quiet, float_status *status)
6689 flag aSign, bSign;
6691 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6692 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6693 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6694 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6695 if (!is_quiet ||
6696 float128_is_signaling_nan(a, status) ||
6697 float128_is_signaling_nan(b, status)) {
6698 float_raise(float_flag_invalid, status);
6700 return float_relation_unordered;
6702 aSign = extractFloat128Sign( a );
6703 bSign = extractFloat128Sign( b );
6704 if ( aSign != bSign ) {
6705 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6706 /* zero case */
6707 return float_relation_equal;
6708 } else {
6709 return 1 - (2 * aSign);
6711 } else {
6712 if (a.low == b.low && a.high == b.high) {
6713 return float_relation_equal;
6714 } else {
6715 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6720 int float128_compare(float128 a, float128 b, float_status *status)
6722 return float128_compare_internal(a, b, 0, status);
6725 int float128_compare_quiet(float128 a, float128 b, float_status *status)
6727 return float128_compare_internal(a, b, 1, status);
6730 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
6732 flag aSign;
6733 int32_t aExp;
6734 uint64_t aSig;
6736 if (floatx80_invalid_encoding(a)) {
6737 float_raise(float_flag_invalid, status);
6738 return floatx80_default_nan(status);
6740 aSig = extractFloatx80Frac( a );
6741 aExp = extractFloatx80Exp( a );
6742 aSign = extractFloatx80Sign( a );
6744 if ( aExp == 0x7FFF ) {
6745 if ( aSig<<1 ) {
6746 return propagateFloatx80NaN(a, a, status);
6748 return a;
6751 if (aExp == 0) {
6752 if (aSig == 0) {
6753 return a;
6755 aExp++;
6758 if (n > 0x10000) {
6759 n = 0x10000;
6760 } else if (n < -0x10000) {
6761 n = -0x10000;
6764 aExp += n;
6765 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
6766 aSign, aExp, aSig, 0, status);
6769 float128 float128_scalbn(float128 a, int n, float_status *status)
6771 flag aSign;
6772 int32_t aExp;
6773 uint64_t aSig0, aSig1;
6775 aSig1 = extractFloat128Frac1( a );
6776 aSig0 = extractFloat128Frac0( a );
6777 aExp = extractFloat128Exp( a );
6778 aSign = extractFloat128Sign( a );
6779 if ( aExp == 0x7FFF ) {
6780 if ( aSig0 | aSig1 ) {
6781 return propagateFloat128NaN(a, a, status);
6783 return a;
6785 if (aExp != 0) {
6786 aSig0 |= LIT64( 0x0001000000000000 );
6787 } else if (aSig0 == 0 && aSig1 == 0) {
6788 return a;
6789 } else {
6790 aExp++;
6793 if (n > 0x10000) {
6794 n = 0x10000;
6795 } else if (n < -0x10000) {
6796 n = -0x10000;
6799 aExp += n - 1;
6800 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6801 , status);