target/arm: Rewrite helper_sve_st[1234]*_r
[qemu/ar7.git] / fpu / softfloat.c
blob46ae2061724406c93a05364e77f4d311e19606e7
1 /*
2 * QEMU float support
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "fpu/softfloat-macros.h"
98 /*----------------------------------------------------------------------------
99 | Returns the fraction bits of the half-precision floating-point value `a'.
100 *----------------------------------------------------------------------------*/
102 static inline uint32_t extractFloat16Frac(float16 a)
104 return float16_val(a) & 0x3ff;
107 /*----------------------------------------------------------------------------
108 | Returns the exponent bits of the half-precision floating-point value `a'.
109 *----------------------------------------------------------------------------*/
111 static inline int extractFloat16Exp(float16 a)
113 return (float16_val(a) >> 10) & 0x1f;
116 /*----------------------------------------------------------------------------
117 | Returns the fraction bits of the single-precision floating-point value `a'.
118 *----------------------------------------------------------------------------*/
120 static inline uint32_t extractFloat32Frac(float32 a)
122 return float32_val(a) & 0x007FFFFF;
125 /*----------------------------------------------------------------------------
126 | Returns the exponent bits of the single-precision floating-point value `a'.
127 *----------------------------------------------------------------------------*/
129 static inline int extractFloat32Exp(float32 a)
131 return (float32_val(a) >> 23) & 0xFF;
134 /*----------------------------------------------------------------------------
135 | Returns the sign bit of the single-precision floating-point value `a'.
136 *----------------------------------------------------------------------------*/
138 static inline flag extractFloat32Sign(float32 a)
140 return float32_val(a) >> 31;
143 /*----------------------------------------------------------------------------
144 | Returns the fraction bits of the double-precision floating-point value `a'.
145 *----------------------------------------------------------------------------*/
147 static inline uint64_t extractFloat64Frac(float64 a)
149 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
152 /*----------------------------------------------------------------------------
153 | Returns the exponent bits of the double-precision floating-point value `a'.
154 *----------------------------------------------------------------------------*/
156 static inline int extractFloat64Exp(float64 a)
158 return (float64_val(a) >> 52) & 0x7FF;
161 /*----------------------------------------------------------------------------
162 | Returns the sign bit of the double-precision floating-point value `a'.
163 *----------------------------------------------------------------------------*/
165 static inline flag extractFloat64Sign(float64 a)
167 return float64_val(a) >> 63;
171 * Classify a floating point number. Everything above float_class_qnan
172 * is a NaN so cls >= float_class_qnan is any NaN.
175 typedef enum __attribute__ ((__packed__)) {
176 float_class_unclassified,
177 float_class_zero,
178 float_class_normal,
179 float_class_inf,
180 float_class_qnan, /* all NaNs from here */
181 float_class_snan,
182 } FloatClass;
184 /* Simple helpers for checking if, or what kind of, NaN we have */
185 static inline __attribute__((unused)) bool is_nan(FloatClass c)
187 return unlikely(c >= float_class_qnan);
190 static inline __attribute__((unused)) bool is_snan(FloatClass c)
192 return c == float_class_snan;
195 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
197 return c == float_class_qnan;
201 * Structure holding all of the decomposed parts of a float. The
202 * exponent is unbiased and the fraction is normalized. All
203 * calculations are done with a 64 bit fraction and then rounded as
204 * appropriate for the final format.
206 * Thanks to the packed FloatClass a decent compiler should be able to
207 * fit the whole structure into registers and avoid using the stack
208 * for parameter passing.
211 typedef struct {
212 uint64_t frac;
213 int32_t exp;
214 FloatClass cls;
215 bool sign;
216 } FloatParts;
218 #define DECOMPOSED_BINARY_POINT (64 - 2)
219 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
220 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
222 /* Structure holding all of the relevant parameters for a format.
223 * exp_size: the size of the exponent field
224 * exp_bias: the offset applied to the exponent field
225 * exp_max: the maximum normalised exponent
226 * frac_size: the size of the fraction field
227 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
228 * The following are computed based the size of fraction
229 * frac_lsb: least significant bit of fraction
230 * frac_lsbm1: the bit below the least significant bit (for rounding)
231 * round_mask/roundeven_mask: masks used for rounding
232 * The following optional modifiers are available:
233 * arm_althp: handle ARM Alternative Half Precision
235 typedef struct {
236 int exp_size;
237 int exp_bias;
238 int exp_max;
239 int frac_size;
240 int frac_shift;
241 uint64_t frac_lsb;
242 uint64_t frac_lsbm1;
243 uint64_t round_mask;
244 uint64_t roundeven_mask;
245 bool arm_althp;
246 } FloatFmt;
248 /* Expand fields based on the size of exponent and fraction */
249 #define FLOAT_PARAMS(E, F) \
250 .exp_size = E, \
251 .exp_bias = ((1 << E) - 1) >> 1, \
252 .exp_max = (1 << E) - 1, \
253 .frac_size = F, \
254 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
255 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
256 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
257 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
258 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
260 static const FloatFmt float16_params = {
261 FLOAT_PARAMS(5, 10)
264 static const FloatFmt float16_params_ahp = {
265 FLOAT_PARAMS(5, 10),
266 .arm_althp = true
269 static const FloatFmt float32_params = {
270 FLOAT_PARAMS(8, 23)
273 static const FloatFmt float64_params = {
274 FLOAT_PARAMS(11, 52)
277 /* Unpack a float to parts, but do not canonicalize. */
278 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
280 const int sign_pos = fmt.frac_size + fmt.exp_size;
282 return (FloatParts) {
283 .cls = float_class_unclassified,
284 .sign = extract64(raw, sign_pos, 1),
285 .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
286 .frac = extract64(raw, 0, fmt.frac_size),
290 static inline FloatParts float16_unpack_raw(float16 f)
292 return unpack_raw(float16_params, f);
295 static inline FloatParts float32_unpack_raw(float32 f)
297 return unpack_raw(float32_params, f);
300 static inline FloatParts float64_unpack_raw(float64 f)
302 return unpack_raw(float64_params, f);
305 /* Pack a float from parts, but do not canonicalize. */
306 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
308 const int sign_pos = fmt.frac_size + fmt.exp_size;
309 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
310 return deposit64(ret, sign_pos, 1, p.sign);
313 static inline float16 float16_pack_raw(FloatParts p)
315 return make_float16(pack_raw(float16_params, p));
318 static inline float32 float32_pack_raw(FloatParts p)
320 return make_float32(pack_raw(float32_params, p));
323 static inline float64 float64_pack_raw(FloatParts p)
325 return make_float64(pack_raw(float64_params, p));
328 /*----------------------------------------------------------------------------
329 | Functions and definitions to determine: (1) whether tininess for underflow
330 | is detected before or after rounding by default, (2) what (if anything)
331 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
332 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
333 | are propagated from function inputs to output. These details are target-
334 | specific.
335 *----------------------------------------------------------------------------*/
336 #include "softfloat-specialize.h"
338 /* Canonicalize EXP and FRAC, setting CLS. */
339 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
340 float_status *status)
342 if (part.exp == parm->exp_max && !parm->arm_althp) {
343 if (part.frac == 0) {
344 part.cls = float_class_inf;
345 } else {
346 part.frac <<= parm->frac_shift;
347 part.cls = (parts_is_snan_frac(part.frac, status)
348 ? float_class_snan : float_class_qnan);
350 } else if (part.exp == 0) {
351 if (likely(part.frac == 0)) {
352 part.cls = float_class_zero;
353 } else if (status->flush_inputs_to_zero) {
354 float_raise(float_flag_input_denormal, status);
355 part.cls = float_class_zero;
356 part.frac = 0;
357 } else {
358 int shift = clz64(part.frac) - 1;
359 part.cls = float_class_normal;
360 part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
361 part.frac <<= shift;
363 } else {
364 part.cls = float_class_normal;
365 part.exp -= parm->exp_bias;
366 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
368 return part;
371 /* Round and uncanonicalize a floating-point number by parts. There
372 * are FRAC_SHIFT bits that may require rounding at the bottom of the
373 * fraction; these bits will be removed. The exponent will be biased
374 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
377 static FloatParts round_canonical(FloatParts p, float_status *s,
378 const FloatFmt *parm)
380 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
381 const uint64_t round_mask = parm->round_mask;
382 const uint64_t roundeven_mask = parm->roundeven_mask;
383 const int exp_max = parm->exp_max;
384 const int frac_shift = parm->frac_shift;
385 uint64_t frac, inc;
386 int exp, flags = 0;
387 bool overflow_norm;
389 frac = p.frac;
390 exp = p.exp;
392 switch (p.cls) {
393 case float_class_normal:
394 switch (s->float_rounding_mode) {
395 case float_round_nearest_even:
396 overflow_norm = false;
397 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
398 break;
399 case float_round_ties_away:
400 overflow_norm = false;
401 inc = frac_lsbm1;
402 break;
403 case float_round_to_zero:
404 overflow_norm = true;
405 inc = 0;
406 break;
407 case float_round_up:
408 inc = p.sign ? 0 : round_mask;
409 overflow_norm = p.sign;
410 break;
411 case float_round_down:
412 inc = p.sign ? round_mask : 0;
413 overflow_norm = !p.sign;
414 break;
415 default:
416 g_assert_not_reached();
419 exp += parm->exp_bias;
420 if (likely(exp > 0)) {
421 if (frac & round_mask) {
422 flags |= float_flag_inexact;
423 frac += inc;
424 if (frac & DECOMPOSED_OVERFLOW_BIT) {
425 frac >>= 1;
426 exp++;
429 frac >>= frac_shift;
431 if (parm->arm_althp) {
432 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
433 if (unlikely(exp > exp_max)) {
434 /* Overflow. Return the maximum normal. */
435 flags = float_flag_invalid;
436 exp = exp_max;
437 frac = -1;
439 } else if (unlikely(exp >= exp_max)) {
440 flags |= float_flag_overflow | float_flag_inexact;
441 if (overflow_norm) {
442 exp = exp_max - 1;
443 frac = -1;
444 } else {
445 p.cls = float_class_inf;
446 goto do_inf;
449 } else if (s->flush_to_zero) {
450 flags |= float_flag_output_denormal;
451 p.cls = float_class_zero;
452 goto do_zero;
453 } else {
454 bool is_tiny = (s->float_detect_tininess
455 == float_tininess_before_rounding)
456 || (exp < 0)
457 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
459 shift64RightJamming(frac, 1 - exp, &frac);
460 if (frac & round_mask) {
461 /* Need to recompute round-to-even. */
462 if (s->float_rounding_mode == float_round_nearest_even) {
463 inc = ((frac & roundeven_mask) != frac_lsbm1
464 ? frac_lsbm1 : 0);
466 flags |= float_flag_inexact;
467 frac += inc;
470 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
471 frac >>= frac_shift;
473 if (is_tiny && (flags & float_flag_inexact)) {
474 flags |= float_flag_underflow;
476 if (exp == 0 && frac == 0) {
477 p.cls = float_class_zero;
480 break;
482 case float_class_zero:
483 do_zero:
484 exp = 0;
485 frac = 0;
486 break;
488 case float_class_inf:
489 do_inf:
490 assert(!parm->arm_althp);
491 exp = exp_max;
492 frac = 0;
493 break;
495 case float_class_qnan:
496 case float_class_snan:
497 assert(!parm->arm_althp);
498 exp = exp_max;
499 frac >>= parm->frac_shift;
500 break;
502 default:
503 g_assert_not_reached();
506 float_raise(flags, s);
507 p.exp = exp;
508 p.frac = frac;
509 return p;
512 /* Explicit FloatFmt version */
513 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
514 const FloatFmt *params)
516 return canonicalize(float16_unpack_raw(f), params, s);
519 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
521 return float16a_unpack_canonical(f, s, &float16_params);
524 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
525 const FloatFmt *params)
527 return float16_pack_raw(round_canonical(p, s, params));
530 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
532 return float16a_round_pack_canonical(p, s, &float16_params);
535 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
537 return canonicalize(float32_unpack_raw(f), &float32_params, s);
540 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
542 return float32_pack_raw(round_canonical(p, s, &float32_params));
545 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
547 return canonicalize(float64_unpack_raw(f), &float64_params, s);
550 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
552 return float64_pack_raw(round_canonical(p, s, &float64_params));
555 static FloatParts return_nan(FloatParts a, float_status *s)
557 switch (a.cls) {
558 case float_class_snan:
559 s->float_exception_flags |= float_flag_invalid;
560 a = parts_silence_nan(a, s);
561 /* fall through */
562 case float_class_qnan:
563 if (s->default_nan_mode) {
564 return parts_default_nan(s);
566 break;
568 default:
569 g_assert_not_reached();
571 return a;
574 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
576 if (is_snan(a.cls) || is_snan(b.cls)) {
577 s->float_exception_flags |= float_flag_invalid;
580 if (s->default_nan_mode) {
581 return parts_default_nan(s);
582 } else {
583 if (pickNaN(a.cls, b.cls,
584 a.frac > b.frac ||
585 (a.frac == b.frac && a.sign < b.sign))) {
586 a = b;
588 if (is_snan(a.cls)) {
589 return parts_silence_nan(a, s);
592 return a;
595 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
596 bool inf_zero, float_status *s)
598 int which;
600 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
601 s->float_exception_flags |= float_flag_invalid;
604 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
606 if (s->default_nan_mode) {
607 /* Note that this check is after pickNaNMulAdd so that function
608 * has an opportunity to set the Invalid flag.
610 which = 3;
613 switch (which) {
614 case 0:
615 break;
616 case 1:
617 a = b;
618 break;
619 case 2:
620 a = c;
621 break;
622 case 3:
623 return parts_default_nan(s);
624 default:
625 g_assert_not_reached();
628 if (is_snan(a.cls)) {
629 return parts_silence_nan(a, s);
631 return a;
635 * Returns the result of adding or subtracting the values of the
636 * floating-point values `a' and `b'. The operation is performed
637 * according to the IEC/IEEE Standard for Binary Floating-Point
638 * Arithmetic.
641 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
642 float_status *s)
644 bool a_sign = a.sign;
645 bool b_sign = b.sign ^ subtract;
647 if (a_sign != b_sign) {
648 /* Subtraction */
650 if (a.cls == float_class_normal && b.cls == float_class_normal) {
651 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
652 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
653 a.frac = a.frac - b.frac;
654 } else {
655 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
656 a.frac = b.frac - a.frac;
657 a.exp = b.exp;
658 a_sign ^= 1;
661 if (a.frac == 0) {
662 a.cls = float_class_zero;
663 a.sign = s->float_rounding_mode == float_round_down;
664 } else {
665 int shift = clz64(a.frac) - 1;
666 a.frac = a.frac << shift;
667 a.exp = a.exp - shift;
668 a.sign = a_sign;
670 return a;
672 if (is_nan(a.cls) || is_nan(b.cls)) {
673 return pick_nan(a, b, s);
675 if (a.cls == float_class_inf) {
676 if (b.cls == float_class_inf) {
677 float_raise(float_flag_invalid, s);
678 return parts_default_nan(s);
680 return a;
682 if (a.cls == float_class_zero && b.cls == float_class_zero) {
683 a.sign = s->float_rounding_mode == float_round_down;
684 return a;
686 if (a.cls == float_class_zero || b.cls == float_class_inf) {
687 b.sign = a_sign ^ 1;
688 return b;
690 if (b.cls == float_class_zero) {
691 return a;
693 } else {
694 /* Addition */
695 if (a.cls == float_class_normal && b.cls == float_class_normal) {
696 if (a.exp > b.exp) {
697 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
698 } else if (a.exp < b.exp) {
699 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
700 a.exp = b.exp;
702 a.frac += b.frac;
703 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
704 shift64RightJamming(a.frac, 1, &a.frac);
705 a.exp += 1;
707 return a;
709 if (is_nan(a.cls) || is_nan(b.cls)) {
710 return pick_nan(a, b, s);
712 if (a.cls == float_class_inf || b.cls == float_class_zero) {
713 return a;
715 if (b.cls == float_class_inf || a.cls == float_class_zero) {
716 b.sign = b_sign;
717 return b;
720 g_assert_not_reached();
724 * Returns the result of adding or subtracting the floating-point
725 * values `a' and `b'. The operation is performed according to the
726 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
729 float16 __attribute__((flatten)) float16_add(float16 a, float16 b,
730 float_status *status)
732 FloatParts pa = float16_unpack_canonical(a, status);
733 FloatParts pb = float16_unpack_canonical(b, status);
734 FloatParts pr = addsub_floats(pa, pb, false, status);
736 return float16_round_pack_canonical(pr, status);
739 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
740 float_status *status)
742 FloatParts pa = float32_unpack_canonical(a, status);
743 FloatParts pb = float32_unpack_canonical(b, status);
744 FloatParts pr = addsub_floats(pa, pb, false, status);
746 return float32_round_pack_canonical(pr, status);
749 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
750 float_status *status)
752 FloatParts pa = float64_unpack_canonical(a, status);
753 FloatParts pb = float64_unpack_canonical(b, status);
754 FloatParts pr = addsub_floats(pa, pb, false, status);
756 return float64_round_pack_canonical(pr, status);
759 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
760 float_status *status)
762 FloatParts pa = float16_unpack_canonical(a, status);
763 FloatParts pb = float16_unpack_canonical(b, status);
764 FloatParts pr = addsub_floats(pa, pb, true, status);
766 return float16_round_pack_canonical(pr, status);
769 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
770 float_status *status)
772 FloatParts pa = float32_unpack_canonical(a, status);
773 FloatParts pb = float32_unpack_canonical(b, status);
774 FloatParts pr = addsub_floats(pa, pb, true, status);
776 return float32_round_pack_canonical(pr, status);
779 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
780 float_status *status)
782 FloatParts pa = float64_unpack_canonical(a, status);
783 FloatParts pb = float64_unpack_canonical(b, status);
784 FloatParts pr = addsub_floats(pa, pb, true, status);
786 return float64_round_pack_canonical(pr, status);
790 * Returns the result of multiplying the floating-point values `a' and
791 * `b'. The operation is performed according to the IEC/IEEE Standard
792 * for Binary Floating-Point Arithmetic.
795 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
797 bool sign = a.sign ^ b.sign;
799 if (a.cls == float_class_normal && b.cls == float_class_normal) {
800 uint64_t hi, lo;
801 int exp = a.exp + b.exp;
803 mul64To128(a.frac, b.frac, &hi, &lo);
804 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
805 if (lo & DECOMPOSED_OVERFLOW_BIT) {
806 shift64RightJamming(lo, 1, &lo);
807 exp += 1;
810 /* Re-use a */
811 a.exp = exp;
812 a.sign = sign;
813 a.frac = lo;
814 return a;
816 /* handle all the NaN cases */
817 if (is_nan(a.cls) || is_nan(b.cls)) {
818 return pick_nan(a, b, s);
820 /* Inf * Zero == NaN */
821 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
822 (a.cls == float_class_zero && b.cls == float_class_inf)) {
823 s->float_exception_flags |= float_flag_invalid;
824 return parts_default_nan(s);
826 /* Multiply by 0 or Inf */
827 if (a.cls == float_class_inf || a.cls == float_class_zero) {
828 a.sign = sign;
829 return a;
831 if (b.cls == float_class_inf || b.cls == float_class_zero) {
832 b.sign = sign;
833 return b;
835 g_assert_not_reached();
838 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
839 float_status *status)
841 FloatParts pa = float16_unpack_canonical(a, status);
842 FloatParts pb = float16_unpack_canonical(b, status);
843 FloatParts pr = mul_floats(pa, pb, status);
845 return float16_round_pack_canonical(pr, status);
848 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
849 float_status *status)
851 FloatParts pa = float32_unpack_canonical(a, status);
852 FloatParts pb = float32_unpack_canonical(b, status);
853 FloatParts pr = mul_floats(pa, pb, status);
855 return float32_round_pack_canonical(pr, status);
858 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
859 float_status *status)
861 FloatParts pa = float64_unpack_canonical(a, status);
862 FloatParts pb = float64_unpack_canonical(b, status);
863 FloatParts pr = mul_floats(pa, pb, status);
865 return float64_round_pack_canonical(pr, status);
869 * Returns the result of multiplying the floating-point values `a' and
870 * `b' then adding 'c', with no intermediate rounding step after the
871 * multiplication. The operation is performed according to the
872 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
873 * The flags argument allows the caller to select negation of the
874 * addend, the intermediate product, or the final result. (The
875 * difference between this and having the caller do a separate
876 * negation is that negating externally will flip the sign bit on
877 * NaNs.)
880 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
881 int flags, float_status *s)
883 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
884 ((1 << float_class_inf) | (1 << float_class_zero));
885 bool p_sign;
886 bool sign_flip = flags & float_muladd_negate_result;
887 FloatClass p_class;
888 uint64_t hi, lo;
889 int p_exp;
891 /* It is implementation-defined whether the cases of (0,inf,qnan)
892 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
893 * they return if they do), so we have to hand this information
894 * off to the target-specific pick-a-NaN routine.
896 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
897 return pick_nan_muladd(a, b, c, inf_zero, s);
900 if (inf_zero) {
901 s->float_exception_flags |= float_flag_invalid;
902 return parts_default_nan(s);
905 if (flags & float_muladd_negate_c) {
906 c.sign ^= 1;
909 p_sign = a.sign ^ b.sign;
911 if (flags & float_muladd_negate_product) {
912 p_sign ^= 1;
915 if (a.cls == float_class_inf || b.cls == float_class_inf) {
916 p_class = float_class_inf;
917 } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
918 p_class = float_class_zero;
919 } else {
920 p_class = float_class_normal;
923 if (c.cls == float_class_inf) {
924 if (p_class == float_class_inf && p_sign != c.sign) {
925 s->float_exception_flags |= float_flag_invalid;
926 return parts_default_nan(s);
927 } else {
928 a.cls = float_class_inf;
929 a.sign = c.sign ^ sign_flip;
930 return a;
934 if (p_class == float_class_inf) {
935 a.cls = float_class_inf;
936 a.sign = p_sign ^ sign_flip;
937 return a;
940 if (p_class == float_class_zero) {
941 if (c.cls == float_class_zero) {
942 if (p_sign != c.sign) {
943 p_sign = s->float_rounding_mode == float_round_down;
945 c.sign = p_sign;
946 } else if (flags & float_muladd_halve_result) {
947 c.exp -= 1;
949 c.sign ^= sign_flip;
950 return c;
953 /* a & b should be normals now... */
954 assert(a.cls == float_class_normal &&
955 b.cls == float_class_normal);
957 p_exp = a.exp + b.exp;
959 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
960 * result.
962 mul64To128(a.frac, b.frac, &hi, &lo);
963 /* binary point now at bit 124 */
965 /* check for overflow */
966 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
967 shift128RightJamming(hi, lo, 1, &hi, &lo);
968 p_exp += 1;
971 /* + add/sub */
972 if (c.cls == float_class_zero) {
973 /* move binary point back to 62 */
974 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
975 } else {
976 int exp_diff = p_exp - c.exp;
977 if (p_sign == c.sign) {
978 /* Addition */
979 if (exp_diff <= 0) {
980 shift128RightJamming(hi, lo,
981 DECOMPOSED_BINARY_POINT - exp_diff,
982 &hi, &lo);
983 lo += c.frac;
984 p_exp = c.exp;
985 } else {
986 uint64_t c_hi, c_lo;
987 /* shift c to the same binary point as the product (124) */
988 c_hi = c.frac >> 2;
989 c_lo = 0;
990 shift128RightJamming(c_hi, c_lo,
991 exp_diff,
992 &c_hi, &c_lo);
993 add128(hi, lo, c_hi, c_lo, &hi, &lo);
994 /* move binary point back to 62 */
995 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
998 if (lo & DECOMPOSED_OVERFLOW_BIT) {
999 shift64RightJamming(lo, 1, &lo);
1000 p_exp += 1;
1003 } else {
1004 /* Subtraction */
1005 uint64_t c_hi, c_lo;
1006 /* make C binary point match product at bit 124 */
1007 c_hi = c.frac >> 2;
1008 c_lo = 0;
1010 if (exp_diff <= 0) {
1011 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1012 if (exp_diff == 0
1014 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1015 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1016 } else {
1017 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1018 p_sign ^= 1;
1019 p_exp = c.exp;
1021 } else {
1022 shift128RightJamming(c_hi, c_lo,
1023 exp_diff,
1024 &c_hi, &c_lo);
1025 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1028 if (hi == 0 && lo == 0) {
1029 a.cls = float_class_zero;
1030 a.sign = s->float_rounding_mode == float_round_down;
1031 a.sign ^= sign_flip;
1032 return a;
1033 } else {
1034 int shift;
1035 if (hi != 0) {
1036 shift = clz64(hi);
1037 } else {
1038 shift = clz64(lo) + 64;
1040 /* Normalizing to a binary point of 124 is the
1041 correct adjust for the exponent. However since we're
1042 shifting, we might as well put the binary point back
1043 at 62 where we really want it. Therefore shift as
1044 if we're leaving 1 bit at the top of the word, but
1045 adjust the exponent as if we're leaving 3 bits. */
1046 shift -= 1;
1047 if (shift >= 64) {
1048 lo = lo << (shift - 64);
1049 } else {
1050 hi = (hi << shift) | (lo >> (64 - shift));
1051 lo = hi | ((lo << shift) != 0);
1053 p_exp -= shift - 2;
1058 if (flags & float_muladd_halve_result) {
1059 p_exp -= 1;
1062 /* finally prepare our result */
1063 a.cls = float_class_normal;
1064 a.sign = p_sign ^ sign_flip;
1065 a.exp = p_exp;
1066 a.frac = lo;
1068 return a;
1071 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1072 int flags, float_status *status)
1074 FloatParts pa = float16_unpack_canonical(a, status);
1075 FloatParts pb = float16_unpack_canonical(b, status);
1076 FloatParts pc = float16_unpack_canonical(c, status);
1077 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1079 return float16_round_pack_canonical(pr, status);
1082 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1083 int flags, float_status *status)
1085 FloatParts pa = float32_unpack_canonical(a, status);
1086 FloatParts pb = float32_unpack_canonical(b, status);
1087 FloatParts pc = float32_unpack_canonical(c, status);
1088 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1090 return float32_round_pack_canonical(pr, status);
1093 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1094 int flags, float_status *status)
1096 FloatParts pa = float64_unpack_canonical(a, status);
1097 FloatParts pb = float64_unpack_canonical(b, status);
1098 FloatParts pc = float64_unpack_canonical(c, status);
1099 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1101 return float64_round_pack_canonical(pr, status);
1105 * Returns the result of dividing the floating-point value `a' by the
1106 * corresponding value `b'. The operation is performed according to
1107 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1110 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1112 bool sign = a.sign ^ b.sign;
1114 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1115 uint64_t n0, n1, q, r;
1116 int exp = a.exp - b.exp;
1119 * We want a 2*N / N-bit division to produce exactly an N-bit
1120 * result, so that we do not lose any precision and so that we
1121 * do not have to renormalize afterward. If A.frac < B.frac,
1122 * then division would produce an (N-1)-bit result; shift A left
1123 * by one to produce the an N-bit result, and decrement the
1124 * exponent to match.
1126 * The udiv_qrnnd algorithm that we're using requires normalization,
1127 * i.e. the msb of the denominator must be set. Since we know that
1128 * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1129 * by one (more), and the remainder must be shifted right by one.
1131 if (a.frac < b.frac) {
1132 exp -= 1;
1133 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1134 } else {
1135 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1137 q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1140 * Set lsb if there is a remainder, to set inexact.
1141 * As mentioned above, to find the actual value of the remainder we
1142 * would need to shift right, but (1) we are only concerned about
1143 * non-zero-ness, and (2) the remainder will always be even because
1144 * both inputs to the division primitive are even.
1146 a.frac = q | (r != 0);
1147 a.sign = sign;
1148 a.exp = exp;
1149 return a;
1151 /* handle all the NaN cases */
1152 if (is_nan(a.cls) || is_nan(b.cls)) {
1153 return pick_nan(a, b, s);
1155 /* 0/0 or Inf/Inf */
1156 if (a.cls == b.cls
1158 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1159 s->float_exception_flags |= float_flag_invalid;
1160 return parts_default_nan(s);
1162 /* Inf / x or 0 / x */
1163 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1164 a.sign = sign;
1165 return a;
1167 /* Div 0 => Inf */
1168 if (b.cls == float_class_zero) {
1169 s->float_exception_flags |= float_flag_divbyzero;
1170 a.cls = float_class_inf;
1171 a.sign = sign;
1172 return a;
1174 /* Div by Inf */
1175 if (b.cls == float_class_inf) {
1176 a.cls = float_class_zero;
1177 a.sign = sign;
1178 return a;
1180 g_assert_not_reached();
1183 float16 float16_div(float16 a, float16 b, float_status *status)
1185 FloatParts pa = float16_unpack_canonical(a, status);
1186 FloatParts pb = float16_unpack_canonical(b, status);
1187 FloatParts pr = div_floats(pa, pb, status);
1189 return float16_round_pack_canonical(pr, status);
1192 float32 float32_div(float32 a, float32 b, float_status *status)
1194 FloatParts pa = float32_unpack_canonical(a, status);
1195 FloatParts pb = float32_unpack_canonical(b, status);
1196 FloatParts pr = div_floats(pa, pb, status);
1198 return float32_round_pack_canonical(pr, status);
1201 float64 float64_div(float64 a, float64 b, float_status *status)
1203 FloatParts pa = float64_unpack_canonical(a, status);
1204 FloatParts pb = float64_unpack_canonical(b, status);
1205 FloatParts pr = div_floats(pa, pb, status);
1207 return float64_round_pack_canonical(pr, status);
1211 * Float to Float conversions
1213 * Returns the result of converting one float format to another. The
1214 * conversion is performed according to the IEC/IEEE Standard for
1215 * Binary Floating-Point Arithmetic.
1217 * The float_to_float helper only needs to take care of raising
1218 * invalid exceptions and handling the conversion on NaNs.
1221 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1222 float_status *s)
1224 if (dstf->arm_althp) {
1225 switch (a.cls) {
1226 case float_class_qnan:
1227 case float_class_snan:
1228 /* There is no NaN in the destination format. Raise Invalid
1229 * and return a zero with the sign of the input NaN.
1231 s->float_exception_flags |= float_flag_invalid;
1232 a.cls = float_class_zero;
1233 a.frac = 0;
1234 a.exp = 0;
1235 break;
1237 case float_class_inf:
1238 /* There is no Inf in the destination format. Raise Invalid
1239 * and return the maximum normal with the correct sign.
1241 s->float_exception_flags |= float_flag_invalid;
1242 a.cls = float_class_normal;
1243 a.exp = dstf->exp_max;
1244 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1245 break;
1247 default:
1248 break;
1250 } else if (is_nan(a.cls)) {
1251 if (is_snan(a.cls)) {
1252 s->float_exception_flags |= float_flag_invalid;
1253 a = parts_silence_nan(a, s);
1255 if (s->default_nan_mode) {
1256 return parts_default_nan(s);
1259 return a;
1262 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1264 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1265 FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1266 FloatParts pr = float_to_float(p, &float32_params, s);
1267 return float32_round_pack_canonical(pr, s);
1270 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1272 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1273 FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1274 FloatParts pr = float_to_float(p, &float64_params, s);
1275 return float64_round_pack_canonical(pr, s);
1278 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1280 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1281 FloatParts p = float32_unpack_canonical(a, s);
1282 FloatParts pr = float_to_float(p, fmt16, s);
1283 return float16a_round_pack_canonical(pr, s, fmt16);
1286 float64 float32_to_float64(float32 a, float_status *s)
1288 FloatParts p = float32_unpack_canonical(a, s);
1289 FloatParts pr = float_to_float(p, &float64_params, s);
1290 return float64_round_pack_canonical(pr, s);
1293 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1295 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1296 FloatParts p = float64_unpack_canonical(a, s);
1297 FloatParts pr = float_to_float(p, fmt16, s);
1298 return float16a_round_pack_canonical(pr, s, fmt16);
1301 float32 float64_to_float32(float64 a, float_status *s)
1303 FloatParts p = float64_unpack_canonical(a, s);
1304 FloatParts pr = float_to_float(p, &float32_params, s);
1305 return float32_round_pack_canonical(pr, s);
1309 * Rounds the floating-point value `a' to an integer, and returns the
1310 * result as a floating-point value. The operation is performed
1311 * according to the IEC/IEEE Standard for Binary Floating-Point
1312 * Arithmetic.
1315 static FloatParts round_to_int(FloatParts a, int rmode,
1316 int scale, float_status *s)
1318 switch (a.cls) {
1319 case float_class_qnan:
1320 case float_class_snan:
1321 return return_nan(a, s);
1323 case float_class_zero:
1324 case float_class_inf:
1325 /* already "integral" */
1326 break;
1328 case float_class_normal:
1329 scale = MIN(MAX(scale, -0x10000), 0x10000);
1330 a.exp += scale;
1332 if (a.exp >= DECOMPOSED_BINARY_POINT) {
1333 /* already integral */
1334 break;
1336 if (a.exp < 0) {
1337 bool one;
1338 /* all fractional */
1339 s->float_exception_flags |= float_flag_inexact;
1340 switch (rmode) {
1341 case float_round_nearest_even:
1342 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1343 break;
1344 case float_round_ties_away:
1345 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1346 break;
1347 case float_round_to_zero:
1348 one = false;
1349 break;
1350 case float_round_up:
1351 one = !a.sign;
1352 break;
1353 case float_round_down:
1354 one = a.sign;
1355 break;
1356 default:
1357 g_assert_not_reached();
1360 if (one) {
1361 a.frac = DECOMPOSED_IMPLICIT_BIT;
1362 a.exp = 0;
1363 } else {
1364 a.cls = float_class_zero;
1366 } else {
1367 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1368 uint64_t frac_lsbm1 = frac_lsb >> 1;
1369 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1370 uint64_t rnd_mask = rnd_even_mask >> 1;
1371 uint64_t inc;
1373 switch (rmode) {
1374 case float_round_nearest_even:
1375 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1376 break;
1377 case float_round_ties_away:
1378 inc = frac_lsbm1;
1379 break;
1380 case float_round_to_zero:
1381 inc = 0;
1382 break;
1383 case float_round_up:
1384 inc = a.sign ? 0 : rnd_mask;
1385 break;
1386 case float_round_down:
1387 inc = a.sign ? rnd_mask : 0;
1388 break;
1389 default:
1390 g_assert_not_reached();
1393 if (a.frac & rnd_mask) {
1394 s->float_exception_flags |= float_flag_inexact;
1395 a.frac += inc;
1396 a.frac &= ~rnd_mask;
1397 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1398 a.frac >>= 1;
1399 a.exp++;
1403 break;
1404 default:
1405 g_assert_not_reached();
1407 return a;
1410 float16 float16_round_to_int(float16 a, float_status *s)
1412 FloatParts pa = float16_unpack_canonical(a, s);
1413 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1414 return float16_round_pack_canonical(pr, s);
1417 float32 float32_round_to_int(float32 a, float_status *s)
1419 FloatParts pa = float32_unpack_canonical(a, s);
1420 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1421 return float32_round_pack_canonical(pr, s);
1424 float64 float64_round_to_int(float64 a, float_status *s)
1426 FloatParts pa = float64_unpack_canonical(a, s);
1427 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1428 return float64_round_pack_canonical(pr, s);
1432 * Returns the result of converting the floating-point value `a' to
1433 * the two's complement integer format. The conversion is performed
1434 * according to the IEC/IEEE Standard for Binary Floating-Point
1435 * Arithmetic---which means in particular that the conversion is
1436 * rounded according to the current rounding mode. If `a' is a NaN,
1437 * the largest positive integer is returned. Otherwise, if the
1438 * conversion overflows, the largest integer with the same sign as `a'
1439 * is returned.
1442 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale,
1443 int64_t min, int64_t max,
1444 float_status *s)
1446 uint64_t r;
1447 int orig_flags = get_float_exception_flags(s);
1448 FloatParts p = round_to_int(in, rmode, scale, s);
1450 switch (p.cls) {
1451 case float_class_snan:
1452 case float_class_qnan:
1453 s->float_exception_flags = orig_flags | float_flag_invalid;
1454 return max;
1455 case float_class_inf:
1456 s->float_exception_flags = orig_flags | float_flag_invalid;
1457 return p.sign ? min : max;
1458 case float_class_zero:
1459 return 0;
1460 case float_class_normal:
1461 if (p.exp < DECOMPOSED_BINARY_POINT) {
1462 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1463 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1464 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1465 } else {
1466 r = UINT64_MAX;
1468 if (p.sign) {
1469 if (r <= -(uint64_t) min) {
1470 return -r;
1471 } else {
1472 s->float_exception_flags = orig_flags | float_flag_invalid;
1473 return min;
1475 } else {
1476 if (r <= max) {
1477 return r;
1478 } else {
1479 s->float_exception_flags = orig_flags | float_flag_invalid;
1480 return max;
1483 default:
1484 g_assert_not_reached();
1488 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale,
1489 float_status *s)
1491 return round_to_int_and_pack(float16_unpack_canonical(a, s),
1492 rmode, scale, INT16_MIN, INT16_MAX, s);
1495 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale,
1496 float_status *s)
1498 return round_to_int_and_pack(float16_unpack_canonical(a, s),
1499 rmode, scale, INT32_MIN, INT32_MAX, s);
1502 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale,
1503 float_status *s)
1505 return round_to_int_and_pack(float16_unpack_canonical(a, s),
1506 rmode, scale, INT64_MIN, INT64_MAX, s);
1509 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale,
1510 float_status *s)
1512 return round_to_int_and_pack(float32_unpack_canonical(a, s),
1513 rmode, scale, INT16_MIN, INT16_MAX, s);
1516 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale,
1517 float_status *s)
1519 return round_to_int_and_pack(float32_unpack_canonical(a, s),
1520 rmode, scale, INT32_MIN, INT32_MAX, s);
1523 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale,
1524 float_status *s)
1526 return round_to_int_and_pack(float32_unpack_canonical(a, s),
1527 rmode, scale, INT64_MIN, INT64_MAX, s);
1530 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale,
1531 float_status *s)
1533 return round_to_int_and_pack(float64_unpack_canonical(a, s),
1534 rmode, scale, INT16_MIN, INT16_MAX, s);
1537 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale,
1538 float_status *s)
1540 return round_to_int_and_pack(float64_unpack_canonical(a, s),
1541 rmode, scale, INT32_MIN, INT32_MAX, s);
1544 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale,
1545 float_status *s)
1547 return round_to_int_and_pack(float64_unpack_canonical(a, s),
1548 rmode, scale, INT64_MIN, INT64_MAX, s);
1551 int16_t float16_to_int16(float16 a, float_status *s)
1553 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
1556 int32_t float16_to_int32(float16 a, float_status *s)
1558 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
1561 int64_t float16_to_int64(float16 a, float_status *s)
1563 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
1566 int16_t float32_to_int16(float32 a, float_status *s)
1568 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
1571 int32_t float32_to_int32(float32 a, float_status *s)
1573 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
1576 int64_t float32_to_int64(float32 a, float_status *s)
1578 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
1581 int16_t float64_to_int16(float64 a, float_status *s)
1583 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
1586 int32_t float64_to_int32(float64 a, float_status *s)
1588 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
1591 int64_t float64_to_int64(float64 a, float_status *s)
1593 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
1596 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
1598 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
1601 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
1603 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
1606 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
1608 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
1611 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
1613 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
1616 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
1618 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
1621 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
1623 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
1626 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
1628 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
1631 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
1633 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
1636 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
1638 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
1642 * Returns the result of converting the floating-point value `a' to
1643 * the unsigned integer format. The conversion is performed according
1644 * to the IEC/IEEE Standard for Binary Floating-Point
1645 * Arithmetic---which means in particular that the conversion is
1646 * rounded according to the current rounding mode. If `a' is a NaN,
1647 * the largest unsigned integer is returned. Otherwise, if the
1648 * conversion overflows, the largest unsigned integer is returned. If
1649 * the 'a' is negative, the result is rounded and zero is returned;
1650 * values that do not round to zero will raise the inexact exception
1651 * flag.
1654 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale,
1655 uint64_t max, float_status *s)
1657 int orig_flags = get_float_exception_flags(s);
1658 FloatParts p = round_to_int(in, rmode, scale, s);
1659 uint64_t r;
1661 switch (p.cls) {
1662 case float_class_snan:
1663 case float_class_qnan:
1664 s->float_exception_flags = orig_flags | float_flag_invalid;
1665 return max;
1666 case float_class_inf:
1667 s->float_exception_flags = orig_flags | float_flag_invalid;
1668 return p.sign ? 0 : max;
1669 case float_class_zero:
1670 return 0;
1671 case float_class_normal:
1672 if (p.sign) {
1673 s->float_exception_flags = orig_flags | float_flag_invalid;
1674 return 0;
1677 if (p.exp < DECOMPOSED_BINARY_POINT) {
1678 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1679 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1680 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1681 } else {
1682 s->float_exception_flags = orig_flags | float_flag_invalid;
1683 return max;
1686 /* For uint64 this will never trip, but if p.exp is too large
1687 * to shift a decomposed fraction we shall have exited via the
1688 * 3rd leg above.
1690 if (r > max) {
1691 s->float_exception_flags = orig_flags | float_flag_invalid;
1692 return max;
1694 return r;
1695 default:
1696 g_assert_not_reached();
1700 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale,
1701 float_status *s)
1703 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
1704 rmode, scale, UINT16_MAX, s);
1707 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale,
1708 float_status *s)
1710 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
1711 rmode, scale, UINT32_MAX, s);
1714 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale,
1715 float_status *s)
1717 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
1718 rmode, scale, UINT64_MAX, s);
1721 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale,
1722 float_status *s)
1724 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
1725 rmode, scale, UINT16_MAX, s);
1728 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale,
1729 float_status *s)
1731 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
1732 rmode, scale, UINT32_MAX, s);
1735 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale,
1736 float_status *s)
1738 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
1739 rmode, scale, UINT64_MAX, s);
1742 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale,
1743 float_status *s)
1745 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
1746 rmode, scale, UINT16_MAX, s);
1749 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale,
1750 float_status *s)
1752 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
1753 rmode, scale, UINT32_MAX, s);
1756 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale,
1757 float_status *s)
1759 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
1760 rmode, scale, UINT64_MAX, s);
1763 uint16_t float16_to_uint16(float16 a, float_status *s)
1765 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
1768 uint32_t float16_to_uint32(float16 a, float_status *s)
1770 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
1773 uint64_t float16_to_uint64(float16 a, float_status *s)
1775 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
1778 uint16_t float32_to_uint16(float32 a, float_status *s)
1780 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
1783 uint32_t float32_to_uint32(float32 a, float_status *s)
1785 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
1788 uint64_t float32_to_uint64(float32 a, float_status *s)
1790 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
1793 uint16_t float64_to_uint16(float64 a, float_status *s)
1795 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
1798 uint32_t float64_to_uint32(float64 a, float_status *s)
1800 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
1803 uint64_t float64_to_uint64(float64 a, float_status *s)
1805 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
1808 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
1810 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
1813 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
1815 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
1818 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
1820 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
1823 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
1825 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
1828 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
1830 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
1833 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
1835 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
1838 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
1840 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
1843 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
1845 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
1848 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
1850 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
1854 * Integer to float conversions
1856 * Returns the result of converting the two's complement integer `a'
1857 * to the floating-point format. The conversion is performed according
1858 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1861 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
1863 FloatParts r = { .sign = false };
1865 if (a == 0) {
1866 r.cls = float_class_zero;
1867 } else {
1868 uint64_t f = a;
1869 int shift;
1871 r.cls = float_class_normal;
1872 if (a < 0) {
1873 f = -f;
1874 r.sign = true;
1876 shift = clz64(f) - 1;
1877 scale = MIN(MAX(scale, -0x10000), 0x10000);
1879 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
1880 r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
1883 return r;
1886 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
1888 FloatParts pa = int_to_float(a, scale, status);
1889 return float16_round_pack_canonical(pa, status);
1892 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
1894 return int64_to_float16_scalbn(a, scale, status);
1897 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
1899 return int64_to_float16_scalbn(a, scale, status);
1902 float16 int64_to_float16(int64_t a, float_status *status)
1904 return int64_to_float16_scalbn(a, 0, status);
1907 float16 int32_to_float16(int32_t a, float_status *status)
1909 return int64_to_float16_scalbn(a, 0, status);
1912 float16 int16_to_float16(int16_t a, float_status *status)
1914 return int64_to_float16_scalbn(a, 0, status);
1917 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
1919 FloatParts pa = int_to_float(a, scale, status);
1920 return float32_round_pack_canonical(pa, status);
1923 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
1925 return int64_to_float32_scalbn(a, scale, status);
1928 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
1930 return int64_to_float32_scalbn(a, scale, status);
1933 float32 int64_to_float32(int64_t a, float_status *status)
1935 return int64_to_float32_scalbn(a, 0, status);
1938 float32 int32_to_float32(int32_t a, float_status *status)
1940 return int64_to_float32_scalbn(a, 0, status);
1943 float32 int16_to_float32(int16_t a, float_status *status)
1945 return int64_to_float32_scalbn(a, 0, status);
1948 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
1950 FloatParts pa = int_to_float(a, scale, status);
1951 return float64_round_pack_canonical(pa, status);
1954 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
1956 return int64_to_float64_scalbn(a, scale, status);
1959 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
1961 return int64_to_float64_scalbn(a, scale, status);
1964 float64 int64_to_float64(int64_t a, float_status *status)
1966 return int64_to_float64_scalbn(a, 0, status);
1969 float64 int32_to_float64(int32_t a, float_status *status)
1971 return int64_to_float64_scalbn(a, 0, status);
1974 float64 int16_to_float64(int16_t a, float_status *status)
1976 return int64_to_float64_scalbn(a, 0, status);
1981 * Unsigned Integer to float conversions
1983 * Returns the result of converting the unsigned integer `a' to the
1984 * floating-point format. The conversion is performed according to the
1985 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1988 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
1990 FloatParts r = { .sign = false };
1992 if (a == 0) {
1993 r.cls = float_class_zero;
1994 } else {
1995 scale = MIN(MAX(scale, -0x10000), 0x10000);
1996 r.cls = float_class_normal;
1997 if ((int64_t)a < 0) {
1998 r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
1999 shift64RightJamming(a, 1, &a);
2000 r.frac = a;
2001 } else {
2002 int shift = clz64(a) - 1;
2003 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2004 r.frac = a << shift;
2008 return r;
2011 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2013 FloatParts pa = uint_to_float(a, scale, status);
2014 return float16_round_pack_canonical(pa, status);
2017 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2019 return uint64_to_float16_scalbn(a, scale, status);
2022 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2024 return uint64_to_float16_scalbn(a, scale, status);
2027 float16 uint64_to_float16(uint64_t a, float_status *status)
2029 return uint64_to_float16_scalbn(a, 0, status);
2032 float16 uint32_to_float16(uint32_t a, float_status *status)
2034 return uint64_to_float16_scalbn(a, 0, status);
2037 float16 uint16_to_float16(uint16_t a, float_status *status)
2039 return uint64_to_float16_scalbn(a, 0, status);
2042 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2044 FloatParts pa = uint_to_float(a, scale, status);
2045 return float32_round_pack_canonical(pa, status);
2048 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2050 return uint64_to_float32_scalbn(a, scale, status);
2053 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2055 return uint64_to_float32_scalbn(a, scale, status);
2058 float32 uint64_to_float32(uint64_t a, float_status *status)
2060 return uint64_to_float32_scalbn(a, 0, status);
2063 float32 uint32_to_float32(uint32_t a, float_status *status)
2065 return uint64_to_float32_scalbn(a, 0, status);
2068 float32 uint16_to_float32(uint16_t a, float_status *status)
2070 return uint64_to_float32_scalbn(a, 0, status);
2073 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2075 FloatParts pa = uint_to_float(a, scale, status);
2076 return float64_round_pack_canonical(pa, status);
2079 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2081 return uint64_to_float64_scalbn(a, scale, status);
2084 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2086 return uint64_to_float64_scalbn(a, scale, status);
2089 float64 uint64_to_float64(uint64_t a, float_status *status)
2091 return uint64_to_float64_scalbn(a, 0, status);
2094 float64 uint32_to_float64(uint32_t a, float_status *status)
2096 return uint64_to_float64_scalbn(a, 0, status);
2099 float64 uint16_to_float64(uint16_t a, float_status *status)
2101 return uint64_to_float64_scalbn(a, 0, status);
2104 /* Float Min/Max */
2105 /* min() and max() functions. These can't be implemented as
2106 * 'compare and pick one input' because that would mishandle
2107 * NaNs and +0 vs -0.
2109 * minnum() and maxnum() functions. These are similar to the min()
2110 * and max() functions but if one of the arguments is a QNaN and
2111 * the other is numerical then the numerical argument is returned.
2112 * SNaNs will get quietened before being returned.
2113 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2114 * and maxNum() operations. min() and max() are the typical min/max
2115 * semantics provided by many CPUs which predate that specification.
2117 * minnummag() and maxnummag() functions correspond to minNumMag()
2118 * and minNumMag() from the IEEE-754 2008.
2120 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2121 bool ieee, bool ismag, float_status *s)
2123 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2124 if (ieee) {
2125 /* Takes two floating-point values `a' and `b', one of
2126 * which is a NaN, and returns the appropriate NaN
2127 * result. If either `a' or `b' is a signaling NaN,
2128 * the invalid exception is raised.
2130 if (is_snan(a.cls) || is_snan(b.cls)) {
2131 return pick_nan(a, b, s);
2132 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2133 return b;
2134 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2135 return a;
2138 return pick_nan(a, b, s);
2139 } else {
2140 int a_exp, b_exp;
2142 switch (a.cls) {
2143 case float_class_normal:
2144 a_exp = a.exp;
2145 break;
2146 case float_class_inf:
2147 a_exp = INT_MAX;
2148 break;
2149 case float_class_zero:
2150 a_exp = INT_MIN;
2151 break;
2152 default:
2153 g_assert_not_reached();
2154 break;
2156 switch (b.cls) {
2157 case float_class_normal:
2158 b_exp = b.exp;
2159 break;
2160 case float_class_inf:
2161 b_exp = INT_MAX;
2162 break;
2163 case float_class_zero:
2164 b_exp = INT_MIN;
2165 break;
2166 default:
2167 g_assert_not_reached();
2168 break;
2171 if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2172 bool a_less = a_exp < b_exp;
2173 if (a_exp == b_exp) {
2174 a_less = a.frac < b.frac;
2176 return a_less ^ ismin ? b : a;
2179 if (a.sign == b.sign) {
2180 bool a_less = a_exp < b_exp;
2181 if (a_exp == b_exp) {
2182 a_less = a.frac < b.frac;
2184 return a.sign ^ a_less ^ ismin ? b : a;
2185 } else {
2186 return a.sign ^ ismin ? b : a;
2191 #define MINMAX(sz, name, ismin, isiee, ismag) \
2192 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
2193 float_status *s) \
2195 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2196 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2197 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
2199 return float ## sz ## _round_pack_canonical(pr, s); \
2202 MINMAX(16, min, true, false, false)
2203 MINMAX(16, minnum, true, true, false)
2204 MINMAX(16, minnummag, true, true, true)
2205 MINMAX(16, max, false, false, false)
2206 MINMAX(16, maxnum, false, true, false)
2207 MINMAX(16, maxnummag, false, true, true)
2209 MINMAX(32, min, true, false, false)
2210 MINMAX(32, minnum, true, true, false)
2211 MINMAX(32, minnummag, true, true, true)
2212 MINMAX(32, max, false, false, false)
2213 MINMAX(32, maxnum, false, true, false)
2214 MINMAX(32, maxnummag, false, true, true)
2216 MINMAX(64, min, true, false, false)
2217 MINMAX(64, minnum, true, true, false)
2218 MINMAX(64, minnummag, true, true, true)
2219 MINMAX(64, max, false, false, false)
2220 MINMAX(64, maxnum, false, true, false)
2221 MINMAX(64, maxnummag, false, true, true)
2223 #undef MINMAX
2225 /* Floating point compare */
2226 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2227 float_status *s)
2229 if (is_nan(a.cls) || is_nan(b.cls)) {
2230 if (!is_quiet ||
2231 a.cls == float_class_snan ||
2232 b.cls == float_class_snan) {
2233 s->float_exception_flags |= float_flag_invalid;
2235 return float_relation_unordered;
2238 if (a.cls == float_class_zero) {
2239 if (b.cls == float_class_zero) {
2240 return float_relation_equal;
2242 return b.sign ? float_relation_greater : float_relation_less;
2243 } else if (b.cls == float_class_zero) {
2244 return a.sign ? float_relation_less : float_relation_greater;
2247 /* The only really important thing about infinity is its sign. If
2248 * both are infinities the sign marks the smallest of the two.
2250 if (a.cls == float_class_inf) {
2251 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2252 return float_relation_equal;
2254 return a.sign ? float_relation_less : float_relation_greater;
2255 } else if (b.cls == float_class_inf) {
2256 return b.sign ? float_relation_greater : float_relation_less;
2259 if (a.sign != b.sign) {
2260 return a.sign ? float_relation_less : float_relation_greater;
2263 if (a.exp == b.exp) {
2264 if (a.frac == b.frac) {
2265 return float_relation_equal;
2267 if (a.sign) {
2268 return a.frac > b.frac ?
2269 float_relation_less : float_relation_greater;
2270 } else {
2271 return a.frac > b.frac ?
2272 float_relation_greater : float_relation_less;
2274 } else {
2275 if (a.sign) {
2276 return a.exp > b.exp ? float_relation_less : float_relation_greater;
2277 } else {
2278 return a.exp > b.exp ? float_relation_greater : float_relation_less;
2283 #define COMPARE(sz) \
2284 int float ## sz ## _compare(float ## sz a, float ## sz b, \
2285 float_status *s) \
2287 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2288 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2289 return compare_floats(pa, pb, false, s); \
2291 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \
2292 float_status *s) \
2294 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2295 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2296 return compare_floats(pa, pb, true, s); \
2299 COMPARE(16)
2300 COMPARE(32)
2301 COMPARE(64)
2303 #undef COMPARE
2305 /* Multiply A by 2 raised to the power N. */
2306 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
2308 if (unlikely(is_nan(a.cls))) {
2309 return return_nan(a, s);
2311 if (a.cls == float_class_normal) {
2312 /* The largest float type (even though not supported by FloatParts)
2313 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
2314 * still allows rounding to infinity, without allowing overflow
2315 * within the int32_t that backs FloatParts.exp.
2317 n = MIN(MAX(n, -0x10000), 0x10000);
2318 a.exp += n;
2320 return a;
2323 float16 float16_scalbn(float16 a, int n, float_status *status)
2325 FloatParts pa = float16_unpack_canonical(a, status);
2326 FloatParts pr = scalbn_decomposed(pa, n, status);
2327 return float16_round_pack_canonical(pr, status);
2330 float32 float32_scalbn(float32 a, int n, float_status *status)
2332 FloatParts pa = float32_unpack_canonical(a, status);
2333 FloatParts pr = scalbn_decomposed(pa, n, status);
2334 return float32_round_pack_canonical(pr, status);
2337 float64 float64_scalbn(float64 a, int n, float_status *status)
2339 FloatParts pa = float64_unpack_canonical(a, status);
2340 FloatParts pr = scalbn_decomposed(pa, n, status);
2341 return float64_round_pack_canonical(pr, status);
2345 * Square Root
2347 * The old softfloat code did an approximation step before zeroing in
2348 * on the final result. However for simpleness we just compute the
2349 * square root by iterating down from the implicit bit to enough extra
2350 * bits to ensure we get a correctly rounded result.
2352 * This does mean however the calculation is slower than before,
2353 * especially for 64 bit floats.
2356 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
2358 uint64_t a_frac, r_frac, s_frac;
2359 int bit, last_bit;
2361 if (is_nan(a.cls)) {
2362 return return_nan(a, s);
2364 if (a.cls == float_class_zero) {
2365 return a; /* sqrt(+-0) = +-0 */
2367 if (a.sign) {
2368 s->float_exception_flags |= float_flag_invalid;
2369 return parts_default_nan(s);
2371 if (a.cls == float_class_inf) {
2372 return a; /* sqrt(+inf) = +inf */
2375 assert(a.cls == float_class_normal);
2377 /* We need two overflow bits at the top. Adding room for that is a
2378 * right shift. If the exponent is odd, we can discard the low bit
2379 * by multiplying the fraction by 2; that's a left shift. Combine
2380 * those and we shift right if the exponent is even.
2382 a_frac = a.frac;
2383 if (!(a.exp & 1)) {
2384 a_frac >>= 1;
2386 a.exp >>= 1;
2388 /* Bit-by-bit computation of sqrt. */
2389 r_frac = 0;
2390 s_frac = 0;
2392 /* Iterate from implicit bit down to the 3 extra bits to compute a
2393 * properly rounded result. Remember we've inserted one more bit
2394 * at the top, so these positions are one less.
2396 bit = DECOMPOSED_BINARY_POINT - 1;
2397 last_bit = MAX(p->frac_shift - 4, 0);
2398 do {
2399 uint64_t q = 1ULL << bit;
2400 uint64_t t_frac = s_frac + q;
2401 if (t_frac <= a_frac) {
2402 s_frac = t_frac + q;
2403 a_frac -= t_frac;
2404 r_frac += q;
2406 a_frac <<= 1;
2407 } while (--bit >= last_bit);
2409 /* Undo the right shift done above. If there is any remaining
2410 * fraction, the result is inexact. Set the sticky bit.
2412 a.frac = (r_frac << 1) + (a_frac != 0);
2414 return a;
2417 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
2419 FloatParts pa = float16_unpack_canonical(a, status);
2420 FloatParts pr = sqrt_float(pa, status, &float16_params);
2421 return float16_round_pack_canonical(pr, status);
2424 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
2426 FloatParts pa = float32_unpack_canonical(a, status);
2427 FloatParts pr = sqrt_float(pa, status, &float32_params);
2428 return float32_round_pack_canonical(pr, status);
2431 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
2433 FloatParts pa = float64_unpack_canonical(a, status);
2434 FloatParts pr = sqrt_float(pa, status, &float64_params);
2435 return float64_round_pack_canonical(pr, status);
2438 /*----------------------------------------------------------------------------
2439 | The pattern for a default generated NaN.
2440 *----------------------------------------------------------------------------*/
2442 float16 float16_default_nan(float_status *status)
2444 FloatParts p = parts_default_nan(status);
2445 p.frac >>= float16_params.frac_shift;
2446 return float16_pack_raw(p);
2449 float32 float32_default_nan(float_status *status)
2451 FloatParts p = parts_default_nan(status);
2452 p.frac >>= float32_params.frac_shift;
2453 return float32_pack_raw(p);
2456 float64 float64_default_nan(float_status *status)
2458 FloatParts p = parts_default_nan(status);
2459 p.frac >>= float64_params.frac_shift;
2460 return float64_pack_raw(p);
2463 float128 float128_default_nan(float_status *status)
2465 FloatParts p = parts_default_nan(status);
2466 float128 r;
2468 /* Extrapolate from the choices made by parts_default_nan to fill
2469 * in the quad-floating format. If the low bit is set, assume we
2470 * want to set all non-snan bits.
2472 r.low = -(p.frac & 1);
2473 r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
2474 r.high |= LIT64(0x7FFF000000000000);
2475 r.high |= (uint64_t)p.sign << 63;
2477 return r;
2480 /*----------------------------------------------------------------------------
2481 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
2482 *----------------------------------------------------------------------------*/
2484 float16 float16_silence_nan(float16 a, float_status *status)
2486 FloatParts p = float16_unpack_raw(a);
2487 p.frac <<= float16_params.frac_shift;
2488 p = parts_silence_nan(p, status);
2489 p.frac >>= float16_params.frac_shift;
2490 return float16_pack_raw(p);
2493 float32 float32_silence_nan(float32 a, float_status *status)
2495 FloatParts p = float32_unpack_raw(a);
2496 p.frac <<= float32_params.frac_shift;
2497 p = parts_silence_nan(p, status);
2498 p.frac >>= float32_params.frac_shift;
2499 return float32_pack_raw(p);
2502 float64 float64_silence_nan(float64 a, float_status *status)
2504 FloatParts p = float64_unpack_raw(a);
2505 p.frac <<= float64_params.frac_shift;
2506 p = parts_silence_nan(p, status);
2507 p.frac >>= float64_params.frac_shift;
2508 return float64_pack_raw(p);
2511 /*----------------------------------------------------------------------------
2512 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2513 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2514 | input. If `zSign' is 1, the input is negated before being converted to an
2515 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
2516 | is simply rounded to an integer, with the inexact exception raised if the
2517 | input cannot be represented exactly as an integer. However, if the fixed-
2518 | point input is too large, the invalid exception is raised and the largest
2519 | positive or negative integer is returned.
2520 *----------------------------------------------------------------------------*/
2522 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2524 int8_t roundingMode;
2525 flag roundNearestEven;
2526 int8_t roundIncrement, roundBits;
2527 int32_t z;
2529 roundingMode = status->float_rounding_mode;
2530 roundNearestEven = ( roundingMode == float_round_nearest_even );
2531 switch (roundingMode) {
2532 case float_round_nearest_even:
2533 case float_round_ties_away:
2534 roundIncrement = 0x40;
2535 break;
2536 case float_round_to_zero:
2537 roundIncrement = 0;
2538 break;
2539 case float_round_up:
2540 roundIncrement = zSign ? 0 : 0x7f;
2541 break;
2542 case float_round_down:
2543 roundIncrement = zSign ? 0x7f : 0;
2544 break;
2545 default:
2546 abort();
2548 roundBits = absZ & 0x7F;
2549 absZ = ( absZ + roundIncrement )>>7;
2550 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2551 z = absZ;
2552 if ( zSign ) z = - z;
2553 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2554 float_raise(float_flag_invalid, status);
2555 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2557 if (roundBits) {
2558 status->float_exception_flags |= float_flag_inexact;
2560 return z;
2564 /*----------------------------------------------------------------------------
2565 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2566 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2567 | and returns the properly rounded 64-bit integer corresponding to the input.
2568 | If `zSign' is 1, the input is negated before being converted to an integer.
2569 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2570 | the inexact exception raised if the input cannot be represented exactly as
2571 | an integer. However, if the fixed-point input is too large, the invalid
2572 | exception is raised and the largest positive or negative integer is
2573 | returned.
2574 *----------------------------------------------------------------------------*/
2576 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2577 float_status *status)
2579 int8_t roundingMode;
2580 flag roundNearestEven, increment;
2581 int64_t z;
2583 roundingMode = status->float_rounding_mode;
2584 roundNearestEven = ( roundingMode == float_round_nearest_even );
2585 switch (roundingMode) {
2586 case float_round_nearest_even:
2587 case float_round_ties_away:
2588 increment = ((int64_t) absZ1 < 0);
2589 break;
2590 case float_round_to_zero:
2591 increment = 0;
2592 break;
2593 case float_round_up:
2594 increment = !zSign && absZ1;
2595 break;
2596 case float_round_down:
2597 increment = zSign && absZ1;
2598 break;
2599 default:
2600 abort();
2602 if ( increment ) {
2603 ++absZ0;
2604 if ( absZ0 == 0 ) goto overflow;
2605 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2607 z = absZ0;
2608 if ( zSign ) z = - z;
2609 if ( z && ( ( z < 0 ) ^ zSign ) ) {
2610 overflow:
2611 float_raise(float_flag_invalid, status);
2612 return
2613 zSign ? (int64_t) LIT64( 0x8000000000000000 )
2614 : LIT64( 0x7FFFFFFFFFFFFFFF );
2616 if (absZ1) {
2617 status->float_exception_flags |= float_flag_inexact;
2619 return z;
2623 /*----------------------------------------------------------------------------
2624 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2625 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2626 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2627 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
2628 | with the inexact exception raised if the input cannot be represented exactly
2629 | as an integer. However, if the fixed-point input is too large, the invalid
2630 | exception is raised and the largest unsigned integer is returned.
2631 *----------------------------------------------------------------------------*/
2633 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2634 uint64_t absZ1, float_status *status)
2636 int8_t roundingMode;
2637 flag roundNearestEven, increment;
2639 roundingMode = status->float_rounding_mode;
2640 roundNearestEven = (roundingMode == float_round_nearest_even);
2641 switch (roundingMode) {
2642 case float_round_nearest_even:
2643 case float_round_ties_away:
2644 increment = ((int64_t)absZ1 < 0);
2645 break;
2646 case float_round_to_zero:
2647 increment = 0;
2648 break;
2649 case float_round_up:
2650 increment = !zSign && absZ1;
2651 break;
2652 case float_round_down:
2653 increment = zSign && absZ1;
2654 break;
2655 default:
2656 abort();
2658 if (increment) {
2659 ++absZ0;
2660 if (absZ0 == 0) {
2661 float_raise(float_flag_invalid, status);
2662 return LIT64(0xFFFFFFFFFFFFFFFF);
2664 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2667 if (zSign && absZ0) {
2668 float_raise(float_flag_invalid, status);
2669 return 0;
2672 if (absZ1) {
2673 status->float_exception_flags |= float_flag_inexact;
2675 return absZ0;
2678 /*----------------------------------------------------------------------------
2679 | If `a' is denormal and we are in flush-to-zero mode then set the
2680 | input-denormal exception and return zero. Otherwise just return the value.
2681 *----------------------------------------------------------------------------*/
2682 float32 float32_squash_input_denormal(float32 a, float_status *status)
2684 if (status->flush_inputs_to_zero) {
2685 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2686 float_raise(float_flag_input_denormal, status);
2687 return make_float32(float32_val(a) & 0x80000000);
2690 return a;
2693 /*----------------------------------------------------------------------------
2694 | Normalizes the subnormal single-precision floating-point value represented
2695 | by the denormalized significand `aSig'. The normalized exponent and
2696 | significand are stored at the locations pointed to by `zExpPtr' and
2697 | `zSigPtr', respectively.
2698 *----------------------------------------------------------------------------*/
2700 static void
2701 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2703 int8_t shiftCount;
2705 shiftCount = clz32(aSig) - 8;
2706 *zSigPtr = aSig<<shiftCount;
2707 *zExpPtr = 1 - shiftCount;
2711 /*----------------------------------------------------------------------------
2712 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2713 | and significand `zSig', and returns the proper single-precision floating-
2714 | point value corresponding to the abstract input. Ordinarily, the abstract
2715 | value is simply rounded and packed into the single-precision format, with
2716 | the inexact exception raised if the abstract input cannot be represented
2717 | exactly. However, if the abstract value is too large, the overflow and
2718 | inexact exceptions are raised and an infinity or maximal finite value is
2719 | returned. If the abstract value is too small, the input value is rounded to
2720 | a subnormal number, and the underflow and inexact exceptions are raised if
2721 | the abstract input cannot be represented exactly as a subnormal single-
2722 | precision floating-point number.
2723 | The input significand `zSig' has its binary point between bits 30
2724 | and 29, which is 7 bits to the left of the usual location. This shifted
2725 | significand must be normalized or smaller. If `zSig' is not normalized,
2726 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2727 | and it must not require rounding. In the usual case that `zSig' is
2728 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2729 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2730 | Binary Floating-Point Arithmetic.
2731 *----------------------------------------------------------------------------*/
2733 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2734 float_status *status)
2736 int8_t roundingMode;
2737 flag roundNearestEven;
2738 int8_t roundIncrement, roundBits;
2739 flag isTiny;
2741 roundingMode = status->float_rounding_mode;
2742 roundNearestEven = ( roundingMode == float_round_nearest_even );
2743 switch (roundingMode) {
2744 case float_round_nearest_even:
2745 case float_round_ties_away:
2746 roundIncrement = 0x40;
2747 break;
2748 case float_round_to_zero:
2749 roundIncrement = 0;
2750 break;
2751 case float_round_up:
2752 roundIncrement = zSign ? 0 : 0x7f;
2753 break;
2754 case float_round_down:
2755 roundIncrement = zSign ? 0x7f : 0;
2756 break;
2757 default:
2758 abort();
2759 break;
2761 roundBits = zSig & 0x7F;
2762 if ( 0xFD <= (uint16_t) zExp ) {
2763 if ( ( 0xFD < zExp )
2764 || ( ( zExp == 0xFD )
2765 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2767 float_raise(float_flag_overflow | float_flag_inexact, status);
2768 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2770 if ( zExp < 0 ) {
2771 if (status->flush_to_zero) {
2772 float_raise(float_flag_output_denormal, status);
2773 return packFloat32(zSign, 0, 0);
2775 isTiny =
2776 (status->float_detect_tininess
2777 == float_tininess_before_rounding)
2778 || ( zExp < -1 )
2779 || ( zSig + roundIncrement < 0x80000000 );
2780 shift32RightJamming( zSig, - zExp, &zSig );
2781 zExp = 0;
2782 roundBits = zSig & 0x7F;
2783 if (isTiny && roundBits) {
2784 float_raise(float_flag_underflow, status);
2788 if (roundBits) {
2789 status->float_exception_flags |= float_flag_inexact;
2791 zSig = ( zSig + roundIncrement )>>7;
2792 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2793 if ( zSig == 0 ) zExp = 0;
2794 return packFloat32( zSign, zExp, zSig );
2798 /*----------------------------------------------------------------------------
2799 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2800 | and significand `zSig', and returns the proper single-precision floating-
2801 | point value corresponding to the abstract input. This routine is just like
2802 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2803 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2804 | floating-point exponent.
2805 *----------------------------------------------------------------------------*/
2807 static float32
2808 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2809 float_status *status)
2811 int8_t shiftCount;
2813 shiftCount = clz32(zSig) - 1;
2814 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2815 status);
2819 /*----------------------------------------------------------------------------
2820 | If `a' is denormal and we are in flush-to-zero mode then set the
2821 | input-denormal exception and return zero. Otherwise just return the value.
2822 *----------------------------------------------------------------------------*/
2823 float64 float64_squash_input_denormal(float64 a, float_status *status)
2825 if (status->flush_inputs_to_zero) {
2826 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2827 float_raise(float_flag_input_denormal, status);
2828 return make_float64(float64_val(a) & (1ULL << 63));
2831 return a;
2834 /*----------------------------------------------------------------------------
2835 | Normalizes the subnormal double-precision floating-point value represented
2836 | by the denormalized significand `aSig'. The normalized exponent and
2837 | significand are stored at the locations pointed to by `zExpPtr' and
2838 | `zSigPtr', respectively.
2839 *----------------------------------------------------------------------------*/
2841 static void
2842 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2844 int8_t shiftCount;
2846 shiftCount = clz64(aSig) - 11;
2847 *zSigPtr = aSig<<shiftCount;
2848 *zExpPtr = 1 - shiftCount;
2852 /*----------------------------------------------------------------------------
2853 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2854 | double-precision floating-point value, returning the result. After being
2855 | shifted into the proper positions, the three fields are simply added
2856 | together to form the result. This means that any integer portion of `zSig'
2857 | will be added into the exponent. Since a properly normalized significand
2858 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2859 | than the desired result exponent whenever `zSig' is a complete, normalized
2860 | significand.
2861 *----------------------------------------------------------------------------*/
2863 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2866 return make_float64(
2867 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2871 /*----------------------------------------------------------------------------
2872 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2873 | and significand `zSig', and returns the proper double-precision floating-
2874 | point value corresponding to the abstract input. Ordinarily, the abstract
2875 | value is simply rounded and packed into the double-precision format, with
2876 | the inexact exception raised if the abstract input cannot be represented
2877 | exactly. However, if the abstract value is too large, the overflow and
2878 | inexact exceptions are raised and an infinity or maximal finite value is
2879 | returned. If the abstract value is too small, the input value is rounded to
2880 | a subnormal number, and the underflow and inexact exceptions are raised if
2881 | the abstract input cannot be represented exactly as a subnormal double-
2882 | precision floating-point number.
2883 | The input significand `zSig' has its binary point between bits 62
2884 | and 61, which is 10 bits to the left of the usual location. This shifted
2885 | significand must be normalized or smaller. If `zSig' is not normalized,
2886 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2887 | and it must not require rounding. In the usual case that `zSig' is
2888 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2889 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2890 | Binary Floating-Point Arithmetic.
2891 *----------------------------------------------------------------------------*/
2893 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2894 float_status *status)
2896 int8_t roundingMode;
2897 flag roundNearestEven;
2898 int roundIncrement, roundBits;
2899 flag isTiny;
2901 roundingMode = status->float_rounding_mode;
2902 roundNearestEven = ( roundingMode == float_round_nearest_even );
2903 switch (roundingMode) {
2904 case float_round_nearest_even:
2905 case float_round_ties_away:
2906 roundIncrement = 0x200;
2907 break;
2908 case float_round_to_zero:
2909 roundIncrement = 0;
2910 break;
2911 case float_round_up:
2912 roundIncrement = zSign ? 0 : 0x3ff;
2913 break;
2914 case float_round_down:
2915 roundIncrement = zSign ? 0x3ff : 0;
2916 break;
2917 case float_round_to_odd:
2918 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2919 break;
2920 default:
2921 abort();
2923 roundBits = zSig & 0x3FF;
2924 if ( 0x7FD <= (uint16_t) zExp ) {
2925 if ( ( 0x7FD < zExp )
2926 || ( ( zExp == 0x7FD )
2927 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2929 bool overflow_to_inf = roundingMode != float_round_to_odd &&
2930 roundIncrement != 0;
2931 float_raise(float_flag_overflow | float_flag_inexact, status);
2932 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2934 if ( zExp < 0 ) {
2935 if (status->flush_to_zero) {
2936 float_raise(float_flag_output_denormal, status);
2937 return packFloat64(zSign, 0, 0);
2939 isTiny =
2940 (status->float_detect_tininess
2941 == float_tininess_before_rounding)
2942 || ( zExp < -1 )
2943 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2944 shift64RightJamming( zSig, - zExp, &zSig );
2945 zExp = 0;
2946 roundBits = zSig & 0x3FF;
2947 if (isTiny && roundBits) {
2948 float_raise(float_flag_underflow, status);
2950 if (roundingMode == float_round_to_odd) {
2952 * For round-to-odd case, the roundIncrement depends on
2953 * zSig which just changed.
2955 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2959 if (roundBits) {
2960 status->float_exception_flags |= float_flag_inexact;
2962 zSig = ( zSig + roundIncrement )>>10;
2963 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2964 if ( zSig == 0 ) zExp = 0;
2965 return packFloat64( zSign, zExp, zSig );
2969 /*----------------------------------------------------------------------------
2970 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2971 | and significand `zSig', and returns the proper double-precision floating-
2972 | point value corresponding to the abstract input. This routine is just like
2973 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2974 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2975 | floating-point exponent.
2976 *----------------------------------------------------------------------------*/
2978 static float64
2979 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2980 float_status *status)
2982 int8_t shiftCount;
2984 shiftCount = clz64(zSig) - 1;
2985 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2986 status);
2990 /*----------------------------------------------------------------------------
2991 | Normalizes the subnormal extended double-precision floating-point value
2992 | represented by the denormalized significand `aSig'. The normalized exponent
2993 | and significand are stored at the locations pointed to by `zExpPtr' and
2994 | `zSigPtr', respectively.
2995 *----------------------------------------------------------------------------*/
2997 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2998 uint64_t *zSigPtr)
3000 int8_t shiftCount;
3002 shiftCount = clz64(aSig);
3003 *zSigPtr = aSig<<shiftCount;
3004 *zExpPtr = 1 - shiftCount;
3007 /*----------------------------------------------------------------------------
3008 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3009 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3010 | and returns the proper extended double-precision floating-point value
3011 | corresponding to the abstract input. Ordinarily, the abstract value is
3012 | rounded and packed into the extended double-precision format, with the
3013 | inexact exception raised if the abstract input cannot be represented
3014 | exactly. However, if the abstract value is too large, the overflow and
3015 | inexact exceptions are raised and an infinity or maximal finite value is
3016 | returned. If the abstract value is too small, the input value is rounded to
3017 | a subnormal number, and the underflow and inexact exceptions are raised if
3018 | the abstract input cannot be represented exactly as a subnormal extended
3019 | double-precision floating-point number.
3020 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
3021 | number of bits as single or double precision, respectively. Otherwise, the
3022 | result is rounded to the full precision of the extended double-precision
3023 | format.
3024 | The input significand must be normalized or smaller. If the input
3025 | significand is not normalized, `zExp' must be 0; in that case, the result
3026 | returned is a subnormal number, and it must not require rounding. The
3027 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3028 | Floating-Point Arithmetic.
3029 *----------------------------------------------------------------------------*/
3031 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
3032 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3033 float_status *status)
3035 int8_t roundingMode;
3036 flag roundNearestEven, increment, isTiny;
3037 int64_t roundIncrement, roundMask, roundBits;
3039 roundingMode = status->float_rounding_mode;
3040 roundNearestEven = ( roundingMode == float_round_nearest_even );
3041 if ( roundingPrecision == 80 ) goto precision80;
3042 if ( roundingPrecision == 64 ) {
3043 roundIncrement = LIT64( 0x0000000000000400 );
3044 roundMask = LIT64( 0x00000000000007FF );
3046 else if ( roundingPrecision == 32 ) {
3047 roundIncrement = LIT64( 0x0000008000000000 );
3048 roundMask = LIT64( 0x000000FFFFFFFFFF );
3050 else {
3051 goto precision80;
3053 zSig0 |= ( zSig1 != 0 );
3054 switch (roundingMode) {
3055 case float_round_nearest_even:
3056 case float_round_ties_away:
3057 break;
3058 case float_round_to_zero:
3059 roundIncrement = 0;
3060 break;
3061 case float_round_up:
3062 roundIncrement = zSign ? 0 : roundMask;
3063 break;
3064 case float_round_down:
3065 roundIncrement = zSign ? roundMask : 0;
3066 break;
3067 default:
3068 abort();
3070 roundBits = zSig0 & roundMask;
3071 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3072 if ( ( 0x7FFE < zExp )
3073 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3075 goto overflow;
3077 if ( zExp <= 0 ) {
3078 if (status->flush_to_zero) {
3079 float_raise(float_flag_output_denormal, status);
3080 return packFloatx80(zSign, 0, 0);
3082 isTiny =
3083 (status->float_detect_tininess
3084 == float_tininess_before_rounding)
3085 || ( zExp < 0 )
3086 || ( zSig0 <= zSig0 + roundIncrement );
3087 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3088 zExp = 0;
3089 roundBits = zSig0 & roundMask;
3090 if (isTiny && roundBits) {
3091 float_raise(float_flag_underflow, status);
3093 if (roundBits) {
3094 status->float_exception_flags |= float_flag_inexact;
3096 zSig0 += roundIncrement;
3097 if ( (int64_t) zSig0 < 0 ) zExp = 1;
3098 roundIncrement = roundMask + 1;
3099 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3100 roundMask |= roundIncrement;
3102 zSig0 &= ~ roundMask;
3103 return packFloatx80( zSign, zExp, zSig0 );
3106 if (roundBits) {
3107 status->float_exception_flags |= float_flag_inexact;
3109 zSig0 += roundIncrement;
3110 if ( zSig0 < roundIncrement ) {
3111 ++zExp;
3112 zSig0 = LIT64( 0x8000000000000000 );
3114 roundIncrement = roundMask + 1;
3115 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3116 roundMask |= roundIncrement;
3118 zSig0 &= ~ roundMask;
3119 if ( zSig0 == 0 ) zExp = 0;
3120 return packFloatx80( zSign, zExp, zSig0 );
3121 precision80:
3122 switch (roundingMode) {
3123 case float_round_nearest_even:
3124 case float_round_ties_away:
3125 increment = ((int64_t)zSig1 < 0);
3126 break;
3127 case float_round_to_zero:
3128 increment = 0;
3129 break;
3130 case float_round_up:
3131 increment = !zSign && zSig1;
3132 break;
3133 case float_round_down:
3134 increment = zSign && zSig1;
3135 break;
3136 default:
3137 abort();
3139 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3140 if ( ( 0x7FFE < zExp )
3141 || ( ( zExp == 0x7FFE )
3142 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
3143 && increment
3146 roundMask = 0;
3147 overflow:
3148 float_raise(float_flag_overflow | float_flag_inexact, status);
3149 if ( ( roundingMode == float_round_to_zero )
3150 || ( zSign && ( roundingMode == float_round_up ) )
3151 || ( ! zSign && ( roundingMode == float_round_down ) )
3153 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3155 return packFloatx80(zSign,
3156 floatx80_infinity_high,
3157 floatx80_infinity_low);
3159 if ( zExp <= 0 ) {
3160 isTiny =
3161 (status->float_detect_tininess
3162 == float_tininess_before_rounding)
3163 || ( zExp < 0 )
3164 || ! increment
3165 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
3166 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
3167 zExp = 0;
3168 if (isTiny && zSig1) {
3169 float_raise(float_flag_underflow, status);
3171 if (zSig1) {
3172 status->float_exception_flags |= float_flag_inexact;
3174 switch (roundingMode) {
3175 case float_round_nearest_even:
3176 case float_round_ties_away:
3177 increment = ((int64_t)zSig1 < 0);
3178 break;
3179 case float_round_to_zero:
3180 increment = 0;
3181 break;
3182 case float_round_up:
3183 increment = !zSign && zSig1;
3184 break;
3185 case float_round_down:
3186 increment = zSign && zSig1;
3187 break;
3188 default:
3189 abort();
3191 if ( increment ) {
3192 ++zSig0;
3193 zSig0 &=
3194 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3195 if ( (int64_t) zSig0 < 0 ) zExp = 1;
3197 return packFloatx80( zSign, zExp, zSig0 );
3200 if (zSig1) {
3201 status->float_exception_flags |= float_flag_inexact;
3203 if ( increment ) {
3204 ++zSig0;
3205 if ( zSig0 == 0 ) {
3206 ++zExp;
3207 zSig0 = LIT64( 0x8000000000000000 );
3209 else {
3210 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3213 else {
3214 if ( zSig0 == 0 ) zExp = 0;
3216 return packFloatx80( zSign, zExp, zSig0 );
3220 /*----------------------------------------------------------------------------
3221 | Takes an abstract floating-point value having sign `zSign', exponent
3222 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
3223 | and returns the proper extended double-precision floating-point value
3224 | corresponding to the abstract input. This routine is just like
3225 | `roundAndPackFloatx80' except that the input significand does not have to be
3226 | normalized.
3227 *----------------------------------------------------------------------------*/
3229 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
3230 flag zSign, int32_t zExp,
3231 uint64_t zSig0, uint64_t zSig1,
3232 float_status *status)
3234 int8_t shiftCount;
3236 if ( zSig0 == 0 ) {
3237 zSig0 = zSig1;
3238 zSig1 = 0;
3239 zExp -= 64;
3241 shiftCount = clz64(zSig0);
3242 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3243 zExp -= shiftCount;
3244 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
3245 zSig0, zSig1, status);
3249 /*----------------------------------------------------------------------------
3250 | Returns the least-significant 64 fraction bits of the quadruple-precision
3251 | floating-point value `a'.
3252 *----------------------------------------------------------------------------*/
3254 static inline uint64_t extractFloat128Frac1( float128 a )
3257 return a.low;
3261 /*----------------------------------------------------------------------------
3262 | Returns the most-significant 48 fraction bits of the quadruple-precision
3263 | floating-point value `a'.
3264 *----------------------------------------------------------------------------*/
3266 static inline uint64_t extractFloat128Frac0( float128 a )
3269 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
3273 /*----------------------------------------------------------------------------
3274 | Returns the exponent bits of the quadruple-precision floating-point value
3275 | `a'.
3276 *----------------------------------------------------------------------------*/
3278 static inline int32_t extractFloat128Exp( float128 a )
3281 return ( a.high>>48 ) & 0x7FFF;
3285 /*----------------------------------------------------------------------------
3286 | Returns the sign bit of the quadruple-precision floating-point value `a'.
3287 *----------------------------------------------------------------------------*/
3289 static inline flag extractFloat128Sign( float128 a )
3292 return a.high>>63;
3296 /*----------------------------------------------------------------------------
3297 | Normalizes the subnormal quadruple-precision floating-point value
3298 | represented by the denormalized significand formed by the concatenation of
3299 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
3300 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
3301 | significand are stored at the location pointed to by `zSig0Ptr', and the
3302 | least significant 64 bits of the normalized significand are stored at the
3303 | location pointed to by `zSig1Ptr'.
3304 *----------------------------------------------------------------------------*/
3306 static void
3307 normalizeFloat128Subnormal(
3308 uint64_t aSig0,
3309 uint64_t aSig1,
3310 int32_t *zExpPtr,
3311 uint64_t *zSig0Ptr,
3312 uint64_t *zSig1Ptr
3315 int8_t shiftCount;
3317 if ( aSig0 == 0 ) {
3318 shiftCount = clz64(aSig1) - 15;
3319 if ( shiftCount < 0 ) {
3320 *zSig0Ptr = aSig1>>( - shiftCount );
3321 *zSig1Ptr = aSig1<<( shiftCount & 63 );
3323 else {
3324 *zSig0Ptr = aSig1<<shiftCount;
3325 *zSig1Ptr = 0;
3327 *zExpPtr = - shiftCount - 63;
3329 else {
3330 shiftCount = clz64(aSig0) - 15;
3331 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
3332 *zExpPtr = 1 - shiftCount;
3337 /*----------------------------------------------------------------------------
3338 | Packs the sign `zSign', the exponent `zExp', and the significand formed
3339 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
3340 | floating-point value, returning the result. After being shifted into the
3341 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
3342 | added together to form the most significant 32 bits of the result. This
3343 | means that any integer portion of `zSig0' will be added into the exponent.
3344 | Since a properly normalized significand will have an integer portion equal
3345 | to 1, the `zExp' input should be 1 less than the desired result exponent
3346 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
3347 | significand.
3348 *----------------------------------------------------------------------------*/
3350 static inline float128
3351 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
3353 float128 z;
3355 z.low = zSig1;
3356 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
3357 return z;
3361 /*----------------------------------------------------------------------------
3362 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3363 | and extended significand formed by the concatenation of `zSig0', `zSig1',
3364 | and `zSig2', and returns the proper quadruple-precision floating-point value
3365 | corresponding to the abstract input. Ordinarily, the abstract value is
3366 | simply rounded and packed into the quadruple-precision format, with the
3367 | inexact exception raised if the abstract input cannot be represented
3368 | exactly. However, if the abstract value is too large, the overflow and
3369 | inexact exceptions are raised and an infinity or maximal finite value is
3370 | returned. If the abstract value is too small, the input value is rounded to
3371 | a subnormal number, and the underflow and inexact exceptions are raised if
3372 | the abstract input cannot be represented exactly as a subnormal quadruple-
3373 | precision floating-point number.
3374 | The input significand must be normalized or smaller. If the input
3375 | significand is not normalized, `zExp' must be 0; in that case, the result
3376 | returned is a subnormal number, and it must not require rounding. In the
3377 | usual case that the input significand is normalized, `zExp' must be 1 less
3378 | than the ``true'' floating-point exponent. The handling of underflow and
3379 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3380 *----------------------------------------------------------------------------*/
3382 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
3383 uint64_t zSig0, uint64_t zSig1,
3384 uint64_t zSig2, float_status *status)
3386 int8_t roundingMode;
3387 flag roundNearestEven, increment, isTiny;
3389 roundingMode = status->float_rounding_mode;
3390 roundNearestEven = ( roundingMode == float_round_nearest_even );
3391 switch (roundingMode) {
3392 case float_round_nearest_even:
3393 case float_round_ties_away:
3394 increment = ((int64_t)zSig2 < 0);
3395 break;
3396 case float_round_to_zero:
3397 increment = 0;
3398 break;
3399 case float_round_up:
3400 increment = !zSign && zSig2;
3401 break;
3402 case float_round_down:
3403 increment = zSign && zSig2;
3404 break;
3405 case float_round_to_odd:
3406 increment = !(zSig1 & 0x1) && zSig2;
3407 break;
3408 default:
3409 abort();
3411 if ( 0x7FFD <= (uint32_t) zExp ) {
3412 if ( ( 0x7FFD < zExp )
3413 || ( ( zExp == 0x7FFD )
3414 && eq128(
3415 LIT64( 0x0001FFFFFFFFFFFF ),
3416 LIT64( 0xFFFFFFFFFFFFFFFF ),
3417 zSig0,
3418 zSig1
3420 && increment
3423 float_raise(float_flag_overflow | float_flag_inexact, status);
3424 if ( ( roundingMode == float_round_to_zero )
3425 || ( zSign && ( roundingMode == float_round_up ) )
3426 || ( ! zSign && ( roundingMode == float_round_down ) )
3427 || (roundingMode == float_round_to_odd)
3429 return
3430 packFloat128(
3431 zSign,
3432 0x7FFE,
3433 LIT64( 0x0000FFFFFFFFFFFF ),
3434 LIT64( 0xFFFFFFFFFFFFFFFF )
3437 return packFloat128( zSign, 0x7FFF, 0, 0 );
3439 if ( zExp < 0 ) {
3440 if (status->flush_to_zero) {
3441 float_raise(float_flag_output_denormal, status);
3442 return packFloat128(zSign, 0, 0, 0);
3444 isTiny =
3445 (status->float_detect_tininess
3446 == float_tininess_before_rounding)
3447 || ( zExp < -1 )
3448 || ! increment
3449 || lt128(
3450 zSig0,
3451 zSig1,
3452 LIT64( 0x0001FFFFFFFFFFFF ),
3453 LIT64( 0xFFFFFFFFFFFFFFFF )
3455 shift128ExtraRightJamming(
3456 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
3457 zExp = 0;
3458 if (isTiny && zSig2) {
3459 float_raise(float_flag_underflow, status);
3461 switch (roundingMode) {
3462 case float_round_nearest_even:
3463 case float_round_ties_away:
3464 increment = ((int64_t)zSig2 < 0);
3465 break;
3466 case float_round_to_zero:
3467 increment = 0;
3468 break;
3469 case float_round_up:
3470 increment = !zSign && zSig2;
3471 break;
3472 case float_round_down:
3473 increment = zSign && zSig2;
3474 break;
3475 case float_round_to_odd:
3476 increment = !(zSig1 & 0x1) && zSig2;
3477 break;
3478 default:
3479 abort();
3483 if (zSig2) {
3484 status->float_exception_flags |= float_flag_inexact;
3486 if ( increment ) {
3487 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
3488 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
3490 else {
3491 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
3493 return packFloat128( zSign, zExp, zSig0, zSig1 );
3497 /*----------------------------------------------------------------------------
3498 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3499 | and significand formed by the concatenation of `zSig0' and `zSig1', and
3500 | returns the proper quadruple-precision floating-point value corresponding
3501 | to the abstract input. This routine is just like `roundAndPackFloat128'
3502 | except that the input significand has fewer bits and does not have to be
3503 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
3504 | point exponent.
3505 *----------------------------------------------------------------------------*/
3507 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
3508 uint64_t zSig0, uint64_t zSig1,
3509 float_status *status)
3511 int8_t shiftCount;
3512 uint64_t zSig2;
3514 if ( zSig0 == 0 ) {
3515 zSig0 = zSig1;
3516 zSig1 = 0;
3517 zExp -= 64;
3519 shiftCount = clz64(zSig0) - 15;
3520 if ( 0 <= shiftCount ) {
3521 zSig2 = 0;
3522 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3524 else {
3525 shift128ExtraRightJamming(
3526 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3528 zExp -= shiftCount;
3529 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3534 /*----------------------------------------------------------------------------
3535 | Returns the result of converting the 32-bit two's complement integer `a'
3536 | to the extended double-precision floating-point format. The conversion
3537 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3538 | Arithmetic.
3539 *----------------------------------------------------------------------------*/
3541 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3543 flag zSign;
3544 uint32_t absA;
3545 int8_t shiftCount;
3546 uint64_t zSig;
3548 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3549 zSign = ( a < 0 );
3550 absA = zSign ? - a : a;
3551 shiftCount = clz32(absA) + 32;
3552 zSig = absA;
3553 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3557 /*----------------------------------------------------------------------------
3558 | Returns the result of converting the 32-bit two's complement integer `a' to
3559 | the quadruple-precision floating-point format. The conversion is performed
3560 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3561 *----------------------------------------------------------------------------*/
3563 float128 int32_to_float128(int32_t a, float_status *status)
3565 flag zSign;
3566 uint32_t absA;
3567 int8_t shiftCount;
3568 uint64_t zSig0;
3570 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3571 zSign = ( a < 0 );
3572 absA = zSign ? - a : a;
3573 shiftCount = clz32(absA) + 17;
3574 zSig0 = absA;
3575 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3579 /*----------------------------------------------------------------------------
3580 | Returns the result of converting the 64-bit two's complement integer `a'
3581 | to the extended double-precision floating-point format. The conversion
3582 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3583 | Arithmetic.
3584 *----------------------------------------------------------------------------*/
3586 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3588 flag zSign;
3589 uint64_t absA;
3590 int8_t shiftCount;
3592 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3593 zSign = ( a < 0 );
3594 absA = zSign ? - a : a;
3595 shiftCount = clz64(absA);
3596 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3600 /*----------------------------------------------------------------------------
3601 | Returns the result of converting the 64-bit two's complement integer `a' to
3602 | the quadruple-precision floating-point format. The conversion is performed
3603 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3604 *----------------------------------------------------------------------------*/
3606 float128 int64_to_float128(int64_t a, float_status *status)
3608 flag zSign;
3609 uint64_t absA;
3610 int8_t shiftCount;
3611 int32_t zExp;
3612 uint64_t zSig0, zSig1;
3614 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3615 zSign = ( a < 0 );
3616 absA = zSign ? - a : a;
3617 shiftCount = clz64(absA) + 49;
3618 zExp = 0x406E - shiftCount;
3619 if ( 64 <= shiftCount ) {
3620 zSig1 = 0;
3621 zSig0 = absA;
3622 shiftCount -= 64;
3624 else {
3625 zSig1 = absA;
3626 zSig0 = 0;
3628 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3629 return packFloat128( zSign, zExp, zSig0, zSig1 );
3633 /*----------------------------------------------------------------------------
3634 | Returns the result of converting the 64-bit unsigned integer `a'
3635 | to the quadruple-precision floating-point format. The conversion is performed
3636 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3637 *----------------------------------------------------------------------------*/
3639 float128 uint64_to_float128(uint64_t a, float_status *status)
3641 if (a == 0) {
3642 return float128_zero;
3644 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
3647 /*----------------------------------------------------------------------------
3648 | Returns the result of converting the single-precision floating-point value
3649 | `a' to the extended double-precision floating-point format. The conversion
3650 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3651 | Arithmetic.
3652 *----------------------------------------------------------------------------*/
3654 floatx80 float32_to_floatx80(float32 a, float_status *status)
3656 flag aSign;
3657 int aExp;
3658 uint32_t aSig;
3660 a = float32_squash_input_denormal(a, status);
3661 aSig = extractFloat32Frac( a );
3662 aExp = extractFloat32Exp( a );
3663 aSign = extractFloat32Sign( a );
3664 if ( aExp == 0xFF ) {
3665 if (aSig) {
3666 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3668 return packFloatx80(aSign,
3669 floatx80_infinity_high,
3670 floatx80_infinity_low);
3672 if ( aExp == 0 ) {
3673 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3674 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3676 aSig |= 0x00800000;
3677 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3681 /*----------------------------------------------------------------------------
3682 | Returns the result of converting the single-precision floating-point value
3683 | `a' to the double-precision floating-point format. The conversion is
3684 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3685 | Arithmetic.
3686 *----------------------------------------------------------------------------*/
3688 float128 float32_to_float128(float32 a, float_status *status)
3690 flag aSign;
3691 int aExp;
3692 uint32_t aSig;
3694 a = float32_squash_input_denormal(a, status);
3695 aSig = extractFloat32Frac( a );
3696 aExp = extractFloat32Exp( a );
3697 aSign = extractFloat32Sign( a );
3698 if ( aExp == 0xFF ) {
3699 if (aSig) {
3700 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3702 return packFloat128( aSign, 0x7FFF, 0, 0 );
3704 if ( aExp == 0 ) {
3705 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3706 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3707 --aExp;
3709 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3713 /*----------------------------------------------------------------------------
3714 | Returns the remainder of the single-precision floating-point value `a'
3715 | with respect to the corresponding value `b'. The operation is performed
3716 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3717 *----------------------------------------------------------------------------*/
3719 float32 float32_rem(float32 a, float32 b, float_status *status)
3721 flag aSign, zSign;
3722 int aExp, bExp, expDiff;
3723 uint32_t aSig, bSig;
3724 uint32_t q;
3725 uint64_t aSig64, bSig64, q64;
3726 uint32_t alternateASig;
3727 int32_t sigMean;
3728 a = float32_squash_input_denormal(a, status);
3729 b = float32_squash_input_denormal(b, status);
3731 aSig = extractFloat32Frac( a );
3732 aExp = extractFloat32Exp( a );
3733 aSign = extractFloat32Sign( a );
3734 bSig = extractFloat32Frac( b );
3735 bExp = extractFloat32Exp( b );
3736 if ( aExp == 0xFF ) {
3737 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3738 return propagateFloat32NaN(a, b, status);
3740 float_raise(float_flag_invalid, status);
3741 return float32_default_nan(status);
3743 if ( bExp == 0xFF ) {
3744 if (bSig) {
3745 return propagateFloat32NaN(a, b, status);
3747 return a;
3749 if ( bExp == 0 ) {
3750 if ( bSig == 0 ) {
3751 float_raise(float_flag_invalid, status);
3752 return float32_default_nan(status);
3754 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3756 if ( aExp == 0 ) {
3757 if ( aSig == 0 ) return a;
3758 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3760 expDiff = aExp - bExp;
3761 aSig |= 0x00800000;
3762 bSig |= 0x00800000;
3763 if ( expDiff < 32 ) {
3764 aSig <<= 8;
3765 bSig <<= 8;
3766 if ( expDiff < 0 ) {
3767 if ( expDiff < -1 ) return a;
3768 aSig >>= 1;
3770 q = ( bSig <= aSig );
3771 if ( q ) aSig -= bSig;
3772 if ( 0 < expDiff ) {
3773 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3774 q >>= 32 - expDiff;
3775 bSig >>= 2;
3776 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3778 else {
3779 aSig >>= 2;
3780 bSig >>= 2;
3783 else {
3784 if ( bSig <= aSig ) aSig -= bSig;
3785 aSig64 = ( (uint64_t) aSig )<<40;
3786 bSig64 = ( (uint64_t) bSig )<<40;
3787 expDiff -= 64;
3788 while ( 0 < expDiff ) {
3789 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3790 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3791 aSig64 = - ( ( bSig * q64 )<<38 );
3792 expDiff -= 62;
3794 expDiff += 64;
3795 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3796 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3797 q = q64>>( 64 - expDiff );
3798 bSig <<= 6;
3799 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3801 do {
3802 alternateASig = aSig;
3803 ++q;
3804 aSig -= bSig;
3805 } while ( 0 <= (int32_t) aSig );
3806 sigMean = aSig + alternateASig;
3807 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3808 aSig = alternateASig;
3810 zSign = ( (int32_t) aSig < 0 );
3811 if ( zSign ) aSig = - aSig;
3812 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3817 /*----------------------------------------------------------------------------
3818 | Returns the binary exponential of the single-precision floating-point value
3819 | `a'. The operation is performed according to the IEC/IEEE Standard for
3820 | Binary Floating-Point Arithmetic.
3822 | Uses the following identities:
3824 | 1. -------------------------------------------------------------------------
3825 | x x*ln(2)
3826 | 2 = e
3828 | 2. -------------------------------------------------------------------------
3829 | 2 3 4 5 n
3830 | x x x x x x x
3831 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3832 | 1! 2! 3! 4! 5! n!
3833 *----------------------------------------------------------------------------*/
3835 static const float64 float32_exp2_coefficients[15] =
3837 const_float64( 0x3ff0000000000000ll ), /* 1 */
3838 const_float64( 0x3fe0000000000000ll ), /* 2 */
3839 const_float64( 0x3fc5555555555555ll ), /* 3 */
3840 const_float64( 0x3fa5555555555555ll ), /* 4 */
3841 const_float64( 0x3f81111111111111ll ), /* 5 */
3842 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
3843 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
3844 const_float64( 0x3efa01a01a01a01all ), /* 8 */
3845 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
3846 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3847 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3848 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3849 const_float64( 0x3de6124613a86d09ll ), /* 13 */
3850 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3851 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3854 float32 float32_exp2(float32 a, float_status *status)
3856 flag aSign;
3857 int aExp;
3858 uint32_t aSig;
3859 float64 r, x, xn;
3860 int i;
3861 a = float32_squash_input_denormal(a, status);
3863 aSig = extractFloat32Frac( a );
3864 aExp = extractFloat32Exp( a );
3865 aSign = extractFloat32Sign( a );
3867 if ( aExp == 0xFF) {
3868 if (aSig) {
3869 return propagateFloat32NaN(a, float32_zero, status);
3871 return (aSign) ? float32_zero : a;
3873 if (aExp == 0) {
3874 if (aSig == 0) return float32_one;
3877 float_raise(float_flag_inexact, status);
3879 /* ******************************* */
3880 /* using float64 for approximation */
3881 /* ******************************* */
3882 x = float32_to_float64(a, status);
3883 x = float64_mul(x, float64_ln2, status);
3885 xn = x;
3886 r = float64_one;
3887 for (i = 0 ; i < 15 ; i++) {
3888 float64 f;
3890 f = float64_mul(xn, float32_exp2_coefficients[i], status);
3891 r = float64_add(r, f, status);
3893 xn = float64_mul(xn, x, status);
3896 return float64_to_float32(r, status);
3899 /*----------------------------------------------------------------------------
3900 | Returns the binary log of the single-precision floating-point value `a'.
3901 | The operation is performed according to the IEC/IEEE Standard for Binary
3902 | Floating-Point Arithmetic.
3903 *----------------------------------------------------------------------------*/
3904 float32 float32_log2(float32 a, float_status *status)
3906 flag aSign, zSign;
3907 int aExp;
3908 uint32_t aSig, zSig, i;
3910 a = float32_squash_input_denormal(a, status);
3911 aSig = extractFloat32Frac( a );
3912 aExp = extractFloat32Exp( a );
3913 aSign = extractFloat32Sign( a );
3915 if ( aExp == 0 ) {
3916 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3917 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3919 if ( aSign ) {
3920 float_raise(float_flag_invalid, status);
3921 return float32_default_nan(status);
3923 if ( aExp == 0xFF ) {
3924 if (aSig) {
3925 return propagateFloat32NaN(a, float32_zero, status);
3927 return a;
3930 aExp -= 0x7F;
3931 aSig |= 0x00800000;
3932 zSign = aExp < 0;
3933 zSig = aExp << 23;
3935 for (i = 1 << 22; i > 0; i >>= 1) {
3936 aSig = ( (uint64_t)aSig * aSig ) >> 23;
3937 if ( aSig & 0x01000000 ) {
3938 aSig >>= 1;
3939 zSig |= i;
3943 if ( zSign )
3944 zSig = -zSig;
3946 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3949 /*----------------------------------------------------------------------------
3950 | Returns 1 if the single-precision floating-point value `a' is equal to
3951 | the corresponding value `b', and 0 otherwise. The invalid exception is
3952 | raised if either operand is a NaN. Otherwise, the comparison is performed
3953 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3954 *----------------------------------------------------------------------------*/
3956 int float32_eq(float32 a, float32 b, float_status *status)
3958 uint32_t av, bv;
3959 a = float32_squash_input_denormal(a, status);
3960 b = float32_squash_input_denormal(b, status);
3962 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3963 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3965 float_raise(float_flag_invalid, status);
3966 return 0;
3968 av = float32_val(a);
3969 bv = float32_val(b);
3970 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3973 /*----------------------------------------------------------------------------
3974 | Returns 1 if the single-precision floating-point value `a' is less than
3975 | or equal to the corresponding value `b', and 0 otherwise. The invalid
3976 | exception is raised if either operand is a NaN. The comparison is performed
3977 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3978 *----------------------------------------------------------------------------*/
3980 int float32_le(float32 a, float32 b, float_status *status)
3982 flag aSign, bSign;
3983 uint32_t av, bv;
3984 a = float32_squash_input_denormal(a, status);
3985 b = float32_squash_input_denormal(b, status);
3987 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3988 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3990 float_raise(float_flag_invalid, status);
3991 return 0;
3993 aSign = extractFloat32Sign( a );
3994 bSign = extractFloat32Sign( b );
3995 av = float32_val(a);
3996 bv = float32_val(b);
3997 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3998 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4002 /*----------------------------------------------------------------------------
4003 | Returns 1 if the single-precision floating-point value `a' is less than
4004 | the corresponding value `b', and 0 otherwise. The invalid exception is
4005 | raised if either operand is a NaN. The comparison is performed according
4006 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4007 *----------------------------------------------------------------------------*/
4009 int float32_lt(float32 a, float32 b, float_status *status)
4011 flag aSign, bSign;
4012 uint32_t av, bv;
4013 a = float32_squash_input_denormal(a, status);
4014 b = float32_squash_input_denormal(b, status);
4016 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4017 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4019 float_raise(float_flag_invalid, status);
4020 return 0;
4022 aSign = extractFloat32Sign( a );
4023 bSign = extractFloat32Sign( b );
4024 av = float32_val(a);
4025 bv = float32_val(b);
4026 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4027 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4031 /*----------------------------------------------------------------------------
4032 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4033 | be compared, and 0 otherwise. The invalid exception is raised if either
4034 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4035 | Standard for Binary Floating-Point Arithmetic.
4036 *----------------------------------------------------------------------------*/
4038 int float32_unordered(float32 a, float32 b, float_status *status)
4040 a = float32_squash_input_denormal(a, status);
4041 b = float32_squash_input_denormal(b, status);
4043 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4044 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4046 float_raise(float_flag_invalid, status);
4047 return 1;
4049 return 0;
4052 /*----------------------------------------------------------------------------
4053 | Returns 1 if the single-precision floating-point value `a' is equal to
4054 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4055 | exception. The comparison is performed according to the IEC/IEEE Standard
4056 | for Binary Floating-Point Arithmetic.
4057 *----------------------------------------------------------------------------*/
4059 int float32_eq_quiet(float32 a, float32 b, float_status *status)
4061 a = float32_squash_input_denormal(a, status);
4062 b = float32_squash_input_denormal(b, status);
4064 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4065 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4067 if (float32_is_signaling_nan(a, status)
4068 || float32_is_signaling_nan(b, status)) {
4069 float_raise(float_flag_invalid, status);
4071 return 0;
4073 return ( float32_val(a) == float32_val(b) ) ||
4074 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
4077 /*----------------------------------------------------------------------------
4078 | Returns 1 if the single-precision floating-point value `a' is less than or
4079 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4080 | cause an exception. Otherwise, the comparison is performed according to the
4081 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4082 *----------------------------------------------------------------------------*/
4084 int float32_le_quiet(float32 a, float32 b, float_status *status)
4086 flag aSign, bSign;
4087 uint32_t av, bv;
4088 a = float32_squash_input_denormal(a, status);
4089 b = float32_squash_input_denormal(b, status);
4091 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4092 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4094 if (float32_is_signaling_nan(a, status)
4095 || float32_is_signaling_nan(b, status)) {
4096 float_raise(float_flag_invalid, status);
4098 return 0;
4100 aSign = extractFloat32Sign( a );
4101 bSign = extractFloat32Sign( b );
4102 av = float32_val(a);
4103 bv = float32_val(b);
4104 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4105 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4109 /*----------------------------------------------------------------------------
4110 | Returns 1 if the single-precision floating-point value `a' is less than
4111 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4112 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4113 | Standard for Binary Floating-Point Arithmetic.
4114 *----------------------------------------------------------------------------*/
4116 int float32_lt_quiet(float32 a, float32 b, float_status *status)
4118 flag aSign, bSign;
4119 uint32_t av, bv;
4120 a = float32_squash_input_denormal(a, status);
4121 b = float32_squash_input_denormal(b, status);
4123 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4124 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4126 if (float32_is_signaling_nan(a, status)
4127 || float32_is_signaling_nan(b, status)) {
4128 float_raise(float_flag_invalid, status);
4130 return 0;
4132 aSign = extractFloat32Sign( a );
4133 bSign = extractFloat32Sign( b );
4134 av = float32_val(a);
4135 bv = float32_val(b);
4136 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4137 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4141 /*----------------------------------------------------------------------------
4142 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4143 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4144 | comparison is performed according to the IEC/IEEE Standard for Binary
4145 | Floating-Point Arithmetic.
4146 *----------------------------------------------------------------------------*/
4148 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
4150 a = float32_squash_input_denormal(a, status);
4151 b = float32_squash_input_denormal(b, status);
4153 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4154 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4156 if (float32_is_signaling_nan(a, status)
4157 || float32_is_signaling_nan(b, status)) {
4158 float_raise(float_flag_invalid, status);
4160 return 1;
4162 return 0;
4165 /*----------------------------------------------------------------------------
4166 | If `a' is denormal and we are in flush-to-zero mode then set the
4167 | input-denormal exception and return zero. Otherwise just return the value.
4168 *----------------------------------------------------------------------------*/
4169 float16 float16_squash_input_denormal(float16 a, float_status *status)
4171 if (status->flush_inputs_to_zero) {
4172 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
4173 float_raise(float_flag_input_denormal, status);
4174 return make_float16(float16_val(a) & 0x8000);
4177 return a;
4180 /*----------------------------------------------------------------------------
4181 | Returns the result of converting the double-precision floating-point value
4182 | `a' to the extended double-precision floating-point format. The conversion
4183 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4184 | Arithmetic.
4185 *----------------------------------------------------------------------------*/
4187 floatx80 float64_to_floatx80(float64 a, float_status *status)
4189 flag aSign;
4190 int aExp;
4191 uint64_t aSig;
4193 a = float64_squash_input_denormal(a, status);
4194 aSig = extractFloat64Frac( a );
4195 aExp = extractFloat64Exp( a );
4196 aSign = extractFloat64Sign( a );
4197 if ( aExp == 0x7FF ) {
4198 if (aSig) {
4199 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4201 return packFloatx80(aSign,
4202 floatx80_infinity_high,
4203 floatx80_infinity_low);
4205 if ( aExp == 0 ) {
4206 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4207 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4209 return
4210 packFloatx80(
4211 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4215 /*----------------------------------------------------------------------------
4216 | Returns the result of converting the double-precision floating-point value
4217 | `a' to the quadruple-precision floating-point format. The conversion is
4218 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4219 | Arithmetic.
4220 *----------------------------------------------------------------------------*/
4222 float128 float64_to_float128(float64 a, float_status *status)
4224 flag aSign;
4225 int aExp;
4226 uint64_t aSig, zSig0, zSig1;
4228 a = float64_squash_input_denormal(a, status);
4229 aSig = extractFloat64Frac( a );
4230 aExp = extractFloat64Exp( a );
4231 aSign = extractFloat64Sign( a );
4232 if ( aExp == 0x7FF ) {
4233 if (aSig) {
4234 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4236 return packFloat128( aSign, 0x7FFF, 0, 0 );
4238 if ( aExp == 0 ) {
4239 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4240 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4241 --aExp;
4243 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4244 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4249 /*----------------------------------------------------------------------------
4250 | Returns the remainder of the double-precision floating-point value `a'
4251 | with respect to the corresponding value `b'. The operation is performed
4252 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4253 *----------------------------------------------------------------------------*/
4255 float64 float64_rem(float64 a, float64 b, float_status *status)
4257 flag aSign, zSign;
4258 int aExp, bExp, expDiff;
4259 uint64_t aSig, bSig;
4260 uint64_t q, alternateASig;
4261 int64_t sigMean;
4263 a = float64_squash_input_denormal(a, status);
4264 b = float64_squash_input_denormal(b, status);
4265 aSig = extractFloat64Frac( a );
4266 aExp = extractFloat64Exp( a );
4267 aSign = extractFloat64Sign( a );
4268 bSig = extractFloat64Frac( b );
4269 bExp = extractFloat64Exp( b );
4270 if ( aExp == 0x7FF ) {
4271 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4272 return propagateFloat64NaN(a, b, status);
4274 float_raise(float_flag_invalid, status);
4275 return float64_default_nan(status);
4277 if ( bExp == 0x7FF ) {
4278 if (bSig) {
4279 return propagateFloat64NaN(a, b, status);
4281 return a;
4283 if ( bExp == 0 ) {
4284 if ( bSig == 0 ) {
4285 float_raise(float_flag_invalid, status);
4286 return float64_default_nan(status);
4288 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4290 if ( aExp == 0 ) {
4291 if ( aSig == 0 ) return a;
4292 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4294 expDiff = aExp - bExp;
4295 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4296 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4297 if ( expDiff < 0 ) {
4298 if ( expDiff < -1 ) return a;
4299 aSig >>= 1;
4301 q = ( bSig <= aSig );
4302 if ( q ) aSig -= bSig;
4303 expDiff -= 64;
4304 while ( 0 < expDiff ) {
4305 q = estimateDiv128To64( aSig, 0, bSig );
4306 q = ( 2 < q ) ? q - 2 : 0;
4307 aSig = - ( ( bSig>>2 ) * q );
4308 expDiff -= 62;
4310 expDiff += 64;
4311 if ( 0 < expDiff ) {
4312 q = estimateDiv128To64( aSig, 0, bSig );
4313 q = ( 2 < q ) ? q - 2 : 0;
4314 q >>= 64 - expDiff;
4315 bSig >>= 2;
4316 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4318 else {
4319 aSig >>= 2;
4320 bSig >>= 2;
4322 do {
4323 alternateASig = aSig;
4324 ++q;
4325 aSig -= bSig;
4326 } while ( 0 <= (int64_t) aSig );
4327 sigMean = aSig + alternateASig;
4328 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4329 aSig = alternateASig;
4331 zSign = ( (int64_t) aSig < 0 );
4332 if ( zSign ) aSig = - aSig;
4333 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4337 /*----------------------------------------------------------------------------
4338 | Returns the binary log of the double-precision floating-point value `a'.
4339 | The operation is performed according to the IEC/IEEE Standard for Binary
4340 | Floating-Point Arithmetic.
4341 *----------------------------------------------------------------------------*/
4342 float64 float64_log2(float64 a, float_status *status)
4344 flag aSign, zSign;
4345 int aExp;
4346 uint64_t aSig, aSig0, aSig1, zSig, i;
4347 a = float64_squash_input_denormal(a, status);
4349 aSig = extractFloat64Frac( a );
4350 aExp = extractFloat64Exp( a );
4351 aSign = extractFloat64Sign( a );
4353 if ( aExp == 0 ) {
4354 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4355 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4357 if ( aSign ) {
4358 float_raise(float_flag_invalid, status);
4359 return float64_default_nan(status);
4361 if ( aExp == 0x7FF ) {
4362 if (aSig) {
4363 return propagateFloat64NaN(a, float64_zero, status);
4365 return a;
4368 aExp -= 0x3FF;
4369 aSig |= LIT64( 0x0010000000000000 );
4370 zSign = aExp < 0;
4371 zSig = (uint64_t)aExp << 52;
4372 for (i = 1LL << 51; i > 0; i >>= 1) {
4373 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4374 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4375 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4376 aSig >>= 1;
4377 zSig |= i;
4381 if ( zSign )
4382 zSig = -zSig;
4383 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4386 /*----------------------------------------------------------------------------
4387 | Returns 1 if the double-precision floating-point value `a' is equal to the
4388 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4389 | if either operand is a NaN. Otherwise, the comparison is performed
4390 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4391 *----------------------------------------------------------------------------*/
4393 int float64_eq(float64 a, float64 b, float_status *status)
4395 uint64_t av, bv;
4396 a = float64_squash_input_denormal(a, status);
4397 b = float64_squash_input_denormal(b, status);
4399 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4400 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4402 float_raise(float_flag_invalid, status);
4403 return 0;
4405 av = float64_val(a);
4406 bv = float64_val(b);
4407 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4411 /*----------------------------------------------------------------------------
4412 | Returns 1 if the double-precision floating-point value `a' is less than or
4413 | equal to the corresponding value `b', and 0 otherwise. The invalid
4414 | exception is raised if either operand is a NaN. The comparison is performed
4415 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4416 *----------------------------------------------------------------------------*/
4418 int float64_le(float64 a, float64 b, float_status *status)
4420 flag aSign, bSign;
4421 uint64_t av, bv;
4422 a = float64_squash_input_denormal(a, status);
4423 b = float64_squash_input_denormal(b, status);
4425 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4426 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4428 float_raise(float_flag_invalid, status);
4429 return 0;
4431 aSign = extractFloat64Sign( a );
4432 bSign = extractFloat64Sign( b );
4433 av = float64_val(a);
4434 bv = float64_val(b);
4435 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4436 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4440 /*----------------------------------------------------------------------------
4441 | Returns 1 if the double-precision floating-point value `a' is less than
4442 | the corresponding value `b', and 0 otherwise. The invalid exception is
4443 | raised if either operand is a NaN. The comparison is performed according
4444 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4445 *----------------------------------------------------------------------------*/
4447 int float64_lt(float64 a, float64 b, float_status *status)
4449 flag aSign, bSign;
4450 uint64_t av, bv;
4452 a = float64_squash_input_denormal(a, status);
4453 b = float64_squash_input_denormal(b, status);
4454 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4455 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4457 float_raise(float_flag_invalid, status);
4458 return 0;
4460 aSign = extractFloat64Sign( a );
4461 bSign = extractFloat64Sign( b );
4462 av = float64_val(a);
4463 bv = float64_val(b);
4464 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4465 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4469 /*----------------------------------------------------------------------------
4470 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4471 | be compared, and 0 otherwise. The invalid exception is raised if either
4472 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4473 | Standard for Binary Floating-Point Arithmetic.
4474 *----------------------------------------------------------------------------*/
4476 int float64_unordered(float64 a, float64 b, float_status *status)
4478 a = float64_squash_input_denormal(a, status);
4479 b = float64_squash_input_denormal(b, status);
4481 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4482 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4484 float_raise(float_flag_invalid, status);
4485 return 1;
4487 return 0;
4490 /*----------------------------------------------------------------------------
4491 | Returns 1 if the double-precision floating-point value `a' is equal to the
4492 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4493 | exception.The comparison is performed according to the IEC/IEEE Standard
4494 | for Binary Floating-Point Arithmetic.
4495 *----------------------------------------------------------------------------*/
4497 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4499 uint64_t av, bv;
4500 a = float64_squash_input_denormal(a, status);
4501 b = float64_squash_input_denormal(b, status);
4503 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4504 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4506 if (float64_is_signaling_nan(a, status)
4507 || float64_is_signaling_nan(b, status)) {
4508 float_raise(float_flag_invalid, status);
4510 return 0;
4512 av = float64_val(a);
4513 bv = float64_val(b);
4514 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4518 /*----------------------------------------------------------------------------
4519 | Returns 1 if the double-precision floating-point value `a' is less than or
4520 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4521 | cause an exception. Otherwise, the comparison is performed according to the
4522 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4523 *----------------------------------------------------------------------------*/
4525 int float64_le_quiet(float64 a, float64 b, float_status *status)
4527 flag aSign, bSign;
4528 uint64_t av, bv;
4529 a = float64_squash_input_denormal(a, status);
4530 b = float64_squash_input_denormal(b, status);
4532 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4533 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4535 if (float64_is_signaling_nan(a, status)
4536 || float64_is_signaling_nan(b, status)) {
4537 float_raise(float_flag_invalid, status);
4539 return 0;
4541 aSign = extractFloat64Sign( a );
4542 bSign = extractFloat64Sign( b );
4543 av = float64_val(a);
4544 bv = float64_val(b);
4545 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4546 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4550 /*----------------------------------------------------------------------------
4551 | Returns 1 if the double-precision floating-point value `a' is less than
4552 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4553 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4554 | Standard for Binary Floating-Point Arithmetic.
4555 *----------------------------------------------------------------------------*/
4557 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4559 flag aSign, bSign;
4560 uint64_t av, bv;
4561 a = float64_squash_input_denormal(a, status);
4562 b = float64_squash_input_denormal(b, status);
4564 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4565 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4567 if (float64_is_signaling_nan(a, status)
4568 || float64_is_signaling_nan(b, status)) {
4569 float_raise(float_flag_invalid, status);
4571 return 0;
4573 aSign = extractFloat64Sign( a );
4574 bSign = extractFloat64Sign( b );
4575 av = float64_val(a);
4576 bv = float64_val(b);
4577 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4578 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4582 /*----------------------------------------------------------------------------
4583 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4584 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4585 | comparison is performed according to the IEC/IEEE Standard for Binary
4586 | Floating-Point Arithmetic.
4587 *----------------------------------------------------------------------------*/
4589 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4591 a = float64_squash_input_denormal(a, status);
4592 b = float64_squash_input_denormal(b, status);
4594 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4595 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4597 if (float64_is_signaling_nan(a, status)
4598 || float64_is_signaling_nan(b, status)) {
4599 float_raise(float_flag_invalid, status);
4601 return 1;
4603 return 0;
4606 /*----------------------------------------------------------------------------
4607 | Returns the result of converting the extended double-precision floating-
4608 | point value `a' to the 32-bit two's complement integer format. The
4609 | conversion is performed according to the IEC/IEEE Standard for Binary
4610 | Floating-Point Arithmetic---which means in particular that the conversion
4611 | is rounded according to the current rounding mode. If `a' is a NaN, the
4612 | largest positive integer is returned. Otherwise, if the conversion
4613 | overflows, the largest integer with the same sign as `a' is returned.
4614 *----------------------------------------------------------------------------*/
4616 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4618 flag aSign;
4619 int32_t aExp, shiftCount;
4620 uint64_t aSig;
4622 if (floatx80_invalid_encoding(a)) {
4623 float_raise(float_flag_invalid, status);
4624 return 1 << 31;
4626 aSig = extractFloatx80Frac( a );
4627 aExp = extractFloatx80Exp( a );
4628 aSign = extractFloatx80Sign( a );
4629 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4630 shiftCount = 0x4037 - aExp;
4631 if ( shiftCount <= 0 ) shiftCount = 1;
4632 shift64RightJamming( aSig, shiftCount, &aSig );
4633 return roundAndPackInt32(aSign, aSig, status);
4637 /*----------------------------------------------------------------------------
4638 | Returns the result of converting the extended double-precision floating-
4639 | point value `a' to the 32-bit two's complement integer format. The
4640 | conversion is performed according to the IEC/IEEE Standard for Binary
4641 | Floating-Point Arithmetic, except that the conversion is always rounded
4642 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4643 | Otherwise, if the conversion overflows, the largest integer with the same
4644 | sign as `a' is returned.
4645 *----------------------------------------------------------------------------*/
4647 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4649 flag aSign;
4650 int32_t aExp, shiftCount;
4651 uint64_t aSig, savedASig;
4652 int32_t z;
4654 if (floatx80_invalid_encoding(a)) {
4655 float_raise(float_flag_invalid, status);
4656 return 1 << 31;
4658 aSig = extractFloatx80Frac( a );
4659 aExp = extractFloatx80Exp( a );
4660 aSign = extractFloatx80Sign( a );
4661 if ( 0x401E < aExp ) {
4662 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4663 goto invalid;
4665 else if ( aExp < 0x3FFF ) {
4666 if (aExp || aSig) {
4667 status->float_exception_flags |= float_flag_inexact;
4669 return 0;
4671 shiftCount = 0x403E - aExp;
4672 savedASig = aSig;
4673 aSig >>= shiftCount;
4674 z = aSig;
4675 if ( aSign ) z = - z;
4676 if ( ( z < 0 ) ^ aSign ) {
4677 invalid:
4678 float_raise(float_flag_invalid, status);
4679 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4681 if ( ( aSig<<shiftCount ) != savedASig ) {
4682 status->float_exception_flags |= float_flag_inexact;
4684 return z;
4688 /*----------------------------------------------------------------------------
4689 | Returns the result of converting the extended double-precision floating-
4690 | point value `a' to the 64-bit two's complement integer format. The
4691 | conversion is performed according to the IEC/IEEE Standard for Binary
4692 | Floating-Point Arithmetic---which means in particular that the conversion
4693 | is rounded according to the current rounding mode. If `a' is a NaN,
4694 | the largest positive integer is returned. Otherwise, if the conversion
4695 | overflows, the largest integer with the same sign as `a' is returned.
4696 *----------------------------------------------------------------------------*/
4698 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4700 flag aSign;
4701 int32_t aExp, shiftCount;
4702 uint64_t aSig, aSigExtra;
4704 if (floatx80_invalid_encoding(a)) {
4705 float_raise(float_flag_invalid, status);
4706 return 1ULL << 63;
4708 aSig = extractFloatx80Frac( a );
4709 aExp = extractFloatx80Exp( a );
4710 aSign = extractFloatx80Sign( a );
4711 shiftCount = 0x403E - aExp;
4712 if ( shiftCount <= 0 ) {
4713 if ( shiftCount ) {
4714 float_raise(float_flag_invalid, status);
4715 if (!aSign || floatx80_is_any_nan(a)) {
4716 return LIT64( 0x7FFFFFFFFFFFFFFF );
4718 return (int64_t) LIT64( 0x8000000000000000 );
4720 aSigExtra = 0;
4722 else {
4723 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4725 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4729 /*----------------------------------------------------------------------------
4730 | Returns the result of converting the extended double-precision floating-
4731 | point value `a' to the 64-bit two's complement integer format. The
4732 | conversion is performed according to the IEC/IEEE Standard for Binary
4733 | Floating-Point Arithmetic, except that the conversion is always rounded
4734 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4735 | Otherwise, if the conversion overflows, the largest integer with the same
4736 | sign as `a' is returned.
4737 *----------------------------------------------------------------------------*/
4739 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4741 flag aSign;
4742 int32_t aExp, shiftCount;
4743 uint64_t aSig;
4744 int64_t z;
4746 if (floatx80_invalid_encoding(a)) {
4747 float_raise(float_flag_invalid, status);
4748 return 1ULL << 63;
4750 aSig = extractFloatx80Frac( a );
4751 aExp = extractFloatx80Exp( a );
4752 aSign = extractFloatx80Sign( a );
4753 shiftCount = aExp - 0x403E;
4754 if ( 0 <= shiftCount ) {
4755 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4756 if ( ( a.high != 0xC03E ) || aSig ) {
4757 float_raise(float_flag_invalid, status);
4758 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4759 return LIT64( 0x7FFFFFFFFFFFFFFF );
4762 return (int64_t) LIT64( 0x8000000000000000 );
4764 else if ( aExp < 0x3FFF ) {
4765 if (aExp | aSig) {
4766 status->float_exception_flags |= float_flag_inexact;
4768 return 0;
4770 z = aSig>>( - shiftCount );
4771 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4772 status->float_exception_flags |= float_flag_inexact;
4774 if ( aSign ) z = - z;
4775 return z;
4779 /*----------------------------------------------------------------------------
4780 | Returns the result of converting the extended double-precision floating-
4781 | point value `a' to the single-precision floating-point format. The
4782 | conversion is performed according to the IEC/IEEE Standard for Binary
4783 | Floating-Point Arithmetic.
4784 *----------------------------------------------------------------------------*/
4786 float32 floatx80_to_float32(floatx80 a, float_status *status)
4788 flag aSign;
4789 int32_t aExp;
4790 uint64_t aSig;
4792 if (floatx80_invalid_encoding(a)) {
4793 float_raise(float_flag_invalid, status);
4794 return float32_default_nan(status);
4796 aSig = extractFloatx80Frac( a );
4797 aExp = extractFloatx80Exp( a );
4798 aSign = extractFloatx80Sign( a );
4799 if ( aExp == 0x7FFF ) {
4800 if ( (uint64_t) ( aSig<<1 ) ) {
4801 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4803 return packFloat32( aSign, 0xFF, 0 );
4805 shift64RightJamming( aSig, 33, &aSig );
4806 if ( aExp || aSig ) aExp -= 0x3F81;
4807 return roundAndPackFloat32(aSign, aExp, aSig, status);
4811 /*----------------------------------------------------------------------------
4812 | Returns the result of converting the extended double-precision floating-
4813 | point value `a' to the double-precision floating-point format. The
4814 | conversion is performed according to the IEC/IEEE Standard for Binary
4815 | Floating-Point Arithmetic.
4816 *----------------------------------------------------------------------------*/
4818 float64 floatx80_to_float64(floatx80 a, float_status *status)
4820 flag aSign;
4821 int32_t aExp;
4822 uint64_t aSig, zSig;
4824 if (floatx80_invalid_encoding(a)) {
4825 float_raise(float_flag_invalid, status);
4826 return float64_default_nan(status);
4828 aSig = extractFloatx80Frac( a );
4829 aExp = extractFloatx80Exp( a );
4830 aSign = extractFloatx80Sign( a );
4831 if ( aExp == 0x7FFF ) {
4832 if ( (uint64_t) ( aSig<<1 ) ) {
4833 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4835 return packFloat64( aSign, 0x7FF, 0 );
4837 shift64RightJamming( aSig, 1, &zSig );
4838 if ( aExp || aSig ) aExp -= 0x3C01;
4839 return roundAndPackFloat64(aSign, aExp, zSig, status);
4843 /*----------------------------------------------------------------------------
4844 | Returns the result of converting the extended double-precision floating-
4845 | point value `a' to the quadruple-precision floating-point format. The
4846 | conversion is performed according to the IEC/IEEE Standard for Binary
4847 | Floating-Point Arithmetic.
4848 *----------------------------------------------------------------------------*/
4850 float128 floatx80_to_float128(floatx80 a, float_status *status)
4852 flag aSign;
4853 int aExp;
4854 uint64_t aSig, zSig0, zSig1;
4856 if (floatx80_invalid_encoding(a)) {
4857 float_raise(float_flag_invalid, status);
4858 return float128_default_nan(status);
4860 aSig = extractFloatx80Frac( a );
4861 aExp = extractFloatx80Exp( a );
4862 aSign = extractFloatx80Sign( a );
4863 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4864 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4866 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4867 return packFloat128( aSign, aExp, zSig0, zSig1 );
4871 /*----------------------------------------------------------------------------
4872 | Rounds the extended double-precision floating-point value `a'
4873 | to the precision provided by floatx80_rounding_precision and returns the
4874 | result as an extended double-precision floating-point value.
4875 | The operation is performed according to the IEC/IEEE Standard for Binary
4876 | Floating-Point Arithmetic.
4877 *----------------------------------------------------------------------------*/
4879 floatx80 floatx80_round(floatx80 a, float_status *status)
4881 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4882 extractFloatx80Sign(a),
4883 extractFloatx80Exp(a),
4884 extractFloatx80Frac(a), 0, status);
4887 /*----------------------------------------------------------------------------
4888 | Rounds the extended double-precision floating-point value `a' to an integer,
4889 | and returns the result as an extended quadruple-precision floating-point
4890 | value. The operation is performed according to the IEC/IEEE Standard for
4891 | Binary Floating-Point Arithmetic.
4892 *----------------------------------------------------------------------------*/
4894 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4896 flag aSign;
4897 int32_t aExp;
4898 uint64_t lastBitMask, roundBitsMask;
4899 floatx80 z;
4901 if (floatx80_invalid_encoding(a)) {
4902 float_raise(float_flag_invalid, status);
4903 return floatx80_default_nan(status);
4905 aExp = extractFloatx80Exp( a );
4906 if ( 0x403E <= aExp ) {
4907 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4908 return propagateFloatx80NaN(a, a, status);
4910 return a;
4912 if ( aExp < 0x3FFF ) {
4913 if ( ( aExp == 0 )
4914 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4915 return a;
4917 status->float_exception_flags |= float_flag_inexact;
4918 aSign = extractFloatx80Sign( a );
4919 switch (status->float_rounding_mode) {
4920 case float_round_nearest_even:
4921 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4923 return
4924 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4926 break;
4927 case float_round_ties_away:
4928 if (aExp == 0x3FFE) {
4929 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4931 break;
4932 case float_round_down:
4933 return
4934 aSign ?
4935 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4936 : packFloatx80( 0, 0, 0 );
4937 case float_round_up:
4938 return
4939 aSign ? packFloatx80( 1, 0, 0 )
4940 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4942 return packFloatx80( aSign, 0, 0 );
4944 lastBitMask = 1;
4945 lastBitMask <<= 0x403E - aExp;
4946 roundBitsMask = lastBitMask - 1;
4947 z = a;
4948 switch (status->float_rounding_mode) {
4949 case float_round_nearest_even:
4950 z.low += lastBitMask>>1;
4951 if ((z.low & roundBitsMask) == 0) {
4952 z.low &= ~lastBitMask;
4954 break;
4955 case float_round_ties_away:
4956 z.low += lastBitMask >> 1;
4957 break;
4958 case float_round_to_zero:
4959 break;
4960 case float_round_up:
4961 if (!extractFloatx80Sign(z)) {
4962 z.low += roundBitsMask;
4964 break;
4965 case float_round_down:
4966 if (extractFloatx80Sign(z)) {
4967 z.low += roundBitsMask;
4969 break;
4970 default:
4971 abort();
4973 z.low &= ~ roundBitsMask;
4974 if ( z.low == 0 ) {
4975 ++z.high;
4976 z.low = LIT64( 0x8000000000000000 );
4978 if (z.low != a.low) {
4979 status->float_exception_flags |= float_flag_inexact;
4981 return z;
4985 /*----------------------------------------------------------------------------
4986 | Returns the result of adding the absolute values of the extended double-
4987 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4988 | negated before being returned. `zSign' is ignored if the result is a NaN.
4989 | The addition is performed according to the IEC/IEEE Standard for Binary
4990 | Floating-Point Arithmetic.
4991 *----------------------------------------------------------------------------*/
4993 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4994 float_status *status)
4996 int32_t aExp, bExp, zExp;
4997 uint64_t aSig, bSig, zSig0, zSig1;
4998 int32_t expDiff;
5000 aSig = extractFloatx80Frac( a );
5001 aExp = extractFloatx80Exp( a );
5002 bSig = extractFloatx80Frac( b );
5003 bExp = extractFloatx80Exp( b );
5004 expDiff = aExp - bExp;
5005 if ( 0 < expDiff ) {
5006 if ( aExp == 0x7FFF ) {
5007 if ((uint64_t)(aSig << 1)) {
5008 return propagateFloatx80NaN(a, b, status);
5010 return a;
5012 if ( bExp == 0 ) --expDiff;
5013 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5014 zExp = aExp;
5016 else if ( expDiff < 0 ) {
5017 if ( bExp == 0x7FFF ) {
5018 if ((uint64_t)(bSig << 1)) {
5019 return propagateFloatx80NaN(a, b, status);
5021 return packFloatx80(zSign,
5022 floatx80_infinity_high,
5023 floatx80_infinity_low);
5025 if ( aExp == 0 ) ++expDiff;
5026 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5027 zExp = bExp;
5029 else {
5030 if ( aExp == 0x7FFF ) {
5031 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5032 return propagateFloatx80NaN(a, b, status);
5034 return a;
5036 zSig1 = 0;
5037 zSig0 = aSig + bSig;
5038 if ( aExp == 0 ) {
5039 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5040 goto roundAndPack;
5042 zExp = aExp;
5043 goto shiftRight1;
5045 zSig0 = aSig + bSig;
5046 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5047 shiftRight1:
5048 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5049 zSig0 |= LIT64( 0x8000000000000000 );
5050 ++zExp;
5051 roundAndPack:
5052 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5053 zSign, zExp, zSig0, zSig1, status);
5056 /*----------------------------------------------------------------------------
5057 | Returns the result of subtracting the absolute values of the extended
5058 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5059 | difference is negated before being returned. `zSign' is ignored if the
5060 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5061 | Standard for Binary Floating-Point Arithmetic.
5062 *----------------------------------------------------------------------------*/
5064 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5065 float_status *status)
5067 int32_t aExp, bExp, zExp;
5068 uint64_t aSig, bSig, zSig0, zSig1;
5069 int32_t expDiff;
5071 aSig = extractFloatx80Frac( a );
5072 aExp = extractFloatx80Exp( a );
5073 bSig = extractFloatx80Frac( b );
5074 bExp = extractFloatx80Exp( b );
5075 expDiff = aExp - bExp;
5076 if ( 0 < expDiff ) goto aExpBigger;
5077 if ( expDiff < 0 ) goto bExpBigger;
5078 if ( aExp == 0x7FFF ) {
5079 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5080 return propagateFloatx80NaN(a, b, status);
5082 float_raise(float_flag_invalid, status);
5083 return floatx80_default_nan(status);
5085 if ( aExp == 0 ) {
5086 aExp = 1;
5087 bExp = 1;
5089 zSig1 = 0;
5090 if ( bSig < aSig ) goto aBigger;
5091 if ( aSig < bSig ) goto bBigger;
5092 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5093 bExpBigger:
5094 if ( bExp == 0x7FFF ) {
5095 if ((uint64_t)(bSig << 1)) {
5096 return propagateFloatx80NaN(a, b, status);
5098 return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5099 floatx80_infinity_low);
5101 if ( aExp == 0 ) ++expDiff;
5102 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5103 bBigger:
5104 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5105 zExp = bExp;
5106 zSign ^= 1;
5107 goto normalizeRoundAndPack;
5108 aExpBigger:
5109 if ( aExp == 0x7FFF ) {
5110 if ((uint64_t)(aSig << 1)) {
5111 return propagateFloatx80NaN(a, b, status);
5113 return a;
5115 if ( bExp == 0 ) --expDiff;
5116 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5117 aBigger:
5118 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5119 zExp = aExp;
5120 normalizeRoundAndPack:
5121 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5122 zSign, zExp, zSig0, zSig1, status);
5125 /*----------------------------------------------------------------------------
5126 | Returns the result of adding the extended double-precision floating-point
5127 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5128 | Standard for Binary Floating-Point Arithmetic.
5129 *----------------------------------------------------------------------------*/
5131 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5133 flag aSign, bSign;
5135 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5136 float_raise(float_flag_invalid, status);
5137 return floatx80_default_nan(status);
5139 aSign = extractFloatx80Sign( a );
5140 bSign = extractFloatx80Sign( b );
5141 if ( aSign == bSign ) {
5142 return addFloatx80Sigs(a, b, aSign, status);
5144 else {
5145 return subFloatx80Sigs(a, b, aSign, status);
5150 /*----------------------------------------------------------------------------
5151 | Returns the result of subtracting the extended double-precision floating-
5152 | point values `a' and `b'. The operation is performed according to the
5153 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5154 *----------------------------------------------------------------------------*/
5156 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5158 flag aSign, bSign;
5160 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5161 float_raise(float_flag_invalid, status);
5162 return floatx80_default_nan(status);
5164 aSign = extractFloatx80Sign( a );
5165 bSign = extractFloatx80Sign( b );
5166 if ( aSign == bSign ) {
5167 return subFloatx80Sigs(a, b, aSign, status);
5169 else {
5170 return addFloatx80Sigs(a, b, aSign, status);
5175 /*----------------------------------------------------------------------------
5176 | Returns the result of multiplying the extended double-precision floating-
5177 | point values `a' and `b'. The operation is performed according to the
5178 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5179 *----------------------------------------------------------------------------*/
5181 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5183 flag aSign, bSign, zSign;
5184 int32_t aExp, bExp, zExp;
5185 uint64_t aSig, bSig, zSig0, zSig1;
5187 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5188 float_raise(float_flag_invalid, status);
5189 return floatx80_default_nan(status);
5191 aSig = extractFloatx80Frac( a );
5192 aExp = extractFloatx80Exp( a );
5193 aSign = extractFloatx80Sign( a );
5194 bSig = extractFloatx80Frac( b );
5195 bExp = extractFloatx80Exp( b );
5196 bSign = extractFloatx80Sign( b );
5197 zSign = aSign ^ bSign;
5198 if ( aExp == 0x7FFF ) {
5199 if ( (uint64_t) ( aSig<<1 )
5200 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5201 return propagateFloatx80NaN(a, b, status);
5203 if ( ( bExp | bSig ) == 0 ) goto invalid;
5204 return packFloatx80(zSign, floatx80_infinity_high,
5205 floatx80_infinity_low);
5207 if ( bExp == 0x7FFF ) {
5208 if ((uint64_t)(bSig << 1)) {
5209 return propagateFloatx80NaN(a, b, status);
5211 if ( ( aExp | aSig ) == 0 ) {
5212 invalid:
5213 float_raise(float_flag_invalid, status);
5214 return floatx80_default_nan(status);
5216 return packFloatx80(zSign, floatx80_infinity_high,
5217 floatx80_infinity_low);
5219 if ( aExp == 0 ) {
5220 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5221 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5223 if ( bExp == 0 ) {
5224 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5225 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5227 zExp = aExp + bExp - 0x3FFE;
5228 mul64To128( aSig, bSig, &zSig0, &zSig1 );
5229 if ( 0 < (int64_t) zSig0 ) {
5230 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5231 --zExp;
5233 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5234 zSign, zExp, zSig0, zSig1, status);
5237 /*----------------------------------------------------------------------------
5238 | Returns the result of dividing the extended double-precision floating-point
5239 | value `a' by the corresponding value `b'. The operation is performed
5240 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5241 *----------------------------------------------------------------------------*/
5243 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5245 flag aSign, bSign, zSign;
5246 int32_t aExp, bExp, zExp;
5247 uint64_t aSig, bSig, zSig0, zSig1;
5248 uint64_t rem0, rem1, rem2, term0, term1, term2;
5250 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5251 float_raise(float_flag_invalid, status);
5252 return floatx80_default_nan(status);
5254 aSig = extractFloatx80Frac( a );
5255 aExp = extractFloatx80Exp( a );
5256 aSign = extractFloatx80Sign( a );
5257 bSig = extractFloatx80Frac( b );
5258 bExp = extractFloatx80Exp( b );
5259 bSign = extractFloatx80Sign( b );
5260 zSign = aSign ^ bSign;
5261 if ( aExp == 0x7FFF ) {
5262 if ((uint64_t)(aSig << 1)) {
5263 return propagateFloatx80NaN(a, b, status);
5265 if ( bExp == 0x7FFF ) {
5266 if ((uint64_t)(bSig << 1)) {
5267 return propagateFloatx80NaN(a, b, status);
5269 goto invalid;
5271 return packFloatx80(zSign, floatx80_infinity_high,
5272 floatx80_infinity_low);
5274 if ( bExp == 0x7FFF ) {
5275 if ((uint64_t)(bSig << 1)) {
5276 return propagateFloatx80NaN(a, b, status);
5278 return packFloatx80( zSign, 0, 0 );
5280 if ( bExp == 0 ) {
5281 if ( bSig == 0 ) {
5282 if ( ( aExp | aSig ) == 0 ) {
5283 invalid:
5284 float_raise(float_flag_invalid, status);
5285 return floatx80_default_nan(status);
5287 float_raise(float_flag_divbyzero, status);
5288 return packFloatx80(zSign, floatx80_infinity_high,
5289 floatx80_infinity_low);
5291 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5293 if ( aExp == 0 ) {
5294 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5295 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5297 zExp = aExp - bExp + 0x3FFE;
5298 rem1 = 0;
5299 if ( bSig <= aSig ) {
5300 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5301 ++zExp;
5303 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5304 mul64To128( bSig, zSig0, &term0, &term1 );
5305 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5306 while ( (int64_t) rem0 < 0 ) {
5307 --zSig0;
5308 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5310 zSig1 = estimateDiv128To64( rem1, 0, bSig );
5311 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5312 mul64To128( bSig, zSig1, &term1, &term2 );
5313 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5314 while ( (int64_t) rem1 < 0 ) {
5315 --zSig1;
5316 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5318 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5320 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5321 zSign, zExp, zSig0, zSig1, status);
5324 /*----------------------------------------------------------------------------
5325 | Returns the remainder of the extended double-precision floating-point value
5326 | `a' with respect to the corresponding value `b'. The operation is performed
5327 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5328 *----------------------------------------------------------------------------*/
5330 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5332 flag aSign, zSign;
5333 int32_t aExp, bExp, expDiff;
5334 uint64_t aSig0, aSig1, bSig;
5335 uint64_t q, term0, term1, alternateASig0, alternateASig1;
5337 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5338 float_raise(float_flag_invalid, status);
5339 return floatx80_default_nan(status);
5341 aSig0 = extractFloatx80Frac( a );
5342 aExp = extractFloatx80Exp( a );
5343 aSign = extractFloatx80Sign( a );
5344 bSig = extractFloatx80Frac( b );
5345 bExp = extractFloatx80Exp( b );
5346 if ( aExp == 0x7FFF ) {
5347 if ( (uint64_t) ( aSig0<<1 )
5348 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5349 return propagateFloatx80NaN(a, b, status);
5351 goto invalid;
5353 if ( bExp == 0x7FFF ) {
5354 if ((uint64_t)(bSig << 1)) {
5355 return propagateFloatx80NaN(a, b, status);
5357 return a;
5359 if ( bExp == 0 ) {
5360 if ( bSig == 0 ) {
5361 invalid:
5362 float_raise(float_flag_invalid, status);
5363 return floatx80_default_nan(status);
5365 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5367 if ( aExp == 0 ) {
5368 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5369 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5371 bSig |= LIT64( 0x8000000000000000 );
5372 zSign = aSign;
5373 expDiff = aExp - bExp;
5374 aSig1 = 0;
5375 if ( expDiff < 0 ) {
5376 if ( expDiff < -1 ) return a;
5377 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5378 expDiff = 0;
5380 q = ( bSig <= aSig0 );
5381 if ( q ) aSig0 -= bSig;
5382 expDiff -= 64;
5383 while ( 0 < expDiff ) {
5384 q = estimateDiv128To64( aSig0, aSig1, bSig );
5385 q = ( 2 < q ) ? q - 2 : 0;
5386 mul64To128( bSig, q, &term0, &term1 );
5387 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5388 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5389 expDiff -= 62;
5391 expDiff += 64;
5392 if ( 0 < expDiff ) {
5393 q = estimateDiv128To64( aSig0, aSig1, bSig );
5394 q = ( 2 < q ) ? q - 2 : 0;
5395 q >>= 64 - expDiff;
5396 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5397 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5398 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5399 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5400 ++q;
5401 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5404 else {
5405 term1 = 0;
5406 term0 = bSig;
5408 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5409 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5410 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5411 && ( q & 1 ) )
5413 aSig0 = alternateASig0;
5414 aSig1 = alternateASig1;
5415 zSign = ! zSign;
5417 return
5418 normalizeRoundAndPackFloatx80(
5419 80, zSign, bExp + expDiff, aSig0, aSig1, status);
5423 /*----------------------------------------------------------------------------
5424 | Returns the square root of the extended double-precision floating-point
5425 | value `a'. The operation is performed according to the IEC/IEEE Standard
5426 | for Binary Floating-Point Arithmetic.
5427 *----------------------------------------------------------------------------*/
5429 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5431 flag aSign;
5432 int32_t aExp, zExp;
5433 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5434 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5436 if (floatx80_invalid_encoding(a)) {
5437 float_raise(float_flag_invalid, status);
5438 return floatx80_default_nan(status);
5440 aSig0 = extractFloatx80Frac( a );
5441 aExp = extractFloatx80Exp( a );
5442 aSign = extractFloatx80Sign( a );
5443 if ( aExp == 0x7FFF ) {
5444 if ((uint64_t)(aSig0 << 1)) {
5445 return propagateFloatx80NaN(a, a, status);
5447 if ( ! aSign ) return a;
5448 goto invalid;
5450 if ( aSign ) {
5451 if ( ( aExp | aSig0 ) == 0 ) return a;
5452 invalid:
5453 float_raise(float_flag_invalid, status);
5454 return floatx80_default_nan(status);
5456 if ( aExp == 0 ) {
5457 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5458 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5460 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5461 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5462 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5463 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5464 doubleZSig0 = zSig0<<1;
5465 mul64To128( zSig0, zSig0, &term0, &term1 );
5466 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5467 while ( (int64_t) rem0 < 0 ) {
5468 --zSig0;
5469 doubleZSig0 -= 2;
5470 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5472 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5473 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5474 if ( zSig1 == 0 ) zSig1 = 1;
5475 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5476 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5477 mul64To128( zSig1, zSig1, &term2, &term3 );
5478 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5479 while ( (int64_t) rem1 < 0 ) {
5480 --zSig1;
5481 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5482 term3 |= 1;
5483 term2 |= doubleZSig0;
5484 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5486 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5488 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5489 zSig0 |= doubleZSig0;
5490 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5491 0, zExp, zSig0, zSig1, status);
5494 /*----------------------------------------------------------------------------
5495 | Returns 1 if the extended double-precision floating-point value `a' is equal
5496 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5497 | raised if either operand is a NaN. Otherwise, the comparison is performed
5498 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5499 *----------------------------------------------------------------------------*/
5501 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5504 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5505 || (extractFloatx80Exp(a) == 0x7FFF
5506 && (uint64_t) (extractFloatx80Frac(a) << 1))
5507 || (extractFloatx80Exp(b) == 0x7FFF
5508 && (uint64_t) (extractFloatx80Frac(b) << 1))
5510 float_raise(float_flag_invalid, status);
5511 return 0;
5513 return
5514 ( a.low == b.low )
5515 && ( ( a.high == b.high )
5516 || ( ( a.low == 0 )
5517 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5522 /*----------------------------------------------------------------------------
5523 | Returns 1 if the extended double-precision floating-point value `a' is
5524 | less than or equal to the corresponding value `b', and 0 otherwise. The
5525 | invalid exception is raised if either operand is a NaN. The comparison is
5526 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5527 | Arithmetic.
5528 *----------------------------------------------------------------------------*/
5530 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5532 flag aSign, bSign;
5534 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5535 || (extractFloatx80Exp(a) == 0x7FFF
5536 && (uint64_t) (extractFloatx80Frac(a) << 1))
5537 || (extractFloatx80Exp(b) == 0x7FFF
5538 && (uint64_t) (extractFloatx80Frac(b) << 1))
5540 float_raise(float_flag_invalid, status);
5541 return 0;
5543 aSign = extractFloatx80Sign( a );
5544 bSign = extractFloatx80Sign( b );
5545 if ( aSign != bSign ) {
5546 return
5547 aSign
5548 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5549 == 0 );
5551 return
5552 aSign ? le128( b.high, b.low, a.high, a.low )
5553 : le128( a.high, a.low, b.high, b.low );
5557 /*----------------------------------------------------------------------------
5558 | Returns 1 if the extended double-precision floating-point value `a' is
5559 | less than the corresponding value `b', and 0 otherwise. The invalid
5560 | exception is raised if either operand is a NaN. The comparison is performed
5561 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5562 *----------------------------------------------------------------------------*/
5564 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5566 flag aSign, bSign;
5568 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5569 || (extractFloatx80Exp(a) == 0x7FFF
5570 && (uint64_t) (extractFloatx80Frac(a) << 1))
5571 || (extractFloatx80Exp(b) == 0x7FFF
5572 && (uint64_t) (extractFloatx80Frac(b) << 1))
5574 float_raise(float_flag_invalid, status);
5575 return 0;
5577 aSign = extractFloatx80Sign( a );
5578 bSign = extractFloatx80Sign( b );
5579 if ( aSign != bSign ) {
5580 return
5581 aSign
5582 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5583 != 0 );
5585 return
5586 aSign ? lt128( b.high, b.low, a.high, a.low )
5587 : lt128( a.high, a.low, b.high, b.low );
5591 /*----------------------------------------------------------------------------
5592 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5593 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5594 | either operand is a NaN. The comparison is performed according to the
5595 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5596 *----------------------------------------------------------------------------*/
5597 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5599 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5600 || (extractFloatx80Exp(a) == 0x7FFF
5601 && (uint64_t) (extractFloatx80Frac(a) << 1))
5602 || (extractFloatx80Exp(b) == 0x7FFF
5603 && (uint64_t) (extractFloatx80Frac(b) << 1))
5605 float_raise(float_flag_invalid, status);
5606 return 1;
5608 return 0;
5611 /*----------------------------------------------------------------------------
5612 | Returns 1 if the extended double-precision floating-point value `a' is
5613 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5614 | cause an exception. The comparison is performed according to the IEC/IEEE
5615 | Standard for Binary Floating-Point Arithmetic.
5616 *----------------------------------------------------------------------------*/
5618 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5621 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5622 float_raise(float_flag_invalid, status);
5623 return 0;
5625 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5626 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5627 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5628 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5630 if (floatx80_is_signaling_nan(a, status)
5631 || floatx80_is_signaling_nan(b, status)) {
5632 float_raise(float_flag_invalid, status);
5634 return 0;
5636 return
5637 ( a.low == b.low )
5638 && ( ( a.high == b.high )
5639 || ( ( a.low == 0 )
5640 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5645 /*----------------------------------------------------------------------------
5646 | Returns 1 if the extended double-precision floating-point value `a' is less
5647 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5648 | do not cause an exception. Otherwise, the comparison is performed according
5649 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5650 *----------------------------------------------------------------------------*/
5652 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5654 flag aSign, bSign;
5656 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5657 float_raise(float_flag_invalid, status);
5658 return 0;
5660 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5661 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5662 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5663 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5665 if (floatx80_is_signaling_nan(a, status)
5666 || floatx80_is_signaling_nan(b, status)) {
5667 float_raise(float_flag_invalid, status);
5669 return 0;
5671 aSign = extractFloatx80Sign( a );
5672 bSign = extractFloatx80Sign( b );
5673 if ( aSign != bSign ) {
5674 return
5675 aSign
5676 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5677 == 0 );
5679 return
5680 aSign ? le128( b.high, b.low, a.high, a.low )
5681 : le128( a.high, a.low, b.high, b.low );
5685 /*----------------------------------------------------------------------------
5686 | Returns 1 if the extended double-precision floating-point value `a' is less
5687 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5688 | an exception. Otherwise, the comparison is performed according to the
5689 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5690 *----------------------------------------------------------------------------*/
5692 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5694 flag aSign, bSign;
5696 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5697 float_raise(float_flag_invalid, status);
5698 return 0;
5700 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5701 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5702 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5703 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5705 if (floatx80_is_signaling_nan(a, status)
5706 || floatx80_is_signaling_nan(b, status)) {
5707 float_raise(float_flag_invalid, status);
5709 return 0;
5711 aSign = extractFloatx80Sign( a );
5712 bSign = extractFloatx80Sign( b );
5713 if ( aSign != bSign ) {
5714 return
5715 aSign
5716 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5717 != 0 );
5719 return
5720 aSign ? lt128( b.high, b.low, a.high, a.low )
5721 : lt128( a.high, a.low, b.high, b.low );
5725 /*----------------------------------------------------------------------------
5726 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5727 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5728 | The comparison is performed according to the IEC/IEEE Standard for Binary
5729 | Floating-Point Arithmetic.
5730 *----------------------------------------------------------------------------*/
5731 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5733 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5734 float_raise(float_flag_invalid, status);
5735 return 1;
5737 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5738 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5739 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5740 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5742 if (floatx80_is_signaling_nan(a, status)
5743 || floatx80_is_signaling_nan(b, status)) {
5744 float_raise(float_flag_invalid, status);
5746 return 1;
5748 return 0;
5751 /*----------------------------------------------------------------------------
5752 | Returns the result of converting the quadruple-precision floating-point
5753 | value `a' to the 32-bit two's complement integer format. The conversion
5754 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5755 | Arithmetic---which means in particular that the conversion is rounded
5756 | according to the current rounding mode. If `a' is a NaN, the largest
5757 | positive integer is returned. Otherwise, if the conversion overflows, the
5758 | largest integer with the same sign as `a' is returned.
5759 *----------------------------------------------------------------------------*/
5761 int32_t float128_to_int32(float128 a, float_status *status)
5763 flag aSign;
5764 int32_t aExp, shiftCount;
5765 uint64_t aSig0, aSig1;
5767 aSig1 = extractFloat128Frac1( a );
5768 aSig0 = extractFloat128Frac0( a );
5769 aExp = extractFloat128Exp( a );
5770 aSign = extractFloat128Sign( a );
5771 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5772 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5773 aSig0 |= ( aSig1 != 0 );
5774 shiftCount = 0x4028 - aExp;
5775 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5776 return roundAndPackInt32(aSign, aSig0, status);
5780 /*----------------------------------------------------------------------------
5781 | Returns the result of converting the quadruple-precision floating-point
5782 | value `a' to the 32-bit two's complement integer format. The conversion
5783 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5784 | Arithmetic, except that the conversion is always rounded toward zero. If
5785 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5786 | conversion overflows, the largest integer with the same sign as `a' is
5787 | returned.
5788 *----------------------------------------------------------------------------*/
5790 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5792 flag aSign;
5793 int32_t aExp, shiftCount;
5794 uint64_t aSig0, aSig1, savedASig;
5795 int32_t z;
5797 aSig1 = extractFloat128Frac1( a );
5798 aSig0 = extractFloat128Frac0( a );
5799 aExp = extractFloat128Exp( a );
5800 aSign = extractFloat128Sign( a );
5801 aSig0 |= ( aSig1 != 0 );
5802 if ( 0x401E < aExp ) {
5803 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5804 goto invalid;
5806 else if ( aExp < 0x3FFF ) {
5807 if (aExp || aSig0) {
5808 status->float_exception_flags |= float_flag_inexact;
5810 return 0;
5812 aSig0 |= LIT64( 0x0001000000000000 );
5813 shiftCount = 0x402F - aExp;
5814 savedASig = aSig0;
5815 aSig0 >>= shiftCount;
5816 z = aSig0;
5817 if ( aSign ) z = - z;
5818 if ( ( z < 0 ) ^ aSign ) {
5819 invalid:
5820 float_raise(float_flag_invalid, status);
5821 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5823 if ( ( aSig0<<shiftCount ) != savedASig ) {
5824 status->float_exception_flags |= float_flag_inexact;
5826 return z;
5830 /*----------------------------------------------------------------------------
5831 | Returns the result of converting the quadruple-precision floating-point
5832 | value `a' to the 64-bit two's complement integer format. The conversion
5833 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5834 | Arithmetic---which means in particular that the conversion is rounded
5835 | according to the current rounding mode. If `a' is a NaN, the largest
5836 | positive integer is returned. Otherwise, if the conversion overflows, the
5837 | largest integer with the same sign as `a' is returned.
5838 *----------------------------------------------------------------------------*/
5840 int64_t float128_to_int64(float128 a, float_status *status)
5842 flag aSign;
5843 int32_t aExp, shiftCount;
5844 uint64_t aSig0, aSig1;
5846 aSig1 = extractFloat128Frac1( a );
5847 aSig0 = extractFloat128Frac0( a );
5848 aExp = extractFloat128Exp( a );
5849 aSign = extractFloat128Sign( a );
5850 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5851 shiftCount = 0x402F - aExp;
5852 if ( shiftCount <= 0 ) {
5853 if ( 0x403E < aExp ) {
5854 float_raise(float_flag_invalid, status);
5855 if ( ! aSign
5856 || ( ( aExp == 0x7FFF )
5857 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5860 return LIT64( 0x7FFFFFFFFFFFFFFF );
5862 return (int64_t) LIT64( 0x8000000000000000 );
5864 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5866 else {
5867 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5869 return roundAndPackInt64(aSign, aSig0, aSig1, status);
5873 /*----------------------------------------------------------------------------
5874 | Returns the result of converting the quadruple-precision floating-point
5875 | value `a' to the 64-bit two's complement integer format. The conversion
5876 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5877 | Arithmetic, except that the conversion is always rounded toward zero.
5878 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5879 | the conversion overflows, the largest integer with the same sign as `a' is
5880 | returned.
5881 *----------------------------------------------------------------------------*/
5883 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5885 flag aSign;
5886 int32_t aExp, shiftCount;
5887 uint64_t aSig0, aSig1;
5888 int64_t z;
5890 aSig1 = extractFloat128Frac1( a );
5891 aSig0 = extractFloat128Frac0( a );
5892 aExp = extractFloat128Exp( a );
5893 aSign = extractFloat128Sign( a );
5894 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5895 shiftCount = aExp - 0x402F;
5896 if ( 0 < shiftCount ) {
5897 if ( 0x403E <= aExp ) {
5898 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5899 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5900 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5901 if (aSig1) {
5902 status->float_exception_flags |= float_flag_inexact;
5905 else {
5906 float_raise(float_flag_invalid, status);
5907 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5908 return LIT64( 0x7FFFFFFFFFFFFFFF );
5911 return (int64_t) LIT64( 0x8000000000000000 );
5913 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5914 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5915 status->float_exception_flags |= float_flag_inexact;
5918 else {
5919 if ( aExp < 0x3FFF ) {
5920 if ( aExp | aSig0 | aSig1 ) {
5921 status->float_exception_flags |= float_flag_inexact;
5923 return 0;
5925 z = aSig0>>( - shiftCount );
5926 if ( aSig1
5927 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5928 status->float_exception_flags |= float_flag_inexact;
5931 if ( aSign ) z = - z;
5932 return z;
5936 /*----------------------------------------------------------------------------
5937 | Returns the result of converting the quadruple-precision floating-point value
5938 | `a' to the 64-bit unsigned integer format. The conversion is
5939 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5940 | Arithmetic---which means in particular that the conversion is rounded
5941 | according to the current rounding mode. If `a' is a NaN, the largest
5942 | positive integer is returned. If the conversion overflows, the
5943 | largest unsigned integer is returned. If 'a' is negative, the value is
5944 | rounded and zero is returned; negative values that do not round to zero
5945 | will raise the inexact exception.
5946 *----------------------------------------------------------------------------*/
5948 uint64_t float128_to_uint64(float128 a, float_status *status)
5950 flag aSign;
5951 int aExp;
5952 int shiftCount;
5953 uint64_t aSig0, aSig1;
5955 aSig0 = extractFloat128Frac0(a);
5956 aSig1 = extractFloat128Frac1(a);
5957 aExp = extractFloat128Exp(a);
5958 aSign = extractFloat128Sign(a);
5959 if (aSign && (aExp > 0x3FFE)) {
5960 float_raise(float_flag_invalid, status);
5961 if (float128_is_any_nan(a)) {
5962 return LIT64(0xFFFFFFFFFFFFFFFF);
5963 } else {
5964 return 0;
5967 if (aExp) {
5968 aSig0 |= LIT64(0x0001000000000000);
5970 shiftCount = 0x402F - aExp;
5971 if (shiftCount <= 0) {
5972 if (0x403E < aExp) {
5973 float_raise(float_flag_invalid, status);
5974 return LIT64(0xFFFFFFFFFFFFFFFF);
5976 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5977 } else {
5978 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5980 return roundAndPackUint64(aSign, aSig0, aSig1, status);
5983 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5985 uint64_t v;
5986 signed char current_rounding_mode = status->float_rounding_mode;
5988 set_float_rounding_mode(float_round_to_zero, status);
5989 v = float128_to_uint64(a, status);
5990 set_float_rounding_mode(current_rounding_mode, status);
5992 return v;
5995 /*----------------------------------------------------------------------------
5996 | Returns the result of converting the quadruple-precision floating-point
5997 | value `a' to the 32-bit unsigned integer format. The conversion
5998 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5999 | Arithmetic except that the conversion is always rounded toward zero.
6000 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
6001 | if the conversion overflows, the largest unsigned integer is returned.
6002 | If 'a' is negative, the value is rounded and zero is returned; negative
6003 | values that do not round to zero will raise the inexact exception.
6004 *----------------------------------------------------------------------------*/
6006 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6008 uint64_t v;
6009 uint32_t res;
6010 int old_exc_flags = get_float_exception_flags(status);
6012 v = float128_to_uint64_round_to_zero(a, status);
6013 if (v > 0xffffffff) {
6014 res = 0xffffffff;
6015 } else {
6016 return v;
6018 set_float_exception_flags(old_exc_flags, status);
6019 float_raise(float_flag_invalid, status);
6020 return res;
6023 /*----------------------------------------------------------------------------
6024 | Returns the result of converting the quadruple-precision floating-point
6025 | value `a' to the single-precision floating-point format. The conversion
6026 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6027 | Arithmetic.
6028 *----------------------------------------------------------------------------*/
6030 float32 float128_to_float32(float128 a, float_status *status)
6032 flag aSign;
6033 int32_t aExp;
6034 uint64_t aSig0, aSig1;
6035 uint32_t zSig;
6037 aSig1 = extractFloat128Frac1( a );
6038 aSig0 = extractFloat128Frac0( a );
6039 aExp = extractFloat128Exp( a );
6040 aSign = extractFloat128Sign( a );
6041 if ( aExp == 0x7FFF ) {
6042 if ( aSig0 | aSig1 ) {
6043 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6045 return packFloat32( aSign, 0xFF, 0 );
6047 aSig0 |= ( aSig1 != 0 );
6048 shift64RightJamming( aSig0, 18, &aSig0 );
6049 zSig = aSig0;
6050 if ( aExp || zSig ) {
6051 zSig |= 0x40000000;
6052 aExp -= 0x3F81;
6054 return roundAndPackFloat32(aSign, aExp, zSig, status);
6058 /*----------------------------------------------------------------------------
6059 | Returns the result of converting the quadruple-precision floating-point
6060 | value `a' to the double-precision floating-point format. The conversion
6061 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6062 | Arithmetic.
6063 *----------------------------------------------------------------------------*/
6065 float64 float128_to_float64(float128 a, float_status *status)
6067 flag aSign;
6068 int32_t aExp;
6069 uint64_t aSig0, aSig1;
6071 aSig1 = extractFloat128Frac1( a );
6072 aSig0 = extractFloat128Frac0( a );
6073 aExp = extractFloat128Exp( a );
6074 aSign = extractFloat128Sign( a );
6075 if ( aExp == 0x7FFF ) {
6076 if ( aSig0 | aSig1 ) {
6077 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6079 return packFloat64( aSign, 0x7FF, 0 );
6081 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6082 aSig0 |= ( aSig1 != 0 );
6083 if ( aExp || aSig0 ) {
6084 aSig0 |= LIT64( 0x4000000000000000 );
6085 aExp -= 0x3C01;
6087 return roundAndPackFloat64(aSign, aExp, aSig0, status);
6091 /*----------------------------------------------------------------------------
6092 | Returns the result of converting the quadruple-precision floating-point
6093 | value `a' to the extended double-precision floating-point format. The
6094 | conversion is performed according to the IEC/IEEE Standard for Binary
6095 | Floating-Point Arithmetic.
6096 *----------------------------------------------------------------------------*/
6098 floatx80 float128_to_floatx80(float128 a, float_status *status)
6100 flag aSign;
6101 int32_t aExp;
6102 uint64_t aSig0, aSig1;
6104 aSig1 = extractFloat128Frac1( a );
6105 aSig0 = extractFloat128Frac0( a );
6106 aExp = extractFloat128Exp( a );
6107 aSign = extractFloat128Sign( a );
6108 if ( aExp == 0x7FFF ) {
6109 if ( aSig0 | aSig1 ) {
6110 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6112 return packFloatx80(aSign, floatx80_infinity_high,
6113 floatx80_infinity_low);
6115 if ( aExp == 0 ) {
6116 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6117 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6119 else {
6120 aSig0 |= LIT64( 0x0001000000000000 );
6122 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6123 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6127 /*----------------------------------------------------------------------------
6128 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6129 | returns the result as a quadruple-precision floating-point value. The
6130 | operation is performed according to the IEC/IEEE Standard for Binary
6131 | Floating-Point Arithmetic.
6132 *----------------------------------------------------------------------------*/
6134 float128 float128_round_to_int(float128 a, float_status *status)
6136 flag aSign;
6137 int32_t aExp;
6138 uint64_t lastBitMask, roundBitsMask;
6139 float128 z;
6141 aExp = extractFloat128Exp( a );
6142 if ( 0x402F <= aExp ) {
6143 if ( 0x406F <= aExp ) {
6144 if ( ( aExp == 0x7FFF )
6145 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6147 return propagateFloat128NaN(a, a, status);
6149 return a;
6151 lastBitMask = 1;
6152 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6153 roundBitsMask = lastBitMask - 1;
6154 z = a;
6155 switch (status->float_rounding_mode) {
6156 case float_round_nearest_even:
6157 if ( lastBitMask ) {
6158 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6159 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6161 else {
6162 if ( (int64_t) z.low < 0 ) {
6163 ++z.high;
6164 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6167 break;
6168 case float_round_ties_away:
6169 if (lastBitMask) {
6170 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6171 } else {
6172 if ((int64_t) z.low < 0) {
6173 ++z.high;
6176 break;
6177 case float_round_to_zero:
6178 break;
6179 case float_round_up:
6180 if (!extractFloat128Sign(z)) {
6181 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6183 break;
6184 case float_round_down:
6185 if (extractFloat128Sign(z)) {
6186 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6188 break;
6189 default:
6190 abort();
6192 z.low &= ~ roundBitsMask;
6194 else {
6195 if ( aExp < 0x3FFF ) {
6196 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6197 status->float_exception_flags |= float_flag_inexact;
6198 aSign = extractFloat128Sign( a );
6199 switch (status->float_rounding_mode) {
6200 case float_round_nearest_even:
6201 if ( ( aExp == 0x3FFE )
6202 && ( extractFloat128Frac0( a )
6203 | extractFloat128Frac1( a ) )
6205 return packFloat128( aSign, 0x3FFF, 0, 0 );
6207 break;
6208 case float_round_ties_away:
6209 if (aExp == 0x3FFE) {
6210 return packFloat128(aSign, 0x3FFF, 0, 0);
6212 break;
6213 case float_round_down:
6214 return
6215 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6216 : packFloat128( 0, 0, 0, 0 );
6217 case float_round_up:
6218 return
6219 aSign ? packFloat128( 1, 0, 0, 0 )
6220 : packFloat128( 0, 0x3FFF, 0, 0 );
6222 return packFloat128( aSign, 0, 0, 0 );
6224 lastBitMask = 1;
6225 lastBitMask <<= 0x402F - aExp;
6226 roundBitsMask = lastBitMask - 1;
6227 z.low = 0;
6228 z.high = a.high;
6229 switch (status->float_rounding_mode) {
6230 case float_round_nearest_even:
6231 z.high += lastBitMask>>1;
6232 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6233 z.high &= ~ lastBitMask;
6235 break;
6236 case float_round_ties_away:
6237 z.high += lastBitMask>>1;
6238 break;
6239 case float_round_to_zero:
6240 break;
6241 case float_round_up:
6242 if (!extractFloat128Sign(z)) {
6243 z.high |= ( a.low != 0 );
6244 z.high += roundBitsMask;
6246 break;
6247 case float_round_down:
6248 if (extractFloat128Sign(z)) {
6249 z.high |= (a.low != 0);
6250 z.high += roundBitsMask;
6252 break;
6253 default:
6254 abort();
6256 z.high &= ~ roundBitsMask;
6258 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6259 status->float_exception_flags |= float_flag_inexact;
6261 return z;
6265 /*----------------------------------------------------------------------------
6266 | Returns the result of adding the absolute values of the quadruple-precision
6267 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6268 | before being returned. `zSign' is ignored if the result is a NaN.
6269 | The addition is performed according to the IEC/IEEE Standard for Binary
6270 | Floating-Point Arithmetic.
6271 *----------------------------------------------------------------------------*/
6273 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6274 float_status *status)
6276 int32_t aExp, bExp, zExp;
6277 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6278 int32_t expDiff;
6280 aSig1 = extractFloat128Frac1( a );
6281 aSig0 = extractFloat128Frac0( a );
6282 aExp = extractFloat128Exp( a );
6283 bSig1 = extractFloat128Frac1( b );
6284 bSig0 = extractFloat128Frac0( b );
6285 bExp = extractFloat128Exp( b );
6286 expDiff = aExp - bExp;
6287 if ( 0 < expDiff ) {
6288 if ( aExp == 0x7FFF ) {
6289 if (aSig0 | aSig1) {
6290 return propagateFloat128NaN(a, b, status);
6292 return a;
6294 if ( bExp == 0 ) {
6295 --expDiff;
6297 else {
6298 bSig0 |= LIT64( 0x0001000000000000 );
6300 shift128ExtraRightJamming(
6301 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6302 zExp = aExp;
6304 else if ( expDiff < 0 ) {
6305 if ( bExp == 0x7FFF ) {
6306 if (bSig0 | bSig1) {
6307 return propagateFloat128NaN(a, b, status);
6309 return packFloat128( zSign, 0x7FFF, 0, 0 );
6311 if ( aExp == 0 ) {
6312 ++expDiff;
6314 else {
6315 aSig0 |= LIT64( 0x0001000000000000 );
6317 shift128ExtraRightJamming(
6318 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6319 zExp = bExp;
6321 else {
6322 if ( aExp == 0x7FFF ) {
6323 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6324 return propagateFloat128NaN(a, b, status);
6326 return a;
6328 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6329 if ( aExp == 0 ) {
6330 if (status->flush_to_zero) {
6331 if (zSig0 | zSig1) {
6332 float_raise(float_flag_output_denormal, status);
6334 return packFloat128(zSign, 0, 0, 0);
6336 return packFloat128( zSign, 0, zSig0, zSig1 );
6338 zSig2 = 0;
6339 zSig0 |= LIT64( 0x0002000000000000 );
6340 zExp = aExp;
6341 goto shiftRight1;
6343 aSig0 |= LIT64( 0x0001000000000000 );
6344 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6345 --zExp;
6346 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6347 ++zExp;
6348 shiftRight1:
6349 shift128ExtraRightJamming(
6350 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6351 roundAndPack:
6352 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6356 /*----------------------------------------------------------------------------
6357 | Returns the result of subtracting the absolute values of the quadruple-
6358 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6359 | difference is negated before being returned. `zSign' is ignored if the
6360 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6361 | Standard for Binary Floating-Point Arithmetic.
6362 *----------------------------------------------------------------------------*/
6364 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6365 float_status *status)
6367 int32_t aExp, bExp, zExp;
6368 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6369 int32_t expDiff;
6371 aSig1 = extractFloat128Frac1( a );
6372 aSig0 = extractFloat128Frac0( a );
6373 aExp = extractFloat128Exp( a );
6374 bSig1 = extractFloat128Frac1( b );
6375 bSig0 = extractFloat128Frac0( b );
6376 bExp = extractFloat128Exp( b );
6377 expDiff = aExp - bExp;
6378 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6379 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6380 if ( 0 < expDiff ) goto aExpBigger;
6381 if ( expDiff < 0 ) goto bExpBigger;
6382 if ( aExp == 0x7FFF ) {
6383 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6384 return propagateFloat128NaN(a, b, status);
6386 float_raise(float_flag_invalid, status);
6387 return float128_default_nan(status);
6389 if ( aExp == 0 ) {
6390 aExp = 1;
6391 bExp = 1;
6393 if ( bSig0 < aSig0 ) goto aBigger;
6394 if ( aSig0 < bSig0 ) goto bBigger;
6395 if ( bSig1 < aSig1 ) goto aBigger;
6396 if ( aSig1 < bSig1 ) goto bBigger;
6397 return packFloat128(status->float_rounding_mode == float_round_down,
6398 0, 0, 0);
6399 bExpBigger:
6400 if ( bExp == 0x7FFF ) {
6401 if (bSig0 | bSig1) {
6402 return propagateFloat128NaN(a, b, status);
6404 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6406 if ( aExp == 0 ) {
6407 ++expDiff;
6409 else {
6410 aSig0 |= LIT64( 0x4000000000000000 );
6412 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6413 bSig0 |= LIT64( 0x4000000000000000 );
6414 bBigger:
6415 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6416 zExp = bExp;
6417 zSign ^= 1;
6418 goto normalizeRoundAndPack;
6419 aExpBigger:
6420 if ( aExp == 0x7FFF ) {
6421 if (aSig0 | aSig1) {
6422 return propagateFloat128NaN(a, b, status);
6424 return a;
6426 if ( bExp == 0 ) {
6427 --expDiff;
6429 else {
6430 bSig0 |= LIT64( 0x4000000000000000 );
6432 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6433 aSig0 |= LIT64( 0x4000000000000000 );
6434 aBigger:
6435 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6436 zExp = aExp;
6437 normalizeRoundAndPack:
6438 --zExp;
6439 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6440 status);
6444 /*----------------------------------------------------------------------------
6445 | Returns the result of adding the quadruple-precision floating-point values
6446 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6447 | for Binary Floating-Point Arithmetic.
6448 *----------------------------------------------------------------------------*/
6450 float128 float128_add(float128 a, float128 b, float_status *status)
6452 flag aSign, bSign;
6454 aSign = extractFloat128Sign( a );
6455 bSign = extractFloat128Sign( b );
6456 if ( aSign == bSign ) {
6457 return addFloat128Sigs(a, b, aSign, status);
6459 else {
6460 return subFloat128Sigs(a, b, aSign, status);
6465 /*----------------------------------------------------------------------------
6466 | Returns the result of subtracting the quadruple-precision floating-point
6467 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6468 | Standard for Binary Floating-Point Arithmetic.
6469 *----------------------------------------------------------------------------*/
6471 float128 float128_sub(float128 a, float128 b, float_status *status)
6473 flag aSign, bSign;
6475 aSign = extractFloat128Sign( a );
6476 bSign = extractFloat128Sign( b );
6477 if ( aSign == bSign ) {
6478 return subFloat128Sigs(a, b, aSign, status);
6480 else {
6481 return addFloat128Sigs(a, b, aSign, status);
6486 /*----------------------------------------------------------------------------
6487 | Returns the result of multiplying the quadruple-precision floating-point
6488 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6489 | Standard for Binary Floating-Point Arithmetic.
6490 *----------------------------------------------------------------------------*/
6492 float128 float128_mul(float128 a, float128 b, float_status *status)
6494 flag aSign, bSign, zSign;
6495 int32_t aExp, bExp, zExp;
6496 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6498 aSig1 = extractFloat128Frac1( a );
6499 aSig0 = extractFloat128Frac0( a );
6500 aExp = extractFloat128Exp( a );
6501 aSign = extractFloat128Sign( a );
6502 bSig1 = extractFloat128Frac1( b );
6503 bSig0 = extractFloat128Frac0( b );
6504 bExp = extractFloat128Exp( b );
6505 bSign = extractFloat128Sign( b );
6506 zSign = aSign ^ bSign;
6507 if ( aExp == 0x7FFF ) {
6508 if ( ( aSig0 | aSig1 )
6509 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6510 return propagateFloat128NaN(a, b, status);
6512 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6513 return packFloat128( zSign, 0x7FFF, 0, 0 );
6515 if ( bExp == 0x7FFF ) {
6516 if (bSig0 | bSig1) {
6517 return propagateFloat128NaN(a, b, status);
6519 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6520 invalid:
6521 float_raise(float_flag_invalid, status);
6522 return float128_default_nan(status);
6524 return packFloat128( zSign, 0x7FFF, 0, 0 );
6526 if ( aExp == 0 ) {
6527 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6528 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6530 if ( bExp == 0 ) {
6531 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6532 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6534 zExp = aExp + bExp - 0x4000;
6535 aSig0 |= LIT64( 0x0001000000000000 );
6536 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6537 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6538 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6539 zSig2 |= ( zSig3 != 0 );
6540 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6541 shift128ExtraRightJamming(
6542 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6543 ++zExp;
6545 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6549 /*----------------------------------------------------------------------------
6550 | Returns the result of dividing the quadruple-precision floating-point value
6551 | `a' by the corresponding value `b'. The operation is performed according to
6552 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6553 *----------------------------------------------------------------------------*/
6555 float128 float128_div(float128 a, float128 b, float_status *status)
6557 flag aSign, bSign, zSign;
6558 int32_t aExp, bExp, zExp;
6559 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6560 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6562 aSig1 = extractFloat128Frac1( a );
6563 aSig0 = extractFloat128Frac0( a );
6564 aExp = extractFloat128Exp( a );
6565 aSign = extractFloat128Sign( a );
6566 bSig1 = extractFloat128Frac1( b );
6567 bSig0 = extractFloat128Frac0( b );
6568 bExp = extractFloat128Exp( b );
6569 bSign = extractFloat128Sign( b );
6570 zSign = aSign ^ bSign;
6571 if ( aExp == 0x7FFF ) {
6572 if (aSig0 | aSig1) {
6573 return propagateFloat128NaN(a, b, status);
6575 if ( bExp == 0x7FFF ) {
6576 if (bSig0 | bSig1) {
6577 return propagateFloat128NaN(a, b, status);
6579 goto invalid;
6581 return packFloat128( zSign, 0x7FFF, 0, 0 );
6583 if ( bExp == 0x7FFF ) {
6584 if (bSig0 | bSig1) {
6585 return propagateFloat128NaN(a, b, status);
6587 return packFloat128( zSign, 0, 0, 0 );
6589 if ( bExp == 0 ) {
6590 if ( ( bSig0 | bSig1 ) == 0 ) {
6591 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6592 invalid:
6593 float_raise(float_flag_invalid, status);
6594 return float128_default_nan(status);
6596 float_raise(float_flag_divbyzero, status);
6597 return packFloat128( zSign, 0x7FFF, 0, 0 );
6599 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6601 if ( aExp == 0 ) {
6602 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6603 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6605 zExp = aExp - bExp + 0x3FFD;
6606 shortShift128Left(
6607 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6608 shortShift128Left(
6609 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6610 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6611 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6612 ++zExp;
6614 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6615 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6616 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6617 while ( (int64_t) rem0 < 0 ) {
6618 --zSig0;
6619 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6621 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6622 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6623 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6624 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6625 while ( (int64_t) rem1 < 0 ) {
6626 --zSig1;
6627 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6629 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6631 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6632 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6636 /*----------------------------------------------------------------------------
6637 | Returns the remainder of the quadruple-precision floating-point value `a'
6638 | with respect to the corresponding value `b'. The operation is performed
6639 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6640 *----------------------------------------------------------------------------*/
6642 float128 float128_rem(float128 a, float128 b, float_status *status)
6644 flag aSign, zSign;
6645 int32_t aExp, bExp, expDiff;
6646 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6647 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6648 int64_t sigMean0;
6650 aSig1 = extractFloat128Frac1( a );
6651 aSig0 = extractFloat128Frac0( a );
6652 aExp = extractFloat128Exp( a );
6653 aSign = extractFloat128Sign( a );
6654 bSig1 = extractFloat128Frac1( b );
6655 bSig0 = extractFloat128Frac0( b );
6656 bExp = extractFloat128Exp( b );
6657 if ( aExp == 0x7FFF ) {
6658 if ( ( aSig0 | aSig1 )
6659 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6660 return propagateFloat128NaN(a, b, status);
6662 goto invalid;
6664 if ( bExp == 0x7FFF ) {
6665 if (bSig0 | bSig1) {
6666 return propagateFloat128NaN(a, b, status);
6668 return a;
6670 if ( bExp == 0 ) {
6671 if ( ( bSig0 | bSig1 ) == 0 ) {
6672 invalid:
6673 float_raise(float_flag_invalid, status);
6674 return float128_default_nan(status);
6676 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6678 if ( aExp == 0 ) {
6679 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6680 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6682 expDiff = aExp - bExp;
6683 if ( expDiff < -1 ) return a;
6684 shortShift128Left(
6685 aSig0 | LIT64( 0x0001000000000000 ),
6686 aSig1,
6687 15 - ( expDiff < 0 ),
6688 &aSig0,
6689 &aSig1
6691 shortShift128Left(
6692 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6693 q = le128( bSig0, bSig1, aSig0, aSig1 );
6694 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6695 expDiff -= 64;
6696 while ( 0 < expDiff ) {
6697 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6698 q = ( 4 < q ) ? q - 4 : 0;
6699 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6700 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6701 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6702 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6703 expDiff -= 61;
6705 if ( -64 < expDiff ) {
6706 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6707 q = ( 4 < q ) ? q - 4 : 0;
6708 q >>= - expDiff;
6709 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6710 expDiff += 52;
6711 if ( expDiff < 0 ) {
6712 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6714 else {
6715 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6717 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6718 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6720 else {
6721 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6722 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6724 do {
6725 alternateASig0 = aSig0;
6726 alternateASig1 = aSig1;
6727 ++q;
6728 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6729 } while ( 0 <= (int64_t) aSig0 );
6730 add128(
6731 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6732 if ( ( sigMean0 < 0 )
6733 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6734 aSig0 = alternateASig0;
6735 aSig1 = alternateASig1;
6737 zSign = ( (int64_t) aSig0 < 0 );
6738 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6739 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6740 status);
6743 /*----------------------------------------------------------------------------
6744 | Returns the square root of the quadruple-precision floating-point value `a'.
6745 | The operation is performed according to the IEC/IEEE Standard for Binary
6746 | Floating-Point Arithmetic.
6747 *----------------------------------------------------------------------------*/
6749 float128 float128_sqrt(float128 a, float_status *status)
6751 flag aSign;
6752 int32_t aExp, zExp;
6753 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6754 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6756 aSig1 = extractFloat128Frac1( a );
6757 aSig0 = extractFloat128Frac0( a );
6758 aExp = extractFloat128Exp( a );
6759 aSign = extractFloat128Sign( a );
6760 if ( aExp == 0x7FFF ) {
6761 if (aSig0 | aSig1) {
6762 return propagateFloat128NaN(a, a, status);
6764 if ( ! aSign ) return a;
6765 goto invalid;
6767 if ( aSign ) {
6768 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6769 invalid:
6770 float_raise(float_flag_invalid, status);
6771 return float128_default_nan(status);
6773 if ( aExp == 0 ) {
6774 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6775 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6777 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6778 aSig0 |= LIT64( 0x0001000000000000 );
6779 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6780 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6781 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6782 doubleZSig0 = zSig0<<1;
6783 mul64To128( zSig0, zSig0, &term0, &term1 );
6784 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6785 while ( (int64_t) rem0 < 0 ) {
6786 --zSig0;
6787 doubleZSig0 -= 2;
6788 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6790 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6791 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6792 if ( zSig1 == 0 ) zSig1 = 1;
6793 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6794 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6795 mul64To128( zSig1, zSig1, &term2, &term3 );
6796 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6797 while ( (int64_t) rem1 < 0 ) {
6798 --zSig1;
6799 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6800 term3 |= 1;
6801 term2 |= doubleZSig0;
6802 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6804 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6806 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6807 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6811 /*----------------------------------------------------------------------------
6812 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6813 | the corresponding value `b', and 0 otherwise. The invalid exception is
6814 | raised if either operand is a NaN. Otherwise, the comparison is performed
6815 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6816 *----------------------------------------------------------------------------*/
6818 int float128_eq(float128 a, float128 b, float_status *status)
6821 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6822 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6823 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6824 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6826 float_raise(float_flag_invalid, status);
6827 return 0;
6829 return
6830 ( a.low == b.low )
6831 && ( ( a.high == b.high )
6832 || ( ( a.low == 0 )
6833 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6838 /*----------------------------------------------------------------------------
6839 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6840 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6841 | exception is raised if either operand is a NaN. The comparison is performed
6842 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6843 *----------------------------------------------------------------------------*/
6845 int float128_le(float128 a, float128 b, float_status *status)
6847 flag aSign, bSign;
6849 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6850 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6851 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6852 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6854 float_raise(float_flag_invalid, status);
6855 return 0;
6857 aSign = extractFloat128Sign( a );
6858 bSign = extractFloat128Sign( b );
6859 if ( aSign != bSign ) {
6860 return
6861 aSign
6862 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6863 == 0 );
6865 return
6866 aSign ? le128( b.high, b.low, a.high, a.low )
6867 : le128( a.high, a.low, b.high, b.low );
6871 /*----------------------------------------------------------------------------
6872 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6873 | the corresponding value `b', and 0 otherwise. The invalid exception is
6874 | raised if either operand is a NaN. The comparison is performed according
6875 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6876 *----------------------------------------------------------------------------*/
6878 int float128_lt(float128 a, float128 b, float_status *status)
6880 flag aSign, bSign;
6882 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6883 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6884 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6885 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6887 float_raise(float_flag_invalid, status);
6888 return 0;
6890 aSign = extractFloat128Sign( a );
6891 bSign = extractFloat128Sign( b );
6892 if ( aSign != bSign ) {
6893 return
6894 aSign
6895 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6896 != 0 );
6898 return
6899 aSign ? lt128( b.high, b.low, a.high, a.low )
6900 : lt128( a.high, a.low, b.high, b.low );
6904 /*----------------------------------------------------------------------------
6905 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6906 | be compared, and 0 otherwise. The invalid exception is raised if either
6907 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6908 | Standard for Binary Floating-Point Arithmetic.
6909 *----------------------------------------------------------------------------*/
6911 int float128_unordered(float128 a, float128 b, float_status *status)
6913 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6914 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6915 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6916 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6918 float_raise(float_flag_invalid, status);
6919 return 1;
6921 return 0;
6924 /*----------------------------------------------------------------------------
6925 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6926 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6927 | exception. The comparison is performed according to the IEC/IEEE Standard
6928 | for Binary Floating-Point Arithmetic.
6929 *----------------------------------------------------------------------------*/
6931 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6934 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6935 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6936 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6937 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6939 if (float128_is_signaling_nan(a, status)
6940 || float128_is_signaling_nan(b, status)) {
6941 float_raise(float_flag_invalid, status);
6943 return 0;
6945 return
6946 ( a.low == b.low )
6947 && ( ( a.high == b.high )
6948 || ( ( a.low == 0 )
6949 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6954 /*----------------------------------------------------------------------------
6955 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6956 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6957 | cause an exception. Otherwise, the comparison is performed according to the
6958 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6959 *----------------------------------------------------------------------------*/
6961 int float128_le_quiet(float128 a, float128 b, float_status *status)
6963 flag aSign, bSign;
6965 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6966 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6967 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6968 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6970 if (float128_is_signaling_nan(a, status)
6971 || float128_is_signaling_nan(b, status)) {
6972 float_raise(float_flag_invalid, status);
6974 return 0;
6976 aSign = extractFloat128Sign( a );
6977 bSign = extractFloat128Sign( b );
6978 if ( aSign != bSign ) {
6979 return
6980 aSign
6981 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6982 == 0 );
6984 return
6985 aSign ? le128( b.high, b.low, a.high, a.low )
6986 : le128( a.high, a.low, b.high, b.low );
6990 /*----------------------------------------------------------------------------
6991 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6992 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6993 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6994 | Standard for Binary Floating-Point Arithmetic.
6995 *----------------------------------------------------------------------------*/
6997 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6999 flag aSign, bSign;
7001 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7002 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7003 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7004 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7006 if (float128_is_signaling_nan(a, status)
7007 || float128_is_signaling_nan(b, status)) {
7008 float_raise(float_flag_invalid, status);
7010 return 0;
7012 aSign = extractFloat128Sign( a );
7013 bSign = extractFloat128Sign( b );
7014 if ( aSign != bSign ) {
7015 return
7016 aSign
7017 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7018 != 0 );
7020 return
7021 aSign ? lt128( b.high, b.low, a.high, a.low )
7022 : lt128( a.high, a.low, b.high, b.low );
7026 /*----------------------------------------------------------------------------
7027 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7028 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
7029 | comparison is performed according to the IEC/IEEE Standard for Binary
7030 | Floating-Point Arithmetic.
7031 *----------------------------------------------------------------------------*/
7033 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7035 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7036 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7037 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7038 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7040 if (float128_is_signaling_nan(a, status)
7041 || float128_is_signaling_nan(b, status)) {
7042 float_raise(float_flag_invalid, status);
7044 return 1;
7046 return 0;
7049 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7050 int is_quiet, float_status *status)
7052 flag aSign, bSign;
7054 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7055 float_raise(float_flag_invalid, status);
7056 return float_relation_unordered;
7058 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7059 ( extractFloatx80Frac( a )<<1 ) ) ||
7060 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7061 ( extractFloatx80Frac( b )<<1 ) )) {
7062 if (!is_quiet ||
7063 floatx80_is_signaling_nan(a, status) ||
7064 floatx80_is_signaling_nan(b, status)) {
7065 float_raise(float_flag_invalid, status);
7067 return float_relation_unordered;
7069 aSign = extractFloatx80Sign( a );
7070 bSign = extractFloatx80Sign( b );
7071 if ( aSign != bSign ) {
7073 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7074 ( ( a.low | b.low ) == 0 ) ) {
7075 /* zero case */
7076 return float_relation_equal;
7077 } else {
7078 return 1 - (2 * aSign);
7080 } else {
7081 if (a.low == b.low && a.high == b.high) {
7082 return float_relation_equal;
7083 } else {
7084 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7089 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7091 return floatx80_compare_internal(a, b, 0, status);
7094 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7096 return floatx80_compare_internal(a, b, 1, status);
7099 static inline int float128_compare_internal(float128 a, float128 b,
7100 int is_quiet, float_status *status)
7102 flag aSign, bSign;
7104 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7105 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7106 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7107 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7108 if (!is_quiet ||
7109 float128_is_signaling_nan(a, status) ||
7110 float128_is_signaling_nan(b, status)) {
7111 float_raise(float_flag_invalid, status);
7113 return float_relation_unordered;
7115 aSign = extractFloat128Sign( a );
7116 bSign = extractFloat128Sign( b );
7117 if ( aSign != bSign ) {
7118 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7119 /* zero case */
7120 return float_relation_equal;
7121 } else {
7122 return 1 - (2 * aSign);
7124 } else {
7125 if (a.low == b.low && a.high == b.high) {
7126 return float_relation_equal;
7127 } else {
7128 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7133 int float128_compare(float128 a, float128 b, float_status *status)
7135 return float128_compare_internal(a, b, 0, status);
7138 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7140 return float128_compare_internal(a, b, 1, status);
7143 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7145 flag aSign;
7146 int32_t aExp;
7147 uint64_t aSig;
7149 if (floatx80_invalid_encoding(a)) {
7150 float_raise(float_flag_invalid, status);
7151 return floatx80_default_nan(status);
7153 aSig = extractFloatx80Frac( a );
7154 aExp = extractFloatx80Exp( a );
7155 aSign = extractFloatx80Sign( a );
7157 if ( aExp == 0x7FFF ) {
7158 if ( aSig<<1 ) {
7159 return propagateFloatx80NaN(a, a, status);
7161 return a;
7164 if (aExp == 0) {
7165 if (aSig == 0) {
7166 return a;
7168 aExp++;
7171 if (n > 0x10000) {
7172 n = 0x10000;
7173 } else if (n < -0x10000) {
7174 n = -0x10000;
7177 aExp += n;
7178 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7179 aSign, aExp, aSig, 0, status);
7182 float128 float128_scalbn(float128 a, int n, float_status *status)
7184 flag aSign;
7185 int32_t aExp;
7186 uint64_t aSig0, aSig1;
7188 aSig1 = extractFloat128Frac1( a );
7189 aSig0 = extractFloat128Frac0( a );
7190 aExp = extractFloat128Exp( a );
7191 aSign = extractFloat128Sign( a );
7192 if ( aExp == 0x7FFF ) {
7193 if ( aSig0 | aSig1 ) {
7194 return propagateFloat128NaN(a, a, status);
7196 return a;
7198 if (aExp != 0) {
7199 aSig0 |= LIT64( 0x0001000000000000 );
7200 } else if (aSig0 == 0 && aSig1 == 0) {
7201 return a;
7202 } else {
7203 aExp++;
7206 if (n > 0x10000) {
7207 n = 0x10000;
7208 } else if (n < -0x10000) {
7209 n = -0x10000;
7212 aExp += n - 1;
7213 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7214 , status);