hw/s390x: Move virtio-ccw-gpu code to a separate file
[qemu.git] / fpu / softfloat.c
blob59ca356d0e7ad07163de5c473500335e1fc2c2c7
1 /*
2 * QEMU float support
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "fpu/softfloat-macros.h"
98 /*----------------------------------------------------------------------------
99 | Returns the fraction bits of the half-precision floating-point value `a'.
100 *----------------------------------------------------------------------------*/
102 static inline uint32_t extractFloat16Frac(float16 a)
104 return float16_val(a) & 0x3ff;
107 /*----------------------------------------------------------------------------
108 | Returns the exponent bits of the half-precision floating-point value `a'.
109 *----------------------------------------------------------------------------*/
111 static inline int extractFloat16Exp(float16 a)
113 return (float16_val(a) >> 10) & 0x1f;
116 /*----------------------------------------------------------------------------
117 | Returns the fraction bits of the single-precision floating-point value `a'.
118 *----------------------------------------------------------------------------*/
120 static inline uint32_t extractFloat32Frac(float32 a)
122 return float32_val(a) & 0x007FFFFF;
125 /*----------------------------------------------------------------------------
126 | Returns the exponent bits of the single-precision floating-point value `a'.
127 *----------------------------------------------------------------------------*/
129 static inline int extractFloat32Exp(float32 a)
131 return (float32_val(a) >> 23) & 0xFF;
134 /*----------------------------------------------------------------------------
135 | Returns the sign bit of the single-precision floating-point value `a'.
136 *----------------------------------------------------------------------------*/
138 static inline flag extractFloat32Sign(float32 a)
140 return float32_val(a) >> 31;
143 /*----------------------------------------------------------------------------
144 | Returns the fraction bits of the double-precision floating-point value `a'.
145 *----------------------------------------------------------------------------*/
147 static inline uint64_t extractFloat64Frac(float64 a)
149 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
152 /*----------------------------------------------------------------------------
153 | Returns the exponent bits of the double-precision floating-point value `a'.
154 *----------------------------------------------------------------------------*/
156 static inline int extractFloat64Exp(float64 a)
158 return (float64_val(a) >> 52) & 0x7FF;
161 /*----------------------------------------------------------------------------
162 | Returns the sign bit of the double-precision floating-point value `a'.
163 *----------------------------------------------------------------------------*/
165 static inline flag extractFloat64Sign(float64 a)
167 return float64_val(a) >> 63;
171 * Classify a floating point number. Everything above float_class_qnan
172 * is a NaN so cls >= float_class_qnan is any NaN.
175 typedef enum __attribute__ ((__packed__)) {
176 float_class_unclassified,
177 float_class_zero,
178 float_class_normal,
179 float_class_inf,
180 float_class_qnan, /* all NaNs from here */
181 float_class_snan,
182 } FloatClass;
184 /* Simple helpers for checking if, or what kind of, NaN we have */
185 static inline __attribute__((unused)) bool is_nan(FloatClass c)
187 return unlikely(c >= float_class_qnan);
190 static inline __attribute__((unused)) bool is_snan(FloatClass c)
192 return c == float_class_snan;
195 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
197 return c == float_class_qnan;
201 * Structure holding all of the decomposed parts of a float. The
202 * exponent is unbiased and the fraction is normalized. All
203 * calculations are done with a 64 bit fraction and then rounded as
204 * appropriate for the final format.
206 * Thanks to the packed FloatClass a decent compiler should be able to
207 * fit the whole structure into registers and avoid using the stack
208 * for parameter passing.
211 typedef struct {
212 uint64_t frac;
213 int32_t exp;
214 FloatClass cls;
215 bool sign;
216 } FloatParts;
218 #define DECOMPOSED_BINARY_POINT (64 - 2)
219 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
220 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
222 /* Structure holding all of the relevant parameters for a format.
223 * exp_size: the size of the exponent field
224 * exp_bias: the offset applied to the exponent field
225 * exp_max: the maximum normalised exponent
226 * frac_size: the size of the fraction field
227 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
228 * The following are computed based the size of fraction
229 * frac_lsb: least significant bit of fraction
230 * frac_lsbm1: the bit below the least significant bit (for rounding)
231 * round_mask/roundeven_mask: masks used for rounding
232 * The following optional modifiers are available:
233 * arm_althp: handle ARM Alternative Half Precision
235 typedef struct {
236 int exp_size;
237 int exp_bias;
238 int exp_max;
239 int frac_size;
240 int frac_shift;
241 uint64_t frac_lsb;
242 uint64_t frac_lsbm1;
243 uint64_t round_mask;
244 uint64_t roundeven_mask;
245 bool arm_althp;
246 } FloatFmt;
248 /* Expand fields based on the size of exponent and fraction */
249 #define FLOAT_PARAMS(E, F) \
250 .exp_size = E, \
251 .exp_bias = ((1 << E) - 1) >> 1, \
252 .exp_max = (1 << E) - 1, \
253 .frac_size = F, \
254 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
255 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
256 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
257 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
258 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
260 static const FloatFmt float16_params = {
261 FLOAT_PARAMS(5, 10)
264 static const FloatFmt float16_params_ahp = {
265 FLOAT_PARAMS(5, 10),
266 .arm_althp = true
269 static const FloatFmt float32_params = {
270 FLOAT_PARAMS(8, 23)
273 static const FloatFmt float64_params = {
274 FLOAT_PARAMS(11, 52)
277 /* Unpack a float to parts, but do not canonicalize. */
278 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
280 const int sign_pos = fmt.frac_size + fmt.exp_size;
282 return (FloatParts) {
283 .cls = float_class_unclassified,
284 .sign = extract64(raw, sign_pos, 1),
285 .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
286 .frac = extract64(raw, 0, fmt.frac_size),
290 static inline FloatParts float16_unpack_raw(float16 f)
292 return unpack_raw(float16_params, f);
295 static inline FloatParts float32_unpack_raw(float32 f)
297 return unpack_raw(float32_params, f);
300 static inline FloatParts float64_unpack_raw(float64 f)
302 return unpack_raw(float64_params, f);
305 /* Pack a float from parts, but do not canonicalize. */
306 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
308 const int sign_pos = fmt.frac_size + fmt.exp_size;
309 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
310 return deposit64(ret, sign_pos, 1, p.sign);
313 static inline float16 float16_pack_raw(FloatParts p)
315 return make_float16(pack_raw(float16_params, p));
318 static inline float32 float32_pack_raw(FloatParts p)
320 return make_float32(pack_raw(float32_params, p));
323 static inline float64 float64_pack_raw(FloatParts p)
325 return make_float64(pack_raw(float64_params, p));
328 /*----------------------------------------------------------------------------
329 | Functions and definitions to determine: (1) whether tininess for underflow
330 | is detected before or after rounding by default, (2) what (if anything)
331 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
332 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
333 | are propagated from function inputs to output. These details are target-
334 | specific.
335 *----------------------------------------------------------------------------*/
336 #include "softfloat-specialize.h"
338 /* Canonicalize EXP and FRAC, setting CLS. */
339 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
340 float_status *status)
342 if (part.exp == parm->exp_max && !parm->arm_althp) {
343 if (part.frac == 0) {
344 part.cls = float_class_inf;
345 } else {
346 part.frac <<= parm->frac_shift;
347 part.cls = (parts_is_snan_frac(part.frac, status)
348 ? float_class_snan : float_class_qnan);
350 } else if (part.exp == 0) {
351 if (likely(part.frac == 0)) {
352 part.cls = float_class_zero;
353 } else if (status->flush_inputs_to_zero) {
354 float_raise(float_flag_input_denormal, status);
355 part.cls = float_class_zero;
356 part.frac = 0;
357 } else {
358 int shift = clz64(part.frac) - 1;
359 part.cls = float_class_normal;
360 part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
361 part.frac <<= shift;
363 } else {
364 part.cls = float_class_normal;
365 part.exp -= parm->exp_bias;
366 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
368 return part;
371 /* Round and uncanonicalize a floating-point number by parts. There
372 * are FRAC_SHIFT bits that may require rounding at the bottom of the
373 * fraction; these bits will be removed. The exponent will be biased
374 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
377 static FloatParts round_canonical(FloatParts p, float_status *s,
378 const FloatFmt *parm)
380 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
381 const uint64_t round_mask = parm->round_mask;
382 const uint64_t roundeven_mask = parm->roundeven_mask;
383 const int exp_max = parm->exp_max;
384 const int frac_shift = parm->frac_shift;
385 uint64_t frac, inc;
386 int exp, flags = 0;
387 bool overflow_norm;
389 frac = p.frac;
390 exp = p.exp;
392 switch (p.cls) {
393 case float_class_normal:
394 switch (s->float_rounding_mode) {
395 case float_round_nearest_even:
396 overflow_norm = false;
397 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
398 break;
399 case float_round_ties_away:
400 overflow_norm = false;
401 inc = frac_lsbm1;
402 break;
403 case float_round_to_zero:
404 overflow_norm = true;
405 inc = 0;
406 break;
407 case float_round_up:
408 inc = p.sign ? 0 : round_mask;
409 overflow_norm = p.sign;
410 break;
411 case float_round_down:
412 inc = p.sign ? round_mask : 0;
413 overflow_norm = !p.sign;
414 break;
415 default:
416 g_assert_not_reached();
419 exp += parm->exp_bias;
420 if (likely(exp > 0)) {
421 if (frac & round_mask) {
422 flags |= float_flag_inexact;
423 frac += inc;
424 if (frac & DECOMPOSED_OVERFLOW_BIT) {
425 frac >>= 1;
426 exp++;
429 frac >>= frac_shift;
431 if (parm->arm_althp) {
432 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
433 if (unlikely(exp > exp_max)) {
434 /* Overflow. Return the maximum normal. */
435 flags = float_flag_invalid;
436 exp = exp_max;
437 frac = -1;
439 } else if (unlikely(exp >= exp_max)) {
440 flags |= float_flag_overflow | float_flag_inexact;
441 if (overflow_norm) {
442 exp = exp_max - 1;
443 frac = -1;
444 } else {
445 p.cls = float_class_inf;
446 goto do_inf;
449 } else if (s->flush_to_zero) {
450 flags |= float_flag_output_denormal;
451 p.cls = float_class_zero;
452 goto do_zero;
453 } else {
454 bool is_tiny = (s->float_detect_tininess
455 == float_tininess_before_rounding)
456 || (exp < 0)
457 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
459 shift64RightJamming(frac, 1 - exp, &frac);
460 if (frac & round_mask) {
461 /* Need to recompute round-to-even. */
462 if (s->float_rounding_mode == float_round_nearest_even) {
463 inc = ((frac & roundeven_mask) != frac_lsbm1
464 ? frac_lsbm1 : 0);
466 flags |= float_flag_inexact;
467 frac += inc;
470 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
471 frac >>= frac_shift;
473 if (is_tiny && (flags & float_flag_inexact)) {
474 flags |= float_flag_underflow;
476 if (exp == 0 && frac == 0) {
477 p.cls = float_class_zero;
480 break;
482 case float_class_zero:
483 do_zero:
484 exp = 0;
485 frac = 0;
486 break;
488 case float_class_inf:
489 do_inf:
490 assert(!parm->arm_althp);
491 exp = exp_max;
492 frac = 0;
493 break;
495 case float_class_qnan:
496 case float_class_snan:
497 assert(!parm->arm_althp);
498 exp = exp_max;
499 frac >>= parm->frac_shift;
500 break;
502 default:
503 g_assert_not_reached();
506 float_raise(flags, s);
507 p.exp = exp;
508 p.frac = frac;
509 return p;
512 /* Explicit FloatFmt version */
513 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
514 const FloatFmt *params)
516 return canonicalize(float16_unpack_raw(f), params, s);
519 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
521 return float16a_unpack_canonical(f, s, &float16_params);
524 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
525 const FloatFmt *params)
527 return float16_pack_raw(round_canonical(p, s, params));
530 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
532 return float16a_round_pack_canonical(p, s, &float16_params);
535 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
537 return canonicalize(float32_unpack_raw(f), &float32_params, s);
540 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
542 return float32_pack_raw(round_canonical(p, s, &float32_params));
545 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
547 return canonicalize(float64_unpack_raw(f), &float64_params, s);
550 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
552 return float64_pack_raw(round_canonical(p, s, &float64_params));
555 static FloatParts return_nan(FloatParts a, float_status *s)
557 switch (a.cls) {
558 case float_class_snan:
559 s->float_exception_flags |= float_flag_invalid;
560 a = parts_silence_nan(a, s);
561 /* fall through */
562 case float_class_qnan:
563 if (s->default_nan_mode) {
564 return parts_default_nan(s);
566 break;
568 default:
569 g_assert_not_reached();
571 return a;
574 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
576 if (is_snan(a.cls) || is_snan(b.cls)) {
577 s->float_exception_flags |= float_flag_invalid;
580 if (s->default_nan_mode) {
581 return parts_default_nan(s);
582 } else {
583 if (pickNaN(a.cls, b.cls,
584 a.frac > b.frac ||
585 (a.frac == b.frac && a.sign < b.sign))) {
586 a = b;
588 if (is_snan(a.cls)) {
589 return parts_silence_nan(a, s);
592 return a;
595 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
596 bool inf_zero, float_status *s)
598 int which;
600 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
601 s->float_exception_flags |= float_flag_invalid;
604 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
606 if (s->default_nan_mode) {
607 /* Note that this check is after pickNaNMulAdd so that function
608 * has an opportunity to set the Invalid flag.
610 which = 3;
613 switch (which) {
614 case 0:
615 break;
616 case 1:
617 a = b;
618 break;
619 case 2:
620 a = c;
621 break;
622 case 3:
623 return parts_default_nan(s);
624 default:
625 g_assert_not_reached();
628 if (is_snan(a.cls)) {
629 return parts_silence_nan(a, s);
631 return a;
635 * Returns the result of adding or subtracting the values of the
636 * floating-point values `a' and `b'. The operation is performed
637 * according to the IEC/IEEE Standard for Binary Floating-Point
638 * Arithmetic.
641 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
642 float_status *s)
644 bool a_sign = a.sign;
645 bool b_sign = b.sign ^ subtract;
647 if (a_sign != b_sign) {
648 /* Subtraction */
650 if (a.cls == float_class_normal && b.cls == float_class_normal) {
651 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
652 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
653 a.frac = a.frac - b.frac;
654 } else {
655 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
656 a.frac = b.frac - a.frac;
657 a.exp = b.exp;
658 a_sign ^= 1;
661 if (a.frac == 0) {
662 a.cls = float_class_zero;
663 a.sign = s->float_rounding_mode == float_round_down;
664 } else {
665 int shift = clz64(a.frac) - 1;
666 a.frac = a.frac << shift;
667 a.exp = a.exp - shift;
668 a.sign = a_sign;
670 return a;
672 if (is_nan(a.cls) || is_nan(b.cls)) {
673 return pick_nan(a, b, s);
675 if (a.cls == float_class_inf) {
676 if (b.cls == float_class_inf) {
677 float_raise(float_flag_invalid, s);
678 return parts_default_nan(s);
680 return a;
682 if (a.cls == float_class_zero && b.cls == float_class_zero) {
683 a.sign = s->float_rounding_mode == float_round_down;
684 return a;
686 if (a.cls == float_class_zero || b.cls == float_class_inf) {
687 b.sign = a_sign ^ 1;
688 return b;
690 if (b.cls == float_class_zero) {
691 return a;
693 } else {
694 /* Addition */
695 if (a.cls == float_class_normal && b.cls == float_class_normal) {
696 if (a.exp > b.exp) {
697 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
698 } else if (a.exp < b.exp) {
699 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
700 a.exp = b.exp;
702 a.frac += b.frac;
703 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
704 shift64RightJamming(a.frac, 1, &a.frac);
705 a.exp += 1;
707 return a;
709 if (is_nan(a.cls) || is_nan(b.cls)) {
710 return pick_nan(a, b, s);
712 if (a.cls == float_class_inf || b.cls == float_class_zero) {
713 return a;
715 if (b.cls == float_class_inf || a.cls == float_class_zero) {
716 b.sign = b_sign;
717 return b;
720 g_assert_not_reached();
724 * Returns the result of adding or subtracting the floating-point
725 * values `a' and `b'. The operation is performed according to the
726 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
729 float16 __attribute__((flatten)) float16_add(float16 a, float16 b,
730 float_status *status)
732 FloatParts pa = float16_unpack_canonical(a, status);
733 FloatParts pb = float16_unpack_canonical(b, status);
734 FloatParts pr = addsub_floats(pa, pb, false, status);
736 return float16_round_pack_canonical(pr, status);
739 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
740 float_status *status)
742 FloatParts pa = float32_unpack_canonical(a, status);
743 FloatParts pb = float32_unpack_canonical(b, status);
744 FloatParts pr = addsub_floats(pa, pb, false, status);
746 return float32_round_pack_canonical(pr, status);
749 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
750 float_status *status)
752 FloatParts pa = float64_unpack_canonical(a, status);
753 FloatParts pb = float64_unpack_canonical(b, status);
754 FloatParts pr = addsub_floats(pa, pb, false, status);
756 return float64_round_pack_canonical(pr, status);
759 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
760 float_status *status)
762 FloatParts pa = float16_unpack_canonical(a, status);
763 FloatParts pb = float16_unpack_canonical(b, status);
764 FloatParts pr = addsub_floats(pa, pb, true, status);
766 return float16_round_pack_canonical(pr, status);
769 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
770 float_status *status)
772 FloatParts pa = float32_unpack_canonical(a, status);
773 FloatParts pb = float32_unpack_canonical(b, status);
774 FloatParts pr = addsub_floats(pa, pb, true, status);
776 return float32_round_pack_canonical(pr, status);
779 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
780 float_status *status)
782 FloatParts pa = float64_unpack_canonical(a, status);
783 FloatParts pb = float64_unpack_canonical(b, status);
784 FloatParts pr = addsub_floats(pa, pb, true, status);
786 return float64_round_pack_canonical(pr, status);
790 * Returns the result of multiplying the floating-point values `a' and
791 * `b'. The operation is performed according to the IEC/IEEE Standard
792 * for Binary Floating-Point Arithmetic.
795 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
797 bool sign = a.sign ^ b.sign;
799 if (a.cls == float_class_normal && b.cls == float_class_normal) {
800 uint64_t hi, lo;
801 int exp = a.exp + b.exp;
803 mul64To128(a.frac, b.frac, &hi, &lo);
804 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
805 if (lo & DECOMPOSED_OVERFLOW_BIT) {
806 shift64RightJamming(lo, 1, &lo);
807 exp += 1;
810 /* Re-use a */
811 a.exp = exp;
812 a.sign = sign;
813 a.frac = lo;
814 return a;
816 /* handle all the NaN cases */
817 if (is_nan(a.cls) || is_nan(b.cls)) {
818 return pick_nan(a, b, s);
820 /* Inf * Zero == NaN */
821 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
822 (a.cls == float_class_zero && b.cls == float_class_inf)) {
823 s->float_exception_flags |= float_flag_invalid;
824 return parts_default_nan(s);
826 /* Multiply by 0 or Inf */
827 if (a.cls == float_class_inf || a.cls == float_class_zero) {
828 a.sign = sign;
829 return a;
831 if (b.cls == float_class_inf || b.cls == float_class_zero) {
832 b.sign = sign;
833 return b;
835 g_assert_not_reached();
838 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
839 float_status *status)
841 FloatParts pa = float16_unpack_canonical(a, status);
842 FloatParts pb = float16_unpack_canonical(b, status);
843 FloatParts pr = mul_floats(pa, pb, status);
845 return float16_round_pack_canonical(pr, status);
848 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
849 float_status *status)
851 FloatParts pa = float32_unpack_canonical(a, status);
852 FloatParts pb = float32_unpack_canonical(b, status);
853 FloatParts pr = mul_floats(pa, pb, status);
855 return float32_round_pack_canonical(pr, status);
858 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
859 float_status *status)
861 FloatParts pa = float64_unpack_canonical(a, status);
862 FloatParts pb = float64_unpack_canonical(b, status);
863 FloatParts pr = mul_floats(pa, pb, status);
865 return float64_round_pack_canonical(pr, status);
869 * Returns the result of multiplying the floating-point values `a' and
870 * `b' then adding 'c', with no intermediate rounding step after the
871 * multiplication. The operation is performed according to the
872 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
873 * The flags argument allows the caller to select negation of the
874 * addend, the intermediate product, or the final result. (The
875 * difference between this and having the caller do a separate
876 * negation is that negating externally will flip the sign bit on
877 * NaNs.)
880 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
881 int flags, float_status *s)
883 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
884 ((1 << float_class_inf) | (1 << float_class_zero));
885 bool p_sign;
886 bool sign_flip = flags & float_muladd_negate_result;
887 FloatClass p_class;
888 uint64_t hi, lo;
889 int p_exp;
891 /* It is implementation-defined whether the cases of (0,inf,qnan)
892 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
893 * they return if they do), so we have to hand this information
894 * off to the target-specific pick-a-NaN routine.
896 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
897 return pick_nan_muladd(a, b, c, inf_zero, s);
900 if (inf_zero) {
901 s->float_exception_flags |= float_flag_invalid;
902 return parts_default_nan(s);
905 if (flags & float_muladd_negate_c) {
906 c.sign ^= 1;
909 p_sign = a.sign ^ b.sign;
911 if (flags & float_muladd_negate_product) {
912 p_sign ^= 1;
915 if (a.cls == float_class_inf || b.cls == float_class_inf) {
916 p_class = float_class_inf;
917 } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
918 p_class = float_class_zero;
919 } else {
920 p_class = float_class_normal;
923 if (c.cls == float_class_inf) {
924 if (p_class == float_class_inf && p_sign != c.sign) {
925 s->float_exception_flags |= float_flag_invalid;
926 return parts_default_nan(s);
927 } else {
928 a.cls = float_class_inf;
929 a.sign = c.sign ^ sign_flip;
930 return a;
934 if (p_class == float_class_inf) {
935 a.cls = float_class_inf;
936 a.sign = p_sign ^ sign_flip;
937 return a;
940 if (p_class == float_class_zero) {
941 if (c.cls == float_class_zero) {
942 if (p_sign != c.sign) {
943 p_sign = s->float_rounding_mode == float_round_down;
945 c.sign = p_sign;
946 } else if (flags & float_muladd_halve_result) {
947 c.exp -= 1;
949 c.sign ^= sign_flip;
950 return c;
953 /* a & b should be normals now... */
954 assert(a.cls == float_class_normal &&
955 b.cls == float_class_normal);
957 p_exp = a.exp + b.exp;
959 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
960 * result.
962 mul64To128(a.frac, b.frac, &hi, &lo);
963 /* binary point now at bit 124 */
965 /* check for overflow */
966 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
967 shift128RightJamming(hi, lo, 1, &hi, &lo);
968 p_exp += 1;
971 /* + add/sub */
972 if (c.cls == float_class_zero) {
973 /* move binary point back to 62 */
974 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
975 } else {
976 int exp_diff = p_exp - c.exp;
977 if (p_sign == c.sign) {
978 /* Addition */
979 if (exp_diff <= 0) {
980 shift128RightJamming(hi, lo,
981 DECOMPOSED_BINARY_POINT - exp_diff,
982 &hi, &lo);
983 lo += c.frac;
984 p_exp = c.exp;
985 } else {
986 uint64_t c_hi, c_lo;
987 /* shift c to the same binary point as the product (124) */
988 c_hi = c.frac >> 2;
989 c_lo = 0;
990 shift128RightJamming(c_hi, c_lo,
991 exp_diff,
992 &c_hi, &c_lo);
993 add128(hi, lo, c_hi, c_lo, &hi, &lo);
994 /* move binary point back to 62 */
995 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
998 if (lo & DECOMPOSED_OVERFLOW_BIT) {
999 shift64RightJamming(lo, 1, &lo);
1000 p_exp += 1;
1003 } else {
1004 /* Subtraction */
1005 uint64_t c_hi, c_lo;
1006 /* make C binary point match product at bit 124 */
1007 c_hi = c.frac >> 2;
1008 c_lo = 0;
1010 if (exp_diff <= 0) {
1011 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1012 if (exp_diff == 0
1014 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1015 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1016 } else {
1017 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1018 p_sign ^= 1;
1019 p_exp = c.exp;
1021 } else {
1022 shift128RightJamming(c_hi, c_lo,
1023 exp_diff,
1024 &c_hi, &c_lo);
1025 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1028 if (hi == 0 && lo == 0) {
1029 a.cls = float_class_zero;
1030 a.sign = s->float_rounding_mode == float_round_down;
1031 a.sign ^= sign_flip;
1032 return a;
1033 } else {
1034 int shift;
1035 if (hi != 0) {
1036 shift = clz64(hi);
1037 } else {
1038 shift = clz64(lo) + 64;
1040 /* Normalizing to a binary point of 124 is the
1041 correct adjust for the exponent. However since we're
1042 shifting, we might as well put the binary point back
1043 at 62 where we really want it. Therefore shift as
1044 if we're leaving 1 bit at the top of the word, but
1045 adjust the exponent as if we're leaving 3 bits. */
1046 shift -= 1;
1047 if (shift >= 64) {
1048 lo = lo << (shift - 64);
1049 } else {
1050 hi = (hi << shift) | (lo >> (64 - shift));
1051 lo = hi | ((lo << shift) != 0);
1053 p_exp -= shift - 2;
1058 if (flags & float_muladd_halve_result) {
1059 p_exp -= 1;
1062 /* finally prepare our result */
1063 a.cls = float_class_normal;
1064 a.sign = p_sign ^ sign_flip;
1065 a.exp = p_exp;
1066 a.frac = lo;
1068 return a;
1071 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1072 int flags, float_status *status)
1074 FloatParts pa = float16_unpack_canonical(a, status);
1075 FloatParts pb = float16_unpack_canonical(b, status);
1076 FloatParts pc = float16_unpack_canonical(c, status);
1077 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1079 return float16_round_pack_canonical(pr, status);
1082 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1083 int flags, float_status *status)
1085 FloatParts pa = float32_unpack_canonical(a, status);
1086 FloatParts pb = float32_unpack_canonical(b, status);
1087 FloatParts pc = float32_unpack_canonical(c, status);
1088 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1090 return float32_round_pack_canonical(pr, status);
1093 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1094 int flags, float_status *status)
1096 FloatParts pa = float64_unpack_canonical(a, status);
1097 FloatParts pb = float64_unpack_canonical(b, status);
1098 FloatParts pc = float64_unpack_canonical(c, status);
1099 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1101 return float64_round_pack_canonical(pr, status);
1105 * Returns the result of dividing the floating-point value `a' by the
1106 * corresponding value `b'. The operation is performed according to
1107 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1110 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1112 bool sign = a.sign ^ b.sign;
1114 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1115 uint64_t temp_lo, temp_hi;
1116 int exp = a.exp - b.exp;
1117 if (a.frac < b.frac) {
1118 exp -= 1;
1119 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1120 &temp_hi, &temp_lo);
1121 } else {
1122 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1123 &temp_hi, &temp_lo);
1125 /* LSB of quot is set if inexact which roundandpack will use
1126 * to set flags. Yet again we re-use a for the result */
1127 a.frac = div128To64(temp_lo, temp_hi, b.frac);
1128 a.sign = sign;
1129 a.exp = exp;
1130 return a;
1132 /* handle all the NaN cases */
1133 if (is_nan(a.cls) || is_nan(b.cls)) {
1134 return pick_nan(a, b, s);
1136 /* 0/0 or Inf/Inf */
1137 if (a.cls == b.cls
1139 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1140 s->float_exception_flags |= float_flag_invalid;
1141 return parts_default_nan(s);
1143 /* Inf / x or 0 / x */
1144 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1145 a.sign = sign;
1146 return a;
1148 /* Div 0 => Inf */
1149 if (b.cls == float_class_zero) {
1150 s->float_exception_flags |= float_flag_divbyzero;
1151 a.cls = float_class_inf;
1152 a.sign = sign;
1153 return a;
1155 /* Div by Inf */
1156 if (b.cls == float_class_inf) {
1157 a.cls = float_class_zero;
1158 a.sign = sign;
1159 return a;
1161 g_assert_not_reached();
1164 float16 float16_div(float16 a, float16 b, float_status *status)
1166 FloatParts pa = float16_unpack_canonical(a, status);
1167 FloatParts pb = float16_unpack_canonical(b, status);
1168 FloatParts pr = div_floats(pa, pb, status);
1170 return float16_round_pack_canonical(pr, status);
1173 float32 float32_div(float32 a, float32 b, float_status *status)
1175 FloatParts pa = float32_unpack_canonical(a, status);
1176 FloatParts pb = float32_unpack_canonical(b, status);
1177 FloatParts pr = div_floats(pa, pb, status);
1179 return float32_round_pack_canonical(pr, status);
1182 float64 float64_div(float64 a, float64 b, float_status *status)
1184 FloatParts pa = float64_unpack_canonical(a, status);
1185 FloatParts pb = float64_unpack_canonical(b, status);
1186 FloatParts pr = div_floats(pa, pb, status);
1188 return float64_round_pack_canonical(pr, status);
1192 * Float to Float conversions
1194 * Returns the result of converting one float format to another. The
1195 * conversion is performed according to the IEC/IEEE Standard for
1196 * Binary Floating-Point Arithmetic.
1198 * The float_to_float helper only needs to take care of raising
1199 * invalid exceptions and handling the conversion on NaNs.
1202 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1203 float_status *s)
1205 if (dstf->arm_althp) {
1206 switch (a.cls) {
1207 case float_class_qnan:
1208 case float_class_snan:
1209 /* There is no NaN in the destination format. Raise Invalid
1210 * and return a zero with the sign of the input NaN.
1212 s->float_exception_flags |= float_flag_invalid;
1213 a.cls = float_class_zero;
1214 a.frac = 0;
1215 a.exp = 0;
1216 break;
1218 case float_class_inf:
1219 /* There is no Inf in the destination format. Raise Invalid
1220 * and return the maximum normal with the correct sign.
1222 s->float_exception_flags |= float_flag_invalid;
1223 a.cls = float_class_normal;
1224 a.exp = dstf->exp_max;
1225 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1226 break;
1228 default:
1229 break;
1231 } else if (is_nan(a.cls)) {
1232 if (is_snan(a.cls)) {
1233 s->float_exception_flags |= float_flag_invalid;
1234 a = parts_silence_nan(a, s);
1236 if (s->default_nan_mode) {
1237 return parts_default_nan(s);
1240 return a;
1243 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1245 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1246 FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1247 FloatParts pr = float_to_float(p, &float32_params, s);
1248 return float32_round_pack_canonical(pr, s);
1251 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1253 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1254 FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1255 FloatParts pr = float_to_float(p, &float64_params, s);
1256 return float64_round_pack_canonical(pr, s);
1259 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1261 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1262 FloatParts p = float32_unpack_canonical(a, s);
1263 FloatParts pr = float_to_float(p, fmt16, s);
1264 return float16a_round_pack_canonical(pr, s, fmt16);
1267 float64 float32_to_float64(float32 a, float_status *s)
1269 FloatParts p = float32_unpack_canonical(a, s);
1270 FloatParts pr = float_to_float(p, &float64_params, s);
1271 return float64_round_pack_canonical(pr, s);
1274 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1276 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1277 FloatParts p = float64_unpack_canonical(a, s);
1278 FloatParts pr = float_to_float(p, fmt16, s);
1279 return float16a_round_pack_canonical(pr, s, fmt16);
1282 float32 float64_to_float32(float64 a, float_status *s)
1284 FloatParts p = float64_unpack_canonical(a, s);
1285 FloatParts pr = float_to_float(p, &float32_params, s);
1286 return float32_round_pack_canonical(pr, s);
1290 * Rounds the floating-point value `a' to an integer, and returns the
1291 * result as a floating-point value. The operation is performed
1292 * according to the IEC/IEEE Standard for Binary Floating-Point
1293 * Arithmetic.
1296 static FloatParts round_to_int(FloatParts a, int rmode,
1297 int scale, float_status *s)
1299 switch (a.cls) {
1300 case float_class_qnan:
1301 case float_class_snan:
1302 return return_nan(a, s);
1304 case float_class_zero:
1305 case float_class_inf:
1306 /* already "integral" */
1307 break;
1309 case float_class_normal:
1310 scale = MIN(MAX(scale, -0x10000), 0x10000);
1311 a.exp += scale;
1313 if (a.exp >= DECOMPOSED_BINARY_POINT) {
1314 /* already integral */
1315 break;
1317 if (a.exp < 0) {
1318 bool one;
1319 /* all fractional */
1320 s->float_exception_flags |= float_flag_inexact;
1321 switch (rmode) {
1322 case float_round_nearest_even:
1323 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1324 break;
1325 case float_round_ties_away:
1326 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1327 break;
1328 case float_round_to_zero:
1329 one = false;
1330 break;
1331 case float_round_up:
1332 one = !a.sign;
1333 break;
1334 case float_round_down:
1335 one = a.sign;
1336 break;
1337 default:
1338 g_assert_not_reached();
1341 if (one) {
1342 a.frac = DECOMPOSED_IMPLICIT_BIT;
1343 a.exp = 0;
1344 } else {
1345 a.cls = float_class_zero;
1347 } else {
1348 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1349 uint64_t frac_lsbm1 = frac_lsb >> 1;
1350 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1351 uint64_t rnd_mask = rnd_even_mask >> 1;
1352 uint64_t inc;
1354 switch (rmode) {
1355 case float_round_nearest_even:
1356 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1357 break;
1358 case float_round_ties_away:
1359 inc = frac_lsbm1;
1360 break;
1361 case float_round_to_zero:
1362 inc = 0;
1363 break;
1364 case float_round_up:
1365 inc = a.sign ? 0 : rnd_mask;
1366 break;
1367 case float_round_down:
1368 inc = a.sign ? rnd_mask : 0;
1369 break;
1370 default:
1371 g_assert_not_reached();
1374 if (a.frac & rnd_mask) {
1375 s->float_exception_flags |= float_flag_inexact;
1376 a.frac += inc;
1377 a.frac &= ~rnd_mask;
1378 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1379 a.frac >>= 1;
1380 a.exp++;
1384 break;
1385 default:
1386 g_assert_not_reached();
1388 return a;
1391 float16 float16_round_to_int(float16 a, float_status *s)
1393 FloatParts pa = float16_unpack_canonical(a, s);
1394 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1395 return float16_round_pack_canonical(pr, s);
1398 float32 float32_round_to_int(float32 a, float_status *s)
1400 FloatParts pa = float32_unpack_canonical(a, s);
1401 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1402 return float32_round_pack_canonical(pr, s);
1405 float64 float64_round_to_int(float64 a, float_status *s)
1407 FloatParts pa = float64_unpack_canonical(a, s);
1408 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1409 return float64_round_pack_canonical(pr, s);
1412 float64 float64_trunc_to_int(float64 a, float_status *s)
1414 FloatParts pa = float64_unpack_canonical(a, s);
1415 FloatParts pr = round_to_int(pa, float_round_to_zero, 0, s);
1416 return float64_round_pack_canonical(pr, s);
1420 * Returns the result of converting the floating-point value `a' to
1421 * the two's complement integer format. The conversion is performed
1422 * according to the IEC/IEEE Standard for Binary Floating-Point
1423 * Arithmetic---which means in particular that the conversion is
1424 * rounded according to the current rounding mode. If `a' is a NaN,
1425 * the largest positive integer is returned. Otherwise, if the
1426 * conversion overflows, the largest integer with the same sign as `a'
1427 * is returned.
1430 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale,
1431 int64_t min, int64_t max,
1432 float_status *s)
1434 uint64_t r;
1435 int orig_flags = get_float_exception_flags(s);
1436 FloatParts p = round_to_int(in, rmode, scale, s);
1438 switch (p.cls) {
1439 case float_class_snan:
1440 case float_class_qnan:
1441 s->float_exception_flags = orig_flags | float_flag_invalid;
1442 return max;
1443 case float_class_inf:
1444 s->float_exception_flags = orig_flags | float_flag_invalid;
1445 return p.sign ? min : max;
1446 case float_class_zero:
1447 return 0;
1448 case float_class_normal:
1449 if (p.exp < DECOMPOSED_BINARY_POINT) {
1450 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1451 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1452 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1453 } else {
1454 r = UINT64_MAX;
1456 if (p.sign) {
1457 if (r <= -(uint64_t) min) {
1458 return -r;
1459 } else {
1460 s->float_exception_flags = orig_flags | float_flag_invalid;
1461 return min;
1463 } else {
1464 if (r <= max) {
1465 return r;
1466 } else {
1467 s->float_exception_flags = orig_flags | float_flag_invalid;
1468 return max;
1471 default:
1472 g_assert_not_reached();
1476 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale,
1477 float_status *s)
1479 return round_to_int_and_pack(float16_unpack_canonical(a, s),
1480 rmode, scale, INT16_MIN, INT16_MAX, s);
1483 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale,
1484 float_status *s)
1486 return round_to_int_and_pack(float16_unpack_canonical(a, s),
1487 rmode, scale, INT32_MIN, INT32_MAX, s);
1490 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale,
1491 float_status *s)
1493 return round_to_int_and_pack(float16_unpack_canonical(a, s),
1494 rmode, scale, INT64_MIN, INT64_MAX, s);
1497 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale,
1498 float_status *s)
1500 return round_to_int_and_pack(float32_unpack_canonical(a, s),
1501 rmode, scale, INT16_MIN, INT16_MAX, s);
1504 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale,
1505 float_status *s)
1507 return round_to_int_and_pack(float32_unpack_canonical(a, s),
1508 rmode, scale, INT32_MIN, INT32_MAX, s);
1511 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale,
1512 float_status *s)
1514 return round_to_int_and_pack(float32_unpack_canonical(a, s),
1515 rmode, scale, INT64_MIN, INT64_MAX, s);
1518 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale,
1519 float_status *s)
1521 return round_to_int_and_pack(float64_unpack_canonical(a, s),
1522 rmode, scale, INT16_MIN, INT16_MAX, s);
1525 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale,
1526 float_status *s)
1528 return round_to_int_and_pack(float64_unpack_canonical(a, s),
1529 rmode, scale, INT32_MIN, INT32_MAX, s);
1532 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale,
1533 float_status *s)
1535 return round_to_int_and_pack(float64_unpack_canonical(a, s),
1536 rmode, scale, INT64_MIN, INT64_MAX, s);
1539 int16_t float16_to_int16(float16 a, float_status *s)
1541 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
1544 int32_t float16_to_int32(float16 a, float_status *s)
1546 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
1549 int64_t float16_to_int64(float16 a, float_status *s)
1551 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
1554 int16_t float32_to_int16(float32 a, float_status *s)
1556 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
1559 int32_t float32_to_int32(float32 a, float_status *s)
1561 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
1564 int64_t float32_to_int64(float32 a, float_status *s)
1566 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
1569 int16_t float64_to_int16(float64 a, float_status *s)
1571 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
1574 int32_t float64_to_int32(float64 a, float_status *s)
1576 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
1579 int64_t float64_to_int64(float64 a, float_status *s)
1581 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
1584 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
1586 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
1589 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
1591 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
1594 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
1596 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
1599 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
1601 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
1604 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
1606 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
1609 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
1611 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
1614 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
1616 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
1619 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
1621 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
1624 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
1626 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
1630 * Returns the result of converting the floating-point value `a' to
1631 * the unsigned integer format. The conversion is performed according
1632 * to the IEC/IEEE Standard for Binary Floating-Point
1633 * Arithmetic---which means in particular that the conversion is
1634 * rounded according to the current rounding mode. If `a' is a NaN,
1635 * the largest unsigned integer is returned. Otherwise, if the
1636 * conversion overflows, the largest unsigned integer is returned. If
1637 * the 'a' is negative, the result is rounded and zero is returned;
1638 * values that do not round to zero will raise the inexact exception
1639 * flag.
1642 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale,
1643 uint64_t max, float_status *s)
1645 int orig_flags = get_float_exception_flags(s);
1646 FloatParts p = round_to_int(in, rmode, scale, s);
1647 uint64_t r;
1649 switch (p.cls) {
1650 case float_class_snan:
1651 case float_class_qnan:
1652 s->float_exception_flags = orig_flags | float_flag_invalid;
1653 return max;
1654 case float_class_inf:
1655 s->float_exception_flags = orig_flags | float_flag_invalid;
1656 return p.sign ? 0 : max;
1657 case float_class_zero:
1658 return 0;
1659 case float_class_normal:
1660 if (p.sign) {
1661 s->float_exception_flags = orig_flags | float_flag_invalid;
1662 return 0;
1665 if (p.exp < DECOMPOSED_BINARY_POINT) {
1666 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1667 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1668 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1669 } else {
1670 s->float_exception_flags = orig_flags | float_flag_invalid;
1671 return max;
1674 /* For uint64 this will never trip, but if p.exp is too large
1675 * to shift a decomposed fraction we shall have exited via the
1676 * 3rd leg above.
1678 if (r > max) {
1679 s->float_exception_flags = orig_flags | float_flag_invalid;
1680 return max;
1682 return r;
1683 default:
1684 g_assert_not_reached();
1688 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale,
1689 float_status *s)
1691 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
1692 rmode, scale, UINT16_MAX, s);
1695 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale,
1696 float_status *s)
1698 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
1699 rmode, scale, UINT32_MAX, s);
1702 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale,
1703 float_status *s)
1705 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
1706 rmode, scale, UINT64_MAX, s);
1709 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale,
1710 float_status *s)
1712 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
1713 rmode, scale, UINT16_MAX, s);
1716 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale,
1717 float_status *s)
1719 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
1720 rmode, scale, UINT32_MAX, s);
1723 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale,
1724 float_status *s)
1726 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
1727 rmode, scale, UINT64_MAX, s);
1730 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale,
1731 float_status *s)
1733 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
1734 rmode, scale, UINT16_MAX, s);
1737 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale,
1738 float_status *s)
1740 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
1741 rmode, scale, UINT32_MAX, s);
1744 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale,
1745 float_status *s)
1747 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
1748 rmode, scale, UINT64_MAX, s);
1751 uint16_t float16_to_uint16(float16 a, float_status *s)
1753 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
1756 uint32_t float16_to_uint32(float16 a, float_status *s)
1758 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
1761 uint64_t float16_to_uint64(float16 a, float_status *s)
1763 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
1766 uint16_t float32_to_uint16(float32 a, float_status *s)
1768 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
1771 uint32_t float32_to_uint32(float32 a, float_status *s)
1773 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
1776 uint64_t float32_to_uint64(float32 a, float_status *s)
1778 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
1781 uint16_t float64_to_uint16(float64 a, float_status *s)
1783 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
1786 uint32_t float64_to_uint32(float64 a, float_status *s)
1788 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
1791 uint64_t float64_to_uint64(float64 a, float_status *s)
1793 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
1796 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
1798 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
1801 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
1803 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
1806 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
1808 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
1811 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
1813 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
1816 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
1818 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
1821 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
1823 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
1826 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
1828 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
1831 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
1833 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
1836 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
1838 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
1842 * Integer to float conversions
1844 * Returns the result of converting the two's complement integer `a'
1845 * to the floating-point format. The conversion is performed according
1846 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1849 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
1851 FloatParts r = { .sign = false };
1853 if (a == 0) {
1854 r.cls = float_class_zero;
1855 } else {
1856 uint64_t f = a;
1857 int shift;
1859 r.cls = float_class_normal;
1860 if (a < 0) {
1861 f = -f;
1862 r.sign = true;
1864 shift = clz64(f) - 1;
1865 scale = MIN(MAX(scale, -0x10000), 0x10000);
1867 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
1868 r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
1871 return r;
1874 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
1876 FloatParts pa = int_to_float(a, scale, status);
1877 return float16_round_pack_canonical(pa, status);
1880 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
1882 return int64_to_float16_scalbn(a, scale, status);
1885 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
1887 return int64_to_float16_scalbn(a, scale, status);
1890 float16 int64_to_float16(int64_t a, float_status *status)
1892 return int64_to_float16_scalbn(a, 0, status);
1895 float16 int32_to_float16(int32_t a, float_status *status)
1897 return int64_to_float16_scalbn(a, 0, status);
1900 float16 int16_to_float16(int16_t a, float_status *status)
1902 return int64_to_float16_scalbn(a, 0, status);
1905 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
1907 FloatParts pa = int_to_float(a, scale, status);
1908 return float32_round_pack_canonical(pa, status);
1911 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
1913 return int64_to_float32_scalbn(a, scale, status);
1916 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
1918 return int64_to_float32_scalbn(a, scale, status);
1921 float32 int64_to_float32(int64_t a, float_status *status)
1923 return int64_to_float32_scalbn(a, 0, status);
1926 float32 int32_to_float32(int32_t a, float_status *status)
1928 return int64_to_float32_scalbn(a, 0, status);
1931 float32 int16_to_float32(int16_t a, float_status *status)
1933 return int64_to_float32_scalbn(a, 0, status);
1936 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
1938 FloatParts pa = int_to_float(a, scale, status);
1939 return float64_round_pack_canonical(pa, status);
1942 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
1944 return int64_to_float64_scalbn(a, scale, status);
1947 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
1949 return int64_to_float64_scalbn(a, scale, status);
1952 float64 int64_to_float64(int64_t a, float_status *status)
1954 return int64_to_float64_scalbn(a, 0, status);
1957 float64 int32_to_float64(int32_t a, float_status *status)
1959 return int64_to_float64_scalbn(a, 0, status);
1962 float64 int16_to_float64(int16_t a, float_status *status)
1964 return int64_to_float64_scalbn(a, 0, status);
1969 * Unsigned Integer to float conversions
1971 * Returns the result of converting the unsigned integer `a' to the
1972 * floating-point format. The conversion is performed according to the
1973 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1976 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
1978 FloatParts r = { .sign = false };
1980 if (a == 0) {
1981 r.cls = float_class_zero;
1982 } else {
1983 scale = MIN(MAX(scale, -0x10000), 0x10000);
1984 r.cls = float_class_normal;
1985 if ((int64_t)a < 0) {
1986 r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
1987 shift64RightJamming(a, 1, &a);
1988 r.frac = a;
1989 } else {
1990 int shift = clz64(a) - 1;
1991 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
1992 r.frac = a << shift;
1996 return r;
1999 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2001 FloatParts pa = uint_to_float(a, scale, status);
2002 return float16_round_pack_canonical(pa, status);
2005 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2007 return uint64_to_float16_scalbn(a, scale, status);
2010 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2012 return uint64_to_float16_scalbn(a, scale, status);
2015 float16 uint64_to_float16(uint64_t a, float_status *status)
2017 return uint64_to_float16_scalbn(a, 0, status);
2020 float16 uint32_to_float16(uint32_t a, float_status *status)
2022 return uint64_to_float16_scalbn(a, 0, status);
2025 float16 uint16_to_float16(uint16_t a, float_status *status)
2027 return uint64_to_float16_scalbn(a, 0, status);
2030 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2032 FloatParts pa = uint_to_float(a, scale, status);
2033 return float32_round_pack_canonical(pa, status);
2036 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2038 return uint64_to_float32_scalbn(a, scale, status);
2041 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2043 return uint64_to_float32_scalbn(a, scale, status);
2046 float32 uint64_to_float32(uint64_t a, float_status *status)
2048 return uint64_to_float32_scalbn(a, 0, status);
2051 float32 uint32_to_float32(uint32_t a, float_status *status)
2053 return uint64_to_float32_scalbn(a, 0, status);
2056 float32 uint16_to_float32(uint16_t a, float_status *status)
2058 return uint64_to_float32_scalbn(a, 0, status);
2061 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2063 FloatParts pa = uint_to_float(a, scale, status);
2064 return float64_round_pack_canonical(pa, status);
2067 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2069 return uint64_to_float64_scalbn(a, scale, status);
2072 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2074 return uint64_to_float64_scalbn(a, scale, status);
2077 float64 uint64_to_float64(uint64_t a, float_status *status)
2079 return uint64_to_float64_scalbn(a, 0, status);
2082 float64 uint32_to_float64(uint32_t a, float_status *status)
2084 return uint64_to_float64_scalbn(a, 0, status);
2087 float64 uint16_to_float64(uint16_t a, float_status *status)
2089 return uint64_to_float64_scalbn(a, 0, status);
2092 /* Float Min/Max */
2093 /* min() and max() functions. These can't be implemented as
2094 * 'compare and pick one input' because that would mishandle
2095 * NaNs and +0 vs -0.
2097 * minnum() and maxnum() functions. These are similar to the min()
2098 * and max() functions but if one of the arguments is a QNaN and
2099 * the other is numerical then the numerical argument is returned.
2100 * SNaNs will get quietened before being returned.
2101 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2102 * and maxNum() operations. min() and max() are the typical min/max
2103 * semantics provided by many CPUs which predate that specification.
2105 * minnummag() and maxnummag() functions correspond to minNumMag()
2106 * and minNumMag() from the IEEE-754 2008.
2108 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2109 bool ieee, bool ismag, float_status *s)
2111 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2112 if (ieee) {
2113 /* Takes two floating-point values `a' and `b', one of
2114 * which is a NaN, and returns the appropriate NaN
2115 * result. If either `a' or `b' is a signaling NaN,
2116 * the invalid exception is raised.
2118 if (is_snan(a.cls) || is_snan(b.cls)) {
2119 return pick_nan(a, b, s);
2120 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2121 return b;
2122 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2123 return a;
2126 return pick_nan(a, b, s);
2127 } else {
2128 int a_exp, b_exp;
2130 switch (a.cls) {
2131 case float_class_normal:
2132 a_exp = a.exp;
2133 break;
2134 case float_class_inf:
2135 a_exp = INT_MAX;
2136 break;
2137 case float_class_zero:
2138 a_exp = INT_MIN;
2139 break;
2140 default:
2141 g_assert_not_reached();
2142 break;
2144 switch (b.cls) {
2145 case float_class_normal:
2146 b_exp = b.exp;
2147 break;
2148 case float_class_inf:
2149 b_exp = INT_MAX;
2150 break;
2151 case float_class_zero:
2152 b_exp = INT_MIN;
2153 break;
2154 default:
2155 g_assert_not_reached();
2156 break;
2159 if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2160 bool a_less = a_exp < b_exp;
2161 if (a_exp == b_exp) {
2162 a_less = a.frac < b.frac;
2164 return a_less ^ ismin ? b : a;
2167 if (a.sign == b.sign) {
2168 bool a_less = a_exp < b_exp;
2169 if (a_exp == b_exp) {
2170 a_less = a.frac < b.frac;
2172 return a.sign ^ a_less ^ ismin ? b : a;
2173 } else {
2174 return a.sign ^ ismin ? b : a;
2179 #define MINMAX(sz, name, ismin, isiee, ismag) \
2180 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
2181 float_status *s) \
2183 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2184 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2185 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
2187 return float ## sz ## _round_pack_canonical(pr, s); \
2190 MINMAX(16, min, true, false, false)
2191 MINMAX(16, minnum, true, true, false)
2192 MINMAX(16, minnummag, true, true, true)
2193 MINMAX(16, max, false, false, false)
2194 MINMAX(16, maxnum, false, true, false)
2195 MINMAX(16, maxnummag, false, true, true)
2197 MINMAX(32, min, true, false, false)
2198 MINMAX(32, minnum, true, true, false)
2199 MINMAX(32, minnummag, true, true, true)
2200 MINMAX(32, max, false, false, false)
2201 MINMAX(32, maxnum, false, true, false)
2202 MINMAX(32, maxnummag, false, true, true)
2204 MINMAX(64, min, true, false, false)
2205 MINMAX(64, minnum, true, true, false)
2206 MINMAX(64, minnummag, true, true, true)
2207 MINMAX(64, max, false, false, false)
2208 MINMAX(64, maxnum, false, true, false)
2209 MINMAX(64, maxnummag, false, true, true)
2211 #undef MINMAX
2213 /* Floating point compare */
2214 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2215 float_status *s)
2217 if (is_nan(a.cls) || is_nan(b.cls)) {
2218 if (!is_quiet ||
2219 a.cls == float_class_snan ||
2220 b.cls == float_class_snan) {
2221 s->float_exception_flags |= float_flag_invalid;
2223 return float_relation_unordered;
2226 if (a.cls == float_class_zero) {
2227 if (b.cls == float_class_zero) {
2228 return float_relation_equal;
2230 return b.sign ? float_relation_greater : float_relation_less;
2231 } else if (b.cls == float_class_zero) {
2232 return a.sign ? float_relation_less : float_relation_greater;
2235 /* The only really important thing about infinity is its sign. If
2236 * both are infinities the sign marks the smallest of the two.
2238 if (a.cls == float_class_inf) {
2239 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2240 return float_relation_equal;
2242 return a.sign ? float_relation_less : float_relation_greater;
2243 } else if (b.cls == float_class_inf) {
2244 return b.sign ? float_relation_greater : float_relation_less;
2247 if (a.sign != b.sign) {
2248 return a.sign ? float_relation_less : float_relation_greater;
2251 if (a.exp == b.exp) {
2252 if (a.frac == b.frac) {
2253 return float_relation_equal;
2255 if (a.sign) {
2256 return a.frac > b.frac ?
2257 float_relation_less : float_relation_greater;
2258 } else {
2259 return a.frac > b.frac ?
2260 float_relation_greater : float_relation_less;
2262 } else {
2263 if (a.sign) {
2264 return a.exp > b.exp ? float_relation_less : float_relation_greater;
2265 } else {
2266 return a.exp > b.exp ? float_relation_greater : float_relation_less;
2271 #define COMPARE(sz) \
2272 int float ## sz ## _compare(float ## sz a, float ## sz b, \
2273 float_status *s) \
2275 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2276 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2277 return compare_floats(pa, pb, false, s); \
2279 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \
2280 float_status *s) \
2282 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2283 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2284 return compare_floats(pa, pb, true, s); \
2287 COMPARE(16)
2288 COMPARE(32)
2289 COMPARE(64)
2291 #undef COMPARE
2293 /* Multiply A by 2 raised to the power N. */
2294 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
2296 if (unlikely(is_nan(a.cls))) {
2297 return return_nan(a, s);
2299 if (a.cls == float_class_normal) {
2300 /* The largest float type (even though not supported by FloatParts)
2301 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
2302 * still allows rounding to infinity, without allowing overflow
2303 * within the int32_t that backs FloatParts.exp.
2305 n = MIN(MAX(n, -0x10000), 0x10000);
2306 a.exp += n;
2308 return a;
2311 float16 float16_scalbn(float16 a, int n, float_status *status)
2313 FloatParts pa = float16_unpack_canonical(a, status);
2314 FloatParts pr = scalbn_decomposed(pa, n, status);
2315 return float16_round_pack_canonical(pr, status);
2318 float32 float32_scalbn(float32 a, int n, float_status *status)
2320 FloatParts pa = float32_unpack_canonical(a, status);
2321 FloatParts pr = scalbn_decomposed(pa, n, status);
2322 return float32_round_pack_canonical(pr, status);
2325 float64 float64_scalbn(float64 a, int n, float_status *status)
2327 FloatParts pa = float64_unpack_canonical(a, status);
2328 FloatParts pr = scalbn_decomposed(pa, n, status);
2329 return float64_round_pack_canonical(pr, status);
2333 * Square Root
2335 * The old softfloat code did an approximation step before zeroing in
2336 * on the final result. However for simpleness we just compute the
2337 * square root by iterating down from the implicit bit to enough extra
2338 * bits to ensure we get a correctly rounded result.
2340 * This does mean however the calculation is slower than before,
2341 * especially for 64 bit floats.
2344 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
2346 uint64_t a_frac, r_frac, s_frac;
2347 int bit, last_bit;
2349 if (is_nan(a.cls)) {
2350 return return_nan(a, s);
2352 if (a.cls == float_class_zero) {
2353 return a; /* sqrt(+-0) = +-0 */
2355 if (a.sign) {
2356 s->float_exception_flags |= float_flag_invalid;
2357 return parts_default_nan(s);
2359 if (a.cls == float_class_inf) {
2360 return a; /* sqrt(+inf) = +inf */
2363 assert(a.cls == float_class_normal);
2365 /* We need two overflow bits at the top. Adding room for that is a
2366 * right shift. If the exponent is odd, we can discard the low bit
2367 * by multiplying the fraction by 2; that's a left shift. Combine
2368 * those and we shift right if the exponent is even.
2370 a_frac = a.frac;
2371 if (!(a.exp & 1)) {
2372 a_frac >>= 1;
2374 a.exp >>= 1;
2376 /* Bit-by-bit computation of sqrt. */
2377 r_frac = 0;
2378 s_frac = 0;
2380 /* Iterate from implicit bit down to the 3 extra bits to compute a
2381 * properly rounded result. Remember we've inserted one more bit
2382 * at the top, so these positions are one less.
2384 bit = DECOMPOSED_BINARY_POINT - 1;
2385 last_bit = MAX(p->frac_shift - 4, 0);
2386 do {
2387 uint64_t q = 1ULL << bit;
2388 uint64_t t_frac = s_frac + q;
2389 if (t_frac <= a_frac) {
2390 s_frac = t_frac + q;
2391 a_frac -= t_frac;
2392 r_frac += q;
2394 a_frac <<= 1;
2395 } while (--bit >= last_bit);
2397 /* Undo the right shift done above. If there is any remaining
2398 * fraction, the result is inexact. Set the sticky bit.
2400 a.frac = (r_frac << 1) + (a_frac != 0);
2402 return a;
2405 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
2407 FloatParts pa = float16_unpack_canonical(a, status);
2408 FloatParts pr = sqrt_float(pa, status, &float16_params);
2409 return float16_round_pack_canonical(pr, status);
2412 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
2414 FloatParts pa = float32_unpack_canonical(a, status);
2415 FloatParts pr = sqrt_float(pa, status, &float32_params);
2416 return float32_round_pack_canonical(pr, status);
2419 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
2421 FloatParts pa = float64_unpack_canonical(a, status);
2422 FloatParts pr = sqrt_float(pa, status, &float64_params);
2423 return float64_round_pack_canonical(pr, status);
2426 /*----------------------------------------------------------------------------
2427 | The pattern for a default generated NaN.
2428 *----------------------------------------------------------------------------*/
2430 float16 float16_default_nan(float_status *status)
2432 FloatParts p = parts_default_nan(status);
2433 p.frac >>= float16_params.frac_shift;
2434 return float16_pack_raw(p);
2437 float32 float32_default_nan(float_status *status)
2439 FloatParts p = parts_default_nan(status);
2440 p.frac >>= float32_params.frac_shift;
2441 return float32_pack_raw(p);
2444 float64 float64_default_nan(float_status *status)
2446 FloatParts p = parts_default_nan(status);
2447 p.frac >>= float64_params.frac_shift;
2448 return float64_pack_raw(p);
2451 float128 float128_default_nan(float_status *status)
2453 FloatParts p = parts_default_nan(status);
2454 float128 r;
2456 /* Extrapolate from the choices made by parts_default_nan to fill
2457 * in the quad-floating format. If the low bit is set, assume we
2458 * want to set all non-snan bits.
2460 r.low = -(p.frac & 1);
2461 r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
2462 r.high |= LIT64(0x7FFF000000000000);
2463 r.high |= (uint64_t)p.sign << 63;
2465 return r;
2468 /*----------------------------------------------------------------------------
2469 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
2470 *----------------------------------------------------------------------------*/
2472 float16 float16_silence_nan(float16 a, float_status *status)
2474 FloatParts p = float16_unpack_raw(a);
2475 p.frac <<= float16_params.frac_shift;
2476 p = parts_silence_nan(p, status);
2477 p.frac >>= float16_params.frac_shift;
2478 return float16_pack_raw(p);
2481 float32 float32_silence_nan(float32 a, float_status *status)
2483 FloatParts p = float32_unpack_raw(a);
2484 p.frac <<= float32_params.frac_shift;
2485 p = parts_silence_nan(p, status);
2486 p.frac >>= float32_params.frac_shift;
2487 return float32_pack_raw(p);
2490 float64 float64_silence_nan(float64 a, float_status *status)
2492 FloatParts p = float64_unpack_raw(a);
2493 p.frac <<= float64_params.frac_shift;
2494 p = parts_silence_nan(p, status);
2495 p.frac >>= float64_params.frac_shift;
2496 return float64_pack_raw(p);
2499 /*----------------------------------------------------------------------------
2500 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2501 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2502 | input. If `zSign' is 1, the input is negated before being converted to an
2503 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
2504 | is simply rounded to an integer, with the inexact exception raised if the
2505 | input cannot be represented exactly as an integer. However, if the fixed-
2506 | point input is too large, the invalid exception is raised and the largest
2507 | positive or negative integer is returned.
2508 *----------------------------------------------------------------------------*/
2510 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2512 int8_t roundingMode;
2513 flag roundNearestEven;
2514 int8_t roundIncrement, roundBits;
2515 int32_t z;
2517 roundingMode = status->float_rounding_mode;
2518 roundNearestEven = ( roundingMode == float_round_nearest_even );
2519 switch (roundingMode) {
2520 case float_round_nearest_even:
2521 case float_round_ties_away:
2522 roundIncrement = 0x40;
2523 break;
2524 case float_round_to_zero:
2525 roundIncrement = 0;
2526 break;
2527 case float_round_up:
2528 roundIncrement = zSign ? 0 : 0x7f;
2529 break;
2530 case float_round_down:
2531 roundIncrement = zSign ? 0x7f : 0;
2532 break;
2533 default:
2534 abort();
2536 roundBits = absZ & 0x7F;
2537 absZ = ( absZ + roundIncrement )>>7;
2538 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2539 z = absZ;
2540 if ( zSign ) z = - z;
2541 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2542 float_raise(float_flag_invalid, status);
2543 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2545 if (roundBits) {
2546 status->float_exception_flags |= float_flag_inexact;
2548 return z;
2552 /*----------------------------------------------------------------------------
2553 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2554 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2555 | and returns the properly rounded 64-bit integer corresponding to the input.
2556 | If `zSign' is 1, the input is negated before being converted to an integer.
2557 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2558 | the inexact exception raised if the input cannot be represented exactly as
2559 | an integer. However, if the fixed-point input is too large, the invalid
2560 | exception is raised and the largest positive or negative integer is
2561 | returned.
2562 *----------------------------------------------------------------------------*/
2564 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2565 float_status *status)
2567 int8_t roundingMode;
2568 flag roundNearestEven, increment;
2569 int64_t z;
2571 roundingMode = status->float_rounding_mode;
2572 roundNearestEven = ( roundingMode == float_round_nearest_even );
2573 switch (roundingMode) {
2574 case float_round_nearest_even:
2575 case float_round_ties_away:
2576 increment = ((int64_t) absZ1 < 0);
2577 break;
2578 case float_round_to_zero:
2579 increment = 0;
2580 break;
2581 case float_round_up:
2582 increment = !zSign && absZ1;
2583 break;
2584 case float_round_down:
2585 increment = zSign && absZ1;
2586 break;
2587 default:
2588 abort();
2590 if ( increment ) {
2591 ++absZ0;
2592 if ( absZ0 == 0 ) goto overflow;
2593 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2595 z = absZ0;
2596 if ( zSign ) z = - z;
2597 if ( z && ( ( z < 0 ) ^ zSign ) ) {
2598 overflow:
2599 float_raise(float_flag_invalid, status);
2600 return
2601 zSign ? (int64_t) LIT64( 0x8000000000000000 )
2602 : LIT64( 0x7FFFFFFFFFFFFFFF );
2604 if (absZ1) {
2605 status->float_exception_flags |= float_flag_inexact;
2607 return z;
2611 /*----------------------------------------------------------------------------
2612 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2613 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2614 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2615 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
2616 | with the inexact exception raised if the input cannot be represented exactly
2617 | as an integer. However, if the fixed-point input is too large, the invalid
2618 | exception is raised and the largest unsigned integer is returned.
2619 *----------------------------------------------------------------------------*/
2621 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2622 uint64_t absZ1, float_status *status)
2624 int8_t roundingMode;
2625 flag roundNearestEven, increment;
2627 roundingMode = status->float_rounding_mode;
2628 roundNearestEven = (roundingMode == float_round_nearest_even);
2629 switch (roundingMode) {
2630 case float_round_nearest_even:
2631 case float_round_ties_away:
2632 increment = ((int64_t)absZ1 < 0);
2633 break;
2634 case float_round_to_zero:
2635 increment = 0;
2636 break;
2637 case float_round_up:
2638 increment = !zSign && absZ1;
2639 break;
2640 case float_round_down:
2641 increment = zSign && absZ1;
2642 break;
2643 default:
2644 abort();
2646 if (increment) {
2647 ++absZ0;
2648 if (absZ0 == 0) {
2649 float_raise(float_flag_invalid, status);
2650 return LIT64(0xFFFFFFFFFFFFFFFF);
2652 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2655 if (zSign && absZ0) {
2656 float_raise(float_flag_invalid, status);
2657 return 0;
2660 if (absZ1) {
2661 status->float_exception_flags |= float_flag_inexact;
2663 return absZ0;
2666 /*----------------------------------------------------------------------------
2667 | If `a' is denormal and we are in flush-to-zero mode then set the
2668 | input-denormal exception and return zero. Otherwise just return the value.
2669 *----------------------------------------------------------------------------*/
2670 float32 float32_squash_input_denormal(float32 a, float_status *status)
2672 if (status->flush_inputs_to_zero) {
2673 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2674 float_raise(float_flag_input_denormal, status);
2675 return make_float32(float32_val(a) & 0x80000000);
2678 return a;
2681 /*----------------------------------------------------------------------------
2682 | Normalizes the subnormal single-precision floating-point value represented
2683 | by the denormalized significand `aSig'. The normalized exponent and
2684 | significand are stored at the locations pointed to by `zExpPtr' and
2685 | `zSigPtr', respectively.
2686 *----------------------------------------------------------------------------*/
2688 static void
2689 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2691 int8_t shiftCount;
2693 shiftCount = countLeadingZeros32( aSig ) - 8;
2694 *zSigPtr = aSig<<shiftCount;
2695 *zExpPtr = 1 - shiftCount;
2699 /*----------------------------------------------------------------------------
2700 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2701 | and significand `zSig', and returns the proper single-precision floating-
2702 | point value corresponding to the abstract input. Ordinarily, the abstract
2703 | value is simply rounded and packed into the single-precision format, with
2704 | the inexact exception raised if the abstract input cannot be represented
2705 | exactly. However, if the abstract value is too large, the overflow and
2706 | inexact exceptions are raised and an infinity or maximal finite value is
2707 | returned. If the abstract value is too small, the input value is rounded to
2708 | a subnormal number, and the underflow and inexact exceptions are raised if
2709 | the abstract input cannot be represented exactly as a subnormal single-
2710 | precision floating-point number.
2711 | The input significand `zSig' has its binary point between bits 30
2712 | and 29, which is 7 bits to the left of the usual location. This shifted
2713 | significand must be normalized or smaller. If `zSig' is not normalized,
2714 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2715 | and it must not require rounding. In the usual case that `zSig' is
2716 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2717 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2718 | Binary Floating-Point Arithmetic.
2719 *----------------------------------------------------------------------------*/
2721 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2722 float_status *status)
2724 int8_t roundingMode;
2725 flag roundNearestEven;
2726 int8_t roundIncrement, roundBits;
2727 flag isTiny;
2729 roundingMode = status->float_rounding_mode;
2730 roundNearestEven = ( roundingMode == float_round_nearest_even );
2731 switch (roundingMode) {
2732 case float_round_nearest_even:
2733 case float_round_ties_away:
2734 roundIncrement = 0x40;
2735 break;
2736 case float_round_to_zero:
2737 roundIncrement = 0;
2738 break;
2739 case float_round_up:
2740 roundIncrement = zSign ? 0 : 0x7f;
2741 break;
2742 case float_round_down:
2743 roundIncrement = zSign ? 0x7f : 0;
2744 break;
2745 default:
2746 abort();
2747 break;
2749 roundBits = zSig & 0x7F;
2750 if ( 0xFD <= (uint16_t) zExp ) {
2751 if ( ( 0xFD < zExp )
2752 || ( ( zExp == 0xFD )
2753 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2755 float_raise(float_flag_overflow | float_flag_inexact, status);
2756 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2758 if ( zExp < 0 ) {
2759 if (status->flush_to_zero) {
2760 float_raise(float_flag_output_denormal, status);
2761 return packFloat32(zSign, 0, 0);
2763 isTiny =
2764 (status->float_detect_tininess
2765 == float_tininess_before_rounding)
2766 || ( zExp < -1 )
2767 || ( zSig + roundIncrement < 0x80000000 );
2768 shift32RightJamming( zSig, - zExp, &zSig );
2769 zExp = 0;
2770 roundBits = zSig & 0x7F;
2771 if (isTiny && roundBits) {
2772 float_raise(float_flag_underflow, status);
2776 if (roundBits) {
2777 status->float_exception_flags |= float_flag_inexact;
2779 zSig = ( zSig + roundIncrement )>>7;
2780 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2781 if ( zSig == 0 ) zExp = 0;
2782 return packFloat32( zSign, zExp, zSig );
2786 /*----------------------------------------------------------------------------
2787 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2788 | and significand `zSig', and returns the proper single-precision floating-
2789 | point value corresponding to the abstract input. This routine is just like
2790 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2791 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2792 | floating-point exponent.
2793 *----------------------------------------------------------------------------*/
2795 static float32
2796 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2797 float_status *status)
2799 int8_t shiftCount;
2801 shiftCount = countLeadingZeros32( zSig ) - 1;
2802 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2803 status);
2807 /*----------------------------------------------------------------------------
2808 | If `a' is denormal and we are in flush-to-zero mode then set the
2809 | input-denormal exception and return zero. Otherwise just return the value.
2810 *----------------------------------------------------------------------------*/
2811 float64 float64_squash_input_denormal(float64 a, float_status *status)
2813 if (status->flush_inputs_to_zero) {
2814 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2815 float_raise(float_flag_input_denormal, status);
2816 return make_float64(float64_val(a) & (1ULL << 63));
2819 return a;
2822 /*----------------------------------------------------------------------------
2823 | Normalizes the subnormal double-precision floating-point value represented
2824 | by the denormalized significand `aSig'. The normalized exponent and
2825 | significand are stored at the locations pointed to by `zExpPtr' and
2826 | `zSigPtr', respectively.
2827 *----------------------------------------------------------------------------*/
2829 static void
2830 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2832 int8_t shiftCount;
2834 shiftCount = countLeadingZeros64( aSig ) - 11;
2835 *zSigPtr = aSig<<shiftCount;
2836 *zExpPtr = 1 - shiftCount;
2840 /*----------------------------------------------------------------------------
2841 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2842 | double-precision floating-point value, returning the result. After being
2843 | shifted into the proper positions, the three fields are simply added
2844 | together to form the result. This means that any integer portion of `zSig'
2845 | will be added into the exponent. Since a properly normalized significand
2846 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2847 | than the desired result exponent whenever `zSig' is a complete, normalized
2848 | significand.
2849 *----------------------------------------------------------------------------*/
2851 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2854 return make_float64(
2855 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2859 /*----------------------------------------------------------------------------
2860 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2861 | and significand `zSig', and returns the proper double-precision floating-
2862 | point value corresponding to the abstract input. Ordinarily, the abstract
2863 | value is simply rounded and packed into the double-precision format, with
2864 | the inexact exception raised if the abstract input cannot be represented
2865 | exactly. However, if the abstract value is too large, the overflow and
2866 | inexact exceptions are raised and an infinity or maximal finite value is
2867 | returned. If the abstract value is too small, the input value is rounded to
2868 | a subnormal number, and the underflow and inexact exceptions are raised if
2869 | the abstract input cannot be represented exactly as a subnormal double-
2870 | precision floating-point number.
2871 | The input significand `zSig' has its binary point between bits 62
2872 | and 61, which is 10 bits to the left of the usual location. This shifted
2873 | significand must be normalized or smaller. If `zSig' is not normalized,
2874 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2875 | and it must not require rounding. In the usual case that `zSig' is
2876 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2877 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2878 | Binary Floating-Point Arithmetic.
2879 *----------------------------------------------------------------------------*/
2881 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2882 float_status *status)
2884 int8_t roundingMode;
2885 flag roundNearestEven;
2886 int roundIncrement, roundBits;
2887 flag isTiny;
2889 roundingMode = status->float_rounding_mode;
2890 roundNearestEven = ( roundingMode == float_round_nearest_even );
2891 switch (roundingMode) {
2892 case float_round_nearest_even:
2893 case float_round_ties_away:
2894 roundIncrement = 0x200;
2895 break;
2896 case float_round_to_zero:
2897 roundIncrement = 0;
2898 break;
2899 case float_round_up:
2900 roundIncrement = zSign ? 0 : 0x3ff;
2901 break;
2902 case float_round_down:
2903 roundIncrement = zSign ? 0x3ff : 0;
2904 break;
2905 case float_round_to_odd:
2906 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2907 break;
2908 default:
2909 abort();
2911 roundBits = zSig & 0x3FF;
2912 if ( 0x7FD <= (uint16_t) zExp ) {
2913 if ( ( 0x7FD < zExp )
2914 || ( ( zExp == 0x7FD )
2915 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2917 bool overflow_to_inf = roundingMode != float_round_to_odd &&
2918 roundIncrement != 0;
2919 float_raise(float_flag_overflow | float_flag_inexact, status);
2920 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2922 if ( zExp < 0 ) {
2923 if (status->flush_to_zero) {
2924 float_raise(float_flag_output_denormal, status);
2925 return packFloat64(zSign, 0, 0);
2927 isTiny =
2928 (status->float_detect_tininess
2929 == float_tininess_before_rounding)
2930 || ( zExp < -1 )
2931 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2932 shift64RightJamming( zSig, - zExp, &zSig );
2933 zExp = 0;
2934 roundBits = zSig & 0x3FF;
2935 if (isTiny && roundBits) {
2936 float_raise(float_flag_underflow, status);
2938 if (roundingMode == float_round_to_odd) {
2940 * For round-to-odd case, the roundIncrement depends on
2941 * zSig which just changed.
2943 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2947 if (roundBits) {
2948 status->float_exception_flags |= float_flag_inexact;
2950 zSig = ( zSig + roundIncrement )>>10;
2951 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2952 if ( zSig == 0 ) zExp = 0;
2953 return packFloat64( zSign, zExp, zSig );
2957 /*----------------------------------------------------------------------------
2958 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2959 | and significand `zSig', and returns the proper double-precision floating-
2960 | point value corresponding to the abstract input. This routine is just like
2961 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2962 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2963 | floating-point exponent.
2964 *----------------------------------------------------------------------------*/
2966 static float64
2967 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2968 float_status *status)
2970 int8_t shiftCount;
2972 shiftCount = countLeadingZeros64( zSig ) - 1;
2973 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2974 status);
2978 /*----------------------------------------------------------------------------
2979 | Normalizes the subnormal extended double-precision floating-point value
2980 | represented by the denormalized significand `aSig'. The normalized exponent
2981 | and significand are stored at the locations pointed to by `zExpPtr' and
2982 | `zSigPtr', respectively.
2983 *----------------------------------------------------------------------------*/
2985 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2986 uint64_t *zSigPtr)
2988 int8_t shiftCount;
2990 shiftCount = countLeadingZeros64( aSig );
2991 *zSigPtr = aSig<<shiftCount;
2992 *zExpPtr = 1 - shiftCount;
2995 /*----------------------------------------------------------------------------
2996 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2997 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2998 | and returns the proper extended double-precision floating-point value
2999 | corresponding to the abstract input. Ordinarily, the abstract value is
3000 | rounded and packed into the extended double-precision format, with the
3001 | inexact exception raised if the abstract input cannot be represented
3002 | exactly. However, if the abstract value is too large, the overflow and
3003 | inexact exceptions are raised and an infinity or maximal finite value is
3004 | returned. If the abstract value is too small, the input value is rounded to
3005 | a subnormal number, and the underflow and inexact exceptions are raised if
3006 | the abstract input cannot be represented exactly as a subnormal extended
3007 | double-precision floating-point number.
3008 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
3009 | number of bits as single or double precision, respectively. Otherwise, the
3010 | result is rounded to the full precision of the extended double-precision
3011 | format.
3012 | The input significand must be normalized or smaller. If the input
3013 | significand is not normalized, `zExp' must be 0; in that case, the result
3014 | returned is a subnormal number, and it must not require rounding. The
3015 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3016 | Floating-Point Arithmetic.
3017 *----------------------------------------------------------------------------*/
3019 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
3020 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3021 float_status *status)
3023 int8_t roundingMode;
3024 flag roundNearestEven, increment, isTiny;
3025 int64_t roundIncrement, roundMask, roundBits;
3027 roundingMode = status->float_rounding_mode;
3028 roundNearestEven = ( roundingMode == float_round_nearest_even );
3029 if ( roundingPrecision == 80 ) goto precision80;
3030 if ( roundingPrecision == 64 ) {
3031 roundIncrement = LIT64( 0x0000000000000400 );
3032 roundMask = LIT64( 0x00000000000007FF );
3034 else if ( roundingPrecision == 32 ) {
3035 roundIncrement = LIT64( 0x0000008000000000 );
3036 roundMask = LIT64( 0x000000FFFFFFFFFF );
3038 else {
3039 goto precision80;
3041 zSig0 |= ( zSig1 != 0 );
3042 switch (roundingMode) {
3043 case float_round_nearest_even:
3044 case float_round_ties_away:
3045 break;
3046 case float_round_to_zero:
3047 roundIncrement = 0;
3048 break;
3049 case float_round_up:
3050 roundIncrement = zSign ? 0 : roundMask;
3051 break;
3052 case float_round_down:
3053 roundIncrement = zSign ? roundMask : 0;
3054 break;
3055 default:
3056 abort();
3058 roundBits = zSig0 & roundMask;
3059 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3060 if ( ( 0x7FFE < zExp )
3061 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3063 goto overflow;
3065 if ( zExp <= 0 ) {
3066 if (status->flush_to_zero) {
3067 float_raise(float_flag_output_denormal, status);
3068 return packFloatx80(zSign, 0, 0);
3070 isTiny =
3071 (status->float_detect_tininess
3072 == float_tininess_before_rounding)
3073 || ( zExp < 0 )
3074 || ( zSig0 <= zSig0 + roundIncrement );
3075 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3076 zExp = 0;
3077 roundBits = zSig0 & roundMask;
3078 if (isTiny && roundBits) {
3079 float_raise(float_flag_underflow, status);
3081 if (roundBits) {
3082 status->float_exception_flags |= float_flag_inexact;
3084 zSig0 += roundIncrement;
3085 if ( (int64_t) zSig0 < 0 ) zExp = 1;
3086 roundIncrement = roundMask + 1;
3087 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3088 roundMask |= roundIncrement;
3090 zSig0 &= ~ roundMask;
3091 return packFloatx80( zSign, zExp, zSig0 );
3094 if (roundBits) {
3095 status->float_exception_flags |= float_flag_inexact;
3097 zSig0 += roundIncrement;
3098 if ( zSig0 < roundIncrement ) {
3099 ++zExp;
3100 zSig0 = LIT64( 0x8000000000000000 );
3102 roundIncrement = roundMask + 1;
3103 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3104 roundMask |= roundIncrement;
3106 zSig0 &= ~ roundMask;
3107 if ( zSig0 == 0 ) zExp = 0;
3108 return packFloatx80( zSign, zExp, zSig0 );
3109 precision80:
3110 switch (roundingMode) {
3111 case float_round_nearest_even:
3112 case float_round_ties_away:
3113 increment = ((int64_t)zSig1 < 0);
3114 break;
3115 case float_round_to_zero:
3116 increment = 0;
3117 break;
3118 case float_round_up:
3119 increment = !zSign && zSig1;
3120 break;
3121 case float_round_down:
3122 increment = zSign && zSig1;
3123 break;
3124 default:
3125 abort();
3127 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3128 if ( ( 0x7FFE < zExp )
3129 || ( ( zExp == 0x7FFE )
3130 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
3131 && increment
3134 roundMask = 0;
3135 overflow:
3136 float_raise(float_flag_overflow | float_flag_inexact, status);
3137 if ( ( roundingMode == float_round_to_zero )
3138 || ( zSign && ( roundingMode == float_round_up ) )
3139 || ( ! zSign && ( roundingMode == float_round_down ) )
3141 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3143 return packFloatx80(zSign,
3144 floatx80_infinity_high,
3145 floatx80_infinity_low);
3147 if ( zExp <= 0 ) {
3148 isTiny =
3149 (status->float_detect_tininess
3150 == float_tininess_before_rounding)
3151 || ( zExp < 0 )
3152 || ! increment
3153 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
3154 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
3155 zExp = 0;
3156 if (isTiny && zSig1) {
3157 float_raise(float_flag_underflow, status);
3159 if (zSig1) {
3160 status->float_exception_flags |= float_flag_inexact;
3162 switch (roundingMode) {
3163 case float_round_nearest_even:
3164 case float_round_ties_away:
3165 increment = ((int64_t)zSig1 < 0);
3166 break;
3167 case float_round_to_zero:
3168 increment = 0;
3169 break;
3170 case float_round_up:
3171 increment = !zSign && zSig1;
3172 break;
3173 case float_round_down:
3174 increment = zSign && zSig1;
3175 break;
3176 default:
3177 abort();
3179 if ( increment ) {
3180 ++zSig0;
3181 zSig0 &=
3182 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3183 if ( (int64_t) zSig0 < 0 ) zExp = 1;
3185 return packFloatx80( zSign, zExp, zSig0 );
3188 if (zSig1) {
3189 status->float_exception_flags |= float_flag_inexact;
3191 if ( increment ) {
3192 ++zSig0;
3193 if ( zSig0 == 0 ) {
3194 ++zExp;
3195 zSig0 = LIT64( 0x8000000000000000 );
3197 else {
3198 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3201 else {
3202 if ( zSig0 == 0 ) zExp = 0;
3204 return packFloatx80( zSign, zExp, zSig0 );
3208 /*----------------------------------------------------------------------------
3209 | Takes an abstract floating-point value having sign `zSign', exponent
3210 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
3211 | and returns the proper extended double-precision floating-point value
3212 | corresponding to the abstract input. This routine is just like
3213 | `roundAndPackFloatx80' except that the input significand does not have to be
3214 | normalized.
3215 *----------------------------------------------------------------------------*/
3217 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
3218 flag zSign, int32_t zExp,
3219 uint64_t zSig0, uint64_t zSig1,
3220 float_status *status)
3222 int8_t shiftCount;
3224 if ( zSig0 == 0 ) {
3225 zSig0 = zSig1;
3226 zSig1 = 0;
3227 zExp -= 64;
3229 shiftCount = countLeadingZeros64( zSig0 );
3230 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3231 zExp -= shiftCount;
3232 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
3233 zSig0, zSig1, status);
3237 /*----------------------------------------------------------------------------
3238 | Returns the least-significant 64 fraction bits of the quadruple-precision
3239 | floating-point value `a'.
3240 *----------------------------------------------------------------------------*/
3242 static inline uint64_t extractFloat128Frac1( float128 a )
3245 return a.low;
3249 /*----------------------------------------------------------------------------
3250 | Returns the most-significant 48 fraction bits of the quadruple-precision
3251 | floating-point value `a'.
3252 *----------------------------------------------------------------------------*/
3254 static inline uint64_t extractFloat128Frac0( float128 a )
3257 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
3261 /*----------------------------------------------------------------------------
3262 | Returns the exponent bits of the quadruple-precision floating-point value
3263 | `a'.
3264 *----------------------------------------------------------------------------*/
3266 static inline int32_t extractFloat128Exp( float128 a )
3269 return ( a.high>>48 ) & 0x7FFF;
3273 /*----------------------------------------------------------------------------
3274 | Returns the sign bit of the quadruple-precision floating-point value `a'.
3275 *----------------------------------------------------------------------------*/
3277 static inline flag extractFloat128Sign( float128 a )
3280 return a.high>>63;
3284 /*----------------------------------------------------------------------------
3285 | Normalizes the subnormal quadruple-precision floating-point value
3286 | represented by the denormalized significand formed by the concatenation of
3287 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
3288 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
3289 | significand are stored at the location pointed to by `zSig0Ptr', and the
3290 | least significant 64 bits of the normalized significand are stored at the
3291 | location pointed to by `zSig1Ptr'.
3292 *----------------------------------------------------------------------------*/
3294 static void
3295 normalizeFloat128Subnormal(
3296 uint64_t aSig0,
3297 uint64_t aSig1,
3298 int32_t *zExpPtr,
3299 uint64_t *zSig0Ptr,
3300 uint64_t *zSig1Ptr
3303 int8_t shiftCount;
3305 if ( aSig0 == 0 ) {
3306 shiftCount = countLeadingZeros64( aSig1 ) - 15;
3307 if ( shiftCount < 0 ) {
3308 *zSig0Ptr = aSig1>>( - shiftCount );
3309 *zSig1Ptr = aSig1<<( shiftCount & 63 );
3311 else {
3312 *zSig0Ptr = aSig1<<shiftCount;
3313 *zSig1Ptr = 0;
3315 *zExpPtr = - shiftCount - 63;
3317 else {
3318 shiftCount = countLeadingZeros64( aSig0 ) - 15;
3319 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
3320 *zExpPtr = 1 - shiftCount;
3325 /*----------------------------------------------------------------------------
3326 | Packs the sign `zSign', the exponent `zExp', and the significand formed
3327 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
3328 | floating-point value, returning the result. After being shifted into the
3329 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
3330 | added together to form the most significant 32 bits of the result. This
3331 | means that any integer portion of `zSig0' will be added into the exponent.
3332 | Since a properly normalized significand will have an integer portion equal
3333 | to 1, the `zExp' input should be 1 less than the desired result exponent
3334 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
3335 | significand.
3336 *----------------------------------------------------------------------------*/
3338 static inline float128
3339 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
3341 float128 z;
3343 z.low = zSig1;
3344 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
3345 return z;
3349 /*----------------------------------------------------------------------------
3350 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3351 | and extended significand formed by the concatenation of `zSig0', `zSig1',
3352 | and `zSig2', and returns the proper quadruple-precision floating-point value
3353 | corresponding to the abstract input. Ordinarily, the abstract value is
3354 | simply rounded and packed into the quadruple-precision format, with the
3355 | inexact exception raised if the abstract input cannot be represented
3356 | exactly. However, if the abstract value is too large, the overflow and
3357 | inexact exceptions are raised and an infinity or maximal finite value is
3358 | returned. If the abstract value is too small, the input value is rounded to
3359 | a subnormal number, and the underflow and inexact exceptions are raised if
3360 | the abstract input cannot be represented exactly as a subnormal quadruple-
3361 | precision floating-point number.
3362 | The input significand must be normalized or smaller. If the input
3363 | significand is not normalized, `zExp' must be 0; in that case, the result
3364 | returned is a subnormal number, and it must not require rounding. In the
3365 | usual case that the input significand is normalized, `zExp' must be 1 less
3366 | than the ``true'' floating-point exponent. The handling of underflow and
3367 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3368 *----------------------------------------------------------------------------*/
3370 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
3371 uint64_t zSig0, uint64_t zSig1,
3372 uint64_t zSig2, float_status *status)
3374 int8_t roundingMode;
3375 flag roundNearestEven, increment, isTiny;
3377 roundingMode = status->float_rounding_mode;
3378 roundNearestEven = ( roundingMode == float_round_nearest_even );
3379 switch (roundingMode) {
3380 case float_round_nearest_even:
3381 case float_round_ties_away:
3382 increment = ((int64_t)zSig2 < 0);
3383 break;
3384 case float_round_to_zero:
3385 increment = 0;
3386 break;
3387 case float_round_up:
3388 increment = !zSign && zSig2;
3389 break;
3390 case float_round_down:
3391 increment = zSign && zSig2;
3392 break;
3393 case float_round_to_odd:
3394 increment = !(zSig1 & 0x1) && zSig2;
3395 break;
3396 default:
3397 abort();
3399 if ( 0x7FFD <= (uint32_t) zExp ) {
3400 if ( ( 0x7FFD < zExp )
3401 || ( ( zExp == 0x7FFD )
3402 && eq128(
3403 LIT64( 0x0001FFFFFFFFFFFF ),
3404 LIT64( 0xFFFFFFFFFFFFFFFF ),
3405 zSig0,
3406 zSig1
3408 && increment
3411 float_raise(float_flag_overflow | float_flag_inexact, status);
3412 if ( ( roundingMode == float_round_to_zero )
3413 || ( zSign && ( roundingMode == float_round_up ) )
3414 || ( ! zSign && ( roundingMode == float_round_down ) )
3415 || (roundingMode == float_round_to_odd)
3417 return
3418 packFloat128(
3419 zSign,
3420 0x7FFE,
3421 LIT64( 0x0000FFFFFFFFFFFF ),
3422 LIT64( 0xFFFFFFFFFFFFFFFF )
3425 return packFloat128( zSign, 0x7FFF, 0, 0 );
3427 if ( zExp < 0 ) {
3428 if (status->flush_to_zero) {
3429 float_raise(float_flag_output_denormal, status);
3430 return packFloat128(zSign, 0, 0, 0);
3432 isTiny =
3433 (status->float_detect_tininess
3434 == float_tininess_before_rounding)
3435 || ( zExp < -1 )
3436 || ! increment
3437 || lt128(
3438 zSig0,
3439 zSig1,
3440 LIT64( 0x0001FFFFFFFFFFFF ),
3441 LIT64( 0xFFFFFFFFFFFFFFFF )
3443 shift128ExtraRightJamming(
3444 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
3445 zExp = 0;
3446 if (isTiny && zSig2) {
3447 float_raise(float_flag_underflow, status);
3449 switch (roundingMode) {
3450 case float_round_nearest_even:
3451 case float_round_ties_away:
3452 increment = ((int64_t)zSig2 < 0);
3453 break;
3454 case float_round_to_zero:
3455 increment = 0;
3456 break;
3457 case float_round_up:
3458 increment = !zSign && zSig2;
3459 break;
3460 case float_round_down:
3461 increment = zSign && zSig2;
3462 break;
3463 case float_round_to_odd:
3464 increment = !(zSig1 & 0x1) && zSig2;
3465 break;
3466 default:
3467 abort();
3471 if (zSig2) {
3472 status->float_exception_flags |= float_flag_inexact;
3474 if ( increment ) {
3475 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
3476 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
3478 else {
3479 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
3481 return packFloat128( zSign, zExp, zSig0, zSig1 );
3485 /*----------------------------------------------------------------------------
3486 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3487 | and significand formed by the concatenation of `zSig0' and `zSig1', and
3488 | returns the proper quadruple-precision floating-point value corresponding
3489 | to the abstract input. This routine is just like `roundAndPackFloat128'
3490 | except that the input significand has fewer bits and does not have to be
3491 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
3492 | point exponent.
3493 *----------------------------------------------------------------------------*/
3495 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
3496 uint64_t zSig0, uint64_t zSig1,
3497 float_status *status)
3499 int8_t shiftCount;
3500 uint64_t zSig2;
3502 if ( zSig0 == 0 ) {
3503 zSig0 = zSig1;
3504 zSig1 = 0;
3505 zExp -= 64;
3507 shiftCount = countLeadingZeros64( zSig0 ) - 15;
3508 if ( 0 <= shiftCount ) {
3509 zSig2 = 0;
3510 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3512 else {
3513 shift128ExtraRightJamming(
3514 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3516 zExp -= shiftCount;
3517 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3522 /*----------------------------------------------------------------------------
3523 | Returns the result of converting the 32-bit two's complement integer `a'
3524 | to the extended double-precision floating-point format. The conversion
3525 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3526 | Arithmetic.
3527 *----------------------------------------------------------------------------*/
3529 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3531 flag zSign;
3532 uint32_t absA;
3533 int8_t shiftCount;
3534 uint64_t zSig;
3536 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3537 zSign = ( a < 0 );
3538 absA = zSign ? - a : a;
3539 shiftCount = countLeadingZeros32( absA ) + 32;
3540 zSig = absA;
3541 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3545 /*----------------------------------------------------------------------------
3546 | Returns the result of converting the 32-bit two's complement integer `a' to
3547 | the quadruple-precision floating-point format. The conversion is performed
3548 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3549 *----------------------------------------------------------------------------*/
3551 float128 int32_to_float128(int32_t a, float_status *status)
3553 flag zSign;
3554 uint32_t absA;
3555 int8_t shiftCount;
3556 uint64_t zSig0;
3558 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3559 zSign = ( a < 0 );
3560 absA = zSign ? - a : a;
3561 shiftCount = countLeadingZeros32( absA ) + 17;
3562 zSig0 = absA;
3563 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3567 /*----------------------------------------------------------------------------
3568 | Returns the result of converting the 64-bit two's complement integer `a'
3569 | to the extended double-precision floating-point format. The conversion
3570 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3571 | Arithmetic.
3572 *----------------------------------------------------------------------------*/
3574 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3576 flag zSign;
3577 uint64_t absA;
3578 int8_t shiftCount;
3580 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3581 zSign = ( a < 0 );
3582 absA = zSign ? - a : a;
3583 shiftCount = countLeadingZeros64( absA );
3584 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3588 /*----------------------------------------------------------------------------
3589 | Returns the result of converting the 64-bit two's complement integer `a' to
3590 | the quadruple-precision floating-point format. The conversion is performed
3591 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3592 *----------------------------------------------------------------------------*/
3594 float128 int64_to_float128(int64_t a, float_status *status)
3596 flag zSign;
3597 uint64_t absA;
3598 int8_t shiftCount;
3599 int32_t zExp;
3600 uint64_t zSig0, zSig1;
3602 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3603 zSign = ( a < 0 );
3604 absA = zSign ? - a : a;
3605 shiftCount = countLeadingZeros64( absA ) + 49;
3606 zExp = 0x406E - shiftCount;
3607 if ( 64 <= shiftCount ) {
3608 zSig1 = 0;
3609 zSig0 = absA;
3610 shiftCount -= 64;
3612 else {
3613 zSig1 = absA;
3614 zSig0 = 0;
3616 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3617 return packFloat128( zSign, zExp, zSig0, zSig1 );
3621 /*----------------------------------------------------------------------------
3622 | Returns the result of converting the 64-bit unsigned integer `a'
3623 | to the quadruple-precision floating-point format. The conversion is performed
3624 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3625 *----------------------------------------------------------------------------*/
3627 float128 uint64_to_float128(uint64_t a, float_status *status)
3629 if (a == 0) {
3630 return float128_zero;
3632 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
3635 /*----------------------------------------------------------------------------
3636 | Returns the result of converting the single-precision floating-point value
3637 | `a' to the extended double-precision floating-point format. The conversion
3638 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3639 | Arithmetic.
3640 *----------------------------------------------------------------------------*/
3642 floatx80 float32_to_floatx80(float32 a, float_status *status)
3644 flag aSign;
3645 int aExp;
3646 uint32_t aSig;
3648 a = float32_squash_input_denormal(a, status);
3649 aSig = extractFloat32Frac( a );
3650 aExp = extractFloat32Exp( a );
3651 aSign = extractFloat32Sign( a );
3652 if ( aExp == 0xFF ) {
3653 if (aSig) {
3654 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3656 return packFloatx80(aSign,
3657 floatx80_infinity_high,
3658 floatx80_infinity_low);
3660 if ( aExp == 0 ) {
3661 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3662 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3664 aSig |= 0x00800000;
3665 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3669 /*----------------------------------------------------------------------------
3670 | Returns the result of converting the single-precision floating-point value
3671 | `a' to the double-precision floating-point format. The conversion is
3672 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3673 | Arithmetic.
3674 *----------------------------------------------------------------------------*/
3676 float128 float32_to_float128(float32 a, float_status *status)
3678 flag aSign;
3679 int aExp;
3680 uint32_t aSig;
3682 a = float32_squash_input_denormal(a, status);
3683 aSig = extractFloat32Frac( a );
3684 aExp = extractFloat32Exp( a );
3685 aSign = extractFloat32Sign( a );
3686 if ( aExp == 0xFF ) {
3687 if (aSig) {
3688 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3690 return packFloat128( aSign, 0x7FFF, 0, 0 );
3692 if ( aExp == 0 ) {
3693 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3694 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3695 --aExp;
3697 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3701 /*----------------------------------------------------------------------------
3702 | Returns the remainder of the single-precision floating-point value `a'
3703 | with respect to the corresponding value `b'. The operation is performed
3704 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3705 *----------------------------------------------------------------------------*/
3707 float32 float32_rem(float32 a, float32 b, float_status *status)
3709 flag aSign, zSign;
3710 int aExp, bExp, expDiff;
3711 uint32_t aSig, bSig;
3712 uint32_t q;
3713 uint64_t aSig64, bSig64, q64;
3714 uint32_t alternateASig;
3715 int32_t sigMean;
3716 a = float32_squash_input_denormal(a, status);
3717 b = float32_squash_input_denormal(b, status);
3719 aSig = extractFloat32Frac( a );
3720 aExp = extractFloat32Exp( a );
3721 aSign = extractFloat32Sign( a );
3722 bSig = extractFloat32Frac( b );
3723 bExp = extractFloat32Exp( b );
3724 if ( aExp == 0xFF ) {
3725 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3726 return propagateFloat32NaN(a, b, status);
3728 float_raise(float_flag_invalid, status);
3729 return float32_default_nan(status);
3731 if ( bExp == 0xFF ) {
3732 if (bSig) {
3733 return propagateFloat32NaN(a, b, status);
3735 return a;
3737 if ( bExp == 0 ) {
3738 if ( bSig == 0 ) {
3739 float_raise(float_flag_invalid, status);
3740 return float32_default_nan(status);
3742 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3744 if ( aExp == 0 ) {
3745 if ( aSig == 0 ) return a;
3746 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3748 expDiff = aExp - bExp;
3749 aSig |= 0x00800000;
3750 bSig |= 0x00800000;
3751 if ( expDiff < 32 ) {
3752 aSig <<= 8;
3753 bSig <<= 8;
3754 if ( expDiff < 0 ) {
3755 if ( expDiff < -1 ) return a;
3756 aSig >>= 1;
3758 q = ( bSig <= aSig );
3759 if ( q ) aSig -= bSig;
3760 if ( 0 < expDiff ) {
3761 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3762 q >>= 32 - expDiff;
3763 bSig >>= 2;
3764 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3766 else {
3767 aSig >>= 2;
3768 bSig >>= 2;
3771 else {
3772 if ( bSig <= aSig ) aSig -= bSig;
3773 aSig64 = ( (uint64_t) aSig )<<40;
3774 bSig64 = ( (uint64_t) bSig )<<40;
3775 expDiff -= 64;
3776 while ( 0 < expDiff ) {
3777 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3778 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3779 aSig64 = - ( ( bSig * q64 )<<38 );
3780 expDiff -= 62;
3782 expDiff += 64;
3783 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3784 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3785 q = q64>>( 64 - expDiff );
3786 bSig <<= 6;
3787 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3789 do {
3790 alternateASig = aSig;
3791 ++q;
3792 aSig -= bSig;
3793 } while ( 0 <= (int32_t) aSig );
3794 sigMean = aSig + alternateASig;
3795 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3796 aSig = alternateASig;
3798 zSign = ( (int32_t) aSig < 0 );
3799 if ( zSign ) aSig = - aSig;
3800 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3805 /*----------------------------------------------------------------------------
3806 | Returns the binary exponential of the single-precision floating-point value
3807 | `a'. The operation is performed according to the IEC/IEEE Standard for
3808 | Binary Floating-Point Arithmetic.
3810 | Uses the following identities:
3812 | 1. -------------------------------------------------------------------------
3813 | x x*ln(2)
3814 | 2 = e
3816 | 2. -------------------------------------------------------------------------
3817 | 2 3 4 5 n
3818 | x x x x x x x
3819 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3820 | 1! 2! 3! 4! 5! n!
3821 *----------------------------------------------------------------------------*/
3823 static const float64 float32_exp2_coefficients[15] =
3825 const_float64( 0x3ff0000000000000ll ), /* 1 */
3826 const_float64( 0x3fe0000000000000ll ), /* 2 */
3827 const_float64( 0x3fc5555555555555ll ), /* 3 */
3828 const_float64( 0x3fa5555555555555ll ), /* 4 */
3829 const_float64( 0x3f81111111111111ll ), /* 5 */
3830 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
3831 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
3832 const_float64( 0x3efa01a01a01a01all ), /* 8 */
3833 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
3834 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3835 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3836 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3837 const_float64( 0x3de6124613a86d09ll ), /* 13 */
3838 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3839 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3842 float32 float32_exp2(float32 a, float_status *status)
3844 flag aSign;
3845 int aExp;
3846 uint32_t aSig;
3847 float64 r, x, xn;
3848 int i;
3849 a = float32_squash_input_denormal(a, status);
3851 aSig = extractFloat32Frac( a );
3852 aExp = extractFloat32Exp( a );
3853 aSign = extractFloat32Sign( a );
3855 if ( aExp == 0xFF) {
3856 if (aSig) {
3857 return propagateFloat32NaN(a, float32_zero, status);
3859 return (aSign) ? float32_zero : a;
3861 if (aExp == 0) {
3862 if (aSig == 0) return float32_one;
3865 float_raise(float_flag_inexact, status);
3867 /* ******************************* */
3868 /* using float64 for approximation */
3869 /* ******************************* */
3870 x = float32_to_float64(a, status);
3871 x = float64_mul(x, float64_ln2, status);
3873 xn = x;
3874 r = float64_one;
3875 for (i = 0 ; i < 15 ; i++) {
3876 float64 f;
3878 f = float64_mul(xn, float32_exp2_coefficients[i], status);
3879 r = float64_add(r, f, status);
3881 xn = float64_mul(xn, x, status);
3884 return float64_to_float32(r, status);
3887 /*----------------------------------------------------------------------------
3888 | Returns the binary log of the single-precision floating-point value `a'.
3889 | The operation is performed according to the IEC/IEEE Standard for Binary
3890 | Floating-Point Arithmetic.
3891 *----------------------------------------------------------------------------*/
3892 float32 float32_log2(float32 a, float_status *status)
3894 flag aSign, zSign;
3895 int aExp;
3896 uint32_t aSig, zSig, i;
3898 a = float32_squash_input_denormal(a, status);
3899 aSig = extractFloat32Frac( a );
3900 aExp = extractFloat32Exp( a );
3901 aSign = extractFloat32Sign( a );
3903 if ( aExp == 0 ) {
3904 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3905 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3907 if ( aSign ) {
3908 float_raise(float_flag_invalid, status);
3909 return float32_default_nan(status);
3911 if ( aExp == 0xFF ) {
3912 if (aSig) {
3913 return propagateFloat32NaN(a, float32_zero, status);
3915 return a;
3918 aExp -= 0x7F;
3919 aSig |= 0x00800000;
3920 zSign = aExp < 0;
3921 zSig = aExp << 23;
3923 for (i = 1 << 22; i > 0; i >>= 1) {
3924 aSig = ( (uint64_t)aSig * aSig ) >> 23;
3925 if ( aSig & 0x01000000 ) {
3926 aSig >>= 1;
3927 zSig |= i;
3931 if ( zSign )
3932 zSig = -zSig;
3934 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3937 /*----------------------------------------------------------------------------
3938 | Returns 1 if the single-precision floating-point value `a' is equal to
3939 | the corresponding value `b', and 0 otherwise. The invalid exception is
3940 | raised if either operand is a NaN. Otherwise, the comparison is performed
3941 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3942 *----------------------------------------------------------------------------*/
3944 int float32_eq(float32 a, float32 b, float_status *status)
3946 uint32_t av, bv;
3947 a = float32_squash_input_denormal(a, status);
3948 b = float32_squash_input_denormal(b, status);
3950 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3951 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3953 float_raise(float_flag_invalid, status);
3954 return 0;
3956 av = float32_val(a);
3957 bv = float32_val(b);
3958 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3961 /*----------------------------------------------------------------------------
3962 | Returns 1 if the single-precision floating-point value `a' is less than
3963 | or equal to the corresponding value `b', and 0 otherwise. The invalid
3964 | exception is raised if either operand is a NaN. The comparison is performed
3965 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3966 *----------------------------------------------------------------------------*/
3968 int float32_le(float32 a, float32 b, float_status *status)
3970 flag aSign, bSign;
3971 uint32_t av, bv;
3972 a = float32_squash_input_denormal(a, status);
3973 b = float32_squash_input_denormal(b, status);
3975 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3976 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3978 float_raise(float_flag_invalid, status);
3979 return 0;
3981 aSign = extractFloat32Sign( a );
3982 bSign = extractFloat32Sign( b );
3983 av = float32_val(a);
3984 bv = float32_val(b);
3985 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3986 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3990 /*----------------------------------------------------------------------------
3991 | Returns 1 if the single-precision floating-point value `a' is less than
3992 | the corresponding value `b', and 0 otherwise. The invalid exception is
3993 | raised if either operand is a NaN. The comparison is performed according
3994 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3995 *----------------------------------------------------------------------------*/
3997 int float32_lt(float32 a, float32 b, float_status *status)
3999 flag aSign, bSign;
4000 uint32_t av, bv;
4001 a = float32_squash_input_denormal(a, status);
4002 b = float32_squash_input_denormal(b, status);
4004 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4005 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4007 float_raise(float_flag_invalid, status);
4008 return 0;
4010 aSign = extractFloat32Sign( a );
4011 bSign = extractFloat32Sign( b );
4012 av = float32_val(a);
4013 bv = float32_val(b);
4014 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4015 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4019 /*----------------------------------------------------------------------------
4020 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4021 | be compared, and 0 otherwise. The invalid exception is raised if either
4022 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4023 | Standard for Binary Floating-Point Arithmetic.
4024 *----------------------------------------------------------------------------*/
4026 int float32_unordered(float32 a, float32 b, float_status *status)
4028 a = float32_squash_input_denormal(a, status);
4029 b = float32_squash_input_denormal(b, status);
4031 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4032 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4034 float_raise(float_flag_invalid, status);
4035 return 1;
4037 return 0;
4040 /*----------------------------------------------------------------------------
4041 | Returns 1 if the single-precision floating-point value `a' is equal to
4042 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4043 | exception. The comparison is performed according to the IEC/IEEE Standard
4044 | for Binary Floating-Point Arithmetic.
4045 *----------------------------------------------------------------------------*/
4047 int float32_eq_quiet(float32 a, float32 b, float_status *status)
4049 a = float32_squash_input_denormal(a, status);
4050 b = float32_squash_input_denormal(b, status);
4052 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4053 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4055 if (float32_is_signaling_nan(a, status)
4056 || float32_is_signaling_nan(b, status)) {
4057 float_raise(float_flag_invalid, status);
4059 return 0;
4061 return ( float32_val(a) == float32_val(b) ) ||
4062 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
4065 /*----------------------------------------------------------------------------
4066 | Returns 1 if the single-precision floating-point value `a' is less than or
4067 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4068 | cause an exception. Otherwise, the comparison is performed according to the
4069 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4070 *----------------------------------------------------------------------------*/
4072 int float32_le_quiet(float32 a, float32 b, float_status *status)
4074 flag aSign, bSign;
4075 uint32_t av, bv;
4076 a = float32_squash_input_denormal(a, status);
4077 b = float32_squash_input_denormal(b, status);
4079 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4080 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4082 if (float32_is_signaling_nan(a, status)
4083 || float32_is_signaling_nan(b, status)) {
4084 float_raise(float_flag_invalid, status);
4086 return 0;
4088 aSign = extractFloat32Sign( a );
4089 bSign = extractFloat32Sign( b );
4090 av = float32_val(a);
4091 bv = float32_val(b);
4092 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4093 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4097 /*----------------------------------------------------------------------------
4098 | Returns 1 if the single-precision floating-point value `a' is less than
4099 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4100 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4101 | Standard for Binary Floating-Point Arithmetic.
4102 *----------------------------------------------------------------------------*/
4104 int float32_lt_quiet(float32 a, float32 b, float_status *status)
4106 flag aSign, bSign;
4107 uint32_t av, bv;
4108 a = float32_squash_input_denormal(a, status);
4109 b = float32_squash_input_denormal(b, status);
4111 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4112 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4114 if (float32_is_signaling_nan(a, status)
4115 || float32_is_signaling_nan(b, status)) {
4116 float_raise(float_flag_invalid, status);
4118 return 0;
4120 aSign = extractFloat32Sign( a );
4121 bSign = extractFloat32Sign( b );
4122 av = float32_val(a);
4123 bv = float32_val(b);
4124 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4125 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4129 /*----------------------------------------------------------------------------
4130 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4131 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4132 | comparison is performed according to the IEC/IEEE Standard for Binary
4133 | Floating-Point Arithmetic.
4134 *----------------------------------------------------------------------------*/
4136 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
4138 a = float32_squash_input_denormal(a, status);
4139 b = float32_squash_input_denormal(b, status);
4141 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4142 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4144 if (float32_is_signaling_nan(a, status)
4145 || float32_is_signaling_nan(b, status)) {
4146 float_raise(float_flag_invalid, status);
4148 return 1;
4150 return 0;
4153 /*----------------------------------------------------------------------------
4154 | If `a' is denormal and we are in flush-to-zero mode then set the
4155 | input-denormal exception and return zero. Otherwise just return the value.
4156 *----------------------------------------------------------------------------*/
4157 float16 float16_squash_input_denormal(float16 a, float_status *status)
4159 if (status->flush_inputs_to_zero) {
4160 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
4161 float_raise(float_flag_input_denormal, status);
4162 return make_float16(float16_val(a) & 0x8000);
4165 return a;
4168 /*----------------------------------------------------------------------------
4169 | Returns the result of converting the double-precision floating-point value
4170 | `a' to the extended double-precision floating-point format. The conversion
4171 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4172 | Arithmetic.
4173 *----------------------------------------------------------------------------*/
4175 floatx80 float64_to_floatx80(float64 a, float_status *status)
4177 flag aSign;
4178 int aExp;
4179 uint64_t aSig;
4181 a = float64_squash_input_denormal(a, status);
4182 aSig = extractFloat64Frac( a );
4183 aExp = extractFloat64Exp( a );
4184 aSign = extractFloat64Sign( a );
4185 if ( aExp == 0x7FF ) {
4186 if (aSig) {
4187 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4189 return packFloatx80(aSign,
4190 floatx80_infinity_high,
4191 floatx80_infinity_low);
4193 if ( aExp == 0 ) {
4194 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4195 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4197 return
4198 packFloatx80(
4199 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4203 /*----------------------------------------------------------------------------
4204 | Returns the result of converting the double-precision floating-point value
4205 | `a' to the quadruple-precision floating-point format. The conversion is
4206 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4207 | Arithmetic.
4208 *----------------------------------------------------------------------------*/
4210 float128 float64_to_float128(float64 a, float_status *status)
4212 flag aSign;
4213 int aExp;
4214 uint64_t aSig, zSig0, zSig1;
4216 a = float64_squash_input_denormal(a, status);
4217 aSig = extractFloat64Frac( a );
4218 aExp = extractFloat64Exp( a );
4219 aSign = extractFloat64Sign( a );
4220 if ( aExp == 0x7FF ) {
4221 if (aSig) {
4222 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4224 return packFloat128( aSign, 0x7FFF, 0, 0 );
4226 if ( aExp == 0 ) {
4227 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4228 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4229 --aExp;
4231 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4232 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4237 /*----------------------------------------------------------------------------
4238 | Returns the remainder of the double-precision floating-point value `a'
4239 | with respect to the corresponding value `b'. The operation is performed
4240 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4241 *----------------------------------------------------------------------------*/
4243 float64 float64_rem(float64 a, float64 b, float_status *status)
4245 flag aSign, zSign;
4246 int aExp, bExp, expDiff;
4247 uint64_t aSig, bSig;
4248 uint64_t q, alternateASig;
4249 int64_t sigMean;
4251 a = float64_squash_input_denormal(a, status);
4252 b = float64_squash_input_denormal(b, status);
4253 aSig = extractFloat64Frac( a );
4254 aExp = extractFloat64Exp( a );
4255 aSign = extractFloat64Sign( a );
4256 bSig = extractFloat64Frac( b );
4257 bExp = extractFloat64Exp( b );
4258 if ( aExp == 0x7FF ) {
4259 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4260 return propagateFloat64NaN(a, b, status);
4262 float_raise(float_flag_invalid, status);
4263 return float64_default_nan(status);
4265 if ( bExp == 0x7FF ) {
4266 if (bSig) {
4267 return propagateFloat64NaN(a, b, status);
4269 return a;
4271 if ( bExp == 0 ) {
4272 if ( bSig == 0 ) {
4273 float_raise(float_flag_invalid, status);
4274 return float64_default_nan(status);
4276 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4278 if ( aExp == 0 ) {
4279 if ( aSig == 0 ) return a;
4280 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4282 expDiff = aExp - bExp;
4283 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4284 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4285 if ( expDiff < 0 ) {
4286 if ( expDiff < -1 ) return a;
4287 aSig >>= 1;
4289 q = ( bSig <= aSig );
4290 if ( q ) aSig -= bSig;
4291 expDiff -= 64;
4292 while ( 0 < expDiff ) {
4293 q = estimateDiv128To64( aSig, 0, bSig );
4294 q = ( 2 < q ) ? q - 2 : 0;
4295 aSig = - ( ( bSig>>2 ) * q );
4296 expDiff -= 62;
4298 expDiff += 64;
4299 if ( 0 < expDiff ) {
4300 q = estimateDiv128To64( aSig, 0, bSig );
4301 q = ( 2 < q ) ? q - 2 : 0;
4302 q >>= 64 - expDiff;
4303 bSig >>= 2;
4304 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4306 else {
4307 aSig >>= 2;
4308 bSig >>= 2;
4310 do {
4311 alternateASig = aSig;
4312 ++q;
4313 aSig -= bSig;
4314 } while ( 0 <= (int64_t) aSig );
4315 sigMean = aSig + alternateASig;
4316 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4317 aSig = alternateASig;
4319 zSign = ( (int64_t) aSig < 0 );
4320 if ( zSign ) aSig = - aSig;
4321 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4325 /*----------------------------------------------------------------------------
4326 | Returns the binary log of the double-precision floating-point value `a'.
4327 | The operation is performed according to the IEC/IEEE Standard for Binary
4328 | Floating-Point Arithmetic.
4329 *----------------------------------------------------------------------------*/
4330 float64 float64_log2(float64 a, float_status *status)
4332 flag aSign, zSign;
4333 int aExp;
4334 uint64_t aSig, aSig0, aSig1, zSig, i;
4335 a = float64_squash_input_denormal(a, status);
4337 aSig = extractFloat64Frac( a );
4338 aExp = extractFloat64Exp( a );
4339 aSign = extractFloat64Sign( a );
4341 if ( aExp == 0 ) {
4342 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4343 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4345 if ( aSign ) {
4346 float_raise(float_flag_invalid, status);
4347 return float64_default_nan(status);
4349 if ( aExp == 0x7FF ) {
4350 if (aSig) {
4351 return propagateFloat64NaN(a, float64_zero, status);
4353 return a;
4356 aExp -= 0x3FF;
4357 aSig |= LIT64( 0x0010000000000000 );
4358 zSign = aExp < 0;
4359 zSig = (uint64_t)aExp << 52;
4360 for (i = 1LL << 51; i > 0; i >>= 1) {
4361 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4362 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4363 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4364 aSig >>= 1;
4365 zSig |= i;
4369 if ( zSign )
4370 zSig = -zSig;
4371 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4374 /*----------------------------------------------------------------------------
4375 | Returns 1 if the double-precision floating-point value `a' is equal to the
4376 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4377 | if either operand is a NaN. Otherwise, the comparison is performed
4378 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4379 *----------------------------------------------------------------------------*/
4381 int float64_eq(float64 a, float64 b, float_status *status)
4383 uint64_t av, bv;
4384 a = float64_squash_input_denormal(a, status);
4385 b = float64_squash_input_denormal(b, status);
4387 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4388 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4390 float_raise(float_flag_invalid, status);
4391 return 0;
4393 av = float64_val(a);
4394 bv = float64_val(b);
4395 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4399 /*----------------------------------------------------------------------------
4400 | Returns 1 if the double-precision floating-point value `a' is less than or
4401 | equal to the corresponding value `b', and 0 otherwise. The invalid
4402 | exception is raised if either operand is a NaN. The comparison is performed
4403 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4404 *----------------------------------------------------------------------------*/
4406 int float64_le(float64 a, float64 b, float_status *status)
4408 flag aSign, bSign;
4409 uint64_t av, bv;
4410 a = float64_squash_input_denormal(a, status);
4411 b = float64_squash_input_denormal(b, status);
4413 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4414 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4416 float_raise(float_flag_invalid, status);
4417 return 0;
4419 aSign = extractFloat64Sign( a );
4420 bSign = extractFloat64Sign( b );
4421 av = float64_val(a);
4422 bv = float64_val(b);
4423 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4424 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4428 /*----------------------------------------------------------------------------
4429 | Returns 1 if the double-precision floating-point value `a' is less than
4430 | the corresponding value `b', and 0 otherwise. The invalid exception is
4431 | raised if either operand is a NaN. The comparison is performed according
4432 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4433 *----------------------------------------------------------------------------*/
4435 int float64_lt(float64 a, float64 b, float_status *status)
4437 flag aSign, bSign;
4438 uint64_t av, bv;
4440 a = float64_squash_input_denormal(a, status);
4441 b = float64_squash_input_denormal(b, status);
4442 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4443 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4445 float_raise(float_flag_invalid, status);
4446 return 0;
4448 aSign = extractFloat64Sign( a );
4449 bSign = extractFloat64Sign( b );
4450 av = float64_val(a);
4451 bv = float64_val(b);
4452 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4453 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4457 /*----------------------------------------------------------------------------
4458 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4459 | be compared, and 0 otherwise. The invalid exception is raised if either
4460 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4461 | Standard for Binary Floating-Point Arithmetic.
4462 *----------------------------------------------------------------------------*/
4464 int float64_unordered(float64 a, float64 b, float_status *status)
4466 a = float64_squash_input_denormal(a, status);
4467 b = float64_squash_input_denormal(b, status);
4469 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4470 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4472 float_raise(float_flag_invalid, status);
4473 return 1;
4475 return 0;
4478 /*----------------------------------------------------------------------------
4479 | Returns 1 if the double-precision floating-point value `a' is equal to the
4480 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4481 | exception.The comparison is performed according to the IEC/IEEE Standard
4482 | for Binary Floating-Point Arithmetic.
4483 *----------------------------------------------------------------------------*/
4485 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4487 uint64_t av, bv;
4488 a = float64_squash_input_denormal(a, status);
4489 b = float64_squash_input_denormal(b, status);
4491 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4492 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4494 if (float64_is_signaling_nan(a, status)
4495 || float64_is_signaling_nan(b, status)) {
4496 float_raise(float_flag_invalid, status);
4498 return 0;
4500 av = float64_val(a);
4501 bv = float64_val(b);
4502 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4506 /*----------------------------------------------------------------------------
4507 | Returns 1 if the double-precision floating-point value `a' is less than or
4508 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4509 | cause an exception. Otherwise, the comparison is performed according to the
4510 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4511 *----------------------------------------------------------------------------*/
4513 int float64_le_quiet(float64 a, float64 b, float_status *status)
4515 flag aSign, bSign;
4516 uint64_t av, bv;
4517 a = float64_squash_input_denormal(a, status);
4518 b = float64_squash_input_denormal(b, status);
4520 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4521 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4523 if (float64_is_signaling_nan(a, status)
4524 || float64_is_signaling_nan(b, status)) {
4525 float_raise(float_flag_invalid, status);
4527 return 0;
4529 aSign = extractFloat64Sign( a );
4530 bSign = extractFloat64Sign( b );
4531 av = float64_val(a);
4532 bv = float64_val(b);
4533 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4534 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4538 /*----------------------------------------------------------------------------
4539 | Returns 1 if the double-precision floating-point value `a' is less than
4540 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4541 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4542 | Standard for Binary Floating-Point Arithmetic.
4543 *----------------------------------------------------------------------------*/
4545 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4547 flag aSign, bSign;
4548 uint64_t av, bv;
4549 a = float64_squash_input_denormal(a, status);
4550 b = float64_squash_input_denormal(b, status);
4552 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4553 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4555 if (float64_is_signaling_nan(a, status)
4556 || float64_is_signaling_nan(b, status)) {
4557 float_raise(float_flag_invalid, status);
4559 return 0;
4561 aSign = extractFloat64Sign( a );
4562 bSign = extractFloat64Sign( b );
4563 av = float64_val(a);
4564 bv = float64_val(b);
4565 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4566 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4570 /*----------------------------------------------------------------------------
4571 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4572 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4573 | comparison is performed according to the IEC/IEEE Standard for Binary
4574 | Floating-Point Arithmetic.
4575 *----------------------------------------------------------------------------*/
4577 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4579 a = float64_squash_input_denormal(a, status);
4580 b = float64_squash_input_denormal(b, status);
4582 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4583 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4585 if (float64_is_signaling_nan(a, status)
4586 || float64_is_signaling_nan(b, status)) {
4587 float_raise(float_flag_invalid, status);
4589 return 1;
4591 return 0;
4594 /*----------------------------------------------------------------------------
4595 | Returns the result of converting the extended double-precision floating-
4596 | point value `a' to the 32-bit two's complement integer format. The
4597 | conversion is performed according to the IEC/IEEE Standard for Binary
4598 | Floating-Point Arithmetic---which means in particular that the conversion
4599 | is rounded according to the current rounding mode. If `a' is a NaN, the
4600 | largest positive integer is returned. Otherwise, if the conversion
4601 | overflows, the largest integer with the same sign as `a' is returned.
4602 *----------------------------------------------------------------------------*/
4604 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4606 flag aSign;
4607 int32_t aExp, shiftCount;
4608 uint64_t aSig;
4610 if (floatx80_invalid_encoding(a)) {
4611 float_raise(float_flag_invalid, status);
4612 return 1 << 31;
4614 aSig = extractFloatx80Frac( a );
4615 aExp = extractFloatx80Exp( a );
4616 aSign = extractFloatx80Sign( a );
4617 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4618 shiftCount = 0x4037 - aExp;
4619 if ( shiftCount <= 0 ) shiftCount = 1;
4620 shift64RightJamming( aSig, shiftCount, &aSig );
4621 return roundAndPackInt32(aSign, aSig, status);
4625 /*----------------------------------------------------------------------------
4626 | Returns the result of converting the extended double-precision floating-
4627 | point value `a' to the 32-bit two's complement integer format. The
4628 | conversion is performed according to the IEC/IEEE Standard for Binary
4629 | Floating-Point Arithmetic, except that the conversion is always rounded
4630 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4631 | Otherwise, if the conversion overflows, the largest integer with the same
4632 | sign as `a' is returned.
4633 *----------------------------------------------------------------------------*/
4635 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4637 flag aSign;
4638 int32_t aExp, shiftCount;
4639 uint64_t aSig, savedASig;
4640 int32_t z;
4642 if (floatx80_invalid_encoding(a)) {
4643 float_raise(float_flag_invalid, status);
4644 return 1 << 31;
4646 aSig = extractFloatx80Frac( a );
4647 aExp = extractFloatx80Exp( a );
4648 aSign = extractFloatx80Sign( a );
4649 if ( 0x401E < aExp ) {
4650 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4651 goto invalid;
4653 else if ( aExp < 0x3FFF ) {
4654 if (aExp || aSig) {
4655 status->float_exception_flags |= float_flag_inexact;
4657 return 0;
4659 shiftCount = 0x403E - aExp;
4660 savedASig = aSig;
4661 aSig >>= shiftCount;
4662 z = aSig;
4663 if ( aSign ) z = - z;
4664 if ( ( z < 0 ) ^ aSign ) {
4665 invalid:
4666 float_raise(float_flag_invalid, status);
4667 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4669 if ( ( aSig<<shiftCount ) != savedASig ) {
4670 status->float_exception_flags |= float_flag_inexact;
4672 return z;
4676 /*----------------------------------------------------------------------------
4677 | Returns the result of converting the extended double-precision floating-
4678 | point value `a' to the 64-bit two's complement integer format. The
4679 | conversion is performed according to the IEC/IEEE Standard for Binary
4680 | Floating-Point Arithmetic---which means in particular that the conversion
4681 | is rounded according to the current rounding mode. If `a' is a NaN,
4682 | the largest positive integer is returned. Otherwise, if the conversion
4683 | overflows, the largest integer with the same sign as `a' is returned.
4684 *----------------------------------------------------------------------------*/
4686 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4688 flag aSign;
4689 int32_t aExp, shiftCount;
4690 uint64_t aSig, aSigExtra;
4692 if (floatx80_invalid_encoding(a)) {
4693 float_raise(float_flag_invalid, status);
4694 return 1ULL << 63;
4696 aSig = extractFloatx80Frac( a );
4697 aExp = extractFloatx80Exp( a );
4698 aSign = extractFloatx80Sign( a );
4699 shiftCount = 0x403E - aExp;
4700 if ( shiftCount <= 0 ) {
4701 if ( shiftCount ) {
4702 float_raise(float_flag_invalid, status);
4703 if (!aSign || floatx80_is_any_nan(a)) {
4704 return LIT64( 0x7FFFFFFFFFFFFFFF );
4706 return (int64_t) LIT64( 0x8000000000000000 );
4708 aSigExtra = 0;
4710 else {
4711 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4713 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4717 /*----------------------------------------------------------------------------
4718 | Returns the result of converting the extended double-precision floating-
4719 | point value `a' to the 64-bit two's complement integer format. The
4720 | conversion is performed according to the IEC/IEEE Standard for Binary
4721 | Floating-Point Arithmetic, except that the conversion is always rounded
4722 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4723 | Otherwise, if the conversion overflows, the largest integer with the same
4724 | sign as `a' is returned.
4725 *----------------------------------------------------------------------------*/
4727 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4729 flag aSign;
4730 int32_t aExp, shiftCount;
4731 uint64_t aSig;
4732 int64_t z;
4734 if (floatx80_invalid_encoding(a)) {
4735 float_raise(float_flag_invalid, status);
4736 return 1ULL << 63;
4738 aSig = extractFloatx80Frac( a );
4739 aExp = extractFloatx80Exp( a );
4740 aSign = extractFloatx80Sign( a );
4741 shiftCount = aExp - 0x403E;
4742 if ( 0 <= shiftCount ) {
4743 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4744 if ( ( a.high != 0xC03E ) || aSig ) {
4745 float_raise(float_flag_invalid, status);
4746 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4747 return LIT64( 0x7FFFFFFFFFFFFFFF );
4750 return (int64_t) LIT64( 0x8000000000000000 );
4752 else if ( aExp < 0x3FFF ) {
4753 if (aExp | aSig) {
4754 status->float_exception_flags |= float_flag_inexact;
4756 return 0;
4758 z = aSig>>( - shiftCount );
4759 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4760 status->float_exception_flags |= float_flag_inexact;
4762 if ( aSign ) z = - z;
4763 return z;
4767 /*----------------------------------------------------------------------------
4768 | Returns the result of converting the extended double-precision floating-
4769 | point value `a' to the single-precision floating-point format. The
4770 | conversion is performed according to the IEC/IEEE Standard for Binary
4771 | Floating-Point Arithmetic.
4772 *----------------------------------------------------------------------------*/
4774 float32 floatx80_to_float32(floatx80 a, float_status *status)
4776 flag aSign;
4777 int32_t aExp;
4778 uint64_t aSig;
4780 if (floatx80_invalid_encoding(a)) {
4781 float_raise(float_flag_invalid, status);
4782 return float32_default_nan(status);
4784 aSig = extractFloatx80Frac( a );
4785 aExp = extractFloatx80Exp( a );
4786 aSign = extractFloatx80Sign( a );
4787 if ( aExp == 0x7FFF ) {
4788 if ( (uint64_t) ( aSig<<1 ) ) {
4789 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4791 return packFloat32( aSign, 0xFF, 0 );
4793 shift64RightJamming( aSig, 33, &aSig );
4794 if ( aExp || aSig ) aExp -= 0x3F81;
4795 return roundAndPackFloat32(aSign, aExp, aSig, status);
4799 /*----------------------------------------------------------------------------
4800 | Returns the result of converting the extended double-precision floating-
4801 | point value `a' to the double-precision floating-point format. The
4802 | conversion is performed according to the IEC/IEEE Standard for Binary
4803 | Floating-Point Arithmetic.
4804 *----------------------------------------------------------------------------*/
4806 float64 floatx80_to_float64(floatx80 a, float_status *status)
4808 flag aSign;
4809 int32_t aExp;
4810 uint64_t aSig, zSig;
4812 if (floatx80_invalid_encoding(a)) {
4813 float_raise(float_flag_invalid, status);
4814 return float64_default_nan(status);
4816 aSig = extractFloatx80Frac( a );
4817 aExp = extractFloatx80Exp( a );
4818 aSign = extractFloatx80Sign( a );
4819 if ( aExp == 0x7FFF ) {
4820 if ( (uint64_t) ( aSig<<1 ) ) {
4821 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4823 return packFloat64( aSign, 0x7FF, 0 );
4825 shift64RightJamming( aSig, 1, &zSig );
4826 if ( aExp || aSig ) aExp -= 0x3C01;
4827 return roundAndPackFloat64(aSign, aExp, zSig, status);
4831 /*----------------------------------------------------------------------------
4832 | Returns the result of converting the extended double-precision floating-
4833 | point value `a' to the quadruple-precision floating-point format. The
4834 | conversion is performed according to the IEC/IEEE Standard for Binary
4835 | Floating-Point Arithmetic.
4836 *----------------------------------------------------------------------------*/
4838 float128 floatx80_to_float128(floatx80 a, float_status *status)
4840 flag aSign;
4841 int aExp;
4842 uint64_t aSig, zSig0, zSig1;
4844 if (floatx80_invalid_encoding(a)) {
4845 float_raise(float_flag_invalid, status);
4846 return float128_default_nan(status);
4848 aSig = extractFloatx80Frac( a );
4849 aExp = extractFloatx80Exp( a );
4850 aSign = extractFloatx80Sign( a );
4851 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4852 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4854 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4855 return packFloat128( aSign, aExp, zSig0, zSig1 );
4859 /*----------------------------------------------------------------------------
4860 | Rounds the extended double-precision floating-point value `a'
4861 | to the precision provided by floatx80_rounding_precision and returns the
4862 | result as an extended double-precision floating-point value.
4863 | The operation is performed according to the IEC/IEEE Standard for Binary
4864 | Floating-Point Arithmetic.
4865 *----------------------------------------------------------------------------*/
4867 floatx80 floatx80_round(floatx80 a, float_status *status)
4869 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4870 extractFloatx80Sign(a),
4871 extractFloatx80Exp(a),
4872 extractFloatx80Frac(a), 0, status);
4875 /*----------------------------------------------------------------------------
4876 | Rounds the extended double-precision floating-point value `a' to an integer,
4877 | and returns the result as an extended quadruple-precision floating-point
4878 | value. The operation is performed according to the IEC/IEEE Standard for
4879 | Binary Floating-Point Arithmetic.
4880 *----------------------------------------------------------------------------*/
4882 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4884 flag aSign;
4885 int32_t aExp;
4886 uint64_t lastBitMask, roundBitsMask;
4887 floatx80 z;
4889 if (floatx80_invalid_encoding(a)) {
4890 float_raise(float_flag_invalid, status);
4891 return floatx80_default_nan(status);
4893 aExp = extractFloatx80Exp( a );
4894 if ( 0x403E <= aExp ) {
4895 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4896 return propagateFloatx80NaN(a, a, status);
4898 return a;
4900 if ( aExp < 0x3FFF ) {
4901 if ( ( aExp == 0 )
4902 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4903 return a;
4905 status->float_exception_flags |= float_flag_inexact;
4906 aSign = extractFloatx80Sign( a );
4907 switch (status->float_rounding_mode) {
4908 case float_round_nearest_even:
4909 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4911 return
4912 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4914 break;
4915 case float_round_ties_away:
4916 if (aExp == 0x3FFE) {
4917 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4919 break;
4920 case float_round_down:
4921 return
4922 aSign ?
4923 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4924 : packFloatx80( 0, 0, 0 );
4925 case float_round_up:
4926 return
4927 aSign ? packFloatx80( 1, 0, 0 )
4928 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4930 return packFloatx80( aSign, 0, 0 );
4932 lastBitMask = 1;
4933 lastBitMask <<= 0x403E - aExp;
4934 roundBitsMask = lastBitMask - 1;
4935 z = a;
4936 switch (status->float_rounding_mode) {
4937 case float_round_nearest_even:
4938 z.low += lastBitMask>>1;
4939 if ((z.low & roundBitsMask) == 0) {
4940 z.low &= ~lastBitMask;
4942 break;
4943 case float_round_ties_away:
4944 z.low += lastBitMask >> 1;
4945 break;
4946 case float_round_to_zero:
4947 break;
4948 case float_round_up:
4949 if (!extractFloatx80Sign(z)) {
4950 z.low += roundBitsMask;
4952 break;
4953 case float_round_down:
4954 if (extractFloatx80Sign(z)) {
4955 z.low += roundBitsMask;
4957 break;
4958 default:
4959 abort();
4961 z.low &= ~ roundBitsMask;
4962 if ( z.low == 0 ) {
4963 ++z.high;
4964 z.low = LIT64( 0x8000000000000000 );
4966 if (z.low != a.low) {
4967 status->float_exception_flags |= float_flag_inexact;
4969 return z;
4973 /*----------------------------------------------------------------------------
4974 | Returns the result of adding the absolute values of the extended double-
4975 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4976 | negated before being returned. `zSign' is ignored if the result is a NaN.
4977 | The addition is performed according to the IEC/IEEE Standard for Binary
4978 | Floating-Point Arithmetic.
4979 *----------------------------------------------------------------------------*/
4981 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4982 float_status *status)
4984 int32_t aExp, bExp, zExp;
4985 uint64_t aSig, bSig, zSig0, zSig1;
4986 int32_t expDiff;
4988 aSig = extractFloatx80Frac( a );
4989 aExp = extractFloatx80Exp( a );
4990 bSig = extractFloatx80Frac( b );
4991 bExp = extractFloatx80Exp( b );
4992 expDiff = aExp - bExp;
4993 if ( 0 < expDiff ) {
4994 if ( aExp == 0x7FFF ) {
4995 if ((uint64_t)(aSig << 1)) {
4996 return propagateFloatx80NaN(a, b, status);
4998 return a;
5000 if ( bExp == 0 ) --expDiff;
5001 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5002 zExp = aExp;
5004 else if ( expDiff < 0 ) {
5005 if ( bExp == 0x7FFF ) {
5006 if ((uint64_t)(bSig << 1)) {
5007 return propagateFloatx80NaN(a, b, status);
5009 return packFloatx80(zSign,
5010 floatx80_infinity_high,
5011 floatx80_infinity_low);
5013 if ( aExp == 0 ) ++expDiff;
5014 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5015 zExp = bExp;
5017 else {
5018 if ( aExp == 0x7FFF ) {
5019 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5020 return propagateFloatx80NaN(a, b, status);
5022 return a;
5024 zSig1 = 0;
5025 zSig0 = aSig + bSig;
5026 if ( aExp == 0 ) {
5027 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5028 goto roundAndPack;
5030 zExp = aExp;
5031 goto shiftRight1;
5033 zSig0 = aSig + bSig;
5034 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5035 shiftRight1:
5036 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5037 zSig0 |= LIT64( 0x8000000000000000 );
5038 ++zExp;
5039 roundAndPack:
5040 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5041 zSign, zExp, zSig0, zSig1, status);
5044 /*----------------------------------------------------------------------------
5045 | Returns the result of subtracting the absolute values of the extended
5046 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5047 | difference is negated before being returned. `zSign' is ignored if the
5048 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5049 | Standard for Binary Floating-Point Arithmetic.
5050 *----------------------------------------------------------------------------*/
5052 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5053 float_status *status)
5055 int32_t aExp, bExp, zExp;
5056 uint64_t aSig, bSig, zSig0, zSig1;
5057 int32_t expDiff;
5059 aSig = extractFloatx80Frac( a );
5060 aExp = extractFloatx80Exp( a );
5061 bSig = extractFloatx80Frac( b );
5062 bExp = extractFloatx80Exp( b );
5063 expDiff = aExp - bExp;
5064 if ( 0 < expDiff ) goto aExpBigger;
5065 if ( expDiff < 0 ) goto bExpBigger;
5066 if ( aExp == 0x7FFF ) {
5067 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5068 return propagateFloatx80NaN(a, b, status);
5070 float_raise(float_flag_invalid, status);
5071 return floatx80_default_nan(status);
5073 if ( aExp == 0 ) {
5074 aExp = 1;
5075 bExp = 1;
5077 zSig1 = 0;
5078 if ( bSig < aSig ) goto aBigger;
5079 if ( aSig < bSig ) goto bBigger;
5080 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5081 bExpBigger:
5082 if ( bExp == 0x7FFF ) {
5083 if ((uint64_t)(bSig << 1)) {
5084 return propagateFloatx80NaN(a, b, status);
5086 return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5087 floatx80_infinity_low);
5089 if ( aExp == 0 ) ++expDiff;
5090 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5091 bBigger:
5092 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5093 zExp = bExp;
5094 zSign ^= 1;
5095 goto normalizeRoundAndPack;
5096 aExpBigger:
5097 if ( aExp == 0x7FFF ) {
5098 if ((uint64_t)(aSig << 1)) {
5099 return propagateFloatx80NaN(a, b, status);
5101 return a;
5103 if ( bExp == 0 ) --expDiff;
5104 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5105 aBigger:
5106 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5107 zExp = aExp;
5108 normalizeRoundAndPack:
5109 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5110 zSign, zExp, zSig0, zSig1, status);
5113 /*----------------------------------------------------------------------------
5114 | Returns the result of adding the extended double-precision floating-point
5115 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5116 | Standard for Binary Floating-Point Arithmetic.
5117 *----------------------------------------------------------------------------*/
5119 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5121 flag aSign, bSign;
5123 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5124 float_raise(float_flag_invalid, status);
5125 return floatx80_default_nan(status);
5127 aSign = extractFloatx80Sign( a );
5128 bSign = extractFloatx80Sign( b );
5129 if ( aSign == bSign ) {
5130 return addFloatx80Sigs(a, b, aSign, status);
5132 else {
5133 return subFloatx80Sigs(a, b, aSign, status);
5138 /*----------------------------------------------------------------------------
5139 | Returns the result of subtracting the extended double-precision floating-
5140 | point values `a' and `b'. The operation is performed according to the
5141 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5142 *----------------------------------------------------------------------------*/
5144 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5146 flag aSign, bSign;
5148 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5149 float_raise(float_flag_invalid, status);
5150 return floatx80_default_nan(status);
5152 aSign = extractFloatx80Sign( a );
5153 bSign = extractFloatx80Sign( b );
5154 if ( aSign == bSign ) {
5155 return subFloatx80Sigs(a, b, aSign, status);
5157 else {
5158 return addFloatx80Sigs(a, b, aSign, status);
5163 /*----------------------------------------------------------------------------
5164 | Returns the result of multiplying the extended double-precision floating-
5165 | point values `a' and `b'. The operation is performed according to the
5166 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5167 *----------------------------------------------------------------------------*/
5169 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5171 flag aSign, bSign, zSign;
5172 int32_t aExp, bExp, zExp;
5173 uint64_t aSig, bSig, zSig0, zSig1;
5175 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5176 float_raise(float_flag_invalid, status);
5177 return floatx80_default_nan(status);
5179 aSig = extractFloatx80Frac( a );
5180 aExp = extractFloatx80Exp( a );
5181 aSign = extractFloatx80Sign( a );
5182 bSig = extractFloatx80Frac( b );
5183 bExp = extractFloatx80Exp( b );
5184 bSign = extractFloatx80Sign( b );
5185 zSign = aSign ^ bSign;
5186 if ( aExp == 0x7FFF ) {
5187 if ( (uint64_t) ( aSig<<1 )
5188 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5189 return propagateFloatx80NaN(a, b, status);
5191 if ( ( bExp | bSig ) == 0 ) goto invalid;
5192 return packFloatx80(zSign, floatx80_infinity_high,
5193 floatx80_infinity_low);
5195 if ( bExp == 0x7FFF ) {
5196 if ((uint64_t)(bSig << 1)) {
5197 return propagateFloatx80NaN(a, b, status);
5199 if ( ( aExp | aSig ) == 0 ) {
5200 invalid:
5201 float_raise(float_flag_invalid, status);
5202 return floatx80_default_nan(status);
5204 return packFloatx80(zSign, floatx80_infinity_high,
5205 floatx80_infinity_low);
5207 if ( aExp == 0 ) {
5208 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5209 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5211 if ( bExp == 0 ) {
5212 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5213 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5215 zExp = aExp + bExp - 0x3FFE;
5216 mul64To128( aSig, bSig, &zSig0, &zSig1 );
5217 if ( 0 < (int64_t) zSig0 ) {
5218 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5219 --zExp;
5221 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5222 zSign, zExp, zSig0, zSig1, status);
5225 /*----------------------------------------------------------------------------
5226 | Returns the result of dividing the extended double-precision floating-point
5227 | value `a' by the corresponding value `b'. The operation is performed
5228 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5229 *----------------------------------------------------------------------------*/
5231 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5233 flag aSign, bSign, zSign;
5234 int32_t aExp, bExp, zExp;
5235 uint64_t aSig, bSig, zSig0, zSig1;
5236 uint64_t rem0, rem1, rem2, term0, term1, term2;
5238 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5239 float_raise(float_flag_invalid, status);
5240 return floatx80_default_nan(status);
5242 aSig = extractFloatx80Frac( a );
5243 aExp = extractFloatx80Exp( a );
5244 aSign = extractFloatx80Sign( a );
5245 bSig = extractFloatx80Frac( b );
5246 bExp = extractFloatx80Exp( b );
5247 bSign = extractFloatx80Sign( b );
5248 zSign = aSign ^ bSign;
5249 if ( aExp == 0x7FFF ) {
5250 if ((uint64_t)(aSig << 1)) {
5251 return propagateFloatx80NaN(a, b, status);
5253 if ( bExp == 0x7FFF ) {
5254 if ((uint64_t)(bSig << 1)) {
5255 return propagateFloatx80NaN(a, b, status);
5257 goto invalid;
5259 return packFloatx80(zSign, floatx80_infinity_high,
5260 floatx80_infinity_low);
5262 if ( bExp == 0x7FFF ) {
5263 if ((uint64_t)(bSig << 1)) {
5264 return propagateFloatx80NaN(a, b, status);
5266 return packFloatx80( zSign, 0, 0 );
5268 if ( bExp == 0 ) {
5269 if ( bSig == 0 ) {
5270 if ( ( aExp | aSig ) == 0 ) {
5271 invalid:
5272 float_raise(float_flag_invalid, status);
5273 return floatx80_default_nan(status);
5275 float_raise(float_flag_divbyzero, status);
5276 return packFloatx80(zSign, floatx80_infinity_high,
5277 floatx80_infinity_low);
5279 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5281 if ( aExp == 0 ) {
5282 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5283 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5285 zExp = aExp - bExp + 0x3FFE;
5286 rem1 = 0;
5287 if ( bSig <= aSig ) {
5288 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5289 ++zExp;
5291 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5292 mul64To128( bSig, zSig0, &term0, &term1 );
5293 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5294 while ( (int64_t) rem0 < 0 ) {
5295 --zSig0;
5296 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5298 zSig1 = estimateDiv128To64( rem1, 0, bSig );
5299 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5300 mul64To128( bSig, zSig1, &term1, &term2 );
5301 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5302 while ( (int64_t) rem1 < 0 ) {
5303 --zSig1;
5304 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5306 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5308 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5309 zSign, zExp, zSig0, zSig1, status);
5312 /*----------------------------------------------------------------------------
5313 | Returns the remainder of the extended double-precision floating-point value
5314 | `a' with respect to the corresponding value `b'. The operation is performed
5315 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5316 *----------------------------------------------------------------------------*/
5318 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5320 flag aSign, zSign;
5321 int32_t aExp, bExp, expDiff;
5322 uint64_t aSig0, aSig1, bSig;
5323 uint64_t q, term0, term1, alternateASig0, alternateASig1;
5325 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5326 float_raise(float_flag_invalid, status);
5327 return floatx80_default_nan(status);
5329 aSig0 = extractFloatx80Frac( a );
5330 aExp = extractFloatx80Exp( a );
5331 aSign = extractFloatx80Sign( a );
5332 bSig = extractFloatx80Frac( b );
5333 bExp = extractFloatx80Exp( b );
5334 if ( aExp == 0x7FFF ) {
5335 if ( (uint64_t) ( aSig0<<1 )
5336 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5337 return propagateFloatx80NaN(a, b, status);
5339 goto invalid;
5341 if ( bExp == 0x7FFF ) {
5342 if ((uint64_t)(bSig << 1)) {
5343 return propagateFloatx80NaN(a, b, status);
5345 return a;
5347 if ( bExp == 0 ) {
5348 if ( bSig == 0 ) {
5349 invalid:
5350 float_raise(float_flag_invalid, status);
5351 return floatx80_default_nan(status);
5353 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5355 if ( aExp == 0 ) {
5356 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5357 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5359 bSig |= LIT64( 0x8000000000000000 );
5360 zSign = aSign;
5361 expDiff = aExp - bExp;
5362 aSig1 = 0;
5363 if ( expDiff < 0 ) {
5364 if ( expDiff < -1 ) return a;
5365 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5366 expDiff = 0;
5368 q = ( bSig <= aSig0 );
5369 if ( q ) aSig0 -= bSig;
5370 expDiff -= 64;
5371 while ( 0 < expDiff ) {
5372 q = estimateDiv128To64( aSig0, aSig1, bSig );
5373 q = ( 2 < q ) ? q - 2 : 0;
5374 mul64To128( bSig, q, &term0, &term1 );
5375 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5376 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5377 expDiff -= 62;
5379 expDiff += 64;
5380 if ( 0 < expDiff ) {
5381 q = estimateDiv128To64( aSig0, aSig1, bSig );
5382 q = ( 2 < q ) ? q - 2 : 0;
5383 q >>= 64 - expDiff;
5384 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5385 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5386 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5387 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5388 ++q;
5389 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5392 else {
5393 term1 = 0;
5394 term0 = bSig;
5396 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5397 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5398 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5399 && ( q & 1 ) )
5401 aSig0 = alternateASig0;
5402 aSig1 = alternateASig1;
5403 zSign = ! zSign;
5405 return
5406 normalizeRoundAndPackFloatx80(
5407 80, zSign, bExp + expDiff, aSig0, aSig1, status);
5411 /*----------------------------------------------------------------------------
5412 | Returns the square root of the extended double-precision floating-point
5413 | value `a'. The operation is performed according to the IEC/IEEE Standard
5414 | for Binary Floating-Point Arithmetic.
5415 *----------------------------------------------------------------------------*/
5417 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5419 flag aSign;
5420 int32_t aExp, zExp;
5421 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5422 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5424 if (floatx80_invalid_encoding(a)) {
5425 float_raise(float_flag_invalid, status);
5426 return floatx80_default_nan(status);
5428 aSig0 = extractFloatx80Frac( a );
5429 aExp = extractFloatx80Exp( a );
5430 aSign = extractFloatx80Sign( a );
5431 if ( aExp == 0x7FFF ) {
5432 if ((uint64_t)(aSig0 << 1)) {
5433 return propagateFloatx80NaN(a, a, status);
5435 if ( ! aSign ) return a;
5436 goto invalid;
5438 if ( aSign ) {
5439 if ( ( aExp | aSig0 ) == 0 ) return a;
5440 invalid:
5441 float_raise(float_flag_invalid, status);
5442 return floatx80_default_nan(status);
5444 if ( aExp == 0 ) {
5445 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5446 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5448 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5449 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5450 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5451 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5452 doubleZSig0 = zSig0<<1;
5453 mul64To128( zSig0, zSig0, &term0, &term1 );
5454 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5455 while ( (int64_t) rem0 < 0 ) {
5456 --zSig0;
5457 doubleZSig0 -= 2;
5458 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5460 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5461 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5462 if ( zSig1 == 0 ) zSig1 = 1;
5463 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5464 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5465 mul64To128( zSig1, zSig1, &term2, &term3 );
5466 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5467 while ( (int64_t) rem1 < 0 ) {
5468 --zSig1;
5469 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5470 term3 |= 1;
5471 term2 |= doubleZSig0;
5472 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5474 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5476 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5477 zSig0 |= doubleZSig0;
5478 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5479 0, zExp, zSig0, zSig1, status);
5482 /*----------------------------------------------------------------------------
5483 | Returns 1 if the extended double-precision floating-point value `a' is equal
5484 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5485 | raised if either operand is a NaN. Otherwise, the comparison is performed
5486 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5487 *----------------------------------------------------------------------------*/
5489 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5492 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5493 || (extractFloatx80Exp(a) == 0x7FFF
5494 && (uint64_t) (extractFloatx80Frac(a) << 1))
5495 || (extractFloatx80Exp(b) == 0x7FFF
5496 && (uint64_t) (extractFloatx80Frac(b) << 1))
5498 float_raise(float_flag_invalid, status);
5499 return 0;
5501 return
5502 ( a.low == b.low )
5503 && ( ( a.high == b.high )
5504 || ( ( a.low == 0 )
5505 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5510 /*----------------------------------------------------------------------------
5511 | Returns 1 if the extended double-precision floating-point value `a' is
5512 | less than or equal to the corresponding value `b', and 0 otherwise. The
5513 | invalid exception is raised if either operand is a NaN. The comparison is
5514 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5515 | Arithmetic.
5516 *----------------------------------------------------------------------------*/
5518 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5520 flag aSign, bSign;
5522 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5523 || (extractFloatx80Exp(a) == 0x7FFF
5524 && (uint64_t) (extractFloatx80Frac(a) << 1))
5525 || (extractFloatx80Exp(b) == 0x7FFF
5526 && (uint64_t) (extractFloatx80Frac(b) << 1))
5528 float_raise(float_flag_invalid, status);
5529 return 0;
5531 aSign = extractFloatx80Sign( a );
5532 bSign = extractFloatx80Sign( b );
5533 if ( aSign != bSign ) {
5534 return
5535 aSign
5536 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5537 == 0 );
5539 return
5540 aSign ? le128( b.high, b.low, a.high, a.low )
5541 : le128( a.high, a.low, b.high, b.low );
5545 /*----------------------------------------------------------------------------
5546 | Returns 1 if the extended double-precision floating-point value `a' is
5547 | less than the corresponding value `b', and 0 otherwise. The invalid
5548 | exception is raised if either operand is a NaN. The comparison is performed
5549 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5550 *----------------------------------------------------------------------------*/
5552 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5554 flag aSign, bSign;
5556 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5557 || (extractFloatx80Exp(a) == 0x7FFF
5558 && (uint64_t) (extractFloatx80Frac(a) << 1))
5559 || (extractFloatx80Exp(b) == 0x7FFF
5560 && (uint64_t) (extractFloatx80Frac(b) << 1))
5562 float_raise(float_flag_invalid, status);
5563 return 0;
5565 aSign = extractFloatx80Sign( a );
5566 bSign = extractFloatx80Sign( b );
5567 if ( aSign != bSign ) {
5568 return
5569 aSign
5570 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5571 != 0 );
5573 return
5574 aSign ? lt128( b.high, b.low, a.high, a.low )
5575 : lt128( a.high, a.low, b.high, b.low );
5579 /*----------------------------------------------------------------------------
5580 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5581 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5582 | either operand is a NaN. The comparison is performed according to the
5583 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5584 *----------------------------------------------------------------------------*/
5585 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5587 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5588 || (extractFloatx80Exp(a) == 0x7FFF
5589 && (uint64_t) (extractFloatx80Frac(a) << 1))
5590 || (extractFloatx80Exp(b) == 0x7FFF
5591 && (uint64_t) (extractFloatx80Frac(b) << 1))
5593 float_raise(float_flag_invalid, status);
5594 return 1;
5596 return 0;
5599 /*----------------------------------------------------------------------------
5600 | Returns 1 if the extended double-precision floating-point value `a' is
5601 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5602 | cause an exception. The comparison is performed according to the IEC/IEEE
5603 | Standard for Binary Floating-Point Arithmetic.
5604 *----------------------------------------------------------------------------*/
5606 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5609 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5610 float_raise(float_flag_invalid, status);
5611 return 0;
5613 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5614 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5615 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5616 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5618 if (floatx80_is_signaling_nan(a, status)
5619 || floatx80_is_signaling_nan(b, status)) {
5620 float_raise(float_flag_invalid, status);
5622 return 0;
5624 return
5625 ( a.low == b.low )
5626 && ( ( a.high == b.high )
5627 || ( ( a.low == 0 )
5628 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5633 /*----------------------------------------------------------------------------
5634 | Returns 1 if the extended double-precision floating-point value `a' is less
5635 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5636 | do not cause an exception. Otherwise, the comparison is performed according
5637 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5638 *----------------------------------------------------------------------------*/
5640 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5642 flag aSign, bSign;
5644 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5645 float_raise(float_flag_invalid, status);
5646 return 0;
5648 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5649 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5650 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5651 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5653 if (floatx80_is_signaling_nan(a, status)
5654 || floatx80_is_signaling_nan(b, status)) {
5655 float_raise(float_flag_invalid, status);
5657 return 0;
5659 aSign = extractFloatx80Sign( a );
5660 bSign = extractFloatx80Sign( b );
5661 if ( aSign != bSign ) {
5662 return
5663 aSign
5664 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5665 == 0 );
5667 return
5668 aSign ? le128( b.high, b.low, a.high, a.low )
5669 : le128( a.high, a.low, b.high, b.low );
5673 /*----------------------------------------------------------------------------
5674 | Returns 1 if the extended double-precision floating-point value `a' is less
5675 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5676 | an exception. Otherwise, the comparison is performed according to the
5677 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5678 *----------------------------------------------------------------------------*/
5680 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5682 flag aSign, bSign;
5684 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5685 float_raise(float_flag_invalid, status);
5686 return 0;
5688 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5689 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5690 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5691 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5693 if (floatx80_is_signaling_nan(a, status)
5694 || floatx80_is_signaling_nan(b, status)) {
5695 float_raise(float_flag_invalid, status);
5697 return 0;
5699 aSign = extractFloatx80Sign( a );
5700 bSign = extractFloatx80Sign( b );
5701 if ( aSign != bSign ) {
5702 return
5703 aSign
5704 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5705 != 0 );
5707 return
5708 aSign ? lt128( b.high, b.low, a.high, a.low )
5709 : lt128( a.high, a.low, b.high, b.low );
5713 /*----------------------------------------------------------------------------
5714 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5715 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5716 | The comparison is performed according to the IEC/IEEE Standard for Binary
5717 | Floating-Point Arithmetic.
5718 *----------------------------------------------------------------------------*/
5719 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5721 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5722 float_raise(float_flag_invalid, status);
5723 return 1;
5725 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5726 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5727 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5728 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5730 if (floatx80_is_signaling_nan(a, status)
5731 || floatx80_is_signaling_nan(b, status)) {
5732 float_raise(float_flag_invalid, status);
5734 return 1;
5736 return 0;
5739 /*----------------------------------------------------------------------------
5740 | Returns the result of converting the quadruple-precision floating-point
5741 | value `a' to the 32-bit two's complement integer format. The conversion
5742 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5743 | Arithmetic---which means in particular that the conversion is rounded
5744 | according to the current rounding mode. If `a' is a NaN, the largest
5745 | positive integer is returned. Otherwise, if the conversion overflows, the
5746 | largest integer with the same sign as `a' is returned.
5747 *----------------------------------------------------------------------------*/
5749 int32_t float128_to_int32(float128 a, float_status *status)
5751 flag aSign;
5752 int32_t aExp, shiftCount;
5753 uint64_t aSig0, aSig1;
5755 aSig1 = extractFloat128Frac1( a );
5756 aSig0 = extractFloat128Frac0( a );
5757 aExp = extractFloat128Exp( a );
5758 aSign = extractFloat128Sign( a );
5759 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5760 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5761 aSig0 |= ( aSig1 != 0 );
5762 shiftCount = 0x4028 - aExp;
5763 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5764 return roundAndPackInt32(aSign, aSig0, status);
5768 /*----------------------------------------------------------------------------
5769 | Returns the result of converting the quadruple-precision floating-point
5770 | value `a' to the 32-bit two's complement integer format. The conversion
5771 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5772 | Arithmetic, except that the conversion is always rounded toward zero. If
5773 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5774 | conversion overflows, the largest integer with the same sign as `a' is
5775 | returned.
5776 *----------------------------------------------------------------------------*/
5778 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5780 flag aSign;
5781 int32_t aExp, shiftCount;
5782 uint64_t aSig0, aSig1, savedASig;
5783 int32_t z;
5785 aSig1 = extractFloat128Frac1( a );
5786 aSig0 = extractFloat128Frac0( a );
5787 aExp = extractFloat128Exp( a );
5788 aSign = extractFloat128Sign( a );
5789 aSig0 |= ( aSig1 != 0 );
5790 if ( 0x401E < aExp ) {
5791 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5792 goto invalid;
5794 else if ( aExp < 0x3FFF ) {
5795 if (aExp || aSig0) {
5796 status->float_exception_flags |= float_flag_inexact;
5798 return 0;
5800 aSig0 |= LIT64( 0x0001000000000000 );
5801 shiftCount = 0x402F - aExp;
5802 savedASig = aSig0;
5803 aSig0 >>= shiftCount;
5804 z = aSig0;
5805 if ( aSign ) z = - z;
5806 if ( ( z < 0 ) ^ aSign ) {
5807 invalid:
5808 float_raise(float_flag_invalid, status);
5809 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5811 if ( ( aSig0<<shiftCount ) != savedASig ) {
5812 status->float_exception_flags |= float_flag_inexact;
5814 return z;
5818 /*----------------------------------------------------------------------------
5819 | Returns the result of converting the quadruple-precision floating-point
5820 | value `a' to the 64-bit two's complement integer format. The conversion
5821 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5822 | Arithmetic---which means in particular that the conversion is rounded
5823 | according to the current rounding mode. If `a' is a NaN, the largest
5824 | positive integer is returned. Otherwise, if the conversion overflows, the
5825 | largest integer with the same sign as `a' is returned.
5826 *----------------------------------------------------------------------------*/
5828 int64_t float128_to_int64(float128 a, float_status *status)
5830 flag aSign;
5831 int32_t aExp, shiftCount;
5832 uint64_t aSig0, aSig1;
5834 aSig1 = extractFloat128Frac1( a );
5835 aSig0 = extractFloat128Frac0( a );
5836 aExp = extractFloat128Exp( a );
5837 aSign = extractFloat128Sign( a );
5838 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5839 shiftCount = 0x402F - aExp;
5840 if ( shiftCount <= 0 ) {
5841 if ( 0x403E < aExp ) {
5842 float_raise(float_flag_invalid, status);
5843 if ( ! aSign
5844 || ( ( aExp == 0x7FFF )
5845 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5848 return LIT64( 0x7FFFFFFFFFFFFFFF );
5850 return (int64_t) LIT64( 0x8000000000000000 );
5852 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5854 else {
5855 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5857 return roundAndPackInt64(aSign, aSig0, aSig1, status);
5861 /*----------------------------------------------------------------------------
5862 | Returns the result of converting the quadruple-precision floating-point
5863 | value `a' to the 64-bit two's complement integer format. The conversion
5864 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5865 | Arithmetic, except that the conversion is always rounded toward zero.
5866 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5867 | the conversion overflows, the largest integer with the same sign as `a' is
5868 | returned.
5869 *----------------------------------------------------------------------------*/
5871 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5873 flag aSign;
5874 int32_t aExp, shiftCount;
5875 uint64_t aSig0, aSig1;
5876 int64_t z;
5878 aSig1 = extractFloat128Frac1( a );
5879 aSig0 = extractFloat128Frac0( a );
5880 aExp = extractFloat128Exp( a );
5881 aSign = extractFloat128Sign( a );
5882 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5883 shiftCount = aExp - 0x402F;
5884 if ( 0 < shiftCount ) {
5885 if ( 0x403E <= aExp ) {
5886 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5887 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5888 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5889 if (aSig1) {
5890 status->float_exception_flags |= float_flag_inexact;
5893 else {
5894 float_raise(float_flag_invalid, status);
5895 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5896 return LIT64( 0x7FFFFFFFFFFFFFFF );
5899 return (int64_t) LIT64( 0x8000000000000000 );
5901 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5902 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5903 status->float_exception_flags |= float_flag_inexact;
5906 else {
5907 if ( aExp < 0x3FFF ) {
5908 if ( aExp | aSig0 | aSig1 ) {
5909 status->float_exception_flags |= float_flag_inexact;
5911 return 0;
5913 z = aSig0>>( - shiftCount );
5914 if ( aSig1
5915 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5916 status->float_exception_flags |= float_flag_inexact;
5919 if ( aSign ) z = - z;
5920 return z;
5924 /*----------------------------------------------------------------------------
5925 | Returns the result of converting the quadruple-precision floating-point value
5926 | `a' to the 64-bit unsigned integer format. The conversion is
5927 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5928 | Arithmetic---which means in particular that the conversion is rounded
5929 | according to the current rounding mode. If `a' is a NaN, the largest
5930 | positive integer is returned. If the conversion overflows, the
5931 | largest unsigned integer is returned. If 'a' is negative, the value is
5932 | rounded and zero is returned; negative values that do not round to zero
5933 | will raise the inexact exception.
5934 *----------------------------------------------------------------------------*/
5936 uint64_t float128_to_uint64(float128 a, float_status *status)
5938 flag aSign;
5939 int aExp;
5940 int shiftCount;
5941 uint64_t aSig0, aSig1;
5943 aSig0 = extractFloat128Frac0(a);
5944 aSig1 = extractFloat128Frac1(a);
5945 aExp = extractFloat128Exp(a);
5946 aSign = extractFloat128Sign(a);
5947 if (aSign && (aExp > 0x3FFE)) {
5948 float_raise(float_flag_invalid, status);
5949 if (float128_is_any_nan(a)) {
5950 return LIT64(0xFFFFFFFFFFFFFFFF);
5951 } else {
5952 return 0;
5955 if (aExp) {
5956 aSig0 |= LIT64(0x0001000000000000);
5958 shiftCount = 0x402F - aExp;
5959 if (shiftCount <= 0) {
5960 if (0x403E < aExp) {
5961 float_raise(float_flag_invalid, status);
5962 return LIT64(0xFFFFFFFFFFFFFFFF);
5964 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5965 } else {
5966 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5968 return roundAndPackUint64(aSign, aSig0, aSig1, status);
5971 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5973 uint64_t v;
5974 signed char current_rounding_mode = status->float_rounding_mode;
5976 set_float_rounding_mode(float_round_to_zero, status);
5977 v = float128_to_uint64(a, status);
5978 set_float_rounding_mode(current_rounding_mode, status);
5980 return v;
5983 /*----------------------------------------------------------------------------
5984 | Returns the result of converting the quadruple-precision floating-point
5985 | value `a' to the 32-bit unsigned integer format. The conversion
5986 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5987 | Arithmetic except that the conversion is always rounded toward zero.
5988 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
5989 | if the conversion overflows, the largest unsigned integer is returned.
5990 | If 'a' is negative, the value is rounded and zero is returned; negative
5991 | values that do not round to zero will raise the inexact exception.
5992 *----------------------------------------------------------------------------*/
5994 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5996 uint64_t v;
5997 uint32_t res;
5998 int old_exc_flags = get_float_exception_flags(status);
6000 v = float128_to_uint64_round_to_zero(a, status);
6001 if (v > 0xffffffff) {
6002 res = 0xffffffff;
6003 } else {
6004 return v;
6006 set_float_exception_flags(old_exc_flags, status);
6007 float_raise(float_flag_invalid, status);
6008 return res;
6011 /*----------------------------------------------------------------------------
6012 | Returns the result of converting the quadruple-precision floating-point
6013 | value `a' to the single-precision floating-point format. The conversion
6014 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6015 | Arithmetic.
6016 *----------------------------------------------------------------------------*/
6018 float32 float128_to_float32(float128 a, float_status *status)
6020 flag aSign;
6021 int32_t aExp;
6022 uint64_t aSig0, aSig1;
6023 uint32_t zSig;
6025 aSig1 = extractFloat128Frac1( a );
6026 aSig0 = extractFloat128Frac0( a );
6027 aExp = extractFloat128Exp( a );
6028 aSign = extractFloat128Sign( a );
6029 if ( aExp == 0x7FFF ) {
6030 if ( aSig0 | aSig1 ) {
6031 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6033 return packFloat32( aSign, 0xFF, 0 );
6035 aSig0 |= ( aSig1 != 0 );
6036 shift64RightJamming( aSig0, 18, &aSig0 );
6037 zSig = aSig0;
6038 if ( aExp || zSig ) {
6039 zSig |= 0x40000000;
6040 aExp -= 0x3F81;
6042 return roundAndPackFloat32(aSign, aExp, zSig, status);
6046 /*----------------------------------------------------------------------------
6047 | Returns the result of converting the quadruple-precision floating-point
6048 | value `a' to the double-precision floating-point format. The conversion
6049 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6050 | Arithmetic.
6051 *----------------------------------------------------------------------------*/
6053 float64 float128_to_float64(float128 a, float_status *status)
6055 flag aSign;
6056 int32_t aExp;
6057 uint64_t aSig0, aSig1;
6059 aSig1 = extractFloat128Frac1( a );
6060 aSig0 = extractFloat128Frac0( a );
6061 aExp = extractFloat128Exp( a );
6062 aSign = extractFloat128Sign( a );
6063 if ( aExp == 0x7FFF ) {
6064 if ( aSig0 | aSig1 ) {
6065 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6067 return packFloat64( aSign, 0x7FF, 0 );
6069 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6070 aSig0 |= ( aSig1 != 0 );
6071 if ( aExp || aSig0 ) {
6072 aSig0 |= LIT64( 0x4000000000000000 );
6073 aExp -= 0x3C01;
6075 return roundAndPackFloat64(aSign, aExp, aSig0, status);
6079 /*----------------------------------------------------------------------------
6080 | Returns the result of converting the quadruple-precision floating-point
6081 | value `a' to the extended double-precision floating-point format. The
6082 | conversion is performed according to the IEC/IEEE Standard for Binary
6083 | Floating-Point Arithmetic.
6084 *----------------------------------------------------------------------------*/
6086 floatx80 float128_to_floatx80(float128 a, float_status *status)
6088 flag aSign;
6089 int32_t aExp;
6090 uint64_t aSig0, aSig1;
6092 aSig1 = extractFloat128Frac1( a );
6093 aSig0 = extractFloat128Frac0( a );
6094 aExp = extractFloat128Exp( a );
6095 aSign = extractFloat128Sign( a );
6096 if ( aExp == 0x7FFF ) {
6097 if ( aSig0 | aSig1 ) {
6098 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6100 return packFloatx80(aSign, floatx80_infinity_high,
6101 floatx80_infinity_low);
6103 if ( aExp == 0 ) {
6104 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6105 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6107 else {
6108 aSig0 |= LIT64( 0x0001000000000000 );
6110 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6111 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6115 /*----------------------------------------------------------------------------
6116 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6117 | returns the result as a quadruple-precision floating-point value. The
6118 | operation is performed according to the IEC/IEEE Standard for Binary
6119 | Floating-Point Arithmetic.
6120 *----------------------------------------------------------------------------*/
6122 float128 float128_round_to_int(float128 a, float_status *status)
6124 flag aSign;
6125 int32_t aExp;
6126 uint64_t lastBitMask, roundBitsMask;
6127 float128 z;
6129 aExp = extractFloat128Exp( a );
6130 if ( 0x402F <= aExp ) {
6131 if ( 0x406F <= aExp ) {
6132 if ( ( aExp == 0x7FFF )
6133 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6135 return propagateFloat128NaN(a, a, status);
6137 return a;
6139 lastBitMask = 1;
6140 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6141 roundBitsMask = lastBitMask - 1;
6142 z = a;
6143 switch (status->float_rounding_mode) {
6144 case float_round_nearest_even:
6145 if ( lastBitMask ) {
6146 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6147 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6149 else {
6150 if ( (int64_t) z.low < 0 ) {
6151 ++z.high;
6152 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6155 break;
6156 case float_round_ties_away:
6157 if (lastBitMask) {
6158 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6159 } else {
6160 if ((int64_t) z.low < 0) {
6161 ++z.high;
6164 break;
6165 case float_round_to_zero:
6166 break;
6167 case float_round_up:
6168 if (!extractFloat128Sign(z)) {
6169 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6171 break;
6172 case float_round_down:
6173 if (extractFloat128Sign(z)) {
6174 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6176 break;
6177 default:
6178 abort();
6180 z.low &= ~ roundBitsMask;
6182 else {
6183 if ( aExp < 0x3FFF ) {
6184 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6185 status->float_exception_flags |= float_flag_inexact;
6186 aSign = extractFloat128Sign( a );
6187 switch (status->float_rounding_mode) {
6188 case float_round_nearest_even:
6189 if ( ( aExp == 0x3FFE )
6190 && ( extractFloat128Frac0( a )
6191 | extractFloat128Frac1( a ) )
6193 return packFloat128( aSign, 0x3FFF, 0, 0 );
6195 break;
6196 case float_round_ties_away:
6197 if (aExp == 0x3FFE) {
6198 return packFloat128(aSign, 0x3FFF, 0, 0);
6200 break;
6201 case float_round_down:
6202 return
6203 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6204 : packFloat128( 0, 0, 0, 0 );
6205 case float_round_up:
6206 return
6207 aSign ? packFloat128( 1, 0, 0, 0 )
6208 : packFloat128( 0, 0x3FFF, 0, 0 );
6210 return packFloat128( aSign, 0, 0, 0 );
6212 lastBitMask = 1;
6213 lastBitMask <<= 0x402F - aExp;
6214 roundBitsMask = lastBitMask - 1;
6215 z.low = 0;
6216 z.high = a.high;
6217 switch (status->float_rounding_mode) {
6218 case float_round_nearest_even:
6219 z.high += lastBitMask>>1;
6220 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6221 z.high &= ~ lastBitMask;
6223 break;
6224 case float_round_ties_away:
6225 z.high += lastBitMask>>1;
6226 break;
6227 case float_round_to_zero:
6228 break;
6229 case float_round_up:
6230 if (!extractFloat128Sign(z)) {
6231 z.high |= ( a.low != 0 );
6232 z.high += roundBitsMask;
6234 break;
6235 case float_round_down:
6236 if (extractFloat128Sign(z)) {
6237 z.high |= (a.low != 0);
6238 z.high += roundBitsMask;
6240 break;
6241 default:
6242 abort();
6244 z.high &= ~ roundBitsMask;
6246 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6247 status->float_exception_flags |= float_flag_inexact;
6249 return z;
6253 /*----------------------------------------------------------------------------
6254 | Returns the result of adding the absolute values of the quadruple-precision
6255 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6256 | before being returned. `zSign' is ignored if the result is a NaN.
6257 | The addition is performed according to the IEC/IEEE Standard for Binary
6258 | Floating-Point Arithmetic.
6259 *----------------------------------------------------------------------------*/
6261 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6262 float_status *status)
6264 int32_t aExp, bExp, zExp;
6265 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6266 int32_t expDiff;
6268 aSig1 = extractFloat128Frac1( a );
6269 aSig0 = extractFloat128Frac0( a );
6270 aExp = extractFloat128Exp( a );
6271 bSig1 = extractFloat128Frac1( b );
6272 bSig0 = extractFloat128Frac0( b );
6273 bExp = extractFloat128Exp( b );
6274 expDiff = aExp - bExp;
6275 if ( 0 < expDiff ) {
6276 if ( aExp == 0x7FFF ) {
6277 if (aSig0 | aSig1) {
6278 return propagateFloat128NaN(a, b, status);
6280 return a;
6282 if ( bExp == 0 ) {
6283 --expDiff;
6285 else {
6286 bSig0 |= LIT64( 0x0001000000000000 );
6288 shift128ExtraRightJamming(
6289 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6290 zExp = aExp;
6292 else if ( expDiff < 0 ) {
6293 if ( bExp == 0x7FFF ) {
6294 if (bSig0 | bSig1) {
6295 return propagateFloat128NaN(a, b, status);
6297 return packFloat128( zSign, 0x7FFF, 0, 0 );
6299 if ( aExp == 0 ) {
6300 ++expDiff;
6302 else {
6303 aSig0 |= LIT64( 0x0001000000000000 );
6305 shift128ExtraRightJamming(
6306 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6307 zExp = bExp;
6309 else {
6310 if ( aExp == 0x7FFF ) {
6311 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6312 return propagateFloat128NaN(a, b, status);
6314 return a;
6316 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6317 if ( aExp == 0 ) {
6318 if (status->flush_to_zero) {
6319 if (zSig0 | zSig1) {
6320 float_raise(float_flag_output_denormal, status);
6322 return packFloat128(zSign, 0, 0, 0);
6324 return packFloat128( zSign, 0, zSig0, zSig1 );
6326 zSig2 = 0;
6327 zSig0 |= LIT64( 0x0002000000000000 );
6328 zExp = aExp;
6329 goto shiftRight1;
6331 aSig0 |= LIT64( 0x0001000000000000 );
6332 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6333 --zExp;
6334 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6335 ++zExp;
6336 shiftRight1:
6337 shift128ExtraRightJamming(
6338 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6339 roundAndPack:
6340 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6344 /*----------------------------------------------------------------------------
6345 | Returns the result of subtracting the absolute values of the quadruple-
6346 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6347 | difference is negated before being returned. `zSign' is ignored if the
6348 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6349 | Standard for Binary Floating-Point Arithmetic.
6350 *----------------------------------------------------------------------------*/
6352 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6353 float_status *status)
6355 int32_t aExp, bExp, zExp;
6356 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6357 int32_t expDiff;
6359 aSig1 = extractFloat128Frac1( a );
6360 aSig0 = extractFloat128Frac0( a );
6361 aExp = extractFloat128Exp( a );
6362 bSig1 = extractFloat128Frac1( b );
6363 bSig0 = extractFloat128Frac0( b );
6364 bExp = extractFloat128Exp( b );
6365 expDiff = aExp - bExp;
6366 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6367 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6368 if ( 0 < expDiff ) goto aExpBigger;
6369 if ( expDiff < 0 ) goto bExpBigger;
6370 if ( aExp == 0x7FFF ) {
6371 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6372 return propagateFloat128NaN(a, b, status);
6374 float_raise(float_flag_invalid, status);
6375 return float128_default_nan(status);
6377 if ( aExp == 0 ) {
6378 aExp = 1;
6379 bExp = 1;
6381 if ( bSig0 < aSig0 ) goto aBigger;
6382 if ( aSig0 < bSig0 ) goto bBigger;
6383 if ( bSig1 < aSig1 ) goto aBigger;
6384 if ( aSig1 < bSig1 ) goto bBigger;
6385 return packFloat128(status->float_rounding_mode == float_round_down,
6386 0, 0, 0);
6387 bExpBigger:
6388 if ( bExp == 0x7FFF ) {
6389 if (bSig0 | bSig1) {
6390 return propagateFloat128NaN(a, b, status);
6392 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6394 if ( aExp == 0 ) {
6395 ++expDiff;
6397 else {
6398 aSig0 |= LIT64( 0x4000000000000000 );
6400 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6401 bSig0 |= LIT64( 0x4000000000000000 );
6402 bBigger:
6403 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6404 zExp = bExp;
6405 zSign ^= 1;
6406 goto normalizeRoundAndPack;
6407 aExpBigger:
6408 if ( aExp == 0x7FFF ) {
6409 if (aSig0 | aSig1) {
6410 return propagateFloat128NaN(a, b, status);
6412 return a;
6414 if ( bExp == 0 ) {
6415 --expDiff;
6417 else {
6418 bSig0 |= LIT64( 0x4000000000000000 );
6420 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6421 aSig0 |= LIT64( 0x4000000000000000 );
6422 aBigger:
6423 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6424 zExp = aExp;
6425 normalizeRoundAndPack:
6426 --zExp;
6427 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6428 status);
6432 /*----------------------------------------------------------------------------
6433 | Returns the result of adding the quadruple-precision floating-point values
6434 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6435 | for Binary Floating-Point Arithmetic.
6436 *----------------------------------------------------------------------------*/
6438 float128 float128_add(float128 a, float128 b, float_status *status)
6440 flag aSign, bSign;
6442 aSign = extractFloat128Sign( a );
6443 bSign = extractFloat128Sign( b );
6444 if ( aSign == bSign ) {
6445 return addFloat128Sigs(a, b, aSign, status);
6447 else {
6448 return subFloat128Sigs(a, b, aSign, status);
6453 /*----------------------------------------------------------------------------
6454 | Returns the result of subtracting the quadruple-precision floating-point
6455 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6456 | Standard for Binary Floating-Point Arithmetic.
6457 *----------------------------------------------------------------------------*/
6459 float128 float128_sub(float128 a, float128 b, float_status *status)
6461 flag aSign, bSign;
6463 aSign = extractFloat128Sign( a );
6464 bSign = extractFloat128Sign( b );
6465 if ( aSign == bSign ) {
6466 return subFloat128Sigs(a, b, aSign, status);
6468 else {
6469 return addFloat128Sigs(a, b, aSign, status);
6474 /*----------------------------------------------------------------------------
6475 | Returns the result of multiplying the quadruple-precision floating-point
6476 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6477 | Standard for Binary Floating-Point Arithmetic.
6478 *----------------------------------------------------------------------------*/
6480 float128 float128_mul(float128 a, float128 b, float_status *status)
6482 flag aSign, bSign, zSign;
6483 int32_t aExp, bExp, zExp;
6484 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6486 aSig1 = extractFloat128Frac1( a );
6487 aSig0 = extractFloat128Frac0( a );
6488 aExp = extractFloat128Exp( a );
6489 aSign = extractFloat128Sign( a );
6490 bSig1 = extractFloat128Frac1( b );
6491 bSig0 = extractFloat128Frac0( b );
6492 bExp = extractFloat128Exp( b );
6493 bSign = extractFloat128Sign( b );
6494 zSign = aSign ^ bSign;
6495 if ( aExp == 0x7FFF ) {
6496 if ( ( aSig0 | aSig1 )
6497 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6498 return propagateFloat128NaN(a, b, status);
6500 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6501 return packFloat128( zSign, 0x7FFF, 0, 0 );
6503 if ( bExp == 0x7FFF ) {
6504 if (bSig0 | bSig1) {
6505 return propagateFloat128NaN(a, b, status);
6507 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6508 invalid:
6509 float_raise(float_flag_invalid, status);
6510 return float128_default_nan(status);
6512 return packFloat128( zSign, 0x7FFF, 0, 0 );
6514 if ( aExp == 0 ) {
6515 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6516 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6518 if ( bExp == 0 ) {
6519 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6520 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6522 zExp = aExp + bExp - 0x4000;
6523 aSig0 |= LIT64( 0x0001000000000000 );
6524 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6525 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6526 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6527 zSig2 |= ( zSig3 != 0 );
6528 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6529 shift128ExtraRightJamming(
6530 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6531 ++zExp;
6533 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6537 /*----------------------------------------------------------------------------
6538 | Returns the result of dividing the quadruple-precision floating-point value
6539 | `a' by the corresponding value `b'. The operation is performed according to
6540 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6541 *----------------------------------------------------------------------------*/
6543 float128 float128_div(float128 a, float128 b, float_status *status)
6545 flag aSign, bSign, zSign;
6546 int32_t aExp, bExp, zExp;
6547 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6548 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6550 aSig1 = extractFloat128Frac1( a );
6551 aSig0 = extractFloat128Frac0( a );
6552 aExp = extractFloat128Exp( a );
6553 aSign = extractFloat128Sign( a );
6554 bSig1 = extractFloat128Frac1( b );
6555 bSig0 = extractFloat128Frac0( b );
6556 bExp = extractFloat128Exp( b );
6557 bSign = extractFloat128Sign( b );
6558 zSign = aSign ^ bSign;
6559 if ( aExp == 0x7FFF ) {
6560 if (aSig0 | aSig1) {
6561 return propagateFloat128NaN(a, b, status);
6563 if ( bExp == 0x7FFF ) {
6564 if (bSig0 | bSig1) {
6565 return propagateFloat128NaN(a, b, status);
6567 goto invalid;
6569 return packFloat128( zSign, 0x7FFF, 0, 0 );
6571 if ( bExp == 0x7FFF ) {
6572 if (bSig0 | bSig1) {
6573 return propagateFloat128NaN(a, b, status);
6575 return packFloat128( zSign, 0, 0, 0 );
6577 if ( bExp == 0 ) {
6578 if ( ( bSig0 | bSig1 ) == 0 ) {
6579 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6580 invalid:
6581 float_raise(float_flag_invalid, status);
6582 return float128_default_nan(status);
6584 float_raise(float_flag_divbyzero, status);
6585 return packFloat128( zSign, 0x7FFF, 0, 0 );
6587 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6589 if ( aExp == 0 ) {
6590 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6591 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6593 zExp = aExp - bExp + 0x3FFD;
6594 shortShift128Left(
6595 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6596 shortShift128Left(
6597 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6598 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6599 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6600 ++zExp;
6602 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6603 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6604 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6605 while ( (int64_t) rem0 < 0 ) {
6606 --zSig0;
6607 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6609 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6610 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6611 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6612 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6613 while ( (int64_t) rem1 < 0 ) {
6614 --zSig1;
6615 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6617 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6619 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6620 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6624 /*----------------------------------------------------------------------------
6625 | Returns the remainder of the quadruple-precision floating-point value `a'
6626 | with respect to the corresponding value `b'. The operation is performed
6627 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6628 *----------------------------------------------------------------------------*/
6630 float128 float128_rem(float128 a, float128 b, float_status *status)
6632 flag aSign, zSign;
6633 int32_t aExp, bExp, expDiff;
6634 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6635 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6636 int64_t sigMean0;
6638 aSig1 = extractFloat128Frac1( a );
6639 aSig0 = extractFloat128Frac0( a );
6640 aExp = extractFloat128Exp( a );
6641 aSign = extractFloat128Sign( a );
6642 bSig1 = extractFloat128Frac1( b );
6643 bSig0 = extractFloat128Frac0( b );
6644 bExp = extractFloat128Exp( b );
6645 if ( aExp == 0x7FFF ) {
6646 if ( ( aSig0 | aSig1 )
6647 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6648 return propagateFloat128NaN(a, b, status);
6650 goto invalid;
6652 if ( bExp == 0x7FFF ) {
6653 if (bSig0 | bSig1) {
6654 return propagateFloat128NaN(a, b, status);
6656 return a;
6658 if ( bExp == 0 ) {
6659 if ( ( bSig0 | bSig1 ) == 0 ) {
6660 invalid:
6661 float_raise(float_flag_invalid, status);
6662 return float128_default_nan(status);
6664 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6666 if ( aExp == 0 ) {
6667 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6668 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6670 expDiff = aExp - bExp;
6671 if ( expDiff < -1 ) return a;
6672 shortShift128Left(
6673 aSig0 | LIT64( 0x0001000000000000 ),
6674 aSig1,
6675 15 - ( expDiff < 0 ),
6676 &aSig0,
6677 &aSig1
6679 shortShift128Left(
6680 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6681 q = le128( bSig0, bSig1, aSig0, aSig1 );
6682 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6683 expDiff -= 64;
6684 while ( 0 < expDiff ) {
6685 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6686 q = ( 4 < q ) ? q - 4 : 0;
6687 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6688 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6689 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6690 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6691 expDiff -= 61;
6693 if ( -64 < expDiff ) {
6694 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6695 q = ( 4 < q ) ? q - 4 : 0;
6696 q >>= - expDiff;
6697 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6698 expDiff += 52;
6699 if ( expDiff < 0 ) {
6700 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6702 else {
6703 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6705 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6706 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6708 else {
6709 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6710 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6712 do {
6713 alternateASig0 = aSig0;
6714 alternateASig1 = aSig1;
6715 ++q;
6716 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6717 } while ( 0 <= (int64_t) aSig0 );
6718 add128(
6719 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6720 if ( ( sigMean0 < 0 )
6721 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6722 aSig0 = alternateASig0;
6723 aSig1 = alternateASig1;
6725 zSign = ( (int64_t) aSig0 < 0 );
6726 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6727 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6728 status);
6731 /*----------------------------------------------------------------------------
6732 | Returns the square root of the quadruple-precision floating-point value `a'.
6733 | The operation is performed according to the IEC/IEEE Standard for Binary
6734 | Floating-Point Arithmetic.
6735 *----------------------------------------------------------------------------*/
6737 float128 float128_sqrt(float128 a, float_status *status)
6739 flag aSign;
6740 int32_t aExp, zExp;
6741 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6742 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6744 aSig1 = extractFloat128Frac1( a );
6745 aSig0 = extractFloat128Frac0( a );
6746 aExp = extractFloat128Exp( a );
6747 aSign = extractFloat128Sign( a );
6748 if ( aExp == 0x7FFF ) {
6749 if (aSig0 | aSig1) {
6750 return propagateFloat128NaN(a, a, status);
6752 if ( ! aSign ) return a;
6753 goto invalid;
6755 if ( aSign ) {
6756 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6757 invalid:
6758 float_raise(float_flag_invalid, status);
6759 return float128_default_nan(status);
6761 if ( aExp == 0 ) {
6762 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6763 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6765 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6766 aSig0 |= LIT64( 0x0001000000000000 );
6767 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6768 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6769 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6770 doubleZSig0 = zSig0<<1;
6771 mul64To128( zSig0, zSig0, &term0, &term1 );
6772 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6773 while ( (int64_t) rem0 < 0 ) {
6774 --zSig0;
6775 doubleZSig0 -= 2;
6776 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6778 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6779 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6780 if ( zSig1 == 0 ) zSig1 = 1;
6781 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6782 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6783 mul64To128( zSig1, zSig1, &term2, &term3 );
6784 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6785 while ( (int64_t) rem1 < 0 ) {
6786 --zSig1;
6787 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6788 term3 |= 1;
6789 term2 |= doubleZSig0;
6790 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6792 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6794 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6795 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6799 /*----------------------------------------------------------------------------
6800 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6801 | the corresponding value `b', and 0 otherwise. The invalid exception is
6802 | raised if either operand is a NaN. Otherwise, the comparison is performed
6803 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6804 *----------------------------------------------------------------------------*/
6806 int float128_eq(float128 a, float128 b, float_status *status)
6809 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6810 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6811 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6812 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6814 float_raise(float_flag_invalid, status);
6815 return 0;
6817 return
6818 ( a.low == b.low )
6819 && ( ( a.high == b.high )
6820 || ( ( a.low == 0 )
6821 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6826 /*----------------------------------------------------------------------------
6827 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6828 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6829 | exception is raised if either operand is a NaN. The comparison is performed
6830 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6831 *----------------------------------------------------------------------------*/
6833 int float128_le(float128 a, float128 b, float_status *status)
6835 flag aSign, bSign;
6837 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6838 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6839 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6840 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6842 float_raise(float_flag_invalid, status);
6843 return 0;
6845 aSign = extractFloat128Sign( a );
6846 bSign = extractFloat128Sign( b );
6847 if ( aSign != bSign ) {
6848 return
6849 aSign
6850 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6851 == 0 );
6853 return
6854 aSign ? le128( b.high, b.low, a.high, a.low )
6855 : le128( a.high, a.low, b.high, b.low );
6859 /*----------------------------------------------------------------------------
6860 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6861 | the corresponding value `b', and 0 otherwise. The invalid exception is
6862 | raised if either operand is a NaN. The comparison is performed according
6863 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6864 *----------------------------------------------------------------------------*/
6866 int float128_lt(float128 a, float128 b, float_status *status)
6868 flag aSign, bSign;
6870 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6871 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6872 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6873 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6875 float_raise(float_flag_invalid, status);
6876 return 0;
6878 aSign = extractFloat128Sign( a );
6879 bSign = extractFloat128Sign( b );
6880 if ( aSign != bSign ) {
6881 return
6882 aSign
6883 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6884 != 0 );
6886 return
6887 aSign ? lt128( b.high, b.low, a.high, a.low )
6888 : lt128( a.high, a.low, b.high, b.low );
6892 /*----------------------------------------------------------------------------
6893 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6894 | be compared, and 0 otherwise. The invalid exception is raised if either
6895 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6896 | Standard for Binary Floating-Point Arithmetic.
6897 *----------------------------------------------------------------------------*/
6899 int float128_unordered(float128 a, float128 b, float_status *status)
6901 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6902 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6903 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6904 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6906 float_raise(float_flag_invalid, status);
6907 return 1;
6909 return 0;
6912 /*----------------------------------------------------------------------------
6913 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6914 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6915 | exception. The comparison is performed according to the IEC/IEEE Standard
6916 | for Binary Floating-Point Arithmetic.
6917 *----------------------------------------------------------------------------*/
6919 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6922 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6923 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6924 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6925 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6927 if (float128_is_signaling_nan(a, status)
6928 || float128_is_signaling_nan(b, status)) {
6929 float_raise(float_flag_invalid, status);
6931 return 0;
6933 return
6934 ( a.low == b.low )
6935 && ( ( a.high == b.high )
6936 || ( ( a.low == 0 )
6937 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6942 /*----------------------------------------------------------------------------
6943 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6944 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6945 | cause an exception. Otherwise, the comparison is performed according to the
6946 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6947 *----------------------------------------------------------------------------*/
6949 int float128_le_quiet(float128 a, float128 b, float_status *status)
6951 flag aSign, bSign;
6953 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6954 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6955 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6956 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6958 if (float128_is_signaling_nan(a, status)
6959 || float128_is_signaling_nan(b, status)) {
6960 float_raise(float_flag_invalid, status);
6962 return 0;
6964 aSign = extractFloat128Sign( a );
6965 bSign = extractFloat128Sign( b );
6966 if ( aSign != bSign ) {
6967 return
6968 aSign
6969 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6970 == 0 );
6972 return
6973 aSign ? le128( b.high, b.low, a.high, a.low )
6974 : le128( a.high, a.low, b.high, b.low );
6978 /*----------------------------------------------------------------------------
6979 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6980 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6981 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6982 | Standard for Binary Floating-Point Arithmetic.
6983 *----------------------------------------------------------------------------*/
6985 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6987 flag aSign, bSign;
6989 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6990 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6991 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6992 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6994 if (float128_is_signaling_nan(a, status)
6995 || float128_is_signaling_nan(b, status)) {
6996 float_raise(float_flag_invalid, status);
6998 return 0;
7000 aSign = extractFloat128Sign( a );
7001 bSign = extractFloat128Sign( b );
7002 if ( aSign != bSign ) {
7003 return
7004 aSign
7005 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7006 != 0 );
7008 return
7009 aSign ? lt128( b.high, b.low, a.high, a.low )
7010 : lt128( a.high, a.low, b.high, b.low );
7014 /*----------------------------------------------------------------------------
7015 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7016 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
7017 | comparison is performed according to the IEC/IEEE Standard for Binary
7018 | Floating-Point Arithmetic.
7019 *----------------------------------------------------------------------------*/
7021 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7023 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7024 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7025 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7026 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7028 if (float128_is_signaling_nan(a, status)
7029 || float128_is_signaling_nan(b, status)) {
7030 float_raise(float_flag_invalid, status);
7032 return 1;
7034 return 0;
7037 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7038 int is_quiet, float_status *status)
7040 flag aSign, bSign;
7042 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7043 float_raise(float_flag_invalid, status);
7044 return float_relation_unordered;
7046 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7047 ( extractFloatx80Frac( a )<<1 ) ) ||
7048 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7049 ( extractFloatx80Frac( b )<<1 ) )) {
7050 if (!is_quiet ||
7051 floatx80_is_signaling_nan(a, status) ||
7052 floatx80_is_signaling_nan(b, status)) {
7053 float_raise(float_flag_invalid, status);
7055 return float_relation_unordered;
7057 aSign = extractFloatx80Sign( a );
7058 bSign = extractFloatx80Sign( b );
7059 if ( aSign != bSign ) {
7061 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7062 ( ( a.low | b.low ) == 0 ) ) {
7063 /* zero case */
7064 return float_relation_equal;
7065 } else {
7066 return 1 - (2 * aSign);
7068 } else {
7069 if (a.low == b.low && a.high == b.high) {
7070 return float_relation_equal;
7071 } else {
7072 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7077 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7079 return floatx80_compare_internal(a, b, 0, status);
7082 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7084 return floatx80_compare_internal(a, b, 1, status);
7087 static inline int float128_compare_internal(float128 a, float128 b,
7088 int is_quiet, float_status *status)
7090 flag aSign, bSign;
7092 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7093 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7094 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7095 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7096 if (!is_quiet ||
7097 float128_is_signaling_nan(a, status) ||
7098 float128_is_signaling_nan(b, status)) {
7099 float_raise(float_flag_invalid, status);
7101 return float_relation_unordered;
7103 aSign = extractFloat128Sign( a );
7104 bSign = extractFloat128Sign( b );
7105 if ( aSign != bSign ) {
7106 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7107 /* zero case */
7108 return float_relation_equal;
7109 } else {
7110 return 1 - (2 * aSign);
7112 } else {
7113 if (a.low == b.low && a.high == b.high) {
7114 return float_relation_equal;
7115 } else {
7116 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7121 int float128_compare(float128 a, float128 b, float_status *status)
7123 return float128_compare_internal(a, b, 0, status);
7126 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7128 return float128_compare_internal(a, b, 1, status);
7131 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7133 flag aSign;
7134 int32_t aExp;
7135 uint64_t aSig;
7137 if (floatx80_invalid_encoding(a)) {
7138 float_raise(float_flag_invalid, status);
7139 return floatx80_default_nan(status);
7141 aSig = extractFloatx80Frac( a );
7142 aExp = extractFloatx80Exp( a );
7143 aSign = extractFloatx80Sign( a );
7145 if ( aExp == 0x7FFF ) {
7146 if ( aSig<<1 ) {
7147 return propagateFloatx80NaN(a, a, status);
7149 return a;
7152 if (aExp == 0) {
7153 if (aSig == 0) {
7154 return a;
7156 aExp++;
7159 if (n > 0x10000) {
7160 n = 0x10000;
7161 } else if (n < -0x10000) {
7162 n = -0x10000;
7165 aExp += n;
7166 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7167 aSign, aExp, aSig, 0, status);
7170 float128 float128_scalbn(float128 a, int n, float_status *status)
7172 flag aSign;
7173 int32_t aExp;
7174 uint64_t aSig0, aSig1;
7176 aSig1 = extractFloat128Frac1( a );
7177 aSig0 = extractFloat128Frac0( a );
7178 aExp = extractFloat128Exp( a );
7179 aSign = extractFloat128Sign( a );
7180 if ( aExp == 0x7FFF ) {
7181 if ( aSig0 | aSig1 ) {
7182 return propagateFloat128NaN(a, a, status);
7184 return a;
7186 if (aExp != 0) {
7187 aSig0 |= LIT64( 0x0001000000000000 );
7188 } else if (aSig0 == 0 && aSig1 == 0) {
7189 return a;
7190 } else {
7191 aExp++;
7194 if (n > 0x10000) {
7195 n = 0x10000;
7196 } else if (n < -0x10000) {
7197 n = -0x10000;
7200 aExp += n - 1;
7201 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7202 , status);