s390x: introduce 4.0 compat machine
[qemu.git] / fpu / softfloat.c
blobe1eef954e65283470bee1504df2642058a1f9635
1 /*
2 * QEMU float support
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "fpu/softfloat-macros.h"
98 /*----------------------------------------------------------------------------
99 | Returns the fraction bits of the half-precision floating-point value `a'.
100 *----------------------------------------------------------------------------*/
102 static inline uint32_t extractFloat16Frac(float16 a)
104 return float16_val(a) & 0x3ff;
107 /*----------------------------------------------------------------------------
108 | Returns the exponent bits of the half-precision floating-point value `a'.
109 *----------------------------------------------------------------------------*/
111 static inline int extractFloat16Exp(float16 a)
113 return (float16_val(a) >> 10) & 0x1f;
116 /*----------------------------------------------------------------------------
117 | Returns the fraction bits of the single-precision floating-point value `a'.
118 *----------------------------------------------------------------------------*/
120 static inline uint32_t extractFloat32Frac(float32 a)
122 return float32_val(a) & 0x007FFFFF;
125 /*----------------------------------------------------------------------------
126 | Returns the exponent bits of the single-precision floating-point value `a'.
127 *----------------------------------------------------------------------------*/
129 static inline int extractFloat32Exp(float32 a)
131 return (float32_val(a) >> 23) & 0xFF;
134 /*----------------------------------------------------------------------------
135 | Returns the sign bit of the single-precision floating-point value `a'.
136 *----------------------------------------------------------------------------*/
138 static inline flag extractFloat32Sign(float32 a)
140 return float32_val(a) >> 31;
143 /*----------------------------------------------------------------------------
144 | Returns the fraction bits of the double-precision floating-point value `a'.
145 *----------------------------------------------------------------------------*/
147 static inline uint64_t extractFloat64Frac(float64 a)
149 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
152 /*----------------------------------------------------------------------------
153 | Returns the exponent bits of the double-precision floating-point value `a'.
154 *----------------------------------------------------------------------------*/
156 static inline int extractFloat64Exp(float64 a)
158 return (float64_val(a) >> 52) & 0x7FF;
161 /*----------------------------------------------------------------------------
162 | Returns the sign bit of the double-precision floating-point value `a'.
163 *----------------------------------------------------------------------------*/
165 static inline flag extractFloat64Sign(float64 a)
167 return float64_val(a) >> 63;
171 * Classify a floating point number. Everything above float_class_qnan
172 * is a NaN so cls >= float_class_qnan is any NaN.
175 typedef enum __attribute__ ((__packed__)) {
176 float_class_unclassified,
177 float_class_zero,
178 float_class_normal,
179 float_class_inf,
180 float_class_qnan, /* all NaNs from here */
181 float_class_snan,
182 } FloatClass;
184 /* Simple helpers for checking if, or what kind of, NaN we have */
185 static inline __attribute__((unused)) bool is_nan(FloatClass c)
187 return unlikely(c >= float_class_qnan);
190 static inline __attribute__((unused)) bool is_snan(FloatClass c)
192 return c == float_class_snan;
195 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
197 return c == float_class_qnan;
201 * Structure holding all of the decomposed parts of a float. The
202 * exponent is unbiased and the fraction is normalized. All
203 * calculations are done with a 64 bit fraction and then rounded as
204 * appropriate for the final format.
206 * Thanks to the packed FloatClass a decent compiler should be able to
207 * fit the whole structure into registers and avoid using the stack
208 * for parameter passing.
211 typedef struct {
212 uint64_t frac;
213 int32_t exp;
214 FloatClass cls;
215 bool sign;
216 } FloatParts;
218 #define DECOMPOSED_BINARY_POINT (64 - 2)
219 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
220 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
222 /* Structure holding all of the relevant parameters for a format.
223 * exp_size: the size of the exponent field
224 * exp_bias: the offset applied to the exponent field
225 * exp_max: the maximum normalised exponent
226 * frac_size: the size of the fraction field
227 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
228 * The following are computed based the size of fraction
229 * frac_lsb: least significant bit of fraction
230 * frac_lsbm1: the bit below the least significant bit (for rounding)
231 * round_mask/roundeven_mask: masks used for rounding
232 * The following optional modifiers are available:
233 * arm_althp: handle ARM Alternative Half Precision
235 typedef struct {
236 int exp_size;
237 int exp_bias;
238 int exp_max;
239 int frac_size;
240 int frac_shift;
241 uint64_t frac_lsb;
242 uint64_t frac_lsbm1;
243 uint64_t round_mask;
244 uint64_t roundeven_mask;
245 bool arm_althp;
246 } FloatFmt;
248 /* Expand fields based on the size of exponent and fraction */
249 #define FLOAT_PARAMS(E, F) \
250 .exp_size = E, \
251 .exp_bias = ((1 << E) - 1) >> 1, \
252 .exp_max = (1 << E) - 1, \
253 .frac_size = F, \
254 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
255 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
256 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
257 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
258 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
260 static const FloatFmt float16_params = {
261 FLOAT_PARAMS(5, 10)
264 static const FloatFmt float16_params_ahp = {
265 FLOAT_PARAMS(5, 10),
266 .arm_althp = true
269 static const FloatFmt float32_params = {
270 FLOAT_PARAMS(8, 23)
273 static const FloatFmt float64_params = {
274 FLOAT_PARAMS(11, 52)
277 /* Unpack a float to parts, but do not canonicalize. */
278 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
280 const int sign_pos = fmt.frac_size + fmt.exp_size;
282 return (FloatParts) {
283 .cls = float_class_unclassified,
284 .sign = extract64(raw, sign_pos, 1),
285 .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
286 .frac = extract64(raw, 0, fmt.frac_size),
290 static inline FloatParts float16_unpack_raw(float16 f)
292 return unpack_raw(float16_params, f);
295 static inline FloatParts float32_unpack_raw(float32 f)
297 return unpack_raw(float32_params, f);
300 static inline FloatParts float64_unpack_raw(float64 f)
302 return unpack_raw(float64_params, f);
305 /* Pack a float from parts, but do not canonicalize. */
306 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
308 const int sign_pos = fmt.frac_size + fmt.exp_size;
309 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
310 return deposit64(ret, sign_pos, 1, p.sign);
313 static inline float16 float16_pack_raw(FloatParts p)
315 return make_float16(pack_raw(float16_params, p));
318 static inline float32 float32_pack_raw(FloatParts p)
320 return make_float32(pack_raw(float32_params, p));
323 static inline float64 float64_pack_raw(FloatParts p)
325 return make_float64(pack_raw(float64_params, p));
328 /*----------------------------------------------------------------------------
329 | Functions and definitions to determine: (1) whether tininess for underflow
330 | is detected before or after rounding by default, (2) what (if anything)
331 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
332 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
333 | are propagated from function inputs to output. These details are target-
334 | specific.
335 *----------------------------------------------------------------------------*/
336 #include "softfloat-specialize.h"
338 /* Canonicalize EXP and FRAC, setting CLS. */
339 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
340 float_status *status)
342 if (part.exp == parm->exp_max && !parm->arm_althp) {
343 if (part.frac == 0) {
344 part.cls = float_class_inf;
345 } else {
346 part.frac <<= parm->frac_shift;
347 part.cls = (parts_is_snan_frac(part.frac, status)
348 ? float_class_snan : float_class_qnan);
350 } else if (part.exp == 0) {
351 if (likely(part.frac == 0)) {
352 part.cls = float_class_zero;
353 } else if (status->flush_inputs_to_zero) {
354 float_raise(float_flag_input_denormal, status);
355 part.cls = float_class_zero;
356 part.frac = 0;
357 } else {
358 int shift = clz64(part.frac) - 1;
359 part.cls = float_class_normal;
360 part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
361 part.frac <<= shift;
363 } else {
364 part.cls = float_class_normal;
365 part.exp -= parm->exp_bias;
366 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
368 return part;
371 /* Round and uncanonicalize a floating-point number by parts. There
372 * are FRAC_SHIFT bits that may require rounding at the bottom of the
373 * fraction; these bits will be removed. The exponent will be biased
374 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
377 static FloatParts round_canonical(FloatParts p, float_status *s,
378 const FloatFmt *parm)
380 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
381 const uint64_t round_mask = parm->round_mask;
382 const uint64_t roundeven_mask = parm->roundeven_mask;
383 const int exp_max = parm->exp_max;
384 const int frac_shift = parm->frac_shift;
385 uint64_t frac, inc;
386 int exp, flags = 0;
387 bool overflow_norm;
389 frac = p.frac;
390 exp = p.exp;
392 switch (p.cls) {
393 case float_class_normal:
394 switch (s->float_rounding_mode) {
395 case float_round_nearest_even:
396 overflow_norm = false;
397 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
398 break;
399 case float_round_ties_away:
400 overflow_norm = false;
401 inc = frac_lsbm1;
402 break;
403 case float_round_to_zero:
404 overflow_norm = true;
405 inc = 0;
406 break;
407 case float_round_up:
408 inc = p.sign ? 0 : round_mask;
409 overflow_norm = p.sign;
410 break;
411 case float_round_down:
412 inc = p.sign ? round_mask : 0;
413 overflow_norm = !p.sign;
414 break;
415 default:
416 g_assert_not_reached();
419 exp += parm->exp_bias;
420 if (likely(exp > 0)) {
421 if (frac & round_mask) {
422 flags |= float_flag_inexact;
423 frac += inc;
424 if (frac & DECOMPOSED_OVERFLOW_BIT) {
425 frac >>= 1;
426 exp++;
429 frac >>= frac_shift;
431 if (parm->arm_althp) {
432 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
433 if (unlikely(exp > exp_max)) {
434 /* Overflow. Return the maximum normal. */
435 flags = float_flag_invalid;
436 exp = exp_max;
437 frac = -1;
439 } else if (unlikely(exp >= exp_max)) {
440 flags |= float_flag_overflow | float_flag_inexact;
441 if (overflow_norm) {
442 exp = exp_max - 1;
443 frac = -1;
444 } else {
445 p.cls = float_class_inf;
446 goto do_inf;
449 } else if (s->flush_to_zero) {
450 flags |= float_flag_output_denormal;
451 p.cls = float_class_zero;
452 goto do_zero;
453 } else {
454 bool is_tiny = (s->float_detect_tininess
455 == float_tininess_before_rounding)
456 || (exp < 0)
457 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
459 shift64RightJamming(frac, 1 - exp, &frac);
460 if (frac & round_mask) {
461 /* Need to recompute round-to-even. */
462 if (s->float_rounding_mode == float_round_nearest_even) {
463 inc = ((frac & roundeven_mask) != frac_lsbm1
464 ? frac_lsbm1 : 0);
466 flags |= float_flag_inexact;
467 frac += inc;
470 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
471 frac >>= frac_shift;
473 if (is_tiny && (flags & float_flag_inexact)) {
474 flags |= float_flag_underflow;
476 if (exp == 0 && frac == 0) {
477 p.cls = float_class_zero;
480 break;
482 case float_class_zero:
483 do_zero:
484 exp = 0;
485 frac = 0;
486 break;
488 case float_class_inf:
489 do_inf:
490 assert(!parm->arm_althp);
491 exp = exp_max;
492 frac = 0;
493 break;
495 case float_class_qnan:
496 case float_class_snan:
497 assert(!parm->arm_althp);
498 exp = exp_max;
499 frac >>= parm->frac_shift;
500 break;
502 default:
503 g_assert_not_reached();
506 float_raise(flags, s);
507 p.exp = exp;
508 p.frac = frac;
509 return p;
512 /* Explicit FloatFmt version */
513 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
514 const FloatFmt *params)
516 return canonicalize(float16_unpack_raw(f), params, s);
519 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
521 return float16a_unpack_canonical(f, s, &float16_params);
524 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
525 const FloatFmt *params)
527 return float16_pack_raw(round_canonical(p, s, params));
530 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
532 return float16a_round_pack_canonical(p, s, &float16_params);
535 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
537 return canonicalize(float32_unpack_raw(f), &float32_params, s);
540 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
542 return float32_pack_raw(round_canonical(p, s, &float32_params));
545 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
547 return canonicalize(float64_unpack_raw(f), &float64_params, s);
550 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
552 return float64_pack_raw(round_canonical(p, s, &float64_params));
555 static FloatParts return_nan(FloatParts a, float_status *s)
557 switch (a.cls) {
558 case float_class_snan:
559 s->float_exception_flags |= float_flag_invalid;
560 a = parts_silence_nan(a, s);
561 /* fall through */
562 case float_class_qnan:
563 if (s->default_nan_mode) {
564 return parts_default_nan(s);
566 break;
568 default:
569 g_assert_not_reached();
571 return a;
574 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
576 if (is_snan(a.cls) || is_snan(b.cls)) {
577 s->float_exception_flags |= float_flag_invalid;
580 if (s->default_nan_mode) {
581 return parts_default_nan(s);
582 } else {
583 if (pickNaN(a.cls, b.cls,
584 a.frac > b.frac ||
585 (a.frac == b.frac && a.sign < b.sign))) {
586 a = b;
588 if (is_snan(a.cls)) {
589 return parts_silence_nan(a, s);
592 return a;
595 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
596 bool inf_zero, float_status *s)
598 int which;
600 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
601 s->float_exception_flags |= float_flag_invalid;
604 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
606 if (s->default_nan_mode) {
607 /* Note that this check is after pickNaNMulAdd so that function
608 * has an opportunity to set the Invalid flag.
610 which = 3;
613 switch (which) {
614 case 0:
615 break;
616 case 1:
617 a = b;
618 break;
619 case 2:
620 a = c;
621 break;
622 case 3:
623 return parts_default_nan(s);
624 default:
625 g_assert_not_reached();
628 if (is_snan(a.cls)) {
629 return parts_silence_nan(a, s);
631 return a;
635 * Returns the result of adding or subtracting the values of the
636 * floating-point values `a' and `b'. The operation is performed
637 * according to the IEC/IEEE Standard for Binary Floating-Point
638 * Arithmetic.
641 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
642 float_status *s)
644 bool a_sign = a.sign;
645 bool b_sign = b.sign ^ subtract;
647 if (a_sign != b_sign) {
648 /* Subtraction */
650 if (a.cls == float_class_normal && b.cls == float_class_normal) {
651 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
652 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
653 a.frac = a.frac - b.frac;
654 } else {
655 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
656 a.frac = b.frac - a.frac;
657 a.exp = b.exp;
658 a_sign ^= 1;
661 if (a.frac == 0) {
662 a.cls = float_class_zero;
663 a.sign = s->float_rounding_mode == float_round_down;
664 } else {
665 int shift = clz64(a.frac) - 1;
666 a.frac = a.frac << shift;
667 a.exp = a.exp - shift;
668 a.sign = a_sign;
670 return a;
672 if (is_nan(a.cls) || is_nan(b.cls)) {
673 return pick_nan(a, b, s);
675 if (a.cls == float_class_inf) {
676 if (b.cls == float_class_inf) {
677 float_raise(float_flag_invalid, s);
678 return parts_default_nan(s);
680 return a;
682 if (a.cls == float_class_zero && b.cls == float_class_zero) {
683 a.sign = s->float_rounding_mode == float_round_down;
684 return a;
686 if (a.cls == float_class_zero || b.cls == float_class_inf) {
687 b.sign = a_sign ^ 1;
688 return b;
690 if (b.cls == float_class_zero) {
691 return a;
693 } else {
694 /* Addition */
695 if (a.cls == float_class_normal && b.cls == float_class_normal) {
696 if (a.exp > b.exp) {
697 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
698 } else if (a.exp < b.exp) {
699 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
700 a.exp = b.exp;
702 a.frac += b.frac;
703 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
704 shift64RightJamming(a.frac, 1, &a.frac);
705 a.exp += 1;
707 return a;
709 if (is_nan(a.cls) || is_nan(b.cls)) {
710 return pick_nan(a, b, s);
712 if (a.cls == float_class_inf || b.cls == float_class_zero) {
713 return a;
715 if (b.cls == float_class_inf || a.cls == float_class_zero) {
716 b.sign = b_sign;
717 return b;
720 g_assert_not_reached();
724 * Returns the result of adding or subtracting the floating-point
725 * values `a' and `b'. The operation is performed according to the
726 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
729 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
731 FloatParts pa = float16_unpack_canonical(a, status);
732 FloatParts pb = float16_unpack_canonical(b, status);
733 FloatParts pr = addsub_floats(pa, pb, false, status);
735 return float16_round_pack_canonical(pr, status);
738 float32 QEMU_FLATTEN float32_add(float32 a, float32 b, float_status *status)
740 FloatParts pa = float32_unpack_canonical(a, status);
741 FloatParts pb = float32_unpack_canonical(b, status);
742 FloatParts pr = addsub_floats(pa, pb, false, status);
744 return float32_round_pack_canonical(pr, status);
747 float64 QEMU_FLATTEN float64_add(float64 a, float64 b, float_status *status)
749 FloatParts pa = float64_unpack_canonical(a, status);
750 FloatParts pb = float64_unpack_canonical(b, status);
751 FloatParts pr = addsub_floats(pa, pb, false, status);
753 return float64_round_pack_canonical(pr, status);
756 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
758 FloatParts pa = float16_unpack_canonical(a, status);
759 FloatParts pb = float16_unpack_canonical(b, status);
760 FloatParts pr = addsub_floats(pa, pb, true, status);
762 return float16_round_pack_canonical(pr, status);
765 float32 QEMU_FLATTEN float32_sub(float32 a, float32 b, float_status *status)
767 FloatParts pa = float32_unpack_canonical(a, status);
768 FloatParts pb = float32_unpack_canonical(b, status);
769 FloatParts pr = addsub_floats(pa, pb, true, status);
771 return float32_round_pack_canonical(pr, status);
774 float64 QEMU_FLATTEN float64_sub(float64 a, float64 b, float_status *status)
776 FloatParts pa = float64_unpack_canonical(a, status);
777 FloatParts pb = float64_unpack_canonical(b, status);
778 FloatParts pr = addsub_floats(pa, pb, true, status);
780 return float64_round_pack_canonical(pr, status);
784 * Returns the result of multiplying the floating-point values `a' and
785 * `b'. The operation is performed according to the IEC/IEEE Standard
786 * for Binary Floating-Point Arithmetic.
789 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
791 bool sign = a.sign ^ b.sign;
793 if (a.cls == float_class_normal && b.cls == float_class_normal) {
794 uint64_t hi, lo;
795 int exp = a.exp + b.exp;
797 mul64To128(a.frac, b.frac, &hi, &lo);
798 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
799 if (lo & DECOMPOSED_OVERFLOW_BIT) {
800 shift64RightJamming(lo, 1, &lo);
801 exp += 1;
804 /* Re-use a */
805 a.exp = exp;
806 a.sign = sign;
807 a.frac = lo;
808 return a;
810 /* handle all the NaN cases */
811 if (is_nan(a.cls) || is_nan(b.cls)) {
812 return pick_nan(a, b, s);
814 /* Inf * Zero == NaN */
815 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
816 (a.cls == float_class_zero && b.cls == float_class_inf)) {
817 s->float_exception_flags |= float_flag_invalid;
818 return parts_default_nan(s);
820 /* Multiply by 0 or Inf */
821 if (a.cls == float_class_inf || a.cls == float_class_zero) {
822 a.sign = sign;
823 return a;
825 if (b.cls == float_class_inf || b.cls == float_class_zero) {
826 b.sign = sign;
827 return b;
829 g_assert_not_reached();
832 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
834 FloatParts pa = float16_unpack_canonical(a, status);
835 FloatParts pb = float16_unpack_canonical(b, status);
836 FloatParts pr = mul_floats(pa, pb, status);
838 return float16_round_pack_canonical(pr, status);
841 float32 QEMU_FLATTEN float32_mul(float32 a, float32 b, float_status *status)
843 FloatParts pa = float32_unpack_canonical(a, status);
844 FloatParts pb = float32_unpack_canonical(b, status);
845 FloatParts pr = mul_floats(pa, pb, status);
847 return float32_round_pack_canonical(pr, status);
850 float64 QEMU_FLATTEN float64_mul(float64 a, float64 b, float_status *status)
852 FloatParts pa = float64_unpack_canonical(a, status);
853 FloatParts pb = float64_unpack_canonical(b, status);
854 FloatParts pr = mul_floats(pa, pb, status);
856 return float64_round_pack_canonical(pr, status);
860 * Returns the result of multiplying the floating-point values `a' and
861 * `b' then adding 'c', with no intermediate rounding step after the
862 * multiplication. The operation is performed according to the
863 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
864 * The flags argument allows the caller to select negation of the
865 * addend, the intermediate product, or the final result. (The
866 * difference between this and having the caller do a separate
867 * negation is that negating externally will flip the sign bit on
868 * NaNs.)
871 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
872 int flags, float_status *s)
874 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
875 ((1 << float_class_inf) | (1 << float_class_zero));
876 bool p_sign;
877 bool sign_flip = flags & float_muladd_negate_result;
878 FloatClass p_class;
879 uint64_t hi, lo;
880 int p_exp;
882 /* It is implementation-defined whether the cases of (0,inf,qnan)
883 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
884 * they return if they do), so we have to hand this information
885 * off to the target-specific pick-a-NaN routine.
887 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
888 return pick_nan_muladd(a, b, c, inf_zero, s);
891 if (inf_zero) {
892 s->float_exception_flags |= float_flag_invalid;
893 return parts_default_nan(s);
896 if (flags & float_muladd_negate_c) {
897 c.sign ^= 1;
900 p_sign = a.sign ^ b.sign;
902 if (flags & float_muladd_negate_product) {
903 p_sign ^= 1;
906 if (a.cls == float_class_inf || b.cls == float_class_inf) {
907 p_class = float_class_inf;
908 } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
909 p_class = float_class_zero;
910 } else {
911 p_class = float_class_normal;
914 if (c.cls == float_class_inf) {
915 if (p_class == float_class_inf && p_sign != c.sign) {
916 s->float_exception_flags |= float_flag_invalid;
917 return parts_default_nan(s);
918 } else {
919 a.cls = float_class_inf;
920 a.sign = c.sign ^ sign_flip;
921 return a;
925 if (p_class == float_class_inf) {
926 a.cls = float_class_inf;
927 a.sign = p_sign ^ sign_flip;
928 return a;
931 if (p_class == float_class_zero) {
932 if (c.cls == float_class_zero) {
933 if (p_sign != c.sign) {
934 p_sign = s->float_rounding_mode == float_round_down;
936 c.sign = p_sign;
937 } else if (flags & float_muladd_halve_result) {
938 c.exp -= 1;
940 c.sign ^= sign_flip;
941 return c;
944 /* a & b should be normals now... */
945 assert(a.cls == float_class_normal &&
946 b.cls == float_class_normal);
948 p_exp = a.exp + b.exp;
950 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
951 * result.
953 mul64To128(a.frac, b.frac, &hi, &lo);
954 /* binary point now at bit 124 */
956 /* check for overflow */
957 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
958 shift128RightJamming(hi, lo, 1, &hi, &lo);
959 p_exp += 1;
962 /* + add/sub */
963 if (c.cls == float_class_zero) {
964 /* move binary point back to 62 */
965 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
966 } else {
967 int exp_diff = p_exp - c.exp;
968 if (p_sign == c.sign) {
969 /* Addition */
970 if (exp_diff <= 0) {
971 shift128RightJamming(hi, lo,
972 DECOMPOSED_BINARY_POINT - exp_diff,
973 &hi, &lo);
974 lo += c.frac;
975 p_exp = c.exp;
976 } else {
977 uint64_t c_hi, c_lo;
978 /* shift c to the same binary point as the product (124) */
979 c_hi = c.frac >> 2;
980 c_lo = 0;
981 shift128RightJamming(c_hi, c_lo,
982 exp_diff,
983 &c_hi, &c_lo);
984 add128(hi, lo, c_hi, c_lo, &hi, &lo);
985 /* move binary point back to 62 */
986 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
989 if (lo & DECOMPOSED_OVERFLOW_BIT) {
990 shift64RightJamming(lo, 1, &lo);
991 p_exp += 1;
994 } else {
995 /* Subtraction */
996 uint64_t c_hi, c_lo;
997 /* make C binary point match product at bit 124 */
998 c_hi = c.frac >> 2;
999 c_lo = 0;
1001 if (exp_diff <= 0) {
1002 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1003 if (exp_diff == 0
1005 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1006 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1007 } else {
1008 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1009 p_sign ^= 1;
1010 p_exp = c.exp;
1012 } else {
1013 shift128RightJamming(c_hi, c_lo,
1014 exp_diff,
1015 &c_hi, &c_lo);
1016 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1019 if (hi == 0 && lo == 0) {
1020 a.cls = float_class_zero;
1021 a.sign = s->float_rounding_mode == float_round_down;
1022 a.sign ^= sign_flip;
1023 return a;
1024 } else {
1025 int shift;
1026 if (hi != 0) {
1027 shift = clz64(hi);
1028 } else {
1029 shift = clz64(lo) + 64;
1031 /* Normalizing to a binary point of 124 is the
1032 correct adjust for the exponent. However since we're
1033 shifting, we might as well put the binary point back
1034 at 62 where we really want it. Therefore shift as
1035 if we're leaving 1 bit at the top of the word, but
1036 adjust the exponent as if we're leaving 3 bits. */
1037 shift -= 1;
1038 if (shift >= 64) {
1039 lo = lo << (shift - 64);
1040 } else {
1041 hi = (hi << shift) | (lo >> (64 - shift));
1042 lo = hi | ((lo << shift) != 0);
1044 p_exp -= shift - 2;
1049 if (flags & float_muladd_halve_result) {
1050 p_exp -= 1;
1053 /* finally prepare our result */
1054 a.cls = float_class_normal;
1055 a.sign = p_sign ^ sign_flip;
1056 a.exp = p_exp;
1057 a.frac = lo;
1059 return a;
1062 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1063 int flags, float_status *status)
1065 FloatParts pa = float16_unpack_canonical(a, status);
1066 FloatParts pb = float16_unpack_canonical(b, status);
1067 FloatParts pc = float16_unpack_canonical(c, status);
1068 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1070 return float16_round_pack_canonical(pr, status);
1073 float32 QEMU_FLATTEN float32_muladd(float32 a, float32 b, float32 c,
1074 int flags, float_status *status)
1076 FloatParts pa = float32_unpack_canonical(a, status);
1077 FloatParts pb = float32_unpack_canonical(b, status);
1078 FloatParts pc = float32_unpack_canonical(c, status);
1079 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1081 return float32_round_pack_canonical(pr, status);
1084 float64 QEMU_FLATTEN float64_muladd(float64 a, float64 b, float64 c,
1085 int flags, float_status *status)
1087 FloatParts pa = float64_unpack_canonical(a, status);
1088 FloatParts pb = float64_unpack_canonical(b, status);
1089 FloatParts pc = float64_unpack_canonical(c, status);
1090 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1092 return float64_round_pack_canonical(pr, status);
1096 * Returns the result of dividing the floating-point value `a' by the
1097 * corresponding value `b'. The operation is performed according to
1098 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1101 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1103 bool sign = a.sign ^ b.sign;
1105 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1106 uint64_t n0, n1, q, r;
1107 int exp = a.exp - b.exp;
1110 * We want a 2*N / N-bit division to produce exactly an N-bit
1111 * result, so that we do not lose any precision and so that we
1112 * do not have to renormalize afterward. If A.frac < B.frac,
1113 * then division would produce an (N-1)-bit result; shift A left
1114 * by one to produce the an N-bit result, and decrement the
1115 * exponent to match.
1117 * The udiv_qrnnd algorithm that we're using requires normalization,
1118 * i.e. the msb of the denominator must be set. Since we know that
1119 * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1120 * by one (more), and the remainder must be shifted right by one.
1122 if (a.frac < b.frac) {
1123 exp -= 1;
1124 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1125 } else {
1126 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1128 q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1131 * Set lsb if there is a remainder, to set inexact.
1132 * As mentioned above, to find the actual value of the remainder we
1133 * would need to shift right, but (1) we are only concerned about
1134 * non-zero-ness, and (2) the remainder will always be even because
1135 * both inputs to the division primitive are even.
1137 a.frac = q | (r != 0);
1138 a.sign = sign;
1139 a.exp = exp;
1140 return a;
1142 /* handle all the NaN cases */
1143 if (is_nan(a.cls) || is_nan(b.cls)) {
1144 return pick_nan(a, b, s);
1146 /* 0/0 or Inf/Inf */
1147 if (a.cls == b.cls
1149 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1150 s->float_exception_flags |= float_flag_invalid;
1151 return parts_default_nan(s);
1153 /* Inf / x or 0 / x */
1154 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1155 a.sign = sign;
1156 return a;
1158 /* Div 0 => Inf */
1159 if (b.cls == float_class_zero) {
1160 s->float_exception_flags |= float_flag_divbyzero;
1161 a.cls = float_class_inf;
1162 a.sign = sign;
1163 return a;
1165 /* Div by Inf */
1166 if (b.cls == float_class_inf) {
1167 a.cls = float_class_zero;
1168 a.sign = sign;
1169 return a;
1171 g_assert_not_reached();
1174 float16 float16_div(float16 a, float16 b, float_status *status)
1176 FloatParts pa = float16_unpack_canonical(a, status);
1177 FloatParts pb = float16_unpack_canonical(b, status);
1178 FloatParts pr = div_floats(pa, pb, status);
1180 return float16_round_pack_canonical(pr, status);
1183 float32 float32_div(float32 a, float32 b, float_status *status)
1185 FloatParts pa = float32_unpack_canonical(a, status);
1186 FloatParts pb = float32_unpack_canonical(b, status);
1187 FloatParts pr = div_floats(pa, pb, status);
1189 return float32_round_pack_canonical(pr, status);
1192 float64 float64_div(float64 a, float64 b, float_status *status)
1194 FloatParts pa = float64_unpack_canonical(a, status);
1195 FloatParts pb = float64_unpack_canonical(b, status);
1196 FloatParts pr = div_floats(pa, pb, status);
1198 return float64_round_pack_canonical(pr, status);
1202 * Float to Float conversions
1204 * Returns the result of converting one float format to another. The
1205 * conversion is performed according to the IEC/IEEE Standard for
1206 * Binary Floating-Point Arithmetic.
1208 * The float_to_float helper only needs to take care of raising
1209 * invalid exceptions and handling the conversion on NaNs.
1212 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1213 float_status *s)
1215 if (dstf->arm_althp) {
1216 switch (a.cls) {
1217 case float_class_qnan:
1218 case float_class_snan:
1219 /* There is no NaN in the destination format. Raise Invalid
1220 * and return a zero with the sign of the input NaN.
1222 s->float_exception_flags |= float_flag_invalid;
1223 a.cls = float_class_zero;
1224 a.frac = 0;
1225 a.exp = 0;
1226 break;
1228 case float_class_inf:
1229 /* There is no Inf in the destination format. Raise Invalid
1230 * and return the maximum normal with the correct sign.
1232 s->float_exception_flags |= float_flag_invalid;
1233 a.cls = float_class_normal;
1234 a.exp = dstf->exp_max;
1235 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1236 break;
1238 default:
1239 break;
1241 } else if (is_nan(a.cls)) {
1242 if (is_snan(a.cls)) {
1243 s->float_exception_flags |= float_flag_invalid;
1244 a = parts_silence_nan(a, s);
1246 if (s->default_nan_mode) {
1247 return parts_default_nan(s);
1250 return a;
1253 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1255 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1256 FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1257 FloatParts pr = float_to_float(p, &float32_params, s);
1258 return float32_round_pack_canonical(pr, s);
1261 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1263 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1264 FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1265 FloatParts pr = float_to_float(p, &float64_params, s);
1266 return float64_round_pack_canonical(pr, s);
1269 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1271 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1272 FloatParts p = float32_unpack_canonical(a, s);
1273 FloatParts pr = float_to_float(p, fmt16, s);
1274 return float16a_round_pack_canonical(pr, s, fmt16);
1277 float64 float32_to_float64(float32 a, float_status *s)
1279 FloatParts p = float32_unpack_canonical(a, s);
1280 FloatParts pr = float_to_float(p, &float64_params, s);
1281 return float64_round_pack_canonical(pr, s);
1284 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1286 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1287 FloatParts p = float64_unpack_canonical(a, s);
1288 FloatParts pr = float_to_float(p, fmt16, s);
1289 return float16a_round_pack_canonical(pr, s, fmt16);
1292 float32 float64_to_float32(float64 a, float_status *s)
1294 FloatParts p = float64_unpack_canonical(a, s);
1295 FloatParts pr = float_to_float(p, &float32_params, s);
1296 return float32_round_pack_canonical(pr, s);
1300 * Rounds the floating-point value `a' to an integer, and returns the
1301 * result as a floating-point value. The operation is performed
1302 * according to the IEC/IEEE Standard for Binary Floating-Point
1303 * Arithmetic.
1306 static FloatParts round_to_int(FloatParts a, int rmode,
1307 int scale, float_status *s)
1309 switch (a.cls) {
1310 case float_class_qnan:
1311 case float_class_snan:
1312 return return_nan(a, s);
1314 case float_class_zero:
1315 case float_class_inf:
1316 /* already "integral" */
1317 break;
1319 case float_class_normal:
1320 scale = MIN(MAX(scale, -0x10000), 0x10000);
1321 a.exp += scale;
1323 if (a.exp >= DECOMPOSED_BINARY_POINT) {
1324 /* already integral */
1325 break;
1327 if (a.exp < 0) {
1328 bool one;
1329 /* all fractional */
1330 s->float_exception_flags |= float_flag_inexact;
1331 switch (rmode) {
1332 case float_round_nearest_even:
1333 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1334 break;
1335 case float_round_ties_away:
1336 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1337 break;
1338 case float_round_to_zero:
1339 one = false;
1340 break;
1341 case float_round_up:
1342 one = !a.sign;
1343 break;
1344 case float_round_down:
1345 one = a.sign;
1346 break;
1347 default:
1348 g_assert_not_reached();
1351 if (one) {
1352 a.frac = DECOMPOSED_IMPLICIT_BIT;
1353 a.exp = 0;
1354 } else {
1355 a.cls = float_class_zero;
1357 } else {
1358 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1359 uint64_t frac_lsbm1 = frac_lsb >> 1;
1360 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1361 uint64_t rnd_mask = rnd_even_mask >> 1;
1362 uint64_t inc;
1364 switch (rmode) {
1365 case float_round_nearest_even:
1366 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1367 break;
1368 case float_round_ties_away:
1369 inc = frac_lsbm1;
1370 break;
1371 case float_round_to_zero:
1372 inc = 0;
1373 break;
1374 case float_round_up:
1375 inc = a.sign ? 0 : rnd_mask;
1376 break;
1377 case float_round_down:
1378 inc = a.sign ? rnd_mask : 0;
1379 break;
1380 default:
1381 g_assert_not_reached();
1384 if (a.frac & rnd_mask) {
1385 s->float_exception_flags |= float_flag_inexact;
1386 a.frac += inc;
1387 a.frac &= ~rnd_mask;
1388 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1389 a.frac >>= 1;
1390 a.exp++;
1394 break;
1395 default:
1396 g_assert_not_reached();
1398 return a;
1401 float16 float16_round_to_int(float16 a, float_status *s)
1403 FloatParts pa = float16_unpack_canonical(a, s);
1404 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1405 return float16_round_pack_canonical(pr, s);
1408 float32 float32_round_to_int(float32 a, float_status *s)
1410 FloatParts pa = float32_unpack_canonical(a, s);
1411 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1412 return float32_round_pack_canonical(pr, s);
1415 float64 float64_round_to_int(float64 a, float_status *s)
1417 FloatParts pa = float64_unpack_canonical(a, s);
1418 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1419 return float64_round_pack_canonical(pr, s);
1423 * Returns the result of converting the floating-point value `a' to
1424 * the two's complement integer format. The conversion is performed
1425 * according to the IEC/IEEE Standard for Binary Floating-Point
1426 * Arithmetic---which means in particular that the conversion is
1427 * rounded according to the current rounding mode. If `a' is a NaN,
1428 * the largest positive integer is returned. Otherwise, if the
1429 * conversion overflows, the largest integer with the same sign as `a'
1430 * is returned.
1433 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale,
1434 int64_t min, int64_t max,
1435 float_status *s)
1437 uint64_t r;
1438 int orig_flags = get_float_exception_flags(s);
1439 FloatParts p = round_to_int(in, rmode, scale, s);
1441 switch (p.cls) {
1442 case float_class_snan:
1443 case float_class_qnan:
1444 s->float_exception_flags = orig_flags | float_flag_invalid;
1445 return max;
1446 case float_class_inf:
1447 s->float_exception_flags = orig_flags | float_flag_invalid;
1448 return p.sign ? min : max;
1449 case float_class_zero:
1450 return 0;
1451 case float_class_normal:
1452 if (p.exp < DECOMPOSED_BINARY_POINT) {
1453 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1454 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1455 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1456 } else {
1457 r = UINT64_MAX;
1459 if (p.sign) {
1460 if (r <= -(uint64_t) min) {
1461 return -r;
1462 } else {
1463 s->float_exception_flags = orig_flags | float_flag_invalid;
1464 return min;
1466 } else {
1467 if (r <= max) {
1468 return r;
1469 } else {
1470 s->float_exception_flags = orig_flags | float_flag_invalid;
1471 return max;
1474 default:
1475 g_assert_not_reached();
1479 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale,
1480 float_status *s)
1482 return round_to_int_and_pack(float16_unpack_canonical(a, s),
1483 rmode, scale, INT16_MIN, INT16_MAX, s);
1486 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale,
1487 float_status *s)
1489 return round_to_int_and_pack(float16_unpack_canonical(a, s),
1490 rmode, scale, INT32_MIN, INT32_MAX, s);
1493 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale,
1494 float_status *s)
1496 return round_to_int_and_pack(float16_unpack_canonical(a, s),
1497 rmode, scale, INT64_MIN, INT64_MAX, s);
1500 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale,
1501 float_status *s)
1503 return round_to_int_and_pack(float32_unpack_canonical(a, s),
1504 rmode, scale, INT16_MIN, INT16_MAX, s);
1507 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale,
1508 float_status *s)
1510 return round_to_int_and_pack(float32_unpack_canonical(a, s),
1511 rmode, scale, INT32_MIN, INT32_MAX, s);
1514 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale,
1515 float_status *s)
1517 return round_to_int_and_pack(float32_unpack_canonical(a, s),
1518 rmode, scale, INT64_MIN, INT64_MAX, s);
1521 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale,
1522 float_status *s)
1524 return round_to_int_and_pack(float64_unpack_canonical(a, s),
1525 rmode, scale, INT16_MIN, INT16_MAX, s);
1528 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale,
1529 float_status *s)
1531 return round_to_int_and_pack(float64_unpack_canonical(a, s),
1532 rmode, scale, INT32_MIN, INT32_MAX, s);
1535 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale,
1536 float_status *s)
1538 return round_to_int_and_pack(float64_unpack_canonical(a, s),
1539 rmode, scale, INT64_MIN, INT64_MAX, s);
1542 int16_t float16_to_int16(float16 a, float_status *s)
1544 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
1547 int32_t float16_to_int32(float16 a, float_status *s)
1549 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
1552 int64_t float16_to_int64(float16 a, float_status *s)
1554 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
1557 int16_t float32_to_int16(float32 a, float_status *s)
1559 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
1562 int32_t float32_to_int32(float32 a, float_status *s)
1564 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
1567 int64_t float32_to_int64(float32 a, float_status *s)
1569 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
1572 int16_t float64_to_int16(float64 a, float_status *s)
1574 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
1577 int32_t float64_to_int32(float64 a, float_status *s)
1579 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
1582 int64_t float64_to_int64(float64 a, float_status *s)
1584 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
1587 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
1589 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
1592 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
1594 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
1597 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
1599 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
1602 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
1604 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
1607 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
1609 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
1612 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
1614 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
1617 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
1619 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
1622 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
1624 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
1627 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
1629 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
1633 * Returns the result of converting the floating-point value `a' to
1634 * the unsigned integer format. The conversion is performed according
1635 * to the IEC/IEEE Standard for Binary Floating-Point
1636 * Arithmetic---which means in particular that the conversion is
1637 * rounded according to the current rounding mode. If `a' is a NaN,
1638 * the largest unsigned integer is returned. Otherwise, if the
1639 * conversion overflows, the largest unsigned integer is returned. If
1640 * the 'a' is negative, the result is rounded and zero is returned;
1641 * values that do not round to zero will raise the inexact exception
1642 * flag.
1645 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale,
1646 uint64_t max, float_status *s)
1648 int orig_flags = get_float_exception_flags(s);
1649 FloatParts p = round_to_int(in, rmode, scale, s);
1650 uint64_t r;
1652 switch (p.cls) {
1653 case float_class_snan:
1654 case float_class_qnan:
1655 s->float_exception_flags = orig_flags | float_flag_invalid;
1656 return max;
1657 case float_class_inf:
1658 s->float_exception_flags = orig_flags | float_flag_invalid;
1659 return p.sign ? 0 : max;
1660 case float_class_zero:
1661 return 0;
1662 case float_class_normal:
1663 if (p.sign) {
1664 s->float_exception_flags = orig_flags | float_flag_invalid;
1665 return 0;
1668 if (p.exp < DECOMPOSED_BINARY_POINT) {
1669 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1670 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1671 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1672 } else {
1673 s->float_exception_flags = orig_flags | float_flag_invalid;
1674 return max;
1677 /* For uint64 this will never trip, but if p.exp is too large
1678 * to shift a decomposed fraction we shall have exited via the
1679 * 3rd leg above.
1681 if (r > max) {
1682 s->float_exception_flags = orig_flags | float_flag_invalid;
1683 return max;
1685 return r;
1686 default:
1687 g_assert_not_reached();
1691 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale,
1692 float_status *s)
1694 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
1695 rmode, scale, UINT16_MAX, s);
1698 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale,
1699 float_status *s)
1701 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
1702 rmode, scale, UINT32_MAX, s);
1705 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale,
1706 float_status *s)
1708 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
1709 rmode, scale, UINT64_MAX, s);
1712 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale,
1713 float_status *s)
1715 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
1716 rmode, scale, UINT16_MAX, s);
1719 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale,
1720 float_status *s)
1722 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
1723 rmode, scale, UINT32_MAX, s);
1726 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale,
1727 float_status *s)
1729 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
1730 rmode, scale, UINT64_MAX, s);
1733 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale,
1734 float_status *s)
1736 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
1737 rmode, scale, UINT16_MAX, s);
1740 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale,
1741 float_status *s)
1743 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
1744 rmode, scale, UINT32_MAX, s);
1747 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale,
1748 float_status *s)
1750 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
1751 rmode, scale, UINT64_MAX, s);
1754 uint16_t float16_to_uint16(float16 a, float_status *s)
1756 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
1759 uint32_t float16_to_uint32(float16 a, float_status *s)
1761 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
1764 uint64_t float16_to_uint64(float16 a, float_status *s)
1766 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
1769 uint16_t float32_to_uint16(float32 a, float_status *s)
1771 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
1774 uint32_t float32_to_uint32(float32 a, float_status *s)
1776 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
1779 uint64_t float32_to_uint64(float32 a, float_status *s)
1781 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
1784 uint16_t float64_to_uint16(float64 a, float_status *s)
1786 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
1789 uint32_t float64_to_uint32(float64 a, float_status *s)
1791 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
1794 uint64_t float64_to_uint64(float64 a, float_status *s)
1796 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
1799 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
1801 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
1804 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
1806 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
1809 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
1811 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
1814 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
1816 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
1819 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
1821 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
1824 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
1826 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
1829 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
1831 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
1834 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
1836 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
1839 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
1841 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
1845 * Integer to float conversions
1847 * Returns the result of converting the two's complement integer `a'
1848 * to the floating-point format. The conversion is performed according
1849 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1852 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
1854 FloatParts r = { .sign = false };
1856 if (a == 0) {
1857 r.cls = float_class_zero;
1858 } else {
1859 uint64_t f = a;
1860 int shift;
1862 r.cls = float_class_normal;
1863 if (a < 0) {
1864 f = -f;
1865 r.sign = true;
1867 shift = clz64(f) - 1;
1868 scale = MIN(MAX(scale, -0x10000), 0x10000);
1870 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
1871 r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
1874 return r;
1877 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
1879 FloatParts pa = int_to_float(a, scale, status);
1880 return float16_round_pack_canonical(pa, status);
1883 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
1885 return int64_to_float16_scalbn(a, scale, status);
1888 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
1890 return int64_to_float16_scalbn(a, scale, status);
1893 float16 int64_to_float16(int64_t a, float_status *status)
1895 return int64_to_float16_scalbn(a, 0, status);
1898 float16 int32_to_float16(int32_t a, float_status *status)
1900 return int64_to_float16_scalbn(a, 0, status);
1903 float16 int16_to_float16(int16_t a, float_status *status)
1905 return int64_to_float16_scalbn(a, 0, status);
1908 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
1910 FloatParts pa = int_to_float(a, scale, status);
1911 return float32_round_pack_canonical(pa, status);
1914 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
1916 return int64_to_float32_scalbn(a, scale, status);
1919 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
1921 return int64_to_float32_scalbn(a, scale, status);
1924 float32 int64_to_float32(int64_t a, float_status *status)
1926 return int64_to_float32_scalbn(a, 0, status);
1929 float32 int32_to_float32(int32_t a, float_status *status)
1931 return int64_to_float32_scalbn(a, 0, status);
1934 float32 int16_to_float32(int16_t a, float_status *status)
1936 return int64_to_float32_scalbn(a, 0, status);
1939 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
1941 FloatParts pa = int_to_float(a, scale, status);
1942 return float64_round_pack_canonical(pa, status);
1945 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
1947 return int64_to_float64_scalbn(a, scale, status);
1950 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
1952 return int64_to_float64_scalbn(a, scale, status);
1955 float64 int64_to_float64(int64_t a, float_status *status)
1957 return int64_to_float64_scalbn(a, 0, status);
1960 float64 int32_to_float64(int32_t a, float_status *status)
1962 return int64_to_float64_scalbn(a, 0, status);
1965 float64 int16_to_float64(int16_t a, float_status *status)
1967 return int64_to_float64_scalbn(a, 0, status);
1972 * Unsigned Integer to float conversions
1974 * Returns the result of converting the unsigned integer `a' to the
1975 * floating-point format. The conversion is performed according to the
1976 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1979 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
1981 FloatParts r = { .sign = false };
1983 if (a == 0) {
1984 r.cls = float_class_zero;
1985 } else {
1986 scale = MIN(MAX(scale, -0x10000), 0x10000);
1987 r.cls = float_class_normal;
1988 if ((int64_t)a < 0) {
1989 r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
1990 shift64RightJamming(a, 1, &a);
1991 r.frac = a;
1992 } else {
1993 int shift = clz64(a) - 1;
1994 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
1995 r.frac = a << shift;
1999 return r;
2002 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2004 FloatParts pa = uint_to_float(a, scale, status);
2005 return float16_round_pack_canonical(pa, status);
2008 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2010 return uint64_to_float16_scalbn(a, scale, status);
2013 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2015 return uint64_to_float16_scalbn(a, scale, status);
2018 float16 uint64_to_float16(uint64_t a, float_status *status)
2020 return uint64_to_float16_scalbn(a, 0, status);
2023 float16 uint32_to_float16(uint32_t a, float_status *status)
2025 return uint64_to_float16_scalbn(a, 0, status);
2028 float16 uint16_to_float16(uint16_t a, float_status *status)
2030 return uint64_to_float16_scalbn(a, 0, status);
2033 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2035 FloatParts pa = uint_to_float(a, scale, status);
2036 return float32_round_pack_canonical(pa, status);
2039 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2041 return uint64_to_float32_scalbn(a, scale, status);
2044 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2046 return uint64_to_float32_scalbn(a, scale, status);
2049 float32 uint64_to_float32(uint64_t a, float_status *status)
2051 return uint64_to_float32_scalbn(a, 0, status);
2054 float32 uint32_to_float32(uint32_t a, float_status *status)
2056 return uint64_to_float32_scalbn(a, 0, status);
2059 float32 uint16_to_float32(uint16_t a, float_status *status)
2061 return uint64_to_float32_scalbn(a, 0, status);
2064 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2066 FloatParts pa = uint_to_float(a, scale, status);
2067 return float64_round_pack_canonical(pa, status);
2070 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2072 return uint64_to_float64_scalbn(a, scale, status);
2075 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2077 return uint64_to_float64_scalbn(a, scale, status);
2080 float64 uint64_to_float64(uint64_t a, float_status *status)
2082 return uint64_to_float64_scalbn(a, 0, status);
2085 float64 uint32_to_float64(uint32_t a, float_status *status)
2087 return uint64_to_float64_scalbn(a, 0, status);
2090 float64 uint16_to_float64(uint16_t a, float_status *status)
2092 return uint64_to_float64_scalbn(a, 0, status);
2095 /* Float Min/Max */
2096 /* min() and max() functions. These can't be implemented as
2097 * 'compare and pick one input' because that would mishandle
2098 * NaNs and +0 vs -0.
2100 * minnum() and maxnum() functions. These are similar to the min()
2101 * and max() functions but if one of the arguments is a QNaN and
2102 * the other is numerical then the numerical argument is returned.
2103 * SNaNs will get quietened before being returned.
2104 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2105 * and maxNum() operations. min() and max() are the typical min/max
2106 * semantics provided by many CPUs which predate that specification.
2108 * minnummag() and maxnummag() functions correspond to minNumMag()
2109 * and minNumMag() from the IEEE-754 2008.
2111 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2112 bool ieee, bool ismag, float_status *s)
2114 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2115 if (ieee) {
2116 /* Takes two floating-point values `a' and `b', one of
2117 * which is a NaN, and returns the appropriate NaN
2118 * result. If either `a' or `b' is a signaling NaN,
2119 * the invalid exception is raised.
2121 if (is_snan(a.cls) || is_snan(b.cls)) {
2122 return pick_nan(a, b, s);
2123 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2124 return b;
2125 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2126 return a;
2129 return pick_nan(a, b, s);
2130 } else {
2131 int a_exp, b_exp;
2133 switch (a.cls) {
2134 case float_class_normal:
2135 a_exp = a.exp;
2136 break;
2137 case float_class_inf:
2138 a_exp = INT_MAX;
2139 break;
2140 case float_class_zero:
2141 a_exp = INT_MIN;
2142 break;
2143 default:
2144 g_assert_not_reached();
2145 break;
2147 switch (b.cls) {
2148 case float_class_normal:
2149 b_exp = b.exp;
2150 break;
2151 case float_class_inf:
2152 b_exp = INT_MAX;
2153 break;
2154 case float_class_zero:
2155 b_exp = INT_MIN;
2156 break;
2157 default:
2158 g_assert_not_reached();
2159 break;
2162 if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2163 bool a_less = a_exp < b_exp;
2164 if (a_exp == b_exp) {
2165 a_less = a.frac < b.frac;
2167 return a_less ^ ismin ? b : a;
2170 if (a.sign == b.sign) {
2171 bool a_less = a_exp < b_exp;
2172 if (a_exp == b_exp) {
2173 a_less = a.frac < b.frac;
2175 return a.sign ^ a_less ^ ismin ? b : a;
2176 } else {
2177 return a.sign ^ ismin ? b : a;
2182 #define MINMAX(sz, name, ismin, isiee, ismag) \
2183 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
2184 float_status *s) \
2186 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2187 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2188 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
2190 return float ## sz ## _round_pack_canonical(pr, s); \
2193 MINMAX(16, min, true, false, false)
2194 MINMAX(16, minnum, true, true, false)
2195 MINMAX(16, minnummag, true, true, true)
2196 MINMAX(16, max, false, false, false)
2197 MINMAX(16, maxnum, false, true, false)
2198 MINMAX(16, maxnummag, false, true, true)
2200 MINMAX(32, min, true, false, false)
2201 MINMAX(32, minnum, true, true, false)
2202 MINMAX(32, minnummag, true, true, true)
2203 MINMAX(32, max, false, false, false)
2204 MINMAX(32, maxnum, false, true, false)
2205 MINMAX(32, maxnummag, false, true, true)
2207 MINMAX(64, min, true, false, false)
2208 MINMAX(64, minnum, true, true, false)
2209 MINMAX(64, minnummag, true, true, true)
2210 MINMAX(64, max, false, false, false)
2211 MINMAX(64, maxnum, false, true, false)
2212 MINMAX(64, maxnummag, false, true, true)
2214 #undef MINMAX
2216 /* Floating point compare */
2217 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2218 float_status *s)
2220 if (is_nan(a.cls) || is_nan(b.cls)) {
2221 if (!is_quiet ||
2222 a.cls == float_class_snan ||
2223 b.cls == float_class_snan) {
2224 s->float_exception_flags |= float_flag_invalid;
2226 return float_relation_unordered;
2229 if (a.cls == float_class_zero) {
2230 if (b.cls == float_class_zero) {
2231 return float_relation_equal;
2233 return b.sign ? float_relation_greater : float_relation_less;
2234 } else if (b.cls == float_class_zero) {
2235 return a.sign ? float_relation_less : float_relation_greater;
2238 /* The only really important thing about infinity is its sign. If
2239 * both are infinities the sign marks the smallest of the two.
2241 if (a.cls == float_class_inf) {
2242 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2243 return float_relation_equal;
2245 return a.sign ? float_relation_less : float_relation_greater;
2246 } else if (b.cls == float_class_inf) {
2247 return b.sign ? float_relation_greater : float_relation_less;
2250 if (a.sign != b.sign) {
2251 return a.sign ? float_relation_less : float_relation_greater;
2254 if (a.exp == b.exp) {
2255 if (a.frac == b.frac) {
2256 return float_relation_equal;
2258 if (a.sign) {
2259 return a.frac > b.frac ?
2260 float_relation_less : float_relation_greater;
2261 } else {
2262 return a.frac > b.frac ?
2263 float_relation_greater : float_relation_less;
2265 } else {
2266 if (a.sign) {
2267 return a.exp > b.exp ? float_relation_less : float_relation_greater;
2268 } else {
2269 return a.exp > b.exp ? float_relation_greater : float_relation_less;
2274 #define COMPARE(sz) \
2275 int float ## sz ## _compare(float ## sz a, float ## sz b, \
2276 float_status *s) \
2278 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2279 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2280 return compare_floats(pa, pb, false, s); \
2282 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \
2283 float_status *s) \
2285 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2286 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2287 return compare_floats(pa, pb, true, s); \
2290 COMPARE(16)
2291 COMPARE(32)
2292 COMPARE(64)
2294 #undef COMPARE
2296 /* Multiply A by 2 raised to the power N. */
2297 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
2299 if (unlikely(is_nan(a.cls))) {
2300 return return_nan(a, s);
2302 if (a.cls == float_class_normal) {
2303 /* The largest float type (even though not supported by FloatParts)
2304 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
2305 * still allows rounding to infinity, without allowing overflow
2306 * within the int32_t that backs FloatParts.exp.
2308 n = MIN(MAX(n, -0x10000), 0x10000);
2309 a.exp += n;
2311 return a;
2314 float16 float16_scalbn(float16 a, int n, float_status *status)
2316 FloatParts pa = float16_unpack_canonical(a, status);
2317 FloatParts pr = scalbn_decomposed(pa, n, status);
2318 return float16_round_pack_canonical(pr, status);
2321 float32 float32_scalbn(float32 a, int n, float_status *status)
2323 FloatParts pa = float32_unpack_canonical(a, status);
2324 FloatParts pr = scalbn_decomposed(pa, n, status);
2325 return float32_round_pack_canonical(pr, status);
2328 float64 float64_scalbn(float64 a, int n, float_status *status)
2330 FloatParts pa = float64_unpack_canonical(a, status);
2331 FloatParts pr = scalbn_decomposed(pa, n, status);
2332 return float64_round_pack_canonical(pr, status);
2336 * Square Root
2338 * The old softfloat code did an approximation step before zeroing in
2339 * on the final result. However for simpleness we just compute the
2340 * square root by iterating down from the implicit bit to enough extra
2341 * bits to ensure we get a correctly rounded result.
2343 * This does mean however the calculation is slower than before,
2344 * especially for 64 bit floats.
2347 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
2349 uint64_t a_frac, r_frac, s_frac;
2350 int bit, last_bit;
2352 if (is_nan(a.cls)) {
2353 return return_nan(a, s);
2355 if (a.cls == float_class_zero) {
2356 return a; /* sqrt(+-0) = +-0 */
2358 if (a.sign) {
2359 s->float_exception_flags |= float_flag_invalid;
2360 return parts_default_nan(s);
2362 if (a.cls == float_class_inf) {
2363 return a; /* sqrt(+inf) = +inf */
2366 assert(a.cls == float_class_normal);
2368 /* We need two overflow bits at the top. Adding room for that is a
2369 * right shift. If the exponent is odd, we can discard the low bit
2370 * by multiplying the fraction by 2; that's a left shift. Combine
2371 * those and we shift right if the exponent is even.
2373 a_frac = a.frac;
2374 if (!(a.exp & 1)) {
2375 a_frac >>= 1;
2377 a.exp >>= 1;
2379 /* Bit-by-bit computation of sqrt. */
2380 r_frac = 0;
2381 s_frac = 0;
2383 /* Iterate from implicit bit down to the 3 extra bits to compute a
2384 * properly rounded result. Remember we've inserted one more bit
2385 * at the top, so these positions are one less.
2387 bit = DECOMPOSED_BINARY_POINT - 1;
2388 last_bit = MAX(p->frac_shift - 4, 0);
2389 do {
2390 uint64_t q = 1ULL << bit;
2391 uint64_t t_frac = s_frac + q;
2392 if (t_frac <= a_frac) {
2393 s_frac = t_frac + q;
2394 a_frac -= t_frac;
2395 r_frac += q;
2397 a_frac <<= 1;
2398 } while (--bit >= last_bit);
2400 /* Undo the right shift done above. If there is any remaining
2401 * fraction, the result is inexact. Set the sticky bit.
2403 a.frac = (r_frac << 1) + (a_frac != 0);
2405 return a;
2408 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
2410 FloatParts pa = float16_unpack_canonical(a, status);
2411 FloatParts pr = sqrt_float(pa, status, &float16_params);
2412 return float16_round_pack_canonical(pr, status);
2415 float32 QEMU_FLATTEN float32_sqrt(float32 a, float_status *status)
2417 FloatParts pa = float32_unpack_canonical(a, status);
2418 FloatParts pr = sqrt_float(pa, status, &float32_params);
2419 return float32_round_pack_canonical(pr, status);
2422 float64 QEMU_FLATTEN float64_sqrt(float64 a, float_status *status)
2424 FloatParts pa = float64_unpack_canonical(a, status);
2425 FloatParts pr = sqrt_float(pa, status, &float64_params);
2426 return float64_round_pack_canonical(pr, status);
2429 /*----------------------------------------------------------------------------
2430 | The pattern for a default generated NaN.
2431 *----------------------------------------------------------------------------*/
2433 float16 float16_default_nan(float_status *status)
2435 FloatParts p = parts_default_nan(status);
2436 p.frac >>= float16_params.frac_shift;
2437 return float16_pack_raw(p);
2440 float32 float32_default_nan(float_status *status)
2442 FloatParts p = parts_default_nan(status);
2443 p.frac >>= float32_params.frac_shift;
2444 return float32_pack_raw(p);
2447 float64 float64_default_nan(float_status *status)
2449 FloatParts p = parts_default_nan(status);
2450 p.frac >>= float64_params.frac_shift;
2451 return float64_pack_raw(p);
2454 float128 float128_default_nan(float_status *status)
2456 FloatParts p = parts_default_nan(status);
2457 float128 r;
2459 /* Extrapolate from the choices made by parts_default_nan to fill
2460 * in the quad-floating format. If the low bit is set, assume we
2461 * want to set all non-snan bits.
2463 r.low = -(p.frac & 1);
2464 r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
2465 r.high |= LIT64(0x7FFF000000000000);
2466 r.high |= (uint64_t)p.sign << 63;
2468 return r;
2471 /*----------------------------------------------------------------------------
2472 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
2473 *----------------------------------------------------------------------------*/
2475 float16 float16_silence_nan(float16 a, float_status *status)
2477 FloatParts p = float16_unpack_raw(a);
2478 p.frac <<= float16_params.frac_shift;
2479 p = parts_silence_nan(p, status);
2480 p.frac >>= float16_params.frac_shift;
2481 return float16_pack_raw(p);
2484 float32 float32_silence_nan(float32 a, float_status *status)
2486 FloatParts p = float32_unpack_raw(a);
2487 p.frac <<= float32_params.frac_shift;
2488 p = parts_silence_nan(p, status);
2489 p.frac >>= float32_params.frac_shift;
2490 return float32_pack_raw(p);
2493 float64 float64_silence_nan(float64 a, float_status *status)
2495 FloatParts p = float64_unpack_raw(a);
2496 p.frac <<= float64_params.frac_shift;
2497 p = parts_silence_nan(p, status);
2498 p.frac >>= float64_params.frac_shift;
2499 return float64_pack_raw(p);
2502 /*----------------------------------------------------------------------------
2503 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2504 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2505 | input. If `zSign' is 1, the input is negated before being converted to an
2506 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
2507 | is simply rounded to an integer, with the inexact exception raised if the
2508 | input cannot be represented exactly as an integer. However, if the fixed-
2509 | point input is too large, the invalid exception is raised and the largest
2510 | positive or negative integer is returned.
2511 *----------------------------------------------------------------------------*/
2513 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2515 int8_t roundingMode;
2516 flag roundNearestEven;
2517 int8_t roundIncrement, roundBits;
2518 int32_t z;
2520 roundingMode = status->float_rounding_mode;
2521 roundNearestEven = ( roundingMode == float_round_nearest_even );
2522 switch (roundingMode) {
2523 case float_round_nearest_even:
2524 case float_round_ties_away:
2525 roundIncrement = 0x40;
2526 break;
2527 case float_round_to_zero:
2528 roundIncrement = 0;
2529 break;
2530 case float_round_up:
2531 roundIncrement = zSign ? 0 : 0x7f;
2532 break;
2533 case float_round_down:
2534 roundIncrement = zSign ? 0x7f : 0;
2535 break;
2536 default:
2537 abort();
2539 roundBits = absZ & 0x7F;
2540 absZ = ( absZ + roundIncrement )>>7;
2541 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2542 z = absZ;
2543 if ( zSign ) z = - z;
2544 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2545 float_raise(float_flag_invalid, status);
2546 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2548 if (roundBits) {
2549 status->float_exception_flags |= float_flag_inexact;
2551 return z;
2555 /*----------------------------------------------------------------------------
2556 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2557 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2558 | and returns the properly rounded 64-bit integer corresponding to the input.
2559 | If `zSign' is 1, the input is negated before being converted to an integer.
2560 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2561 | the inexact exception raised if the input cannot be represented exactly as
2562 | an integer. However, if the fixed-point input is too large, the invalid
2563 | exception is raised and the largest positive or negative integer is
2564 | returned.
2565 *----------------------------------------------------------------------------*/
2567 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2568 float_status *status)
2570 int8_t roundingMode;
2571 flag roundNearestEven, increment;
2572 int64_t z;
2574 roundingMode = status->float_rounding_mode;
2575 roundNearestEven = ( roundingMode == float_round_nearest_even );
2576 switch (roundingMode) {
2577 case float_round_nearest_even:
2578 case float_round_ties_away:
2579 increment = ((int64_t) absZ1 < 0);
2580 break;
2581 case float_round_to_zero:
2582 increment = 0;
2583 break;
2584 case float_round_up:
2585 increment = !zSign && absZ1;
2586 break;
2587 case float_round_down:
2588 increment = zSign && absZ1;
2589 break;
2590 default:
2591 abort();
2593 if ( increment ) {
2594 ++absZ0;
2595 if ( absZ0 == 0 ) goto overflow;
2596 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2598 z = absZ0;
2599 if ( zSign ) z = - z;
2600 if ( z && ( ( z < 0 ) ^ zSign ) ) {
2601 overflow:
2602 float_raise(float_flag_invalid, status);
2603 return
2604 zSign ? (int64_t) LIT64( 0x8000000000000000 )
2605 : LIT64( 0x7FFFFFFFFFFFFFFF );
2607 if (absZ1) {
2608 status->float_exception_flags |= float_flag_inexact;
2610 return z;
2614 /*----------------------------------------------------------------------------
2615 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2616 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2617 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2618 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
2619 | with the inexact exception raised if the input cannot be represented exactly
2620 | as an integer. However, if the fixed-point input is too large, the invalid
2621 | exception is raised and the largest unsigned integer is returned.
2622 *----------------------------------------------------------------------------*/
2624 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2625 uint64_t absZ1, float_status *status)
2627 int8_t roundingMode;
2628 flag roundNearestEven, increment;
2630 roundingMode = status->float_rounding_mode;
2631 roundNearestEven = (roundingMode == float_round_nearest_even);
2632 switch (roundingMode) {
2633 case float_round_nearest_even:
2634 case float_round_ties_away:
2635 increment = ((int64_t)absZ1 < 0);
2636 break;
2637 case float_round_to_zero:
2638 increment = 0;
2639 break;
2640 case float_round_up:
2641 increment = !zSign && absZ1;
2642 break;
2643 case float_round_down:
2644 increment = zSign && absZ1;
2645 break;
2646 default:
2647 abort();
2649 if (increment) {
2650 ++absZ0;
2651 if (absZ0 == 0) {
2652 float_raise(float_flag_invalid, status);
2653 return LIT64(0xFFFFFFFFFFFFFFFF);
2655 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2658 if (zSign && absZ0) {
2659 float_raise(float_flag_invalid, status);
2660 return 0;
2663 if (absZ1) {
2664 status->float_exception_flags |= float_flag_inexact;
2666 return absZ0;
2669 /*----------------------------------------------------------------------------
2670 | If `a' is denormal and we are in flush-to-zero mode then set the
2671 | input-denormal exception and return zero. Otherwise just return the value.
2672 *----------------------------------------------------------------------------*/
2673 float32 float32_squash_input_denormal(float32 a, float_status *status)
2675 if (status->flush_inputs_to_zero) {
2676 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2677 float_raise(float_flag_input_denormal, status);
2678 return make_float32(float32_val(a) & 0x80000000);
2681 return a;
2684 /*----------------------------------------------------------------------------
2685 | Normalizes the subnormal single-precision floating-point value represented
2686 | by the denormalized significand `aSig'. The normalized exponent and
2687 | significand are stored at the locations pointed to by `zExpPtr' and
2688 | `zSigPtr', respectively.
2689 *----------------------------------------------------------------------------*/
2691 static void
2692 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2694 int8_t shiftCount;
2696 shiftCount = clz32(aSig) - 8;
2697 *zSigPtr = aSig<<shiftCount;
2698 *zExpPtr = 1 - shiftCount;
2702 /*----------------------------------------------------------------------------
2703 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2704 | and significand `zSig', and returns the proper single-precision floating-
2705 | point value corresponding to the abstract input. Ordinarily, the abstract
2706 | value is simply rounded and packed into the single-precision format, with
2707 | the inexact exception raised if the abstract input cannot be represented
2708 | exactly. However, if the abstract value is too large, the overflow and
2709 | inexact exceptions are raised and an infinity or maximal finite value is
2710 | returned. If the abstract value is too small, the input value is rounded to
2711 | a subnormal number, and the underflow and inexact exceptions are raised if
2712 | the abstract input cannot be represented exactly as a subnormal single-
2713 | precision floating-point number.
2714 | The input significand `zSig' has its binary point between bits 30
2715 | and 29, which is 7 bits to the left of the usual location. This shifted
2716 | significand must be normalized or smaller. If `zSig' is not normalized,
2717 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2718 | and it must not require rounding. In the usual case that `zSig' is
2719 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2720 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2721 | Binary Floating-Point Arithmetic.
2722 *----------------------------------------------------------------------------*/
2724 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2725 float_status *status)
2727 int8_t roundingMode;
2728 flag roundNearestEven;
2729 int8_t roundIncrement, roundBits;
2730 flag isTiny;
2732 roundingMode = status->float_rounding_mode;
2733 roundNearestEven = ( roundingMode == float_round_nearest_even );
2734 switch (roundingMode) {
2735 case float_round_nearest_even:
2736 case float_round_ties_away:
2737 roundIncrement = 0x40;
2738 break;
2739 case float_round_to_zero:
2740 roundIncrement = 0;
2741 break;
2742 case float_round_up:
2743 roundIncrement = zSign ? 0 : 0x7f;
2744 break;
2745 case float_round_down:
2746 roundIncrement = zSign ? 0x7f : 0;
2747 break;
2748 default:
2749 abort();
2750 break;
2752 roundBits = zSig & 0x7F;
2753 if ( 0xFD <= (uint16_t) zExp ) {
2754 if ( ( 0xFD < zExp )
2755 || ( ( zExp == 0xFD )
2756 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2758 float_raise(float_flag_overflow | float_flag_inexact, status);
2759 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2761 if ( zExp < 0 ) {
2762 if (status->flush_to_zero) {
2763 float_raise(float_flag_output_denormal, status);
2764 return packFloat32(zSign, 0, 0);
2766 isTiny =
2767 (status->float_detect_tininess
2768 == float_tininess_before_rounding)
2769 || ( zExp < -1 )
2770 || ( zSig + roundIncrement < 0x80000000 );
2771 shift32RightJamming( zSig, - zExp, &zSig );
2772 zExp = 0;
2773 roundBits = zSig & 0x7F;
2774 if (isTiny && roundBits) {
2775 float_raise(float_flag_underflow, status);
2779 if (roundBits) {
2780 status->float_exception_flags |= float_flag_inexact;
2782 zSig = ( zSig + roundIncrement )>>7;
2783 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2784 if ( zSig == 0 ) zExp = 0;
2785 return packFloat32( zSign, zExp, zSig );
2789 /*----------------------------------------------------------------------------
2790 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2791 | and significand `zSig', and returns the proper single-precision floating-
2792 | point value corresponding to the abstract input. This routine is just like
2793 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2794 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2795 | floating-point exponent.
2796 *----------------------------------------------------------------------------*/
2798 static float32
2799 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2800 float_status *status)
2802 int8_t shiftCount;
2804 shiftCount = clz32(zSig) - 1;
2805 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2806 status);
2810 /*----------------------------------------------------------------------------
2811 | If `a' is denormal and we are in flush-to-zero mode then set the
2812 | input-denormal exception and return zero. Otherwise just return the value.
2813 *----------------------------------------------------------------------------*/
2814 float64 float64_squash_input_denormal(float64 a, float_status *status)
2816 if (status->flush_inputs_to_zero) {
2817 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2818 float_raise(float_flag_input_denormal, status);
2819 return make_float64(float64_val(a) & (1ULL << 63));
2822 return a;
2825 /*----------------------------------------------------------------------------
2826 | Normalizes the subnormal double-precision floating-point value represented
2827 | by the denormalized significand `aSig'. The normalized exponent and
2828 | significand are stored at the locations pointed to by `zExpPtr' and
2829 | `zSigPtr', respectively.
2830 *----------------------------------------------------------------------------*/
2832 static void
2833 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2835 int8_t shiftCount;
2837 shiftCount = clz64(aSig) - 11;
2838 *zSigPtr = aSig<<shiftCount;
2839 *zExpPtr = 1 - shiftCount;
2843 /*----------------------------------------------------------------------------
2844 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2845 | double-precision floating-point value, returning the result. After being
2846 | shifted into the proper positions, the three fields are simply added
2847 | together to form the result. This means that any integer portion of `zSig'
2848 | will be added into the exponent. Since a properly normalized significand
2849 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2850 | than the desired result exponent whenever `zSig' is a complete, normalized
2851 | significand.
2852 *----------------------------------------------------------------------------*/
2854 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2857 return make_float64(
2858 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2862 /*----------------------------------------------------------------------------
2863 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2864 | and significand `zSig', and returns the proper double-precision floating-
2865 | point value corresponding to the abstract input. Ordinarily, the abstract
2866 | value is simply rounded and packed into the double-precision format, with
2867 | the inexact exception raised if the abstract input cannot be represented
2868 | exactly. However, if the abstract value is too large, the overflow and
2869 | inexact exceptions are raised and an infinity or maximal finite value is
2870 | returned. If the abstract value is too small, the input value is rounded to
2871 | a subnormal number, and the underflow and inexact exceptions are raised if
2872 | the abstract input cannot be represented exactly as a subnormal double-
2873 | precision floating-point number.
2874 | The input significand `zSig' has its binary point between bits 62
2875 | and 61, which is 10 bits to the left of the usual location. This shifted
2876 | significand must be normalized or smaller. If `zSig' is not normalized,
2877 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2878 | and it must not require rounding. In the usual case that `zSig' is
2879 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2880 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2881 | Binary Floating-Point Arithmetic.
2882 *----------------------------------------------------------------------------*/
2884 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2885 float_status *status)
2887 int8_t roundingMode;
2888 flag roundNearestEven;
2889 int roundIncrement, roundBits;
2890 flag isTiny;
2892 roundingMode = status->float_rounding_mode;
2893 roundNearestEven = ( roundingMode == float_round_nearest_even );
2894 switch (roundingMode) {
2895 case float_round_nearest_even:
2896 case float_round_ties_away:
2897 roundIncrement = 0x200;
2898 break;
2899 case float_round_to_zero:
2900 roundIncrement = 0;
2901 break;
2902 case float_round_up:
2903 roundIncrement = zSign ? 0 : 0x3ff;
2904 break;
2905 case float_round_down:
2906 roundIncrement = zSign ? 0x3ff : 0;
2907 break;
2908 case float_round_to_odd:
2909 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2910 break;
2911 default:
2912 abort();
2914 roundBits = zSig & 0x3FF;
2915 if ( 0x7FD <= (uint16_t) zExp ) {
2916 if ( ( 0x7FD < zExp )
2917 || ( ( zExp == 0x7FD )
2918 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2920 bool overflow_to_inf = roundingMode != float_round_to_odd &&
2921 roundIncrement != 0;
2922 float_raise(float_flag_overflow | float_flag_inexact, status);
2923 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2925 if ( zExp < 0 ) {
2926 if (status->flush_to_zero) {
2927 float_raise(float_flag_output_denormal, status);
2928 return packFloat64(zSign, 0, 0);
2930 isTiny =
2931 (status->float_detect_tininess
2932 == float_tininess_before_rounding)
2933 || ( zExp < -1 )
2934 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2935 shift64RightJamming( zSig, - zExp, &zSig );
2936 zExp = 0;
2937 roundBits = zSig & 0x3FF;
2938 if (isTiny && roundBits) {
2939 float_raise(float_flag_underflow, status);
2941 if (roundingMode == float_round_to_odd) {
2943 * For round-to-odd case, the roundIncrement depends on
2944 * zSig which just changed.
2946 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2950 if (roundBits) {
2951 status->float_exception_flags |= float_flag_inexact;
2953 zSig = ( zSig + roundIncrement )>>10;
2954 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2955 if ( zSig == 0 ) zExp = 0;
2956 return packFloat64( zSign, zExp, zSig );
2960 /*----------------------------------------------------------------------------
2961 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2962 | and significand `zSig', and returns the proper double-precision floating-
2963 | point value corresponding to the abstract input. This routine is just like
2964 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2965 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2966 | floating-point exponent.
2967 *----------------------------------------------------------------------------*/
2969 static float64
2970 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2971 float_status *status)
2973 int8_t shiftCount;
2975 shiftCount = clz64(zSig) - 1;
2976 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2977 status);
2981 /*----------------------------------------------------------------------------
2982 | Normalizes the subnormal extended double-precision floating-point value
2983 | represented by the denormalized significand `aSig'. The normalized exponent
2984 | and significand are stored at the locations pointed to by `zExpPtr' and
2985 | `zSigPtr', respectively.
2986 *----------------------------------------------------------------------------*/
2988 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2989 uint64_t *zSigPtr)
2991 int8_t shiftCount;
2993 shiftCount = clz64(aSig);
2994 *zSigPtr = aSig<<shiftCount;
2995 *zExpPtr = 1 - shiftCount;
2998 /*----------------------------------------------------------------------------
2999 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3000 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3001 | and returns the proper extended double-precision floating-point value
3002 | corresponding to the abstract input. Ordinarily, the abstract value is
3003 | rounded and packed into the extended double-precision format, with the
3004 | inexact exception raised if the abstract input cannot be represented
3005 | exactly. However, if the abstract value is too large, the overflow and
3006 | inexact exceptions are raised and an infinity or maximal finite value is
3007 | returned. If the abstract value is too small, the input value is rounded to
3008 | a subnormal number, and the underflow and inexact exceptions are raised if
3009 | the abstract input cannot be represented exactly as a subnormal extended
3010 | double-precision floating-point number.
3011 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
3012 | number of bits as single or double precision, respectively. Otherwise, the
3013 | result is rounded to the full precision of the extended double-precision
3014 | format.
3015 | The input significand must be normalized or smaller. If the input
3016 | significand is not normalized, `zExp' must be 0; in that case, the result
3017 | returned is a subnormal number, and it must not require rounding. The
3018 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3019 | Floating-Point Arithmetic.
3020 *----------------------------------------------------------------------------*/
3022 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
3023 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3024 float_status *status)
3026 int8_t roundingMode;
3027 flag roundNearestEven, increment, isTiny;
3028 int64_t roundIncrement, roundMask, roundBits;
3030 roundingMode = status->float_rounding_mode;
3031 roundNearestEven = ( roundingMode == float_round_nearest_even );
3032 if ( roundingPrecision == 80 ) goto precision80;
3033 if ( roundingPrecision == 64 ) {
3034 roundIncrement = LIT64( 0x0000000000000400 );
3035 roundMask = LIT64( 0x00000000000007FF );
3037 else if ( roundingPrecision == 32 ) {
3038 roundIncrement = LIT64( 0x0000008000000000 );
3039 roundMask = LIT64( 0x000000FFFFFFFFFF );
3041 else {
3042 goto precision80;
3044 zSig0 |= ( zSig1 != 0 );
3045 switch (roundingMode) {
3046 case float_round_nearest_even:
3047 case float_round_ties_away:
3048 break;
3049 case float_round_to_zero:
3050 roundIncrement = 0;
3051 break;
3052 case float_round_up:
3053 roundIncrement = zSign ? 0 : roundMask;
3054 break;
3055 case float_round_down:
3056 roundIncrement = zSign ? roundMask : 0;
3057 break;
3058 default:
3059 abort();
3061 roundBits = zSig0 & roundMask;
3062 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3063 if ( ( 0x7FFE < zExp )
3064 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3066 goto overflow;
3068 if ( zExp <= 0 ) {
3069 if (status->flush_to_zero) {
3070 float_raise(float_flag_output_denormal, status);
3071 return packFloatx80(zSign, 0, 0);
3073 isTiny =
3074 (status->float_detect_tininess
3075 == float_tininess_before_rounding)
3076 || ( zExp < 0 )
3077 || ( zSig0 <= zSig0 + roundIncrement );
3078 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3079 zExp = 0;
3080 roundBits = zSig0 & roundMask;
3081 if (isTiny && roundBits) {
3082 float_raise(float_flag_underflow, status);
3084 if (roundBits) {
3085 status->float_exception_flags |= float_flag_inexact;
3087 zSig0 += roundIncrement;
3088 if ( (int64_t) zSig0 < 0 ) zExp = 1;
3089 roundIncrement = roundMask + 1;
3090 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3091 roundMask |= roundIncrement;
3093 zSig0 &= ~ roundMask;
3094 return packFloatx80( zSign, zExp, zSig0 );
3097 if (roundBits) {
3098 status->float_exception_flags |= float_flag_inexact;
3100 zSig0 += roundIncrement;
3101 if ( zSig0 < roundIncrement ) {
3102 ++zExp;
3103 zSig0 = LIT64( 0x8000000000000000 );
3105 roundIncrement = roundMask + 1;
3106 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3107 roundMask |= roundIncrement;
3109 zSig0 &= ~ roundMask;
3110 if ( zSig0 == 0 ) zExp = 0;
3111 return packFloatx80( zSign, zExp, zSig0 );
3112 precision80:
3113 switch (roundingMode) {
3114 case float_round_nearest_even:
3115 case float_round_ties_away:
3116 increment = ((int64_t)zSig1 < 0);
3117 break;
3118 case float_round_to_zero:
3119 increment = 0;
3120 break;
3121 case float_round_up:
3122 increment = !zSign && zSig1;
3123 break;
3124 case float_round_down:
3125 increment = zSign && zSig1;
3126 break;
3127 default:
3128 abort();
3130 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3131 if ( ( 0x7FFE < zExp )
3132 || ( ( zExp == 0x7FFE )
3133 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
3134 && increment
3137 roundMask = 0;
3138 overflow:
3139 float_raise(float_flag_overflow | float_flag_inexact, status);
3140 if ( ( roundingMode == float_round_to_zero )
3141 || ( zSign && ( roundingMode == float_round_up ) )
3142 || ( ! zSign && ( roundingMode == float_round_down ) )
3144 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3146 return packFloatx80(zSign,
3147 floatx80_infinity_high,
3148 floatx80_infinity_low);
3150 if ( zExp <= 0 ) {
3151 isTiny =
3152 (status->float_detect_tininess
3153 == float_tininess_before_rounding)
3154 || ( zExp < 0 )
3155 || ! increment
3156 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
3157 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
3158 zExp = 0;
3159 if (isTiny && zSig1) {
3160 float_raise(float_flag_underflow, status);
3162 if (zSig1) {
3163 status->float_exception_flags |= float_flag_inexact;
3165 switch (roundingMode) {
3166 case float_round_nearest_even:
3167 case float_round_ties_away:
3168 increment = ((int64_t)zSig1 < 0);
3169 break;
3170 case float_round_to_zero:
3171 increment = 0;
3172 break;
3173 case float_round_up:
3174 increment = !zSign && zSig1;
3175 break;
3176 case float_round_down:
3177 increment = zSign && zSig1;
3178 break;
3179 default:
3180 abort();
3182 if ( increment ) {
3183 ++zSig0;
3184 zSig0 &=
3185 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3186 if ( (int64_t) zSig0 < 0 ) zExp = 1;
3188 return packFloatx80( zSign, zExp, zSig0 );
3191 if (zSig1) {
3192 status->float_exception_flags |= float_flag_inexact;
3194 if ( increment ) {
3195 ++zSig0;
3196 if ( zSig0 == 0 ) {
3197 ++zExp;
3198 zSig0 = LIT64( 0x8000000000000000 );
3200 else {
3201 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3204 else {
3205 if ( zSig0 == 0 ) zExp = 0;
3207 return packFloatx80( zSign, zExp, zSig0 );
3211 /*----------------------------------------------------------------------------
3212 | Takes an abstract floating-point value having sign `zSign', exponent
3213 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
3214 | and returns the proper extended double-precision floating-point value
3215 | corresponding to the abstract input. This routine is just like
3216 | `roundAndPackFloatx80' except that the input significand does not have to be
3217 | normalized.
3218 *----------------------------------------------------------------------------*/
3220 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
3221 flag zSign, int32_t zExp,
3222 uint64_t zSig0, uint64_t zSig1,
3223 float_status *status)
3225 int8_t shiftCount;
3227 if ( zSig0 == 0 ) {
3228 zSig0 = zSig1;
3229 zSig1 = 0;
3230 zExp -= 64;
3232 shiftCount = clz64(zSig0);
3233 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3234 zExp -= shiftCount;
3235 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
3236 zSig0, zSig1, status);
3240 /*----------------------------------------------------------------------------
3241 | Returns the least-significant 64 fraction bits of the quadruple-precision
3242 | floating-point value `a'.
3243 *----------------------------------------------------------------------------*/
3245 static inline uint64_t extractFloat128Frac1( float128 a )
3248 return a.low;
3252 /*----------------------------------------------------------------------------
3253 | Returns the most-significant 48 fraction bits of the quadruple-precision
3254 | floating-point value `a'.
3255 *----------------------------------------------------------------------------*/
3257 static inline uint64_t extractFloat128Frac0( float128 a )
3260 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
3264 /*----------------------------------------------------------------------------
3265 | Returns the exponent bits of the quadruple-precision floating-point value
3266 | `a'.
3267 *----------------------------------------------------------------------------*/
3269 static inline int32_t extractFloat128Exp( float128 a )
3272 return ( a.high>>48 ) & 0x7FFF;
3276 /*----------------------------------------------------------------------------
3277 | Returns the sign bit of the quadruple-precision floating-point value `a'.
3278 *----------------------------------------------------------------------------*/
3280 static inline flag extractFloat128Sign( float128 a )
3283 return a.high>>63;
3287 /*----------------------------------------------------------------------------
3288 | Normalizes the subnormal quadruple-precision floating-point value
3289 | represented by the denormalized significand formed by the concatenation of
3290 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
3291 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
3292 | significand are stored at the location pointed to by `zSig0Ptr', and the
3293 | least significant 64 bits of the normalized significand are stored at the
3294 | location pointed to by `zSig1Ptr'.
3295 *----------------------------------------------------------------------------*/
3297 static void
3298 normalizeFloat128Subnormal(
3299 uint64_t aSig0,
3300 uint64_t aSig1,
3301 int32_t *zExpPtr,
3302 uint64_t *zSig0Ptr,
3303 uint64_t *zSig1Ptr
3306 int8_t shiftCount;
3308 if ( aSig0 == 0 ) {
3309 shiftCount = clz64(aSig1) - 15;
3310 if ( shiftCount < 0 ) {
3311 *zSig0Ptr = aSig1>>( - shiftCount );
3312 *zSig1Ptr = aSig1<<( shiftCount & 63 );
3314 else {
3315 *zSig0Ptr = aSig1<<shiftCount;
3316 *zSig1Ptr = 0;
3318 *zExpPtr = - shiftCount - 63;
3320 else {
3321 shiftCount = clz64(aSig0) - 15;
3322 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
3323 *zExpPtr = 1 - shiftCount;
3328 /*----------------------------------------------------------------------------
3329 | Packs the sign `zSign', the exponent `zExp', and the significand formed
3330 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
3331 | floating-point value, returning the result. After being shifted into the
3332 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
3333 | added together to form the most significant 32 bits of the result. This
3334 | means that any integer portion of `zSig0' will be added into the exponent.
3335 | Since a properly normalized significand will have an integer portion equal
3336 | to 1, the `zExp' input should be 1 less than the desired result exponent
3337 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
3338 | significand.
3339 *----------------------------------------------------------------------------*/
3341 static inline float128
3342 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
3344 float128 z;
3346 z.low = zSig1;
3347 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
3348 return z;
3352 /*----------------------------------------------------------------------------
3353 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3354 | and extended significand formed by the concatenation of `zSig0', `zSig1',
3355 | and `zSig2', and returns the proper quadruple-precision floating-point value
3356 | corresponding to the abstract input. Ordinarily, the abstract value is
3357 | simply rounded and packed into the quadruple-precision format, with the
3358 | inexact exception raised if the abstract input cannot be represented
3359 | exactly. However, if the abstract value is too large, the overflow and
3360 | inexact exceptions are raised and an infinity or maximal finite value is
3361 | returned. If the abstract value is too small, the input value is rounded to
3362 | a subnormal number, and the underflow and inexact exceptions are raised if
3363 | the abstract input cannot be represented exactly as a subnormal quadruple-
3364 | precision floating-point number.
3365 | The input significand must be normalized or smaller. If the input
3366 | significand is not normalized, `zExp' must be 0; in that case, the result
3367 | returned is a subnormal number, and it must not require rounding. In the
3368 | usual case that the input significand is normalized, `zExp' must be 1 less
3369 | than the ``true'' floating-point exponent. The handling of underflow and
3370 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3371 *----------------------------------------------------------------------------*/
3373 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
3374 uint64_t zSig0, uint64_t zSig1,
3375 uint64_t zSig2, float_status *status)
3377 int8_t roundingMode;
3378 flag roundNearestEven, increment, isTiny;
3380 roundingMode = status->float_rounding_mode;
3381 roundNearestEven = ( roundingMode == float_round_nearest_even );
3382 switch (roundingMode) {
3383 case float_round_nearest_even:
3384 case float_round_ties_away:
3385 increment = ((int64_t)zSig2 < 0);
3386 break;
3387 case float_round_to_zero:
3388 increment = 0;
3389 break;
3390 case float_round_up:
3391 increment = !zSign && zSig2;
3392 break;
3393 case float_round_down:
3394 increment = zSign && zSig2;
3395 break;
3396 case float_round_to_odd:
3397 increment = !(zSig1 & 0x1) && zSig2;
3398 break;
3399 default:
3400 abort();
3402 if ( 0x7FFD <= (uint32_t) zExp ) {
3403 if ( ( 0x7FFD < zExp )
3404 || ( ( zExp == 0x7FFD )
3405 && eq128(
3406 LIT64( 0x0001FFFFFFFFFFFF ),
3407 LIT64( 0xFFFFFFFFFFFFFFFF ),
3408 zSig0,
3409 zSig1
3411 && increment
3414 float_raise(float_flag_overflow | float_flag_inexact, status);
3415 if ( ( roundingMode == float_round_to_zero )
3416 || ( zSign && ( roundingMode == float_round_up ) )
3417 || ( ! zSign && ( roundingMode == float_round_down ) )
3418 || (roundingMode == float_round_to_odd)
3420 return
3421 packFloat128(
3422 zSign,
3423 0x7FFE,
3424 LIT64( 0x0000FFFFFFFFFFFF ),
3425 LIT64( 0xFFFFFFFFFFFFFFFF )
3428 return packFloat128( zSign, 0x7FFF, 0, 0 );
3430 if ( zExp < 0 ) {
3431 if (status->flush_to_zero) {
3432 float_raise(float_flag_output_denormal, status);
3433 return packFloat128(zSign, 0, 0, 0);
3435 isTiny =
3436 (status->float_detect_tininess
3437 == float_tininess_before_rounding)
3438 || ( zExp < -1 )
3439 || ! increment
3440 || lt128(
3441 zSig0,
3442 zSig1,
3443 LIT64( 0x0001FFFFFFFFFFFF ),
3444 LIT64( 0xFFFFFFFFFFFFFFFF )
3446 shift128ExtraRightJamming(
3447 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
3448 zExp = 0;
3449 if (isTiny && zSig2) {
3450 float_raise(float_flag_underflow, status);
3452 switch (roundingMode) {
3453 case float_round_nearest_even:
3454 case float_round_ties_away:
3455 increment = ((int64_t)zSig2 < 0);
3456 break;
3457 case float_round_to_zero:
3458 increment = 0;
3459 break;
3460 case float_round_up:
3461 increment = !zSign && zSig2;
3462 break;
3463 case float_round_down:
3464 increment = zSign && zSig2;
3465 break;
3466 case float_round_to_odd:
3467 increment = !(zSig1 & 0x1) && zSig2;
3468 break;
3469 default:
3470 abort();
3474 if (zSig2) {
3475 status->float_exception_flags |= float_flag_inexact;
3477 if ( increment ) {
3478 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
3479 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
3481 else {
3482 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
3484 return packFloat128( zSign, zExp, zSig0, zSig1 );
3488 /*----------------------------------------------------------------------------
3489 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3490 | and significand formed by the concatenation of `zSig0' and `zSig1', and
3491 | returns the proper quadruple-precision floating-point value corresponding
3492 | to the abstract input. This routine is just like `roundAndPackFloat128'
3493 | except that the input significand has fewer bits and does not have to be
3494 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
3495 | point exponent.
3496 *----------------------------------------------------------------------------*/
3498 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
3499 uint64_t zSig0, uint64_t zSig1,
3500 float_status *status)
3502 int8_t shiftCount;
3503 uint64_t zSig2;
3505 if ( zSig0 == 0 ) {
3506 zSig0 = zSig1;
3507 zSig1 = 0;
3508 zExp -= 64;
3510 shiftCount = clz64(zSig0) - 15;
3511 if ( 0 <= shiftCount ) {
3512 zSig2 = 0;
3513 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3515 else {
3516 shift128ExtraRightJamming(
3517 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3519 zExp -= shiftCount;
3520 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3525 /*----------------------------------------------------------------------------
3526 | Returns the result of converting the 32-bit two's complement integer `a'
3527 | to the extended double-precision floating-point format. The conversion
3528 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3529 | Arithmetic.
3530 *----------------------------------------------------------------------------*/
3532 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3534 flag zSign;
3535 uint32_t absA;
3536 int8_t shiftCount;
3537 uint64_t zSig;
3539 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3540 zSign = ( a < 0 );
3541 absA = zSign ? - a : a;
3542 shiftCount = clz32(absA) + 32;
3543 zSig = absA;
3544 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3548 /*----------------------------------------------------------------------------
3549 | Returns the result of converting the 32-bit two's complement integer `a' to
3550 | the quadruple-precision floating-point format. The conversion is performed
3551 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3552 *----------------------------------------------------------------------------*/
3554 float128 int32_to_float128(int32_t a, float_status *status)
3556 flag zSign;
3557 uint32_t absA;
3558 int8_t shiftCount;
3559 uint64_t zSig0;
3561 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3562 zSign = ( a < 0 );
3563 absA = zSign ? - a : a;
3564 shiftCount = clz32(absA) + 17;
3565 zSig0 = absA;
3566 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3570 /*----------------------------------------------------------------------------
3571 | Returns the result of converting the 64-bit two's complement integer `a'
3572 | to the extended double-precision floating-point format. The conversion
3573 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3574 | Arithmetic.
3575 *----------------------------------------------------------------------------*/
3577 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3579 flag zSign;
3580 uint64_t absA;
3581 int8_t shiftCount;
3583 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3584 zSign = ( a < 0 );
3585 absA = zSign ? - a : a;
3586 shiftCount = clz64(absA);
3587 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3591 /*----------------------------------------------------------------------------
3592 | Returns the result of converting the 64-bit two's complement integer `a' to
3593 | the quadruple-precision floating-point format. The conversion is performed
3594 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3595 *----------------------------------------------------------------------------*/
3597 float128 int64_to_float128(int64_t a, float_status *status)
3599 flag zSign;
3600 uint64_t absA;
3601 int8_t shiftCount;
3602 int32_t zExp;
3603 uint64_t zSig0, zSig1;
3605 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3606 zSign = ( a < 0 );
3607 absA = zSign ? - a : a;
3608 shiftCount = clz64(absA) + 49;
3609 zExp = 0x406E - shiftCount;
3610 if ( 64 <= shiftCount ) {
3611 zSig1 = 0;
3612 zSig0 = absA;
3613 shiftCount -= 64;
3615 else {
3616 zSig1 = absA;
3617 zSig0 = 0;
3619 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3620 return packFloat128( zSign, zExp, zSig0, zSig1 );
3624 /*----------------------------------------------------------------------------
3625 | Returns the result of converting the 64-bit unsigned integer `a'
3626 | to the quadruple-precision floating-point format. The conversion is performed
3627 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3628 *----------------------------------------------------------------------------*/
3630 float128 uint64_to_float128(uint64_t a, float_status *status)
3632 if (a == 0) {
3633 return float128_zero;
3635 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
3638 /*----------------------------------------------------------------------------
3639 | Returns the result of converting the single-precision floating-point value
3640 | `a' to the extended double-precision floating-point format. The conversion
3641 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3642 | Arithmetic.
3643 *----------------------------------------------------------------------------*/
3645 floatx80 float32_to_floatx80(float32 a, float_status *status)
3647 flag aSign;
3648 int aExp;
3649 uint32_t aSig;
3651 a = float32_squash_input_denormal(a, status);
3652 aSig = extractFloat32Frac( a );
3653 aExp = extractFloat32Exp( a );
3654 aSign = extractFloat32Sign( a );
3655 if ( aExp == 0xFF ) {
3656 if (aSig) {
3657 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3659 return packFloatx80(aSign,
3660 floatx80_infinity_high,
3661 floatx80_infinity_low);
3663 if ( aExp == 0 ) {
3664 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3665 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3667 aSig |= 0x00800000;
3668 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3672 /*----------------------------------------------------------------------------
3673 | Returns the result of converting the single-precision floating-point value
3674 | `a' to the double-precision floating-point format. The conversion is
3675 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3676 | Arithmetic.
3677 *----------------------------------------------------------------------------*/
3679 float128 float32_to_float128(float32 a, float_status *status)
3681 flag aSign;
3682 int aExp;
3683 uint32_t aSig;
3685 a = float32_squash_input_denormal(a, status);
3686 aSig = extractFloat32Frac( a );
3687 aExp = extractFloat32Exp( a );
3688 aSign = extractFloat32Sign( a );
3689 if ( aExp == 0xFF ) {
3690 if (aSig) {
3691 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3693 return packFloat128( aSign, 0x7FFF, 0, 0 );
3695 if ( aExp == 0 ) {
3696 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3697 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3698 --aExp;
3700 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3704 /*----------------------------------------------------------------------------
3705 | Returns the remainder of the single-precision floating-point value `a'
3706 | with respect to the corresponding value `b'. The operation is performed
3707 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3708 *----------------------------------------------------------------------------*/
3710 float32 float32_rem(float32 a, float32 b, float_status *status)
3712 flag aSign, zSign;
3713 int aExp, bExp, expDiff;
3714 uint32_t aSig, bSig;
3715 uint32_t q;
3716 uint64_t aSig64, bSig64, q64;
3717 uint32_t alternateASig;
3718 int32_t sigMean;
3719 a = float32_squash_input_denormal(a, status);
3720 b = float32_squash_input_denormal(b, status);
3722 aSig = extractFloat32Frac( a );
3723 aExp = extractFloat32Exp( a );
3724 aSign = extractFloat32Sign( a );
3725 bSig = extractFloat32Frac( b );
3726 bExp = extractFloat32Exp( b );
3727 if ( aExp == 0xFF ) {
3728 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3729 return propagateFloat32NaN(a, b, status);
3731 float_raise(float_flag_invalid, status);
3732 return float32_default_nan(status);
3734 if ( bExp == 0xFF ) {
3735 if (bSig) {
3736 return propagateFloat32NaN(a, b, status);
3738 return a;
3740 if ( bExp == 0 ) {
3741 if ( bSig == 0 ) {
3742 float_raise(float_flag_invalid, status);
3743 return float32_default_nan(status);
3745 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3747 if ( aExp == 0 ) {
3748 if ( aSig == 0 ) return a;
3749 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3751 expDiff = aExp - bExp;
3752 aSig |= 0x00800000;
3753 bSig |= 0x00800000;
3754 if ( expDiff < 32 ) {
3755 aSig <<= 8;
3756 bSig <<= 8;
3757 if ( expDiff < 0 ) {
3758 if ( expDiff < -1 ) return a;
3759 aSig >>= 1;
3761 q = ( bSig <= aSig );
3762 if ( q ) aSig -= bSig;
3763 if ( 0 < expDiff ) {
3764 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3765 q >>= 32 - expDiff;
3766 bSig >>= 2;
3767 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3769 else {
3770 aSig >>= 2;
3771 bSig >>= 2;
3774 else {
3775 if ( bSig <= aSig ) aSig -= bSig;
3776 aSig64 = ( (uint64_t) aSig )<<40;
3777 bSig64 = ( (uint64_t) bSig )<<40;
3778 expDiff -= 64;
3779 while ( 0 < expDiff ) {
3780 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3781 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3782 aSig64 = - ( ( bSig * q64 )<<38 );
3783 expDiff -= 62;
3785 expDiff += 64;
3786 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3787 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3788 q = q64>>( 64 - expDiff );
3789 bSig <<= 6;
3790 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3792 do {
3793 alternateASig = aSig;
3794 ++q;
3795 aSig -= bSig;
3796 } while ( 0 <= (int32_t) aSig );
3797 sigMean = aSig + alternateASig;
3798 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3799 aSig = alternateASig;
3801 zSign = ( (int32_t) aSig < 0 );
3802 if ( zSign ) aSig = - aSig;
3803 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3808 /*----------------------------------------------------------------------------
3809 | Returns the binary exponential of the single-precision floating-point value
3810 | `a'. The operation is performed according to the IEC/IEEE Standard for
3811 | Binary Floating-Point Arithmetic.
3813 | Uses the following identities:
3815 | 1. -------------------------------------------------------------------------
3816 | x x*ln(2)
3817 | 2 = e
3819 | 2. -------------------------------------------------------------------------
3820 | 2 3 4 5 n
3821 | x x x x x x x
3822 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3823 | 1! 2! 3! 4! 5! n!
3824 *----------------------------------------------------------------------------*/
3826 static const float64 float32_exp2_coefficients[15] =
3828 const_float64( 0x3ff0000000000000ll ), /* 1 */
3829 const_float64( 0x3fe0000000000000ll ), /* 2 */
3830 const_float64( 0x3fc5555555555555ll ), /* 3 */
3831 const_float64( 0x3fa5555555555555ll ), /* 4 */
3832 const_float64( 0x3f81111111111111ll ), /* 5 */
3833 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
3834 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
3835 const_float64( 0x3efa01a01a01a01all ), /* 8 */
3836 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
3837 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3838 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3839 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3840 const_float64( 0x3de6124613a86d09ll ), /* 13 */
3841 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3842 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3845 float32 float32_exp2(float32 a, float_status *status)
3847 flag aSign;
3848 int aExp;
3849 uint32_t aSig;
3850 float64 r, x, xn;
3851 int i;
3852 a = float32_squash_input_denormal(a, status);
3854 aSig = extractFloat32Frac( a );
3855 aExp = extractFloat32Exp( a );
3856 aSign = extractFloat32Sign( a );
3858 if ( aExp == 0xFF) {
3859 if (aSig) {
3860 return propagateFloat32NaN(a, float32_zero, status);
3862 return (aSign) ? float32_zero : a;
3864 if (aExp == 0) {
3865 if (aSig == 0) return float32_one;
3868 float_raise(float_flag_inexact, status);
3870 /* ******************************* */
3871 /* using float64 for approximation */
3872 /* ******************************* */
3873 x = float32_to_float64(a, status);
3874 x = float64_mul(x, float64_ln2, status);
3876 xn = x;
3877 r = float64_one;
3878 for (i = 0 ; i < 15 ; i++) {
3879 float64 f;
3881 f = float64_mul(xn, float32_exp2_coefficients[i], status);
3882 r = float64_add(r, f, status);
3884 xn = float64_mul(xn, x, status);
3887 return float64_to_float32(r, status);
3890 /*----------------------------------------------------------------------------
3891 | Returns the binary log of the single-precision floating-point value `a'.
3892 | The operation is performed according to the IEC/IEEE Standard for Binary
3893 | Floating-Point Arithmetic.
3894 *----------------------------------------------------------------------------*/
3895 float32 float32_log2(float32 a, float_status *status)
3897 flag aSign, zSign;
3898 int aExp;
3899 uint32_t aSig, zSig, i;
3901 a = float32_squash_input_denormal(a, status);
3902 aSig = extractFloat32Frac( a );
3903 aExp = extractFloat32Exp( a );
3904 aSign = extractFloat32Sign( a );
3906 if ( aExp == 0 ) {
3907 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3908 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3910 if ( aSign ) {
3911 float_raise(float_flag_invalid, status);
3912 return float32_default_nan(status);
3914 if ( aExp == 0xFF ) {
3915 if (aSig) {
3916 return propagateFloat32NaN(a, float32_zero, status);
3918 return a;
3921 aExp -= 0x7F;
3922 aSig |= 0x00800000;
3923 zSign = aExp < 0;
3924 zSig = aExp << 23;
3926 for (i = 1 << 22; i > 0; i >>= 1) {
3927 aSig = ( (uint64_t)aSig * aSig ) >> 23;
3928 if ( aSig & 0x01000000 ) {
3929 aSig >>= 1;
3930 zSig |= i;
3934 if ( zSign )
3935 zSig = -zSig;
3937 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3940 /*----------------------------------------------------------------------------
3941 | Returns 1 if the single-precision floating-point value `a' is equal to
3942 | the corresponding value `b', and 0 otherwise. The invalid exception is
3943 | raised if either operand is a NaN. Otherwise, the comparison is performed
3944 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3945 *----------------------------------------------------------------------------*/
3947 int float32_eq(float32 a, float32 b, float_status *status)
3949 uint32_t av, bv;
3950 a = float32_squash_input_denormal(a, status);
3951 b = float32_squash_input_denormal(b, status);
3953 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3954 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3956 float_raise(float_flag_invalid, status);
3957 return 0;
3959 av = float32_val(a);
3960 bv = float32_val(b);
3961 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3964 /*----------------------------------------------------------------------------
3965 | Returns 1 if the single-precision floating-point value `a' is less than
3966 | or equal to the corresponding value `b', and 0 otherwise. The invalid
3967 | exception is raised if either operand is a NaN. The comparison is performed
3968 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3969 *----------------------------------------------------------------------------*/
3971 int float32_le(float32 a, float32 b, float_status *status)
3973 flag aSign, bSign;
3974 uint32_t av, bv;
3975 a = float32_squash_input_denormal(a, status);
3976 b = float32_squash_input_denormal(b, status);
3978 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3979 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3981 float_raise(float_flag_invalid, status);
3982 return 0;
3984 aSign = extractFloat32Sign( a );
3985 bSign = extractFloat32Sign( b );
3986 av = float32_val(a);
3987 bv = float32_val(b);
3988 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3989 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3993 /*----------------------------------------------------------------------------
3994 | Returns 1 if the single-precision floating-point value `a' is less than
3995 | the corresponding value `b', and 0 otherwise. The invalid exception is
3996 | raised if either operand is a NaN. The comparison is performed according
3997 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3998 *----------------------------------------------------------------------------*/
4000 int float32_lt(float32 a, float32 b, float_status *status)
4002 flag aSign, bSign;
4003 uint32_t av, bv;
4004 a = float32_squash_input_denormal(a, status);
4005 b = float32_squash_input_denormal(b, status);
4007 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4008 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4010 float_raise(float_flag_invalid, status);
4011 return 0;
4013 aSign = extractFloat32Sign( a );
4014 bSign = extractFloat32Sign( b );
4015 av = float32_val(a);
4016 bv = float32_val(b);
4017 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4018 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4022 /*----------------------------------------------------------------------------
4023 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4024 | be compared, and 0 otherwise. The invalid exception is raised if either
4025 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4026 | Standard for Binary Floating-Point Arithmetic.
4027 *----------------------------------------------------------------------------*/
4029 int float32_unordered(float32 a, float32 b, float_status *status)
4031 a = float32_squash_input_denormal(a, status);
4032 b = float32_squash_input_denormal(b, status);
4034 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4035 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4037 float_raise(float_flag_invalid, status);
4038 return 1;
4040 return 0;
4043 /*----------------------------------------------------------------------------
4044 | Returns 1 if the single-precision floating-point value `a' is equal to
4045 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4046 | exception. The comparison is performed according to the IEC/IEEE Standard
4047 | for Binary Floating-Point Arithmetic.
4048 *----------------------------------------------------------------------------*/
4050 int float32_eq_quiet(float32 a, float32 b, float_status *status)
4052 a = float32_squash_input_denormal(a, status);
4053 b = float32_squash_input_denormal(b, status);
4055 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4056 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4058 if (float32_is_signaling_nan(a, status)
4059 || float32_is_signaling_nan(b, status)) {
4060 float_raise(float_flag_invalid, status);
4062 return 0;
4064 return ( float32_val(a) == float32_val(b) ) ||
4065 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
4068 /*----------------------------------------------------------------------------
4069 | Returns 1 if the single-precision floating-point value `a' is less than or
4070 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4071 | cause an exception. Otherwise, the comparison is performed according to the
4072 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4073 *----------------------------------------------------------------------------*/
4075 int float32_le_quiet(float32 a, float32 b, float_status *status)
4077 flag aSign, bSign;
4078 uint32_t av, bv;
4079 a = float32_squash_input_denormal(a, status);
4080 b = float32_squash_input_denormal(b, status);
4082 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4083 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4085 if (float32_is_signaling_nan(a, status)
4086 || float32_is_signaling_nan(b, status)) {
4087 float_raise(float_flag_invalid, status);
4089 return 0;
4091 aSign = extractFloat32Sign( a );
4092 bSign = extractFloat32Sign( b );
4093 av = float32_val(a);
4094 bv = float32_val(b);
4095 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4096 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4100 /*----------------------------------------------------------------------------
4101 | Returns 1 if the single-precision floating-point value `a' is less than
4102 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4103 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4104 | Standard for Binary Floating-Point Arithmetic.
4105 *----------------------------------------------------------------------------*/
4107 int float32_lt_quiet(float32 a, float32 b, float_status *status)
4109 flag aSign, bSign;
4110 uint32_t av, bv;
4111 a = float32_squash_input_denormal(a, status);
4112 b = float32_squash_input_denormal(b, status);
4114 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4115 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4117 if (float32_is_signaling_nan(a, status)
4118 || float32_is_signaling_nan(b, status)) {
4119 float_raise(float_flag_invalid, status);
4121 return 0;
4123 aSign = extractFloat32Sign( a );
4124 bSign = extractFloat32Sign( b );
4125 av = float32_val(a);
4126 bv = float32_val(b);
4127 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4128 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4132 /*----------------------------------------------------------------------------
4133 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4134 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4135 | comparison is performed according to the IEC/IEEE Standard for Binary
4136 | Floating-Point Arithmetic.
4137 *----------------------------------------------------------------------------*/
4139 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
4141 a = float32_squash_input_denormal(a, status);
4142 b = float32_squash_input_denormal(b, status);
4144 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4145 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4147 if (float32_is_signaling_nan(a, status)
4148 || float32_is_signaling_nan(b, status)) {
4149 float_raise(float_flag_invalid, status);
4151 return 1;
4153 return 0;
4156 /*----------------------------------------------------------------------------
4157 | If `a' is denormal and we are in flush-to-zero mode then set the
4158 | input-denormal exception and return zero. Otherwise just return the value.
4159 *----------------------------------------------------------------------------*/
4160 float16 float16_squash_input_denormal(float16 a, float_status *status)
4162 if (status->flush_inputs_to_zero) {
4163 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
4164 float_raise(float_flag_input_denormal, status);
4165 return make_float16(float16_val(a) & 0x8000);
4168 return a;
4171 /*----------------------------------------------------------------------------
4172 | Returns the result of converting the double-precision floating-point value
4173 | `a' to the extended double-precision floating-point format. The conversion
4174 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4175 | Arithmetic.
4176 *----------------------------------------------------------------------------*/
4178 floatx80 float64_to_floatx80(float64 a, float_status *status)
4180 flag aSign;
4181 int aExp;
4182 uint64_t aSig;
4184 a = float64_squash_input_denormal(a, status);
4185 aSig = extractFloat64Frac( a );
4186 aExp = extractFloat64Exp( a );
4187 aSign = extractFloat64Sign( a );
4188 if ( aExp == 0x7FF ) {
4189 if (aSig) {
4190 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4192 return packFloatx80(aSign,
4193 floatx80_infinity_high,
4194 floatx80_infinity_low);
4196 if ( aExp == 0 ) {
4197 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4198 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4200 return
4201 packFloatx80(
4202 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4206 /*----------------------------------------------------------------------------
4207 | Returns the result of converting the double-precision floating-point value
4208 | `a' to the quadruple-precision floating-point format. The conversion is
4209 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4210 | Arithmetic.
4211 *----------------------------------------------------------------------------*/
4213 float128 float64_to_float128(float64 a, float_status *status)
4215 flag aSign;
4216 int aExp;
4217 uint64_t aSig, zSig0, zSig1;
4219 a = float64_squash_input_denormal(a, status);
4220 aSig = extractFloat64Frac( a );
4221 aExp = extractFloat64Exp( a );
4222 aSign = extractFloat64Sign( a );
4223 if ( aExp == 0x7FF ) {
4224 if (aSig) {
4225 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4227 return packFloat128( aSign, 0x7FFF, 0, 0 );
4229 if ( aExp == 0 ) {
4230 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4231 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4232 --aExp;
4234 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4235 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4240 /*----------------------------------------------------------------------------
4241 | Returns the remainder of the double-precision floating-point value `a'
4242 | with respect to the corresponding value `b'. The operation is performed
4243 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4244 *----------------------------------------------------------------------------*/
4246 float64 float64_rem(float64 a, float64 b, float_status *status)
4248 flag aSign, zSign;
4249 int aExp, bExp, expDiff;
4250 uint64_t aSig, bSig;
4251 uint64_t q, alternateASig;
4252 int64_t sigMean;
4254 a = float64_squash_input_denormal(a, status);
4255 b = float64_squash_input_denormal(b, status);
4256 aSig = extractFloat64Frac( a );
4257 aExp = extractFloat64Exp( a );
4258 aSign = extractFloat64Sign( a );
4259 bSig = extractFloat64Frac( b );
4260 bExp = extractFloat64Exp( b );
4261 if ( aExp == 0x7FF ) {
4262 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4263 return propagateFloat64NaN(a, b, status);
4265 float_raise(float_flag_invalid, status);
4266 return float64_default_nan(status);
4268 if ( bExp == 0x7FF ) {
4269 if (bSig) {
4270 return propagateFloat64NaN(a, b, status);
4272 return a;
4274 if ( bExp == 0 ) {
4275 if ( bSig == 0 ) {
4276 float_raise(float_flag_invalid, status);
4277 return float64_default_nan(status);
4279 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4281 if ( aExp == 0 ) {
4282 if ( aSig == 0 ) return a;
4283 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4285 expDiff = aExp - bExp;
4286 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4287 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4288 if ( expDiff < 0 ) {
4289 if ( expDiff < -1 ) return a;
4290 aSig >>= 1;
4292 q = ( bSig <= aSig );
4293 if ( q ) aSig -= bSig;
4294 expDiff -= 64;
4295 while ( 0 < expDiff ) {
4296 q = estimateDiv128To64( aSig, 0, bSig );
4297 q = ( 2 < q ) ? q - 2 : 0;
4298 aSig = - ( ( bSig>>2 ) * q );
4299 expDiff -= 62;
4301 expDiff += 64;
4302 if ( 0 < expDiff ) {
4303 q = estimateDiv128To64( aSig, 0, bSig );
4304 q = ( 2 < q ) ? q - 2 : 0;
4305 q >>= 64 - expDiff;
4306 bSig >>= 2;
4307 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4309 else {
4310 aSig >>= 2;
4311 bSig >>= 2;
4313 do {
4314 alternateASig = aSig;
4315 ++q;
4316 aSig -= bSig;
4317 } while ( 0 <= (int64_t) aSig );
4318 sigMean = aSig + alternateASig;
4319 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4320 aSig = alternateASig;
4322 zSign = ( (int64_t) aSig < 0 );
4323 if ( zSign ) aSig = - aSig;
4324 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4328 /*----------------------------------------------------------------------------
4329 | Returns the binary log of the double-precision floating-point value `a'.
4330 | The operation is performed according to the IEC/IEEE Standard for Binary
4331 | Floating-Point Arithmetic.
4332 *----------------------------------------------------------------------------*/
4333 float64 float64_log2(float64 a, float_status *status)
4335 flag aSign, zSign;
4336 int aExp;
4337 uint64_t aSig, aSig0, aSig1, zSig, i;
4338 a = float64_squash_input_denormal(a, status);
4340 aSig = extractFloat64Frac( a );
4341 aExp = extractFloat64Exp( a );
4342 aSign = extractFloat64Sign( a );
4344 if ( aExp == 0 ) {
4345 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4346 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4348 if ( aSign ) {
4349 float_raise(float_flag_invalid, status);
4350 return float64_default_nan(status);
4352 if ( aExp == 0x7FF ) {
4353 if (aSig) {
4354 return propagateFloat64NaN(a, float64_zero, status);
4356 return a;
4359 aExp -= 0x3FF;
4360 aSig |= LIT64( 0x0010000000000000 );
4361 zSign = aExp < 0;
4362 zSig = (uint64_t)aExp << 52;
4363 for (i = 1LL << 51; i > 0; i >>= 1) {
4364 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4365 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4366 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4367 aSig >>= 1;
4368 zSig |= i;
4372 if ( zSign )
4373 zSig = -zSig;
4374 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4377 /*----------------------------------------------------------------------------
4378 | Returns 1 if the double-precision floating-point value `a' is equal to the
4379 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4380 | if either operand is a NaN. Otherwise, the comparison is performed
4381 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4382 *----------------------------------------------------------------------------*/
4384 int float64_eq(float64 a, float64 b, float_status *status)
4386 uint64_t av, bv;
4387 a = float64_squash_input_denormal(a, status);
4388 b = float64_squash_input_denormal(b, status);
4390 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4391 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4393 float_raise(float_flag_invalid, status);
4394 return 0;
4396 av = float64_val(a);
4397 bv = float64_val(b);
4398 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4402 /*----------------------------------------------------------------------------
4403 | Returns 1 if the double-precision floating-point value `a' is less than or
4404 | equal to the corresponding value `b', and 0 otherwise. The invalid
4405 | exception is raised if either operand is a NaN. The comparison is performed
4406 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4407 *----------------------------------------------------------------------------*/
4409 int float64_le(float64 a, float64 b, float_status *status)
4411 flag aSign, bSign;
4412 uint64_t av, bv;
4413 a = float64_squash_input_denormal(a, status);
4414 b = float64_squash_input_denormal(b, status);
4416 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4417 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4419 float_raise(float_flag_invalid, status);
4420 return 0;
4422 aSign = extractFloat64Sign( a );
4423 bSign = extractFloat64Sign( b );
4424 av = float64_val(a);
4425 bv = float64_val(b);
4426 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4427 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4431 /*----------------------------------------------------------------------------
4432 | Returns 1 if the double-precision floating-point value `a' is less than
4433 | the corresponding value `b', and 0 otherwise. The invalid exception is
4434 | raised if either operand is a NaN. The comparison is performed according
4435 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4436 *----------------------------------------------------------------------------*/
4438 int float64_lt(float64 a, float64 b, float_status *status)
4440 flag aSign, bSign;
4441 uint64_t av, bv;
4443 a = float64_squash_input_denormal(a, status);
4444 b = float64_squash_input_denormal(b, status);
4445 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4446 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4448 float_raise(float_flag_invalid, status);
4449 return 0;
4451 aSign = extractFloat64Sign( a );
4452 bSign = extractFloat64Sign( b );
4453 av = float64_val(a);
4454 bv = float64_val(b);
4455 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4456 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4460 /*----------------------------------------------------------------------------
4461 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4462 | be compared, and 0 otherwise. The invalid exception is raised if either
4463 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4464 | Standard for Binary Floating-Point Arithmetic.
4465 *----------------------------------------------------------------------------*/
4467 int float64_unordered(float64 a, float64 b, float_status *status)
4469 a = float64_squash_input_denormal(a, status);
4470 b = float64_squash_input_denormal(b, status);
4472 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4473 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4475 float_raise(float_flag_invalid, status);
4476 return 1;
4478 return 0;
4481 /*----------------------------------------------------------------------------
4482 | Returns 1 if the double-precision floating-point value `a' is equal to the
4483 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4484 | exception.The comparison is performed according to the IEC/IEEE Standard
4485 | for Binary Floating-Point Arithmetic.
4486 *----------------------------------------------------------------------------*/
4488 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4490 uint64_t av, bv;
4491 a = float64_squash_input_denormal(a, status);
4492 b = float64_squash_input_denormal(b, status);
4494 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4495 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4497 if (float64_is_signaling_nan(a, status)
4498 || float64_is_signaling_nan(b, status)) {
4499 float_raise(float_flag_invalid, status);
4501 return 0;
4503 av = float64_val(a);
4504 bv = float64_val(b);
4505 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4509 /*----------------------------------------------------------------------------
4510 | Returns 1 if the double-precision floating-point value `a' is less than or
4511 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4512 | cause an exception. Otherwise, the comparison is performed according to the
4513 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4514 *----------------------------------------------------------------------------*/
4516 int float64_le_quiet(float64 a, float64 b, float_status *status)
4518 flag aSign, bSign;
4519 uint64_t av, bv;
4520 a = float64_squash_input_denormal(a, status);
4521 b = float64_squash_input_denormal(b, status);
4523 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4524 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4526 if (float64_is_signaling_nan(a, status)
4527 || float64_is_signaling_nan(b, status)) {
4528 float_raise(float_flag_invalid, status);
4530 return 0;
4532 aSign = extractFloat64Sign( a );
4533 bSign = extractFloat64Sign( b );
4534 av = float64_val(a);
4535 bv = float64_val(b);
4536 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4537 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4541 /*----------------------------------------------------------------------------
4542 | Returns 1 if the double-precision floating-point value `a' is less than
4543 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4544 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4545 | Standard for Binary Floating-Point Arithmetic.
4546 *----------------------------------------------------------------------------*/
4548 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4550 flag aSign, bSign;
4551 uint64_t av, bv;
4552 a = float64_squash_input_denormal(a, status);
4553 b = float64_squash_input_denormal(b, status);
4555 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4556 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4558 if (float64_is_signaling_nan(a, status)
4559 || float64_is_signaling_nan(b, status)) {
4560 float_raise(float_flag_invalid, status);
4562 return 0;
4564 aSign = extractFloat64Sign( a );
4565 bSign = extractFloat64Sign( b );
4566 av = float64_val(a);
4567 bv = float64_val(b);
4568 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4569 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4573 /*----------------------------------------------------------------------------
4574 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4575 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4576 | comparison is performed according to the IEC/IEEE Standard for Binary
4577 | Floating-Point Arithmetic.
4578 *----------------------------------------------------------------------------*/
4580 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4582 a = float64_squash_input_denormal(a, status);
4583 b = float64_squash_input_denormal(b, status);
4585 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4586 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4588 if (float64_is_signaling_nan(a, status)
4589 || float64_is_signaling_nan(b, status)) {
4590 float_raise(float_flag_invalid, status);
4592 return 1;
4594 return 0;
4597 /*----------------------------------------------------------------------------
4598 | Returns the result of converting the extended double-precision floating-
4599 | point value `a' to the 32-bit two's complement integer format. The
4600 | conversion is performed according to the IEC/IEEE Standard for Binary
4601 | Floating-Point Arithmetic---which means in particular that the conversion
4602 | is rounded according to the current rounding mode. If `a' is a NaN, the
4603 | largest positive integer is returned. Otherwise, if the conversion
4604 | overflows, the largest integer with the same sign as `a' is returned.
4605 *----------------------------------------------------------------------------*/
4607 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4609 flag aSign;
4610 int32_t aExp, shiftCount;
4611 uint64_t aSig;
4613 if (floatx80_invalid_encoding(a)) {
4614 float_raise(float_flag_invalid, status);
4615 return 1 << 31;
4617 aSig = extractFloatx80Frac( a );
4618 aExp = extractFloatx80Exp( a );
4619 aSign = extractFloatx80Sign( a );
4620 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4621 shiftCount = 0x4037 - aExp;
4622 if ( shiftCount <= 0 ) shiftCount = 1;
4623 shift64RightJamming( aSig, shiftCount, &aSig );
4624 return roundAndPackInt32(aSign, aSig, status);
4628 /*----------------------------------------------------------------------------
4629 | Returns the result of converting the extended double-precision floating-
4630 | point value `a' to the 32-bit two's complement integer format. The
4631 | conversion is performed according to the IEC/IEEE Standard for Binary
4632 | Floating-Point Arithmetic, except that the conversion is always rounded
4633 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4634 | Otherwise, if the conversion overflows, the largest integer with the same
4635 | sign as `a' is returned.
4636 *----------------------------------------------------------------------------*/
4638 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4640 flag aSign;
4641 int32_t aExp, shiftCount;
4642 uint64_t aSig, savedASig;
4643 int32_t z;
4645 if (floatx80_invalid_encoding(a)) {
4646 float_raise(float_flag_invalid, status);
4647 return 1 << 31;
4649 aSig = extractFloatx80Frac( a );
4650 aExp = extractFloatx80Exp( a );
4651 aSign = extractFloatx80Sign( a );
4652 if ( 0x401E < aExp ) {
4653 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4654 goto invalid;
4656 else if ( aExp < 0x3FFF ) {
4657 if (aExp || aSig) {
4658 status->float_exception_flags |= float_flag_inexact;
4660 return 0;
4662 shiftCount = 0x403E - aExp;
4663 savedASig = aSig;
4664 aSig >>= shiftCount;
4665 z = aSig;
4666 if ( aSign ) z = - z;
4667 if ( ( z < 0 ) ^ aSign ) {
4668 invalid:
4669 float_raise(float_flag_invalid, status);
4670 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4672 if ( ( aSig<<shiftCount ) != savedASig ) {
4673 status->float_exception_flags |= float_flag_inexact;
4675 return z;
4679 /*----------------------------------------------------------------------------
4680 | Returns the result of converting the extended double-precision floating-
4681 | point value `a' to the 64-bit two's complement integer format. The
4682 | conversion is performed according to the IEC/IEEE Standard for Binary
4683 | Floating-Point Arithmetic---which means in particular that the conversion
4684 | is rounded according to the current rounding mode. If `a' is a NaN,
4685 | the largest positive integer is returned. Otherwise, if the conversion
4686 | overflows, the largest integer with the same sign as `a' is returned.
4687 *----------------------------------------------------------------------------*/
4689 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4691 flag aSign;
4692 int32_t aExp, shiftCount;
4693 uint64_t aSig, aSigExtra;
4695 if (floatx80_invalid_encoding(a)) {
4696 float_raise(float_flag_invalid, status);
4697 return 1ULL << 63;
4699 aSig = extractFloatx80Frac( a );
4700 aExp = extractFloatx80Exp( a );
4701 aSign = extractFloatx80Sign( a );
4702 shiftCount = 0x403E - aExp;
4703 if ( shiftCount <= 0 ) {
4704 if ( shiftCount ) {
4705 float_raise(float_flag_invalid, status);
4706 if (!aSign || floatx80_is_any_nan(a)) {
4707 return LIT64( 0x7FFFFFFFFFFFFFFF );
4709 return (int64_t) LIT64( 0x8000000000000000 );
4711 aSigExtra = 0;
4713 else {
4714 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4716 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4720 /*----------------------------------------------------------------------------
4721 | Returns the result of converting the extended double-precision floating-
4722 | point value `a' to the 64-bit two's complement integer format. The
4723 | conversion is performed according to the IEC/IEEE Standard for Binary
4724 | Floating-Point Arithmetic, except that the conversion is always rounded
4725 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4726 | Otherwise, if the conversion overflows, the largest integer with the same
4727 | sign as `a' is returned.
4728 *----------------------------------------------------------------------------*/
4730 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4732 flag aSign;
4733 int32_t aExp, shiftCount;
4734 uint64_t aSig;
4735 int64_t z;
4737 if (floatx80_invalid_encoding(a)) {
4738 float_raise(float_flag_invalid, status);
4739 return 1ULL << 63;
4741 aSig = extractFloatx80Frac( a );
4742 aExp = extractFloatx80Exp( a );
4743 aSign = extractFloatx80Sign( a );
4744 shiftCount = aExp - 0x403E;
4745 if ( 0 <= shiftCount ) {
4746 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4747 if ( ( a.high != 0xC03E ) || aSig ) {
4748 float_raise(float_flag_invalid, status);
4749 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4750 return LIT64( 0x7FFFFFFFFFFFFFFF );
4753 return (int64_t) LIT64( 0x8000000000000000 );
4755 else if ( aExp < 0x3FFF ) {
4756 if (aExp | aSig) {
4757 status->float_exception_flags |= float_flag_inexact;
4759 return 0;
4761 z = aSig>>( - shiftCount );
4762 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4763 status->float_exception_flags |= float_flag_inexact;
4765 if ( aSign ) z = - z;
4766 return z;
4770 /*----------------------------------------------------------------------------
4771 | Returns the result of converting the extended double-precision floating-
4772 | point value `a' to the single-precision floating-point format. The
4773 | conversion is performed according to the IEC/IEEE Standard for Binary
4774 | Floating-Point Arithmetic.
4775 *----------------------------------------------------------------------------*/
4777 float32 floatx80_to_float32(floatx80 a, float_status *status)
4779 flag aSign;
4780 int32_t aExp;
4781 uint64_t aSig;
4783 if (floatx80_invalid_encoding(a)) {
4784 float_raise(float_flag_invalid, status);
4785 return float32_default_nan(status);
4787 aSig = extractFloatx80Frac( a );
4788 aExp = extractFloatx80Exp( a );
4789 aSign = extractFloatx80Sign( a );
4790 if ( aExp == 0x7FFF ) {
4791 if ( (uint64_t) ( aSig<<1 ) ) {
4792 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4794 return packFloat32( aSign, 0xFF, 0 );
4796 shift64RightJamming( aSig, 33, &aSig );
4797 if ( aExp || aSig ) aExp -= 0x3F81;
4798 return roundAndPackFloat32(aSign, aExp, aSig, status);
4802 /*----------------------------------------------------------------------------
4803 | Returns the result of converting the extended double-precision floating-
4804 | point value `a' to the double-precision floating-point format. The
4805 | conversion is performed according to the IEC/IEEE Standard for Binary
4806 | Floating-Point Arithmetic.
4807 *----------------------------------------------------------------------------*/
4809 float64 floatx80_to_float64(floatx80 a, float_status *status)
4811 flag aSign;
4812 int32_t aExp;
4813 uint64_t aSig, zSig;
4815 if (floatx80_invalid_encoding(a)) {
4816 float_raise(float_flag_invalid, status);
4817 return float64_default_nan(status);
4819 aSig = extractFloatx80Frac( a );
4820 aExp = extractFloatx80Exp( a );
4821 aSign = extractFloatx80Sign( a );
4822 if ( aExp == 0x7FFF ) {
4823 if ( (uint64_t) ( aSig<<1 ) ) {
4824 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4826 return packFloat64( aSign, 0x7FF, 0 );
4828 shift64RightJamming( aSig, 1, &zSig );
4829 if ( aExp || aSig ) aExp -= 0x3C01;
4830 return roundAndPackFloat64(aSign, aExp, zSig, status);
4834 /*----------------------------------------------------------------------------
4835 | Returns the result of converting the extended double-precision floating-
4836 | point value `a' to the quadruple-precision floating-point format. The
4837 | conversion is performed according to the IEC/IEEE Standard for Binary
4838 | Floating-Point Arithmetic.
4839 *----------------------------------------------------------------------------*/
4841 float128 floatx80_to_float128(floatx80 a, float_status *status)
4843 flag aSign;
4844 int aExp;
4845 uint64_t aSig, zSig0, zSig1;
4847 if (floatx80_invalid_encoding(a)) {
4848 float_raise(float_flag_invalid, status);
4849 return float128_default_nan(status);
4851 aSig = extractFloatx80Frac( a );
4852 aExp = extractFloatx80Exp( a );
4853 aSign = extractFloatx80Sign( a );
4854 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4855 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4857 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4858 return packFloat128( aSign, aExp, zSig0, zSig1 );
4862 /*----------------------------------------------------------------------------
4863 | Rounds the extended double-precision floating-point value `a'
4864 | to the precision provided by floatx80_rounding_precision and returns the
4865 | result as an extended double-precision floating-point value.
4866 | The operation is performed according to the IEC/IEEE Standard for Binary
4867 | Floating-Point Arithmetic.
4868 *----------------------------------------------------------------------------*/
4870 floatx80 floatx80_round(floatx80 a, float_status *status)
4872 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4873 extractFloatx80Sign(a),
4874 extractFloatx80Exp(a),
4875 extractFloatx80Frac(a), 0, status);
4878 /*----------------------------------------------------------------------------
4879 | Rounds the extended double-precision floating-point value `a' to an integer,
4880 | and returns the result as an extended quadruple-precision floating-point
4881 | value. The operation is performed according to the IEC/IEEE Standard for
4882 | Binary Floating-Point Arithmetic.
4883 *----------------------------------------------------------------------------*/
4885 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4887 flag aSign;
4888 int32_t aExp;
4889 uint64_t lastBitMask, roundBitsMask;
4890 floatx80 z;
4892 if (floatx80_invalid_encoding(a)) {
4893 float_raise(float_flag_invalid, status);
4894 return floatx80_default_nan(status);
4896 aExp = extractFloatx80Exp( a );
4897 if ( 0x403E <= aExp ) {
4898 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4899 return propagateFloatx80NaN(a, a, status);
4901 return a;
4903 if ( aExp < 0x3FFF ) {
4904 if ( ( aExp == 0 )
4905 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4906 return a;
4908 status->float_exception_flags |= float_flag_inexact;
4909 aSign = extractFloatx80Sign( a );
4910 switch (status->float_rounding_mode) {
4911 case float_round_nearest_even:
4912 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4914 return
4915 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4917 break;
4918 case float_round_ties_away:
4919 if (aExp == 0x3FFE) {
4920 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4922 break;
4923 case float_round_down:
4924 return
4925 aSign ?
4926 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4927 : packFloatx80( 0, 0, 0 );
4928 case float_round_up:
4929 return
4930 aSign ? packFloatx80( 1, 0, 0 )
4931 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4933 return packFloatx80( aSign, 0, 0 );
4935 lastBitMask = 1;
4936 lastBitMask <<= 0x403E - aExp;
4937 roundBitsMask = lastBitMask - 1;
4938 z = a;
4939 switch (status->float_rounding_mode) {
4940 case float_round_nearest_even:
4941 z.low += lastBitMask>>1;
4942 if ((z.low & roundBitsMask) == 0) {
4943 z.low &= ~lastBitMask;
4945 break;
4946 case float_round_ties_away:
4947 z.low += lastBitMask >> 1;
4948 break;
4949 case float_round_to_zero:
4950 break;
4951 case float_round_up:
4952 if (!extractFloatx80Sign(z)) {
4953 z.low += roundBitsMask;
4955 break;
4956 case float_round_down:
4957 if (extractFloatx80Sign(z)) {
4958 z.low += roundBitsMask;
4960 break;
4961 default:
4962 abort();
4964 z.low &= ~ roundBitsMask;
4965 if ( z.low == 0 ) {
4966 ++z.high;
4967 z.low = LIT64( 0x8000000000000000 );
4969 if (z.low != a.low) {
4970 status->float_exception_flags |= float_flag_inexact;
4972 return z;
4976 /*----------------------------------------------------------------------------
4977 | Returns the result of adding the absolute values of the extended double-
4978 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4979 | negated before being returned. `zSign' is ignored if the result is a NaN.
4980 | The addition is performed according to the IEC/IEEE Standard for Binary
4981 | Floating-Point Arithmetic.
4982 *----------------------------------------------------------------------------*/
4984 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4985 float_status *status)
4987 int32_t aExp, bExp, zExp;
4988 uint64_t aSig, bSig, zSig0, zSig1;
4989 int32_t expDiff;
4991 aSig = extractFloatx80Frac( a );
4992 aExp = extractFloatx80Exp( a );
4993 bSig = extractFloatx80Frac( b );
4994 bExp = extractFloatx80Exp( b );
4995 expDiff = aExp - bExp;
4996 if ( 0 < expDiff ) {
4997 if ( aExp == 0x7FFF ) {
4998 if ((uint64_t)(aSig << 1)) {
4999 return propagateFloatx80NaN(a, b, status);
5001 return a;
5003 if ( bExp == 0 ) --expDiff;
5004 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5005 zExp = aExp;
5007 else if ( expDiff < 0 ) {
5008 if ( bExp == 0x7FFF ) {
5009 if ((uint64_t)(bSig << 1)) {
5010 return propagateFloatx80NaN(a, b, status);
5012 return packFloatx80(zSign,
5013 floatx80_infinity_high,
5014 floatx80_infinity_low);
5016 if ( aExp == 0 ) ++expDiff;
5017 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5018 zExp = bExp;
5020 else {
5021 if ( aExp == 0x7FFF ) {
5022 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5023 return propagateFloatx80NaN(a, b, status);
5025 return a;
5027 zSig1 = 0;
5028 zSig0 = aSig + bSig;
5029 if ( aExp == 0 ) {
5030 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5031 goto roundAndPack;
5033 zExp = aExp;
5034 goto shiftRight1;
5036 zSig0 = aSig + bSig;
5037 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5038 shiftRight1:
5039 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5040 zSig0 |= LIT64( 0x8000000000000000 );
5041 ++zExp;
5042 roundAndPack:
5043 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5044 zSign, zExp, zSig0, zSig1, status);
5047 /*----------------------------------------------------------------------------
5048 | Returns the result of subtracting the absolute values of the extended
5049 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5050 | difference is negated before being returned. `zSign' is ignored if the
5051 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5052 | Standard for Binary Floating-Point Arithmetic.
5053 *----------------------------------------------------------------------------*/
5055 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5056 float_status *status)
5058 int32_t aExp, bExp, zExp;
5059 uint64_t aSig, bSig, zSig0, zSig1;
5060 int32_t expDiff;
5062 aSig = extractFloatx80Frac( a );
5063 aExp = extractFloatx80Exp( a );
5064 bSig = extractFloatx80Frac( b );
5065 bExp = extractFloatx80Exp( b );
5066 expDiff = aExp - bExp;
5067 if ( 0 < expDiff ) goto aExpBigger;
5068 if ( expDiff < 0 ) goto bExpBigger;
5069 if ( aExp == 0x7FFF ) {
5070 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5071 return propagateFloatx80NaN(a, b, status);
5073 float_raise(float_flag_invalid, status);
5074 return floatx80_default_nan(status);
5076 if ( aExp == 0 ) {
5077 aExp = 1;
5078 bExp = 1;
5080 zSig1 = 0;
5081 if ( bSig < aSig ) goto aBigger;
5082 if ( aSig < bSig ) goto bBigger;
5083 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5084 bExpBigger:
5085 if ( bExp == 0x7FFF ) {
5086 if ((uint64_t)(bSig << 1)) {
5087 return propagateFloatx80NaN(a, b, status);
5089 return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5090 floatx80_infinity_low);
5092 if ( aExp == 0 ) ++expDiff;
5093 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5094 bBigger:
5095 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5096 zExp = bExp;
5097 zSign ^= 1;
5098 goto normalizeRoundAndPack;
5099 aExpBigger:
5100 if ( aExp == 0x7FFF ) {
5101 if ((uint64_t)(aSig << 1)) {
5102 return propagateFloatx80NaN(a, b, status);
5104 return a;
5106 if ( bExp == 0 ) --expDiff;
5107 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5108 aBigger:
5109 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5110 zExp = aExp;
5111 normalizeRoundAndPack:
5112 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5113 zSign, zExp, zSig0, zSig1, status);
5116 /*----------------------------------------------------------------------------
5117 | Returns the result of adding the extended double-precision floating-point
5118 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5119 | Standard for Binary Floating-Point Arithmetic.
5120 *----------------------------------------------------------------------------*/
5122 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5124 flag aSign, bSign;
5126 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5127 float_raise(float_flag_invalid, status);
5128 return floatx80_default_nan(status);
5130 aSign = extractFloatx80Sign( a );
5131 bSign = extractFloatx80Sign( b );
5132 if ( aSign == bSign ) {
5133 return addFloatx80Sigs(a, b, aSign, status);
5135 else {
5136 return subFloatx80Sigs(a, b, aSign, status);
5141 /*----------------------------------------------------------------------------
5142 | Returns the result of subtracting the extended double-precision floating-
5143 | point values `a' and `b'. The operation is performed according to the
5144 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5145 *----------------------------------------------------------------------------*/
5147 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5149 flag aSign, bSign;
5151 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5152 float_raise(float_flag_invalid, status);
5153 return floatx80_default_nan(status);
5155 aSign = extractFloatx80Sign( a );
5156 bSign = extractFloatx80Sign( b );
5157 if ( aSign == bSign ) {
5158 return subFloatx80Sigs(a, b, aSign, status);
5160 else {
5161 return addFloatx80Sigs(a, b, aSign, status);
5166 /*----------------------------------------------------------------------------
5167 | Returns the result of multiplying the extended double-precision floating-
5168 | point values `a' and `b'. The operation is performed according to the
5169 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5170 *----------------------------------------------------------------------------*/
5172 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5174 flag aSign, bSign, zSign;
5175 int32_t aExp, bExp, zExp;
5176 uint64_t aSig, bSig, zSig0, zSig1;
5178 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5179 float_raise(float_flag_invalid, status);
5180 return floatx80_default_nan(status);
5182 aSig = extractFloatx80Frac( a );
5183 aExp = extractFloatx80Exp( a );
5184 aSign = extractFloatx80Sign( a );
5185 bSig = extractFloatx80Frac( b );
5186 bExp = extractFloatx80Exp( b );
5187 bSign = extractFloatx80Sign( b );
5188 zSign = aSign ^ bSign;
5189 if ( aExp == 0x7FFF ) {
5190 if ( (uint64_t) ( aSig<<1 )
5191 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5192 return propagateFloatx80NaN(a, b, status);
5194 if ( ( bExp | bSig ) == 0 ) goto invalid;
5195 return packFloatx80(zSign, floatx80_infinity_high,
5196 floatx80_infinity_low);
5198 if ( bExp == 0x7FFF ) {
5199 if ((uint64_t)(bSig << 1)) {
5200 return propagateFloatx80NaN(a, b, status);
5202 if ( ( aExp | aSig ) == 0 ) {
5203 invalid:
5204 float_raise(float_flag_invalid, status);
5205 return floatx80_default_nan(status);
5207 return packFloatx80(zSign, floatx80_infinity_high,
5208 floatx80_infinity_low);
5210 if ( aExp == 0 ) {
5211 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5212 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5214 if ( bExp == 0 ) {
5215 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5216 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5218 zExp = aExp + bExp - 0x3FFE;
5219 mul64To128( aSig, bSig, &zSig0, &zSig1 );
5220 if ( 0 < (int64_t) zSig0 ) {
5221 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5222 --zExp;
5224 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5225 zSign, zExp, zSig0, zSig1, status);
5228 /*----------------------------------------------------------------------------
5229 | Returns the result of dividing the extended double-precision floating-point
5230 | value `a' by the corresponding value `b'. The operation is performed
5231 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5232 *----------------------------------------------------------------------------*/
5234 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5236 flag aSign, bSign, zSign;
5237 int32_t aExp, bExp, zExp;
5238 uint64_t aSig, bSig, zSig0, zSig1;
5239 uint64_t rem0, rem1, rem2, term0, term1, term2;
5241 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5242 float_raise(float_flag_invalid, status);
5243 return floatx80_default_nan(status);
5245 aSig = extractFloatx80Frac( a );
5246 aExp = extractFloatx80Exp( a );
5247 aSign = extractFloatx80Sign( a );
5248 bSig = extractFloatx80Frac( b );
5249 bExp = extractFloatx80Exp( b );
5250 bSign = extractFloatx80Sign( b );
5251 zSign = aSign ^ bSign;
5252 if ( aExp == 0x7FFF ) {
5253 if ((uint64_t)(aSig << 1)) {
5254 return propagateFloatx80NaN(a, b, status);
5256 if ( bExp == 0x7FFF ) {
5257 if ((uint64_t)(bSig << 1)) {
5258 return propagateFloatx80NaN(a, b, status);
5260 goto invalid;
5262 return packFloatx80(zSign, floatx80_infinity_high,
5263 floatx80_infinity_low);
5265 if ( bExp == 0x7FFF ) {
5266 if ((uint64_t)(bSig << 1)) {
5267 return propagateFloatx80NaN(a, b, status);
5269 return packFloatx80( zSign, 0, 0 );
5271 if ( bExp == 0 ) {
5272 if ( bSig == 0 ) {
5273 if ( ( aExp | aSig ) == 0 ) {
5274 invalid:
5275 float_raise(float_flag_invalid, status);
5276 return floatx80_default_nan(status);
5278 float_raise(float_flag_divbyzero, status);
5279 return packFloatx80(zSign, floatx80_infinity_high,
5280 floatx80_infinity_low);
5282 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5284 if ( aExp == 0 ) {
5285 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5286 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5288 zExp = aExp - bExp + 0x3FFE;
5289 rem1 = 0;
5290 if ( bSig <= aSig ) {
5291 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5292 ++zExp;
5294 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5295 mul64To128( bSig, zSig0, &term0, &term1 );
5296 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5297 while ( (int64_t) rem0 < 0 ) {
5298 --zSig0;
5299 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5301 zSig1 = estimateDiv128To64( rem1, 0, bSig );
5302 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5303 mul64To128( bSig, zSig1, &term1, &term2 );
5304 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5305 while ( (int64_t) rem1 < 0 ) {
5306 --zSig1;
5307 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5309 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5311 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5312 zSign, zExp, zSig0, zSig1, status);
5315 /*----------------------------------------------------------------------------
5316 | Returns the remainder of the extended double-precision floating-point value
5317 | `a' with respect to the corresponding value `b'. The operation is performed
5318 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5319 *----------------------------------------------------------------------------*/
5321 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5323 flag aSign, zSign;
5324 int32_t aExp, bExp, expDiff;
5325 uint64_t aSig0, aSig1, bSig;
5326 uint64_t q, term0, term1, alternateASig0, alternateASig1;
5328 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5329 float_raise(float_flag_invalid, status);
5330 return floatx80_default_nan(status);
5332 aSig0 = extractFloatx80Frac( a );
5333 aExp = extractFloatx80Exp( a );
5334 aSign = extractFloatx80Sign( a );
5335 bSig = extractFloatx80Frac( b );
5336 bExp = extractFloatx80Exp( b );
5337 if ( aExp == 0x7FFF ) {
5338 if ( (uint64_t) ( aSig0<<1 )
5339 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5340 return propagateFloatx80NaN(a, b, status);
5342 goto invalid;
5344 if ( bExp == 0x7FFF ) {
5345 if ((uint64_t)(bSig << 1)) {
5346 return propagateFloatx80NaN(a, b, status);
5348 return a;
5350 if ( bExp == 0 ) {
5351 if ( bSig == 0 ) {
5352 invalid:
5353 float_raise(float_flag_invalid, status);
5354 return floatx80_default_nan(status);
5356 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5358 if ( aExp == 0 ) {
5359 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5360 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5362 bSig |= LIT64( 0x8000000000000000 );
5363 zSign = aSign;
5364 expDiff = aExp - bExp;
5365 aSig1 = 0;
5366 if ( expDiff < 0 ) {
5367 if ( expDiff < -1 ) return a;
5368 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5369 expDiff = 0;
5371 q = ( bSig <= aSig0 );
5372 if ( q ) aSig0 -= bSig;
5373 expDiff -= 64;
5374 while ( 0 < expDiff ) {
5375 q = estimateDiv128To64( aSig0, aSig1, bSig );
5376 q = ( 2 < q ) ? q - 2 : 0;
5377 mul64To128( bSig, q, &term0, &term1 );
5378 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5379 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5380 expDiff -= 62;
5382 expDiff += 64;
5383 if ( 0 < expDiff ) {
5384 q = estimateDiv128To64( aSig0, aSig1, bSig );
5385 q = ( 2 < q ) ? q - 2 : 0;
5386 q >>= 64 - expDiff;
5387 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5388 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5389 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5390 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5391 ++q;
5392 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5395 else {
5396 term1 = 0;
5397 term0 = bSig;
5399 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5400 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5401 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5402 && ( q & 1 ) )
5404 aSig0 = alternateASig0;
5405 aSig1 = alternateASig1;
5406 zSign = ! zSign;
5408 return
5409 normalizeRoundAndPackFloatx80(
5410 80, zSign, bExp + expDiff, aSig0, aSig1, status);
5414 /*----------------------------------------------------------------------------
5415 | Returns the square root of the extended double-precision floating-point
5416 | value `a'. The operation is performed according to the IEC/IEEE Standard
5417 | for Binary Floating-Point Arithmetic.
5418 *----------------------------------------------------------------------------*/
5420 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5422 flag aSign;
5423 int32_t aExp, zExp;
5424 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5425 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5427 if (floatx80_invalid_encoding(a)) {
5428 float_raise(float_flag_invalid, status);
5429 return floatx80_default_nan(status);
5431 aSig0 = extractFloatx80Frac( a );
5432 aExp = extractFloatx80Exp( a );
5433 aSign = extractFloatx80Sign( a );
5434 if ( aExp == 0x7FFF ) {
5435 if ((uint64_t)(aSig0 << 1)) {
5436 return propagateFloatx80NaN(a, a, status);
5438 if ( ! aSign ) return a;
5439 goto invalid;
5441 if ( aSign ) {
5442 if ( ( aExp | aSig0 ) == 0 ) return a;
5443 invalid:
5444 float_raise(float_flag_invalid, status);
5445 return floatx80_default_nan(status);
5447 if ( aExp == 0 ) {
5448 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5449 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5451 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5452 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5453 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5454 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5455 doubleZSig0 = zSig0<<1;
5456 mul64To128( zSig0, zSig0, &term0, &term1 );
5457 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5458 while ( (int64_t) rem0 < 0 ) {
5459 --zSig0;
5460 doubleZSig0 -= 2;
5461 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5463 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5464 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5465 if ( zSig1 == 0 ) zSig1 = 1;
5466 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5467 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5468 mul64To128( zSig1, zSig1, &term2, &term3 );
5469 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5470 while ( (int64_t) rem1 < 0 ) {
5471 --zSig1;
5472 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5473 term3 |= 1;
5474 term2 |= doubleZSig0;
5475 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5477 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5479 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5480 zSig0 |= doubleZSig0;
5481 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5482 0, zExp, zSig0, zSig1, status);
5485 /*----------------------------------------------------------------------------
5486 | Returns 1 if the extended double-precision floating-point value `a' is equal
5487 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5488 | raised if either operand is a NaN. Otherwise, the comparison is performed
5489 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5490 *----------------------------------------------------------------------------*/
5492 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5495 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5496 || (extractFloatx80Exp(a) == 0x7FFF
5497 && (uint64_t) (extractFloatx80Frac(a) << 1))
5498 || (extractFloatx80Exp(b) == 0x7FFF
5499 && (uint64_t) (extractFloatx80Frac(b) << 1))
5501 float_raise(float_flag_invalid, status);
5502 return 0;
5504 return
5505 ( a.low == b.low )
5506 && ( ( a.high == b.high )
5507 || ( ( a.low == 0 )
5508 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5513 /*----------------------------------------------------------------------------
5514 | Returns 1 if the extended double-precision floating-point value `a' is
5515 | less than or equal to the corresponding value `b', and 0 otherwise. The
5516 | invalid exception is raised if either operand is a NaN. The comparison is
5517 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5518 | Arithmetic.
5519 *----------------------------------------------------------------------------*/
5521 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5523 flag aSign, bSign;
5525 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5526 || (extractFloatx80Exp(a) == 0x7FFF
5527 && (uint64_t) (extractFloatx80Frac(a) << 1))
5528 || (extractFloatx80Exp(b) == 0x7FFF
5529 && (uint64_t) (extractFloatx80Frac(b) << 1))
5531 float_raise(float_flag_invalid, status);
5532 return 0;
5534 aSign = extractFloatx80Sign( a );
5535 bSign = extractFloatx80Sign( b );
5536 if ( aSign != bSign ) {
5537 return
5538 aSign
5539 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5540 == 0 );
5542 return
5543 aSign ? le128( b.high, b.low, a.high, a.low )
5544 : le128( a.high, a.low, b.high, b.low );
5548 /*----------------------------------------------------------------------------
5549 | Returns 1 if the extended double-precision floating-point value `a' is
5550 | less than the corresponding value `b', and 0 otherwise. The invalid
5551 | exception is raised if either operand is a NaN. The comparison is performed
5552 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5553 *----------------------------------------------------------------------------*/
5555 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5557 flag aSign, bSign;
5559 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5560 || (extractFloatx80Exp(a) == 0x7FFF
5561 && (uint64_t) (extractFloatx80Frac(a) << 1))
5562 || (extractFloatx80Exp(b) == 0x7FFF
5563 && (uint64_t) (extractFloatx80Frac(b) << 1))
5565 float_raise(float_flag_invalid, status);
5566 return 0;
5568 aSign = extractFloatx80Sign( a );
5569 bSign = extractFloatx80Sign( b );
5570 if ( aSign != bSign ) {
5571 return
5572 aSign
5573 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5574 != 0 );
5576 return
5577 aSign ? lt128( b.high, b.low, a.high, a.low )
5578 : lt128( a.high, a.low, b.high, b.low );
5582 /*----------------------------------------------------------------------------
5583 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5584 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5585 | either operand is a NaN. The comparison is performed according to the
5586 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5587 *----------------------------------------------------------------------------*/
5588 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5590 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5591 || (extractFloatx80Exp(a) == 0x7FFF
5592 && (uint64_t) (extractFloatx80Frac(a) << 1))
5593 || (extractFloatx80Exp(b) == 0x7FFF
5594 && (uint64_t) (extractFloatx80Frac(b) << 1))
5596 float_raise(float_flag_invalid, status);
5597 return 1;
5599 return 0;
5602 /*----------------------------------------------------------------------------
5603 | Returns 1 if the extended double-precision floating-point value `a' is
5604 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5605 | cause an exception. The comparison is performed according to the IEC/IEEE
5606 | Standard for Binary Floating-Point Arithmetic.
5607 *----------------------------------------------------------------------------*/
5609 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5612 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5613 float_raise(float_flag_invalid, status);
5614 return 0;
5616 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5617 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5618 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5619 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5621 if (floatx80_is_signaling_nan(a, status)
5622 || floatx80_is_signaling_nan(b, status)) {
5623 float_raise(float_flag_invalid, status);
5625 return 0;
5627 return
5628 ( a.low == b.low )
5629 && ( ( a.high == b.high )
5630 || ( ( a.low == 0 )
5631 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5636 /*----------------------------------------------------------------------------
5637 | Returns 1 if the extended double-precision floating-point value `a' is less
5638 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5639 | do not cause an exception. Otherwise, the comparison is performed according
5640 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5641 *----------------------------------------------------------------------------*/
5643 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5645 flag aSign, bSign;
5647 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5648 float_raise(float_flag_invalid, status);
5649 return 0;
5651 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5652 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5653 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5654 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5656 if (floatx80_is_signaling_nan(a, status)
5657 || floatx80_is_signaling_nan(b, status)) {
5658 float_raise(float_flag_invalid, status);
5660 return 0;
5662 aSign = extractFloatx80Sign( a );
5663 bSign = extractFloatx80Sign( b );
5664 if ( aSign != bSign ) {
5665 return
5666 aSign
5667 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5668 == 0 );
5670 return
5671 aSign ? le128( b.high, b.low, a.high, a.low )
5672 : le128( a.high, a.low, b.high, b.low );
5676 /*----------------------------------------------------------------------------
5677 | Returns 1 if the extended double-precision floating-point value `a' is less
5678 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5679 | an exception. Otherwise, the comparison is performed according to the
5680 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5681 *----------------------------------------------------------------------------*/
5683 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5685 flag aSign, bSign;
5687 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5688 float_raise(float_flag_invalid, status);
5689 return 0;
5691 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5692 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5693 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5694 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5696 if (floatx80_is_signaling_nan(a, status)
5697 || floatx80_is_signaling_nan(b, status)) {
5698 float_raise(float_flag_invalid, status);
5700 return 0;
5702 aSign = extractFloatx80Sign( a );
5703 bSign = extractFloatx80Sign( b );
5704 if ( aSign != bSign ) {
5705 return
5706 aSign
5707 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5708 != 0 );
5710 return
5711 aSign ? lt128( b.high, b.low, a.high, a.low )
5712 : lt128( a.high, a.low, b.high, b.low );
5716 /*----------------------------------------------------------------------------
5717 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5718 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5719 | The comparison is performed according to the IEC/IEEE Standard for Binary
5720 | Floating-Point Arithmetic.
5721 *----------------------------------------------------------------------------*/
5722 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5724 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5725 float_raise(float_flag_invalid, status);
5726 return 1;
5728 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5729 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5730 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5731 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5733 if (floatx80_is_signaling_nan(a, status)
5734 || floatx80_is_signaling_nan(b, status)) {
5735 float_raise(float_flag_invalid, status);
5737 return 1;
5739 return 0;
5742 /*----------------------------------------------------------------------------
5743 | Returns the result of converting the quadruple-precision floating-point
5744 | value `a' to the 32-bit two's complement integer format. The conversion
5745 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5746 | Arithmetic---which means in particular that the conversion is rounded
5747 | according to the current rounding mode. If `a' is a NaN, the largest
5748 | positive integer is returned. Otherwise, if the conversion overflows, the
5749 | largest integer with the same sign as `a' is returned.
5750 *----------------------------------------------------------------------------*/
5752 int32_t float128_to_int32(float128 a, float_status *status)
5754 flag aSign;
5755 int32_t aExp, shiftCount;
5756 uint64_t aSig0, aSig1;
5758 aSig1 = extractFloat128Frac1( a );
5759 aSig0 = extractFloat128Frac0( a );
5760 aExp = extractFloat128Exp( a );
5761 aSign = extractFloat128Sign( a );
5762 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5763 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5764 aSig0 |= ( aSig1 != 0 );
5765 shiftCount = 0x4028 - aExp;
5766 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5767 return roundAndPackInt32(aSign, aSig0, status);
5771 /*----------------------------------------------------------------------------
5772 | Returns the result of converting the quadruple-precision floating-point
5773 | value `a' to the 32-bit two's complement integer format. The conversion
5774 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5775 | Arithmetic, except that the conversion is always rounded toward zero. If
5776 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5777 | conversion overflows, the largest integer with the same sign as `a' is
5778 | returned.
5779 *----------------------------------------------------------------------------*/
5781 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5783 flag aSign;
5784 int32_t aExp, shiftCount;
5785 uint64_t aSig0, aSig1, savedASig;
5786 int32_t z;
5788 aSig1 = extractFloat128Frac1( a );
5789 aSig0 = extractFloat128Frac0( a );
5790 aExp = extractFloat128Exp( a );
5791 aSign = extractFloat128Sign( a );
5792 aSig0 |= ( aSig1 != 0 );
5793 if ( 0x401E < aExp ) {
5794 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5795 goto invalid;
5797 else if ( aExp < 0x3FFF ) {
5798 if (aExp || aSig0) {
5799 status->float_exception_flags |= float_flag_inexact;
5801 return 0;
5803 aSig0 |= LIT64( 0x0001000000000000 );
5804 shiftCount = 0x402F - aExp;
5805 savedASig = aSig0;
5806 aSig0 >>= shiftCount;
5807 z = aSig0;
5808 if ( aSign ) z = - z;
5809 if ( ( z < 0 ) ^ aSign ) {
5810 invalid:
5811 float_raise(float_flag_invalid, status);
5812 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5814 if ( ( aSig0<<shiftCount ) != savedASig ) {
5815 status->float_exception_flags |= float_flag_inexact;
5817 return z;
5821 /*----------------------------------------------------------------------------
5822 | Returns the result of converting the quadruple-precision floating-point
5823 | value `a' to the 64-bit two's complement integer format. The conversion
5824 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5825 | Arithmetic---which means in particular that the conversion is rounded
5826 | according to the current rounding mode. If `a' is a NaN, the largest
5827 | positive integer is returned. Otherwise, if the conversion overflows, the
5828 | largest integer with the same sign as `a' is returned.
5829 *----------------------------------------------------------------------------*/
5831 int64_t float128_to_int64(float128 a, float_status *status)
5833 flag aSign;
5834 int32_t aExp, shiftCount;
5835 uint64_t aSig0, aSig1;
5837 aSig1 = extractFloat128Frac1( a );
5838 aSig0 = extractFloat128Frac0( a );
5839 aExp = extractFloat128Exp( a );
5840 aSign = extractFloat128Sign( a );
5841 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5842 shiftCount = 0x402F - aExp;
5843 if ( shiftCount <= 0 ) {
5844 if ( 0x403E < aExp ) {
5845 float_raise(float_flag_invalid, status);
5846 if ( ! aSign
5847 || ( ( aExp == 0x7FFF )
5848 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5851 return LIT64( 0x7FFFFFFFFFFFFFFF );
5853 return (int64_t) LIT64( 0x8000000000000000 );
5855 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5857 else {
5858 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5860 return roundAndPackInt64(aSign, aSig0, aSig1, status);
5864 /*----------------------------------------------------------------------------
5865 | Returns the result of converting the quadruple-precision floating-point
5866 | value `a' to the 64-bit two's complement integer format. The conversion
5867 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5868 | Arithmetic, except that the conversion is always rounded toward zero.
5869 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5870 | the conversion overflows, the largest integer with the same sign as `a' is
5871 | returned.
5872 *----------------------------------------------------------------------------*/
5874 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5876 flag aSign;
5877 int32_t aExp, shiftCount;
5878 uint64_t aSig0, aSig1;
5879 int64_t z;
5881 aSig1 = extractFloat128Frac1( a );
5882 aSig0 = extractFloat128Frac0( a );
5883 aExp = extractFloat128Exp( a );
5884 aSign = extractFloat128Sign( a );
5885 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5886 shiftCount = aExp - 0x402F;
5887 if ( 0 < shiftCount ) {
5888 if ( 0x403E <= aExp ) {
5889 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5890 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5891 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5892 if (aSig1) {
5893 status->float_exception_flags |= float_flag_inexact;
5896 else {
5897 float_raise(float_flag_invalid, status);
5898 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5899 return LIT64( 0x7FFFFFFFFFFFFFFF );
5902 return (int64_t) LIT64( 0x8000000000000000 );
5904 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5905 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5906 status->float_exception_flags |= float_flag_inexact;
5909 else {
5910 if ( aExp < 0x3FFF ) {
5911 if ( aExp | aSig0 | aSig1 ) {
5912 status->float_exception_flags |= float_flag_inexact;
5914 return 0;
5916 z = aSig0>>( - shiftCount );
5917 if ( aSig1
5918 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5919 status->float_exception_flags |= float_flag_inexact;
5922 if ( aSign ) z = - z;
5923 return z;
5927 /*----------------------------------------------------------------------------
5928 | Returns the result of converting the quadruple-precision floating-point value
5929 | `a' to the 64-bit unsigned integer format. The conversion is
5930 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5931 | Arithmetic---which means in particular that the conversion is rounded
5932 | according to the current rounding mode. If `a' is a NaN, the largest
5933 | positive integer is returned. If the conversion overflows, the
5934 | largest unsigned integer is returned. If 'a' is negative, the value is
5935 | rounded and zero is returned; negative values that do not round to zero
5936 | will raise the inexact exception.
5937 *----------------------------------------------------------------------------*/
5939 uint64_t float128_to_uint64(float128 a, float_status *status)
5941 flag aSign;
5942 int aExp;
5943 int shiftCount;
5944 uint64_t aSig0, aSig1;
5946 aSig0 = extractFloat128Frac0(a);
5947 aSig1 = extractFloat128Frac1(a);
5948 aExp = extractFloat128Exp(a);
5949 aSign = extractFloat128Sign(a);
5950 if (aSign && (aExp > 0x3FFE)) {
5951 float_raise(float_flag_invalid, status);
5952 if (float128_is_any_nan(a)) {
5953 return LIT64(0xFFFFFFFFFFFFFFFF);
5954 } else {
5955 return 0;
5958 if (aExp) {
5959 aSig0 |= LIT64(0x0001000000000000);
5961 shiftCount = 0x402F - aExp;
5962 if (shiftCount <= 0) {
5963 if (0x403E < aExp) {
5964 float_raise(float_flag_invalid, status);
5965 return LIT64(0xFFFFFFFFFFFFFFFF);
5967 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5968 } else {
5969 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5971 return roundAndPackUint64(aSign, aSig0, aSig1, status);
5974 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5976 uint64_t v;
5977 signed char current_rounding_mode = status->float_rounding_mode;
5979 set_float_rounding_mode(float_round_to_zero, status);
5980 v = float128_to_uint64(a, status);
5981 set_float_rounding_mode(current_rounding_mode, status);
5983 return v;
5986 /*----------------------------------------------------------------------------
5987 | Returns the result of converting the quadruple-precision floating-point
5988 | value `a' to the 32-bit unsigned integer format. The conversion
5989 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5990 | Arithmetic except that the conversion is always rounded toward zero.
5991 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
5992 | if the conversion overflows, the largest unsigned integer is returned.
5993 | If 'a' is negative, the value is rounded and zero is returned; negative
5994 | values that do not round to zero will raise the inexact exception.
5995 *----------------------------------------------------------------------------*/
5997 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5999 uint64_t v;
6000 uint32_t res;
6001 int old_exc_flags = get_float_exception_flags(status);
6003 v = float128_to_uint64_round_to_zero(a, status);
6004 if (v > 0xffffffff) {
6005 res = 0xffffffff;
6006 } else {
6007 return v;
6009 set_float_exception_flags(old_exc_flags, status);
6010 float_raise(float_flag_invalid, status);
6011 return res;
6014 /*----------------------------------------------------------------------------
6015 | Returns the result of converting the quadruple-precision floating-point
6016 | value `a' to the single-precision floating-point format. The conversion
6017 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6018 | Arithmetic.
6019 *----------------------------------------------------------------------------*/
6021 float32 float128_to_float32(float128 a, float_status *status)
6023 flag aSign;
6024 int32_t aExp;
6025 uint64_t aSig0, aSig1;
6026 uint32_t zSig;
6028 aSig1 = extractFloat128Frac1( a );
6029 aSig0 = extractFloat128Frac0( a );
6030 aExp = extractFloat128Exp( a );
6031 aSign = extractFloat128Sign( a );
6032 if ( aExp == 0x7FFF ) {
6033 if ( aSig0 | aSig1 ) {
6034 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6036 return packFloat32( aSign, 0xFF, 0 );
6038 aSig0 |= ( aSig1 != 0 );
6039 shift64RightJamming( aSig0, 18, &aSig0 );
6040 zSig = aSig0;
6041 if ( aExp || zSig ) {
6042 zSig |= 0x40000000;
6043 aExp -= 0x3F81;
6045 return roundAndPackFloat32(aSign, aExp, zSig, status);
6049 /*----------------------------------------------------------------------------
6050 | Returns the result of converting the quadruple-precision floating-point
6051 | value `a' to the double-precision floating-point format. The conversion
6052 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6053 | Arithmetic.
6054 *----------------------------------------------------------------------------*/
6056 float64 float128_to_float64(float128 a, float_status *status)
6058 flag aSign;
6059 int32_t aExp;
6060 uint64_t aSig0, aSig1;
6062 aSig1 = extractFloat128Frac1( a );
6063 aSig0 = extractFloat128Frac0( a );
6064 aExp = extractFloat128Exp( a );
6065 aSign = extractFloat128Sign( a );
6066 if ( aExp == 0x7FFF ) {
6067 if ( aSig0 | aSig1 ) {
6068 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6070 return packFloat64( aSign, 0x7FF, 0 );
6072 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6073 aSig0 |= ( aSig1 != 0 );
6074 if ( aExp || aSig0 ) {
6075 aSig0 |= LIT64( 0x4000000000000000 );
6076 aExp -= 0x3C01;
6078 return roundAndPackFloat64(aSign, aExp, aSig0, status);
6082 /*----------------------------------------------------------------------------
6083 | Returns the result of converting the quadruple-precision floating-point
6084 | value `a' to the extended double-precision floating-point format. The
6085 | conversion is performed according to the IEC/IEEE Standard for Binary
6086 | Floating-Point Arithmetic.
6087 *----------------------------------------------------------------------------*/
6089 floatx80 float128_to_floatx80(float128 a, float_status *status)
6091 flag aSign;
6092 int32_t aExp;
6093 uint64_t aSig0, aSig1;
6095 aSig1 = extractFloat128Frac1( a );
6096 aSig0 = extractFloat128Frac0( a );
6097 aExp = extractFloat128Exp( a );
6098 aSign = extractFloat128Sign( a );
6099 if ( aExp == 0x7FFF ) {
6100 if ( aSig0 | aSig1 ) {
6101 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6103 return packFloatx80(aSign, floatx80_infinity_high,
6104 floatx80_infinity_low);
6106 if ( aExp == 0 ) {
6107 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6108 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6110 else {
6111 aSig0 |= LIT64( 0x0001000000000000 );
6113 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6114 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6118 /*----------------------------------------------------------------------------
6119 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6120 | returns the result as a quadruple-precision floating-point value. The
6121 | operation is performed according to the IEC/IEEE Standard for Binary
6122 | Floating-Point Arithmetic.
6123 *----------------------------------------------------------------------------*/
6125 float128 float128_round_to_int(float128 a, float_status *status)
6127 flag aSign;
6128 int32_t aExp;
6129 uint64_t lastBitMask, roundBitsMask;
6130 float128 z;
6132 aExp = extractFloat128Exp( a );
6133 if ( 0x402F <= aExp ) {
6134 if ( 0x406F <= aExp ) {
6135 if ( ( aExp == 0x7FFF )
6136 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6138 return propagateFloat128NaN(a, a, status);
6140 return a;
6142 lastBitMask = 1;
6143 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6144 roundBitsMask = lastBitMask - 1;
6145 z = a;
6146 switch (status->float_rounding_mode) {
6147 case float_round_nearest_even:
6148 if ( lastBitMask ) {
6149 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6150 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6152 else {
6153 if ( (int64_t) z.low < 0 ) {
6154 ++z.high;
6155 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6158 break;
6159 case float_round_ties_away:
6160 if (lastBitMask) {
6161 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6162 } else {
6163 if ((int64_t) z.low < 0) {
6164 ++z.high;
6167 break;
6168 case float_round_to_zero:
6169 break;
6170 case float_round_up:
6171 if (!extractFloat128Sign(z)) {
6172 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6174 break;
6175 case float_round_down:
6176 if (extractFloat128Sign(z)) {
6177 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6179 break;
6180 default:
6181 abort();
6183 z.low &= ~ roundBitsMask;
6185 else {
6186 if ( aExp < 0x3FFF ) {
6187 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6188 status->float_exception_flags |= float_flag_inexact;
6189 aSign = extractFloat128Sign( a );
6190 switch (status->float_rounding_mode) {
6191 case float_round_nearest_even:
6192 if ( ( aExp == 0x3FFE )
6193 && ( extractFloat128Frac0( a )
6194 | extractFloat128Frac1( a ) )
6196 return packFloat128( aSign, 0x3FFF, 0, 0 );
6198 break;
6199 case float_round_ties_away:
6200 if (aExp == 0x3FFE) {
6201 return packFloat128(aSign, 0x3FFF, 0, 0);
6203 break;
6204 case float_round_down:
6205 return
6206 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6207 : packFloat128( 0, 0, 0, 0 );
6208 case float_round_up:
6209 return
6210 aSign ? packFloat128( 1, 0, 0, 0 )
6211 : packFloat128( 0, 0x3FFF, 0, 0 );
6213 return packFloat128( aSign, 0, 0, 0 );
6215 lastBitMask = 1;
6216 lastBitMask <<= 0x402F - aExp;
6217 roundBitsMask = lastBitMask - 1;
6218 z.low = 0;
6219 z.high = a.high;
6220 switch (status->float_rounding_mode) {
6221 case float_round_nearest_even:
6222 z.high += lastBitMask>>1;
6223 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6224 z.high &= ~ lastBitMask;
6226 break;
6227 case float_round_ties_away:
6228 z.high += lastBitMask>>1;
6229 break;
6230 case float_round_to_zero:
6231 break;
6232 case float_round_up:
6233 if (!extractFloat128Sign(z)) {
6234 z.high |= ( a.low != 0 );
6235 z.high += roundBitsMask;
6237 break;
6238 case float_round_down:
6239 if (extractFloat128Sign(z)) {
6240 z.high |= (a.low != 0);
6241 z.high += roundBitsMask;
6243 break;
6244 default:
6245 abort();
6247 z.high &= ~ roundBitsMask;
6249 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6250 status->float_exception_flags |= float_flag_inexact;
6252 return z;
6256 /*----------------------------------------------------------------------------
6257 | Returns the result of adding the absolute values of the quadruple-precision
6258 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6259 | before being returned. `zSign' is ignored if the result is a NaN.
6260 | The addition is performed according to the IEC/IEEE Standard for Binary
6261 | Floating-Point Arithmetic.
6262 *----------------------------------------------------------------------------*/
6264 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6265 float_status *status)
6267 int32_t aExp, bExp, zExp;
6268 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6269 int32_t expDiff;
6271 aSig1 = extractFloat128Frac1( a );
6272 aSig0 = extractFloat128Frac0( a );
6273 aExp = extractFloat128Exp( a );
6274 bSig1 = extractFloat128Frac1( b );
6275 bSig0 = extractFloat128Frac0( b );
6276 bExp = extractFloat128Exp( b );
6277 expDiff = aExp - bExp;
6278 if ( 0 < expDiff ) {
6279 if ( aExp == 0x7FFF ) {
6280 if (aSig0 | aSig1) {
6281 return propagateFloat128NaN(a, b, status);
6283 return a;
6285 if ( bExp == 0 ) {
6286 --expDiff;
6288 else {
6289 bSig0 |= LIT64( 0x0001000000000000 );
6291 shift128ExtraRightJamming(
6292 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6293 zExp = aExp;
6295 else if ( expDiff < 0 ) {
6296 if ( bExp == 0x7FFF ) {
6297 if (bSig0 | bSig1) {
6298 return propagateFloat128NaN(a, b, status);
6300 return packFloat128( zSign, 0x7FFF, 0, 0 );
6302 if ( aExp == 0 ) {
6303 ++expDiff;
6305 else {
6306 aSig0 |= LIT64( 0x0001000000000000 );
6308 shift128ExtraRightJamming(
6309 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6310 zExp = bExp;
6312 else {
6313 if ( aExp == 0x7FFF ) {
6314 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6315 return propagateFloat128NaN(a, b, status);
6317 return a;
6319 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6320 if ( aExp == 0 ) {
6321 if (status->flush_to_zero) {
6322 if (zSig0 | zSig1) {
6323 float_raise(float_flag_output_denormal, status);
6325 return packFloat128(zSign, 0, 0, 0);
6327 return packFloat128( zSign, 0, zSig0, zSig1 );
6329 zSig2 = 0;
6330 zSig0 |= LIT64( 0x0002000000000000 );
6331 zExp = aExp;
6332 goto shiftRight1;
6334 aSig0 |= LIT64( 0x0001000000000000 );
6335 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6336 --zExp;
6337 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6338 ++zExp;
6339 shiftRight1:
6340 shift128ExtraRightJamming(
6341 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6342 roundAndPack:
6343 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6347 /*----------------------------------------------------------------------------
6348 | Returns the result of subtracting the absolute values of the quadruple-
6349 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6350 | difference is negated before being returned. `zSign' is ignored if the
6351 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6352 | Standard for Binary Floating-Point Arithmetic.
6353 *----------------------------------------------------------------------------*/
6355 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6356 float_status *status)
6358 int32_t aExp, bExp, zExp;
6359 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6360 int32_t expDiff;
6362 aSig1 = extractFloat128Frac1( a );
6363 aSig0 = extractFloat128Frac0( a );
6364 aExp = extractFloat128Exp( a );
6365 bSig1 = extractFloat128Frac1( b );
6366 bSig0 = extractFloat128Frac0( b );
6367 bExp = extractFloat128Exp( b );
6368 expDiff = aExp - bExp;
6369 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6370 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6371 if ( 0 < expDiff ) goto aExpBigger;
6372 if ( expDiff < 0 ) goto bExpBigger;
6373 if ( aExp == 0x7FFF ) {
6374 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6375 return propagateFloat128NaN(a, b, status);
6377 float_raise(float_flag_invalid, status);
6378 return float128_default_nan(status);
6380 if ( aExp == 0 ) {
6381 aExp = 1;
6382 bExp = 1;
6384 if ( bSig0 < aSig0 ) goto aBigger;
6385 if ( aSig0 < bSig0 ) goto bBigger;
6386 if ( bSig1 < aSig1 ) goto aBigger;
6387 if ( aSig1 < bSig1 ) goto bBigger;
6388 return packFloat128(status->float_rounding_mode == float_round_down,
6389 0, 0, 0);
6390 bExpBigger:
6391 if ( bExp == 0x7FFF ) {
6392 if (bSig0 | bSig1) {
6393 return propagateFloat128NaN(a, b, status);
6395 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6397 if ( aExp == 0 ) {
6398 ++expDiff;
6400 else {
6401 aSig0 |= LIT64( 0x4000000000000000 );
6403 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6404 bSig0 |= LIT64( 0x4000000000000000 );
6405 bBigger:
6406 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6407 zExp = bExp;
6408 zSign ^= 1;
6409 goto normalizeRoundAndPack;
6410 aExpBigger:
6411 if ( aExp == 0x7FFF ) {
6412 if (aSig0 | aSig1) {
6413 return propagateFloat128NaN(a, b, status);
6415 return a;
6417 if ( bExp == 0 ) {
6418 --expDiff;
6420 else {
6421 bSig0 |= LIT64( 0x4000000000000000 );
6423 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6424 aSig0 |= LIT64( 0x4000000000000000 );
6425 aBigger:
6426 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6427 zExp = aExp;
6428 normalizeRoundAndPack:
6429 --zExp;
6430 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6431 status);
6435 /*----------------------------------------------------------------------------
6436 | Returns the result of adding the quadruple-precision floating-point values
6437 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6438 | for Binary Floating-Point Arithmetic.
6439 *----------------------------------------------------------------------------*/
6441 float128 float128_add(float128 a, float128 b, float_status *status)
6443 flag aSign, bSign;
6445 aSign = extractFloat128Sign( a );
6446 bSign = extractFloat128Sign( b );
6447 if ( aSign == bSign ) {
6448 return addFloat128Sigs(a, b, aSign, status);
6450 else {
6451 return subFloat128Sigs(a, b, aSign, status);
6456 /*----------------------------------------------------------------------------
6457 | Returns the result of subtracting the quadruple-precision floating-point
6458 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6459 | Standard for Binary Floating-Point Arithmetic.
6460 *----------------------------------------------------------------------------*/
6462 float128 float128_sub(float128 a, float128 b, float_status *status)
6464 flag aSign, bSign;
6466 aSign = extractFloat128Sign( a );
6467 bSign = extractFloat128Sign( b );
6468 if ( aSign == bSign ) {
6469 return subFloat128Sigs(a, b, aSign, status);
6471 else {
6472 return addFloat128Sigs(a, b, aSign, status);
6477 /*----------------------------------------------------------------------------
6478 | Returns the result of multiplying the quadruple-precision floating-point
6479 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6480 | Standard for Binary Floating-Point Arithmetic.
6481 *----------------------------------------------------------------------------*/
6483 float128 float128_mul(float128 a, float128 b, float_status *status)
6485 flag aSign, bSign, zSign;
6486 int32_t aExp, bExp, zExp;
6487 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6489 aSig1 = extractFloat128Frac1( a );
6490 aSig0 = extractFloat128Frac0( a );
6491 aExp = extractFloat128Exp( a );
6492 aSign = extractFloat128Sign( a );
6493 bSig1 = extractFloat128Frac1( b );
6494 bSig0 = extractFloat128Frac0( b );
6495 bExp = extractFloat128Exp( b );
6496 bSign = extractFloat128Sign( b );
6497 zSign = aSign ^ bSign;
6498 if ( aExp == 0x7FFF ) {
6499 if ( ( aSig0 | aSig1 )
6500 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6501 return propagateFloat128NaN(a, b, status);
6503 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6504 return packFloat128( zSign, 0x7FFF, 0, 0 );
6506 if ( bExp == 0x7FFF ) {
6507 if (bSig0 | bSig1) {
6508 return propagateFloat128NaN(a, b, status);
6510 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6511 invalid:
6512 float_raise(float_flag_invalid, status);
6513 return float128_default_nan(status);
6515 return packFloat128( zSign, 0x7FFF, 0, 0 );
6517 if ( aExp == 0 ) {
6518 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6519 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6521 if ( bExp == 0 ) {
6522 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6523 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6525 zExp = aExp + bExp - 0x4000;
6526 aSig0 |= LIT64( 0x0001000000000000 );
6527 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6528 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6529 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6530 zSig2 |= ( zSig3 != 0 );
6531 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6532 shift128ExtraRightJamming(
6533 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6534 ++zExp;
6536 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6540 /*----------------------------------------------------------------------------
6541 | Returns the result of dividing the quadruple-precision floating-point value
6542 | `a' by the corresponding value `b'. The operation is performed according to
6543 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6544 *----------------------------------------------------------------------------*/
6546 float128 float128_div(float128 a, float128 b, float_status *status)
6548 flag aSign, bSign, zSign;
6549 int32_t aExp, bExp, zExp;
6550 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6551 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6553 aSig1 = extractFloat128Frac1( a );
6554 aSig0 = extractFloat128Frac0( a );
6555 aExp = extractFloat128Exp( a );
6556 aSign = extractFloat128Sign( a );
6557 bSig1 = extractFloat128Frac1( b );
6558 bSig0 = extractFloat128Frac0( b );
6559 bExp = extractFloat128Exp( b );
6560 bSign = extractFloat128Sign( b );
6561 zSign = aSign ^ bSign;
6562 if ( aExp == 0x7FFF ) {
6563 if (aSig0 | aSig1) {
6564 return propagateFloat128NaN(a, b, status);
6566 if ( bExp == 0x7FFF ) {
6567 if (bSig0 | bSig1) {
6568 return propagateFloat128NaN(a, b, status);
6570 goto invalid;
6572 return packFloat128( zSign, 0x7FFF, 0, 0 );
6574 if ( bExp == 0x7FFF ) {
6575 if (bSig0 | bSig1) {
6576 return propagateFloat128NaN(a, b, status);
6578 return packFloat128( zSign, 0, 0, 0 );
6580 if ( bExp == 0 ) {
6581 if ( ( bSig0 | bSig1 ) == 0 ) {
6582 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6583 invalid:
6584 float_raise(float_flag_invalid, status);
6585 return float128_default_nan(status);
6587 float_raise(float_flag_divbyzero, status);
6588 return packFloat128( zSign, 0x7FFF, 0, 0 );
6590 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6592 if ( aExp == 0 ) {
6593 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6594 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6596 zExp = aExp - bExp + 0x3FFD;
6597 shortShift128Left(
6598 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6599 shortShift128Left(
6600 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6601 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6602 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6603 ++zExp;
6605 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6606 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6607 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6608 while ( (int64_t) rem0 < 0 ) {
6609 --zSig0;
6610 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6612 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6613 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6614 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6615 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6616 while ( (int64_t) rem1 < 0 ) {
6617 --zSig1;
6618 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6620 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6622 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6623 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6627 /*----------------------------------------------------------------------------
6628 | Returns the remainder of the quadruple-precision floating-point value `a'
6629 | with respect to the corresponding value `b'. The operation is performed
6630 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6631 *----------------------------------------------------------------------------*/
6633 float128 float128_rem(float128 a, float128 b, float_status *status)
6635 flag aSign, zSign;
6636 int32_t aExp, bExp, expDiff;
6637 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6638 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6639 int64_t sigMean0;
6641 aSig1 = extractFloat128Frac1( a );
6642 aSig0 = extractFloat128Frac0( a );
6643 aExp = extractFloat128Exp( a );
6644 aSign = extractFloat128Sign( a );
6645 bSig1 = extractFloat128Frac1( b );
6646 bSig0 = extractFloat128Frac0( b );
6647 bExp = extractFloat128Exp( b );
6648 if ( aExp == 0x7FFF ) {
6649 if ( ( aSig0 | aSig1 )
6650 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6651 return propagateFloat128NaN(a, b, status);
6653 goto invalid;
6655 if ( bExp == 0x7FFF ) {
6656 if (bSig0 | bSig1) {
6657 return propagateFloat128NaN(a, b, status);
6659 return a;
6661 if ( bExp == 0 ) {
6662 if ( ( bSig0 | bSig1 ) == 0 ) {
6663 invalid:
6664 float_raise(float_flag_invalid, status);
6665 return float128_default_nan(status);
6667 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6669 if ( aExp == 0 ) {
6670 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6671 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6673 expDiff = aExp - bExp;
6674 if ( expDiff < -1 ) return a;
6675 shortShift128Left(
6676 aSig0 | LIT64( 0x0001000000000000 ),
6677 aSig1,
6678 15 - ( expDiff < 0 ),
6679 &aSig0,
6680 &aSig1
6682 shortShift128Left(
6683 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6684 q = le128( bSig0, bSig1, aSig0, aSig1 );
6685 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6686 expDiff -= 64;
6687 while ( 0 < expDiff ) {
6688 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6689 q = ( 4 < q ) ? q - 4 : 0;
6690 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6691 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6692 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6693 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6694 expDiff -= 61;
6696 if ( -64 < expDiff ) {
6697 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6698 q = ( 4 < q ) ? q - 4 : 0;
6699 q >>= - expDiff;
6700 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6701 expDiff += 52;
6702 if ( expDiff < 0 ) {
6703 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6705 else {
6706 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6708 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6709 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6711 else {
6712 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6713 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6715 do {
6716 alternateASig0 = aSig0;
6717 alternateASig1 = aSig1;
6718 ++q;
6719 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6720 } while ( 0 <= (int64_t) aSig0 );
6721 add128(
6722 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6723 if ( ( sigMean0 < 0 )
6724 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6725 aSig0 = alternateASig0;
6726 aSig1 = alternateASig1;
6728 zSign = ( (int64_t) aSig0 < 0 );
6729 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6730 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6731 status);
6734 /*----------------------------------------------------------------------------
6735 | Returns the square root of the quadruple-precision floating-point value `a'.
6736 | The operation is performed according to the IEC/IEEE Standard for Binary
6737 | Floating-Point Arithmetic.
6738 *----------------------------------------------------------------------------*/
6740 float128 float128_sqrt(float128 a, float_status *status)
6742 flag aSign;
6743 int32_t aExp, zExp;
6744 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6745 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6747 aSig1 = extractFloat128Frac1( a );
6748 aSig0 = extractFloat128Frac0( a );
6749 aExp = extractFloat128Exp( a );
6750 aSign = extractFloat128Sign( a );
6751 if ( aExp == 0x7FFF ) {
6752 if (aSig0 | aSig1) {
6753 return propagateFloat128NaN(a, a, status);
6755 if ( ! aSign ) return a;
6756 goto invalid;
6758 if ( aSign ) {
6759 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6760 invalid:
6761 float_raise(float_flag_invalid, status);
6762 return float128_default_nan(status);
6764 if ( aExp == 0 ) {
6765 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6766 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6768 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6769 aSig0 |= LIT64( 0x0001000000000000 );
6770 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6771 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6772 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6773 doubleZSig0 = zSig0<<1;
6774 mul64To128( zSig0, zSig0, &term0, &term1 );
6775 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6776 while ( (int64_t) rem0 < 0 ) {
6777 --zSig0;
6778 doubleZSig0 -= 2;
6779 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6781 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6782 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6783 if ( zSig1 == 0 ) zSig1 = 1;
6784 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6785 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6786 mul64To128( zSig1, zSig1, &term2, &term3 );
6787 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6788 while ( (int64_t) rem1 < 0 ) {
6789 --zSig1;
6790 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6791 term3 |= 1;
6792 term2 |= doubleZSig0;
6793 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6795 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6797 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6798 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6802 /*----------------------------------------------------------------------------
6803 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6804 | the corresponding value `b', and 0 otherwise. The invalid exception is
6805 | raised if either operand is a NaN. Otherwise, the comparison is performed
6806 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6807 *----------------------------------------------------------------------------*/
6809 int float128_eq(float128 a, float128 b, float_status *status)
6812 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6813 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6814 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6815 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6817 float_raise(float_flag_invalid, status);
6818 return 0;
6820 return
6821 ( a.low == b.low )
6822 && ( ( a.high == b.high )
6823 || ( ( a.low == 0 )
6824 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6829 /*----------------------------------------------------------------------------
6830 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6831 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6832 | exception is raised if either operand is a NaN. The comparison is performed
6833 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6834 *----------------------------------------------------------------------------*/
6836 int float128_le(float128 a, float128 b, float_status *status)
6838 flag aSign, bSign;
6840 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6841 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6842 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6843 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6845 float_raise(float_flag_invalid, status);
6846 return 0;
6848 aSign = extractFloat128Sign( a );
6849 bSign = extractFloat128Sign( b );
6850 if ( aSign != bSign ) {
6851 return
6852 aSign
6853 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6854 == 0 );
6856 return
6857 aSign ? le128( b.high, b.low, a.high, a.low )
6858 : le128( a.high, a.low, b.high, b.low );
6862 /*----------------------------------------------------------------------------
6863 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6864 | the corresponding value `b', and 0 otherwise. The invalid exception is
6865 | raised if either operand is a NaN. The comparison is performed according
6866 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6867 *----------------------------------------------------------------------------*/
6869 int float128_lt(float128 a, float128 b, float_status *status)
6871 flag aSign, bSign;
6873 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6874 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6875 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6876 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6878 float_raise(float_flag_invalid, status);
6879 return 0;
6881 aSign = extractFloat128Sign( a );
6882 bSign = extractFloat128Sign( b );
6883 if ( aSign != bSign ) {
6884 return
6885 aSign
6886 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6887 != 0 );
6889 return
6890 aSign ? lt128( b.high, b.low, a.high, a.low )
6891 : lt128( a.high, a.low, b.high, b.low );
6895 /*----------------------------------------------------------------------------
6896 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6897 | be compared, and 0 otherwise. The invalid exception is raised if either
6898 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6899 | Standard for Binary Floating-Point Arithmetic.
6900 *----------------------------------------------------------------------------*/
6902 int float128_unordered(float128 a, float128 b, float_status *status)
6904 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6905 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6906 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6907 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6909 float_raise(float_flag_invalid, status);
6910 return 1;
6912 return 0;
6915 /*----------------------------------------------------------------------------
6916 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6917 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6918 | exception. The comparison is performed according to the IEC/IEEE Standard
6919 | for Binary Floating-Point Arithmetic.
6920 *----------------------------------------------------------------------------*/
6922 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6925 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6926 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6927 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6928 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6930 if (float128_is_signaling_nan(a, status)
6931 || float128_is_signaling_nan(b, status)) {
6932 float_raise(float_flag_invalid, status);
6934 return 0;
6936 return
6937 ( a.low == b.low )
6938 && ( ( a.high == b.high )
6939 || ( ( a.low == 0 )
6940 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6945 /*----------------------------------------------------------------------------
6946 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6947 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6948 | cause an exception. Otherwise, the comparison is performed according to the
6949 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6950 *----------------------------------------------------------------------------*/
6952 int float128_le_quiet(float128 a, float128 b, float_status *status)
6954 flag aSign, bSign;
6956 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6957 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6958 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6959 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6961 if (float128_is_signaling_nan(a, status)
6962 || float128_is_signaling_nan(b, status)) {
6963 float_raise(float_flag_invalid, status);
6965 return 0;
6967 aSign = extractFloat128Sign( a );
6968 bSign = extractFloat128Sign( b );
6969 if ( aSign != bSign ) {
6970 return
6971 aSign
6972 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6973 == 0 );
6975 return
6976 aSign ? le128( b.high, b.low, a.high, a.low )
6977 : le128( a.high, a.low, b.high, b.low );
6981 /*----------------------------------------------------------------------------
6982 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6983 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6984 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6985 | Standard for Binary Floating-Point Arithmetic.
6986 *----------------------------------------------------------------------------*/
6988 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6990 flag aSign, bSign;
6992 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6993 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6994 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6995 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6997 if (float128_is_signaling_nan(a, status)
6998 || float128_is_signaling_nan(b, status)) {
6999 float_raise(float_flag_invalid, status);
7001 return 0;
7003 aSign = extractFloat128Sign( a );
7004 bSign = extractFloat128Sign( b );
7005 if ( aSign != bSign ) {
7006 return
7007 aSign
7008 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7009 != 0 );
7011 return
7012 aSign ? lt128( b.high, b.low, a.high, a.low )
7013 : lt128( a.high, a.low, b.high, b.low );
7017 /*----------------------------------------------------------------------------
7018 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7019 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
7020 | comparison is performed according to the IEC/IEEE Standard for Binary
7021 | Floating-Point Arithmetic.
7022 *----------------------------------------------------------------------------*/
7024 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7026 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7027 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7028 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7029 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7031 if (float128_is_signaling_nan(a, status)
7032 || float128_is_signaling_nan(b, status)) {
7033 float_raise(float_flag_invalid, status);
7035 return 1;
7037 return 0;
7040 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7041 int is_quiet, float_status *status)
7043 flag aSign, bSign;
7045 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7046 float_raise(float_flag_invalid, status);
7047 return float_relation_unordered;
7049 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7050 ( extractFloatx80Frac( a )<<1 ) ) ||
7051 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7052 ( extractFloatx80Frac( b )<<1 ) )) {
7053 if (!is_quiet ||
7054 floatx80_is_signaling_nan(a, status) ||
7055 floatx80_is_signaling_nan(b, status)) {
7056 float_raise(float_flag_invalid, status);
7058 return float_relation_unordered;
7060 aSign = extractFloatx80Sign( a );
7061 bSign = extractFloatx80Sign( b );
7062 if ( aSign != bSign ) {
7064 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7065 ( ( a.low | b.low ) == 0 ) ) {
7066 /* zero case */
7067 return float_relation_equal;
7068 } else {
7069 return 1 - (2 * aSign);
7071 } else {
7072 if (a.low == b.low && a.high == b.high) {
7073 return float_relation_equal;
7074 } else {
7075 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7080 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7082 return floatx80_compare_internal(a, b, 0, status);
7085 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7087 return floatx80_compare_internal(a, b, 1, status);
7090 static inline int float128_compare_internal(float128 a, float128 b,
7091 int is_quiet, float_status *status)
7093 flag aSign, bSign;
7095 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7096 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7097 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7098 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7099 if (!is_quiet ||
7100 float128_is_signaling_nan(a, status) ||
7101 float128_is_signaling_nan(b, status)) {
7102 float_raise(float_flag_invalid, status);
7104 return float_relation_unordered;
7106 aSign = extractFloat128Sign( a );
7107 bSign = extractFloat128Sign( b );
7108 if ( aSign != bSign ) {
7109 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7110 /* zero case */
7111 return float_relation_equal;
7112 } else {
7113 return 1 - (2 * aSign);
7115 } else {
7116 if (a.low == b.low && a.high == b.high) {
7117 return float_relation_equal;
7118 } else {
7119 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7124 int float128_compare(float128 a, float128 b, float_status *status)
7126 return float128_compare_internal(a, b, 0, status);
7129 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7131 return float128_compare_internal(a, b, 1, status);
7134 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7136 flag aSign;
7137 int32_t aExp;
7138 uint64_t aSig;
7140 if (floatx80_invalid_encoding(a)) {
7141 float_raise(float_flag_invalid, status);
7142 return floatx80_default_nan(status);
7144 aSig = extractFloatx80Frac( a );
7145 aExp = extractFloatx80Exp( a );
7146 aSign = extractFloatx80Sign( a );
7148 if ( aExp == 0x7FFF ) {
7149 if ( aSig<<1 ) {
7150 return propagateFloatx80NaN(a, a, status);
7152 return a;
7155 if (aExp == 0) {
7156 if (aSig == 0) {
7157 return a;
7159 aExp++;
7162 if (n > 0x10000) {
7163 n = 0x10000;
7164 } else if (n < -0x10000) {
7165 n = -0x10000;
7168 aExp += n;
7169 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7170 aSign, aExp, aSig, 0, status);
7173 float128 float128_scalbn(float128 a, int n, float_status *status)
7175 flag aSign;
7176 int32_t aExp;
7177 uint64_t aSig0, aSig1;
7179 aSig1 = extractFloat128Frac1( a );
7180 aSig0 = extractFloat128Frac0( a );
7181 aExp = extractFloat128Exp( a );
7182 aSign = extractFloat128Sign( a );
7183 if ( aExp == 0x7FFF ) {
7184 if ( aSig0 | aSig1 ) {
7185 return propagateFloat128NaN(a, a, status);
7187 return a;
7189 if (aExp != 0) {
7190 aSig0 |= LIT64( 0x0001000000000000 );
7191 } else if (aSig0 == 0 && aSig1 == 0) {
7192 return a;
7193 } else {
7194 aExp++;
7197 if (n > 0x10000) {
7198 n = 0x10000;
7199 } else if (n < -0x10000) {
7200 n = -0x10000;
7203 aExp += n - 1;
7204 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7205 , status);