hw/ppc/spapr: Implement the h_set_xdabr hypercall
[qemu/kevin.git] / fpu / softfloat.c
blobcfa9787d8d47b684f2902b9fd15d989ae5f098e6
1 /*
2 * QEMU float support
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "softfloat-macros.h"
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine: (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output. These details are target-
104 | specific.
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
112 static inline uint32_t extractFloat16Frac(float16 a)
114 return float16_val(a) & 0x3ff;
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
121 static inline int_fast16_t extractFloat16Exp(float16 a)
123 return (float16_val(a) >> 10) & 0x1f;
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
130 static inline flag extractFloat16Sign(float16 a)
132 return float16_val(a)>>15;
135 /*----------------------------------------------------------------------------
136 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
137 | and 7, and returns the properly rounded 32-bit integer corresponding to the
138 | input. If `zSign' is 1, the input is negated before being converted to an
139 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
140 | is simply rounded to an integer, with the inexact exception raised if the
141 | input cannot be represented exactly as an integer. However, if the fixed-
142 | point input is too large, the invalid exception is raised and the largest
143 | positive or negative integer is returned.
144 *----------------------------------------------------------------------------*/
146 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
148 int8_t roundingMode;
149 flag roundNearestEven;
150 int8_t roundIncrement, roundBits;
151 int32_t z;
153 roundingMode = status->float_rounding_mode;
154 roundNearestEven = ( roundingMode == float_round_nearest_even );
155 switch (roundingMode) {
156 case float_round_nearest_even:
157 case float_round_ties_away:
158 roundIncrement = 0x40;
159 break;
160 case float_round_to_zero:
161 roundIncrement = 0;
162 break;
163 case float_round_up:
164 roundIncrement = zSign ? 0 : 0x7f;
165 break;
166 case float_round_down:
167 roundIncrement = zSign ? 0x7f : 0;
168 break;
169 default:
170 abort();
172 roundBits = absZ & 0x7F;
173 absZ = ( absZ + roundIncrement )>>7;
174 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
175 z = absZ;
176 if ( zSign ) z = - z;
177 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
178 float_raise(float_flag_invalid, status);
179 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
181 if (roundBits) {
182 status->float_exception_flags |= float_flag_inexact;
184 return z;
188 /*----------------------------------------------------------------------------
189 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
190 | `absZ1', with binary point between bits 63 and 64 (between the input words),
191 | and returns the properly rounded 64-bit integer corresponding to the input.
192 | If `zSign' is 1, the input is negated before being converted to an integer.
193 | Ordinarily, the fixed-point input is simply rounded to an integer, with
194 | the inexact exception raised if the input cannot be represented exactly as
195 | an integer. However, if the fixed-point input is too large, the invalid
196 | exception is raised and the largest positive or negative integer is
197 | returned.
198 *----------------------------------------------------------------------------*/
200 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
201 float_status *status)
203 int8_t roundingMode;
204 flag roundNearestEven, increment;
205 int64_t z;
207 roundingMode = status->float_rounding_mode;
208 roundNearestEven = ( roundingMode == float_round_nearest_even );
209 switch (roundingMode) {
210 case float_round_nearest_even:
211 case float_round_ties_away:
212 increment = ((int64_t) absZ1 < 0);
213 break;
214 case float_round_to_zero:
215 increment = 0;
216 break;
217 case float_round_up:
218 increment = !zSign && absZ1;
219 break;
220 case float_round_down:
221 increment = zSign && absZ1;
222 break;
223 default:
224 abort();
226 if ( increment ) {
227 ++absZ0;
228 if ( absZ0 == 0 ) goto overflow;
229 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
231 z = absZ0;
232 if ( zSign ) z = - z;
233 if ( z && ( ( z < 0 ) ^ zSign ) ) {
234 overflow:
235 float_raise(float_flag_invalid, status);
236 return
237 zSign ? (int64_t) LIT64( 0x8000000000000000 )
238 : LIT64( 0x7FFFFFFFFFFFFFFF );
240 if (absZ1) {
241 status->float_exception_flags |= float_flag_inexact;
243 return z;
247 /*----------------------------------------------------------------------------
248 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
249 | `absZ1', with binary point between bits 63 and 64 (between the input words),
250 | and returns the properly rounded 64-bit unsigned integer corresponding to the
251 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
252 | with the inexact exception raised if the input cannot be represented exactly
253 | as an integer. However, if the fixed-point input is too large, the invalid
254 | exception is raised and the largest unsigned integer is returned.
255 *----------------------------------------------------------------------------*/
257 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
258 uint64_t absZ1, float_status *status)
260 int8_t roundingMode;
261 flag roundNearestEven, increment;
263 roundingMode = status->float_rounding_mode;
264 roundNearestEven = (roundingMode == float_round_nearest_even);
265 switch (roundingMode) {
266 case float_round_nearest_even:
267 case float_round_ties_away:
268 increment = ((int64_t)absZ1 < 0);
269 break;
270 case float_round_to_zero:
271 increment = 0;
272 break;
273 case float_round_up:
274 increment = !zSign && absZ1;
275 break;
276 case float_round_down:
277 increment = zSign && absZ1;
278 break;
279 default:
280 abort();
282 if (increment) {
283 ++absZ0;
284 if (absZ0 == 0) {
285 float_raise(float_flag_invalid, status);
286 return LIT64(0xFFFFFFFFFFFFFFFF);
288 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
291 if (zSign && absZ0) {
292 float_raise(float_flag_invalid, status);
293 return 0;
296 if (absZ1) {
297 status->float_exception_flags |= float_flag_inexact;
299 return absZ0;
302 /*----------------------------------------------------------------------------
303 | Returns the fraction bits of the single-precision floating-point value `a'.
304 *----------------------------------------------------------------------------*/
306 static inline uint32_t extractFloat32Frac( float32 a )
309 return float32_val(a) & 0x007FFFFF;
313 /*----------------------------------------------------------------------------
314 | Returns the exponent bits of the single-precision floating-point value `a'.
315 *----------------------------------------------------------------------------*/
317 static inline int_fast16_t extractFloat32Exp(float32 a)
320 return ( float32_val(a)>>23 ) & 0xFF;
324 /*----------------------------------------------------------------------------
325 | Returns the sign bit of the single-precision floating-point value `a'.
326 *----------------------------------------------------------------------------*/
328 static inline flag extractFloat32Sign( float32 a )
331 return float32_val(a)>>31;
335 /*----------------------------------------------------------------------------
336 | If `a' is denormal and we are in flush-to-zero mode then set the
337 | input-denormal exception and return zero. Otherwise just return the value.
338 *----------------------------------------------------------------------------*/
339 float32 float32_squash_input_denormal(float32 a, float_status *status)
341 if (status->flush_inputs_to_zero) {
342 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
343 float_raise(float_flag_input_denormal, status);
344 return make_float32(float32_val(a) & 0x80000000);
347 return a;
350 /*----------------------------------------------------------------------------
351 | Normalizes the subnormal single-precision floating-point value represented
352 | by the denormalized significand `aSig'. The normalized exponent and
353 | significand are stored at the locations pointed to by `zExpPtr' and
354 | `zSigPtr', respectively.
355 *----------------------------------------------------------------------------*/
357 static void
358 normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
360 int8_t shiftCount;
362 shiftCount = countLeadingZeros32( aSig ) - 8;
363 *zSigPtr = aSig<<shiftCount;
364 *zExpPtr = 1 - shiftCount;
368 /*----------------------------------------------------------------------------
369 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
370 | single-precision floating-point value, returning the result. After being
371 | shifted into the proper positions, the three fields are simply added
372 | together to form the result. This means that any integer portion of `zSig'
373 | will be added into the exponent. Since a properly normalized significand
374 | will have an integer portion equal to 1, the `zExp' input should be 1 less
375 | than the desired result exponent whenever `zSig' is a complete, normalized
376 | significand.
377 *----------------------------------------------------------------------------*/
379 static inline float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
382 return make_float32(
383 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
387 /*----------------------------------------------------------------------------
388 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
389 | and significand `zSig', and returns the proper single-precision floating-
390 | point value corresponding to the abstract input. Ordinarily, the abstract
391 | value is simply rounded and packed into the single-precision format, with
392 | the inexact exception raised if the abstract input cannot be represented
393 | exactly. However, if the abstract value is too large, the overflow and
394 | inexact exceptions are raised and an infinity or maximal finite value is
395 | returned. If the abstract value is too small, the input value is rounded to
396 | a subnormal number, and the underflow and inexact exceptions are raised if
397 | the abstract input cannot be represented exactly as a subnormal single-
398 | precision floating-point number.
399 | The input significand `zSig' has its binary point between bits 30
400 | and 29, which is 7 bits to the left of the usual location. This shifted
401 | significand must be normalized or smaller. If `zSig' is not normalized,
402 | `zExp' must be 0; in that case, the result returned is a subnormal number,
403 | and it must not require rounding. In the usual case that `zSig' is
404 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
405 | The handling of underflow and overflow follows the IEC/IEEE Standard for
406 | Binary Floating-Point Arithmetic.
407 *----------------------------------------------------------------------------*/
409 static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig,
410 float_status *status)
412 int8_t roundingMode;
413 flag roundNearestEven;
414 int8_t roundIncrement, roundBits;
415 flag isTiny;
417 roundingMode = status->float_rounding_mode;
418 roundNearestEven = ( roundingMode == float_round_nearest_even );
419 switch (roundingMode) {
420 case float_round_nearest_even:
421 case float_round_ties_away:
422 roundIncrement = 0x40;
423 break;
424 case float_round_to_zero:
425 roundIncrement = 0;
426 break;
427 case float_round_up:
428 roundIncrement = zSign ? 0 : 0x7f;
429 break;
430 case float_round_down:
431 roundIncrement = zSign ? 0x7f : 0;
432 break;
433 default:
434 abort();
435 break;
437 roundBits = zSig & 0x7F;
438 if ( 0xFD <= (uint16_t) zExp ) {
439 if ( ( 0xFD < zExp )
440 || ( ( zExp == 0xFD )
441 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
443 float_raise(float_flag_overflow | float_flag_inexact, status);
444 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
446 if ( zExp < 0 ) {
447 if (status->flush_to_zero) {
448 float_raise(float_flag_output_denormal, status);
449 return packFloat32(zSign, 0, 0);
451 isTiny =
452 (status->float_detect_tininess
453 == float_tininess_before_rounding)
454 || ( zExp < -1 )
455 || ( zSig + roundIncrement < 0x80000000 );
456 shift32RightJamming( zSig, - zExp, &zSig );
457 zExp = 0;
458 roundBits = zSig & 0x7F;
459 if (isTiny && roundBits) {
460 float_raise(float_flag_underflow, status);
464 if (roundBits) {
465 status->float_exception_flags |= float_flag_inexact;
467 zSig = ( zSig + roundIncrement )>>7;
468 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
469 if ( zSig == 0 ) zExp = 0;
470 return packFloat32( zSign, zExp, zSig );
474 /*----------------------------------------------------------------------------
475 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
476 | and significand `zSig', and returns the proper single-precision floating-
477 | point value corresponding to the abstract input. This routine is just like
478 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
479 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
480 | floating-point exponent.
481 *----------------------------------------------------------------------------*/
483 static float32
484 normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig,
485 float_status *status)
487 int8_t shiftCount;
489 shiftCount = countLeadingZeros32( zSig ) - 1;
490 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
491 status);
495 /*----------------------------------------------------------------------------
496 | Returns the fraction bits of the double-precision floating-point value `a'.
497 *----------------------------------------------------------------------------*/
499 static inline uint64_t extractFloat64Frac( float64 a )
502 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
506 /*----------------------------------------------------------------------------
507 | Returns the exponent bits of the double-precision floating-point value `a'.
508 *----------------------------------------------------------------------------*/
510 static inline int_fast16_t extractFloat64Exp(float64 a)
513 return ( float64_val(a)>>52 ) & 0x7FF;
517 /*----------------------------------------------------------------------------
518 | Returns the sign bit of the double-precision floating-point value `a'.
519 *----------------------------------------------------------------------------*/
521 static inline flag extractFloat64Sign( float64 a )
524 return float64_val(a)>>63;
528 /*----------------------------------------------------------------------------
529 | If `a' is denormal and we are in flush-to-zero mode then set the
530 | input-denormal exception and return zero. Otherwise just return the value.
531 *----------------------------------------------------------------------------*/
532 float64 float64_squash_input_denormal(float64 a, float_status *status)
534 if (status->flush_inputs_to_zero) {
535 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
536 float_raise(float_flag_input_denormal, status);
537 return make_float64(float64_val(a) & (1ULL << 63));
540 return a;
543 /*----------------------------------------------------------------------------
544 | Normalizes the subnormal double-precision floating-point value represented
545 | by the denormalized significand `aSig'. The normalized exponent and
546 | significand are stored at the locations pointed to by `zExpPtr' and
547 | `zSigPtr', respectively.
548 *----------------------------------------------------------------------------*/
550 static void
551 normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
553 int8_t shiftCount;
555 shiftCount = countLeadingZeros64( aSig ) - 11;
556 *zSigPtr = aSig<<shiftCount;
557 *zExpPtr = 1 - shiftCount;
561 /*----------------------------------------------------------------------------
562 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
563 | double-precision floating-point value, returning the result. After being
564 | shifted into the proper positions, the three fields are simply added
565 | together to form the result. This means that any integer portion of `zSig'
566 | will be added into the exponent. Since a properly normalized significand
567 | will have an integer portion equal to 1, the `zExp' input should be 1 less
568 | than the desired result exponent whenever `zSig' is a complete, normalized
569 | significand.
570 *----------------------------------------------------------------------------*/
572 static inline float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
575 return make_float64(
576 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
580 /*----------------------------------------------------------------------------
581 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
582 | and significand `zSig', and returns the proper double-precision floating-
583 | point value corresponding to the abstract input. Ordinarily, the abstract
584 | value is simply rounded and packed into the double-precision format, with
585 | the inexact exception raised if the abstract input cannot be represented
586 | exactly. However, if the abstract value is too large, the overflow and
587 | inexact exceptions are raised and an infinity or maximal finite value is
588 | returned. If the abstract value is too small, the input value is rounded to
589 | a subnormal number, and the underflow and inexact exceptions are raised if
590 | the abstract input cannot be represented exactly as a subnormal double-
591 | precision floating-point number.
592 | The input significand `zSig' has its binary point between bits 62
593 | and 61, which is 10 bits to the left of the usual location. This shifted
594 | significand must be normalized or smaller. If `zSig' is not normalized,
595 | `zExp' must be 0; in that case, the result returned is a subnormal number,
596 | and it must not require rounding. In the usual case that `zSig' is
597 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
598 | The handling of underflow and overflow follows the IEC/IEEE Standard for
599 | Binary Floating-Point Arithmetic.
600 *----------------------------------------------------------------------------*/
602 static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig,
603 float_status *status)
605 int8_t roundingMode;
606 flag roundNearestEven;
607 int_fast16_t roundIncrement, roundBits;
608 flag isTiny;
610 roundingMode = status->float_rounding_mode;
611 roundNearestEven = ( roundingMode == float_round_nearest_even );
612 switch (roundingMode) {
613 case float_round_nearest_even:
614 case float_round_ties_away:
615 roundIncrement = 0x200;
616 break;
617 case float_round_to_zero:
618 roundIncrement = 0;
619 break;
620 case float_round_up:
621 roundIncrement = zSign ? 0 : 0x3ff;
622 break;
623 case float_round_down:
624 roundIncrement = zSign ? 0x3ff : 0;
625 break;
626 default:
627 abort();
629 roundBits = zSig & 0x3FF;
630 if ( 0x7FD <= (uint16_t) zExp ) {
631 if ( ( 0x7FD < zExp )
632 || ( ( zExp == 0x7FD )
633 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
635 float_raise(float_flag_overflow | float_flag_inexact, status);
636 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
638 if ( zExp < 0 ) {
639 if (status->flush_to_zero) {
640 float_raise(float_flag_output_denormal, status);
641 return packFloat64(zSign, 0, 0);
643 isTiny =
644 (status->float_detect_tininess
645 == float_tininess_before_rounding)
646 || ( zExp < -1 )
647 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
648 shift64RightJamming( zSig, - zExp, &zSig );
649 zExp = 0;
650 roundBits = zSig & 0x3FF;
651 if (isTiny && roundBits) {
652 float_raise(float_flag_underflow, status);
656 if (roundBits) {
657 status->float_exception_flags |= float_flag_inexact;
659 zSig = ( zSig + roundIncrement )>>10;
660 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
661 if ( zSig == 0 ) zExp = 0;
662 return packFloat64( zSign, zExp, zSig );
666 /*----------------------------------------------------------------------------
667 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
668 | and significand `zSig', and returns the proper double-precision floating-
669 | point value corresponding to the abstract input. This routine is just like
670 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
671 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
672 | floating-point exponent.
673 *----------------------------------------------------------------------------*/
675 static float64
676 normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig,
677 float_status *status)
679 int8_t shiftCount;
681 shiftCount = countLeadingZeros64( zSig ) - 1;
682 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
683 status);
687 /*----------------------------------------------------------------------------
688 | Returns the fraction bits of the extended double-precision floating-point
689 | value `a'.
690 *----------------------------------------------------------------------------*/
692 static inline uint64_t extractFloatx80Frac( floatx80 a )
695 return a.low;
699 /*----------------------------------------------------------------------------
700 | Returns the exponent bits of the extended double-precision floating-point
701 | value `a'.
702 *----------------------------------------------------------------------------*/
704 static inline int32_t extractFloatx80Exp( floatx80 a )
707 return a.high & 0x7FFF;
711 /*----------------------------------------------------------------------------
712 | Returns the sign bit of the extended double-precision floating-point value
713 | `a'.
714 *----------------------------------------------------------------------------*/
716 static inline flag extractFloatx80Sign( floatx80 a )
719 return a.high>>15;
723 /*----------------------------------------------------------------------------
724 | Normalizes the subnormal extended double-precision floating-point value
725 | represented by the denormalized significand `aSig'. The normalized exponent
726 | and significand are stored at the locations pointed to by `zExpPtr' and
727 | `zSigPtr', respectively.
728 *----------------------------------------------------------------------------*/
730 static void
731 normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
733 int8_t shiftCount;
735 shiftCount = countLeadingZeros64( aSig );
736 *zSigPtr = aSig<<shiftCount;
737 *zExpPtr = 1 - shiftCount;
741 /*----------------------------------------------------------------------------
742 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
743 | extended double-precision floating-point value, returning the result.
744 *----------------------------------------------------------------------------*/
746 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
748 floatx80 z;
750 z.low = zSig;
751 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
752 return z;
756 /*----------------------------------------------------------------------------
757 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
758 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
759 | and returns the proper extended double-precision floating-point value
760 | corresponding to the abstract input. Ordinarily, the abstract value is
761 | rounded and packed into the extended double-precision format, with the
762 | inexact exception raised if the abstract input cannot be represented
763 | exactly. However, if the abstract value is too large, the overflow and
764 | inexact exceptions are raised and an infinity or maximal finite value is
765 | returned. If the abstract value is too small, the input value is rounded to
766 | a subnormal number, and the underflow and inexact exceptions are raised if
767 | the abstract input cannot be represented exactly as a subnormal extended
768 | double-precision floating-point number.
769 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
770 | number of bits as single or double precision, respectively. Otherwise, the
771 | result is rounded to the full precision of the extended double-precision
772 | format.
773 | The input significand must be normalized or smaller. If the input
774 | significand is not normalized, `zExp' must be 0; in that case, the result
775 | returned is a subnormal number, and it must not require rounding. The
776 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
777 | Floating-Point Arithmetic.
778 *----------------------------------------------------------------------------*/
780 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
781 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
782 float_status *status)
784 int8_t roundingMode;
785 flag roundNearestEven, increment, isTiny;
786 int64_t roundIncrement, roundMask, roundBits;
788 roundingMode = status->float_rounding_mode;
789 roundNearestEven = ( roundingMode == float_round_nearest_even );
790 if ( roundingPrecision == 80 ) goto precision80;
791 if ( roundingPrecision == 64 ) {
792 roundIncrement = LIT64( 0x0000000000000400 );
793 roundMask = LIT64( 0x00000000000007FF );
795 else if ( roundingPrecision == 32 ) {
796 roundIncrement = LIT64( 0x0000008000000000 );
797 roundMask = LIT64( 0x000000FFFFFFFFFF );
799 else {
800 goto precision80;
802 zSig0 |= ( zSig1 != 0 );
803 switch (roundingMode) {
804 case float_round_nearest_even:
805 case float_round_ties_away:
806 break;
807 case float_round_to_zero:
808 roundIncrement = 0;
809 break;
810 case float_round_up:
811 roundIncrement = zSign ? 0 : roundMask;
812 break;
813 case float_round_down:
814 roundIncrement = zSign ? roundMask : 0;
815 break;
816 default:
817 abort();
819 roundBits = zSig0 & roundMask;
820 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
821 if ( ( 0x7FFE < zExp )
822 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
824 goto overflow;
826 if ( zExp <= 0 ) {
827 if (status->flush_to_zero) {
828 float_raise(float_flag_output_denormal, status);
829 return packFloatx80(zSign, 0, 0);
831 isTiny =
832 (status->float_detect_tininess
833 == float_tininess_before_rounding)
834 || ( zExp < 0 )
835 || ( zSig0 <= zSig0 + roundIncrement );
836 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
837 zExp = 0;
838 roundBits = zSig0 & roundMask;
839 if (isTiny && roundBits) {
840 float_raise(float_flag_underflow, status);
842 if (roundBits) {
843 status->float_exception_flags |= float_flag_inexact;
845 zSig0 += roundIncrement;
846 if ( (int64_t) zSig0 < 0 ) zExp = 1;
847 roundIncrement = roundMask + 1;
848 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
849 roundMask |= roundIncrement;
851 zSig0 &= ~ roundMask;
852 return packFloatx80( zSign, zExp, zSig0 );
855 if (roundBits) {
856 status->float_exception_flags |= float_flag_inexact;
858 zSig0 += roundIncrement;
859 if ( zSig0 < roundIncrement ) {
860 ++zExp;
861 zSig0 = LIT64( 0x8000000000000000 );
863 roundIncrement = roundMask + 1;
864 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
865 roundMask |= roundIncrement;
867 zSig0 &= ~ roundMask;
868 if ( zSig0 == 0 ) zExp = 0;
869 return packFloatx80( zSign, zExp, zSig0 );
870 precision80:
871 switch (roundingMode) {
872 case float_round_nearest_even:
873 case float_round_ties_away:
874 increment = ((int64_t)zSig1 < 0);
875 break;
876 case float_round_to_zero:
877 increment = 0;
878 break;
879 case float_round_up:
880 increment = !zSign && zSig1;
881 break;
882 case float_round_down:
883 increment = zSign && zSig1;
884 break;
885 default:
886 abort();
888 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
889 if ( ( 0x7FFE < zExp )
890 || ( ( zExp == 0x7FFE )
891 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
892 && increment
895 roundMask = 0;
896 overflow:
897 float_raise(float_flag_overflow | float_flag_inexact, status);
898 if ( ( roundingMode == float_round_to_zero )
899 || ( zSign && ( roundingMode == float_round_up ) )
900 || ( ! zSign && ( roundingMode == float_round_down ) )
902 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
904 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
906 if ( zExp <= 0 ) {
907 isTiny =
908 (status->float_detect_tininess
909 == float_tininess_before_rounding)
910 || ( zExp < 0 )
911 || ! increment
912 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
913 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
914 zExp = 0;
915 if (isTiny && zSig1) {
916 float_raise(float_flag_underflow, status);
918 if (zSig1) {
919 status->float_exception_flags |= float_flag_inexact;
921 switch (roundingMode) {
922 case float_round_nearest_even:
923 case float_round_ties_away:
924 increment = ((int64_t)zSig1 < 0);
925 break;
926 case float_round_to_zero:
927 increment = 0;
928 break;
929 case float_round_up:
930 increment = !zSign && zSig1;
931 break;
932 case float_round_down:
933 increment = zSign && zSig1;
934 break;
935 default:
936 abort();
938 if ( increment ) {
939 ++zSig0;
940 zSig0 &=
941 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
942 if ( (int64_t) zSig0 < 0 ) zExp = 1;
944 return packFloatx80( zSign, zExp, zSig0 );
947 if (zSig1) {
948 status->float_exception_flags |= float_flag_inexact;
950 if ( increment ) {
951 ++zSig0;
952 if ( zSig0 == 0 ) {
953 ++zExp;
954 zSig0 = LIT64( 0x8000000000000000 );
956 else {
957 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
960 else {
961 if ( zSig0 == 0 ) zExp = 0;
963 return packFloatx80( zSign, zExp, zSig0 );
967 /*----------------------------------------------------------------------------
968 | Takes an abstract floating-point value having sign `zSign', exponent
969 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
970 | and returns the proper extended double-precision floating-point value
971 | corresponding to the abstract input. This routine is just like
972 | `roundAndPackFloatx80' except that the input significand does not have to be
973 | normalized.
974 *----------------------------------------------------------------------------*/
976 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
977 flag zSign, int32_t zExp,
978 uint64_t zSig0, uint64_t zSig1,
979 float_status *status)
981 int8_t shiftCount;
983 if ( zSig0 == 0 ) {
984 zSig0 = zSig1;
985 zSig1 = 0;
986 zExp -= 64;
988 shiftCount = countLeadingZeros64( zSig0 );
989 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
990 zExp -= shiftCount;
991 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
992 zSig0, zSig1, status);
996 /*----------------------------------------------------------------------------
997 | Returns the least-significant 64 fraction bits of the quadruple-precision
998 | floating-point value `a'.
999 *----------------------------------------------------------------------------*/
1001 static inline uint64_t extractFloat128Frac1( float128 a )
1004 return a.low;
1008 /*----------------------------------------------------------------------------
1009 | Returns the most-significant 48 fraction bits of the quadruple-precision
1010 | floating-point value `a'.
1011 *----------------------------------------------------------------------------*/
1013 static inline uint64_t extractFloat128Frac0( float128 a )
1016 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
1020 /*----------------------------------------------------------------------------
1021 | Returns the exponent bits of the quadruple-precision floating-point value
1022 | `a'.
1023 *----------------------------------------------------------------------------*/
1025 static inline int32_t extractFloat128Exp( float128 a )
1028 return ( a.high>>48 ) & 0x7FFF;
1032 /*----------------------------------------------------------------------------
1033 | Returns the sign bit of the quadruple-precision floating-point value `a'.
1034 *----------------------------------------------------------------------------*/
1036 static inline flag extractFloat128Sign( float128 a )
1039 return a.high>>63;
1043 /*----------------------------------------------------------------------------
1044 | Normalizes the subnormal quadruple-precision floating-point value
1045 | represented by the denormalized significand formed by the concatenation of
1046 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
1047 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
1048 | significand are stored at the location pointed to by `zSig0Ptr', and the
1049 | least significant 64 bits of the normalized significand are stored at the
1050 | location pointed to by `zSig1Ptr'.
1051 *----------------------------------------------------------------------------*/
1053 static void
1054 normalizeFloat128Subnormal(
1055 uint64_t aSig0,
1056 uint64_t aSig1,
1057 int32_t *zExpPtr,
1058 uint64_t *zSig0Ptr,
1059 uint64_t *zSig1Ptr
1062 int8_t shiftCount;
1064 if ( aSig0 == 0 ) {
1065 shiftCount = countLeadingZeros64( aSig1 ) - 15;
1066 if ( shiftCount < 0 ) {
1067 *zSig0Ptr = aSig1>>( - shiftCount );
1068 *zSig1Ptr = aSig1<<( shiftCount & 63 );
1070 else {
1071 *zSig0Ptr = aSig1<<shiftCount;
1072 *zSig1Ptr = 0;
1074 *zExpPtr = - shiftCount - 63;
1076 else {
1077 shiftCount = countLeadingZeros64( aSig0 ) - 15;
1078 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1079 *zExpPtr = 1 - shiftCount;
1084 /*----------------------------------------------------------------------------
1085 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1086 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1087 | floating-point value, returning the result. After being shifted into the
1088 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1089 | added together to form the most significant 32 bits of the result. This
1090 | means that any integer portion of `zSig0' will be added into the exponent.
1091 | Since a properly normalized significand will have an integer portion equal
1092 | to 1, the `zExp' input should be 1 less than the desired result exponent
1093 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1094 | significand.
1095 *----------------------------------------------------------------------------*/
1097 static inline float128
1098 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
1100 float128 z;
1102 z.low = zSig1;
1103 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1104 return z;
1108 /*----------------------------------------------------------------------------
1109 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1110 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1111 | and `zSig2', and returns the proper quadruple-precision floating-point value
1112 | corresponding to the abstract input. Ordinarily, the abstract value is
1113 | simply rounded and packed into the quadruple-precision format, with the
1114 | inexact exception raised if the abstract input cannot be represented
1115 | exactly. However, if the abstract value is too large, the overflow and
1116 | inexact exceptions are raised and an infinity or maximal finite value is
1117 | returned. If the abstract value is too small, the input value is rounded to
1118 | a subnormal number, and the underflow and inexact exceptions are raised if
1119 | the abstract input cannot be represented exactly as a subnormal quadruple-
1120 | precision floating-point number.
1121 | The input significand must be normalized or smaller. If the input
1122 | significand is not normalized, `zExp' must be 0; in that case, the result
1123 | returned is a subnormal number, and it must not require rounding. In the
1124 | usual case that the input significand is normalized, `zExp' must be 1 less
1125 | than the ``true'' floating-point exponent. The handling of underflow and
1126 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1127 *----------------------------------------------------------------------------*/
1129 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
1130 uint64_t zSig0, uint64_t zSig1,
1131 uint64_t zSig2, float_status *status)
1133 int8_t roundingMode;
1134 flag roundNearestEven, increment, isTiny;
1136 roundingMode = status->float_rounding_mode;
1137 roundNearestEven = ( roundingMode == float_round_nearest_even );
1138 switch (roundingMode) {
1139 case float_round_nearest_even:
1140 case float_round_ties_away:
1141 increment = ((int64_t)zSig2 < 0);
1142 break;
1143 case float_round_to_zero:
1144 increment = 0;
1145 break;
1146 case float_round_up:
1147 increment = !zSign && zSig2;
1148 break;
1149 case float_round_down:
1150 increment = zSign && zSig2;
1151 break;
1152 default:
1153 abort();
1155 if ( 0x7FFD <= (uint32_t) zExp ) {
1156 if ( ( 0x7FFD < zExp )
1157 || ( ( zExp == 0x7FFD )
1158 && eq128(
1159 LIT64( 0x0001FFFFFFFFFFFF ),
1160 LIT64( 0xFFFFFFFFFFFFFFFF ),
1161 zSig0,
1162 zSig1
1164 && increment
1167 float_raise(float_flag_overflow | float_flag_inexact, status);
1168 if ( ( roundingMode == float_round_to_zero )
1169 || ( zSign && ( roundingMode == float_round_up ) )
1170 || ( ! zSign && ( roundingMode == float_round_down ) )
1172 return
1173 packFloat128(
1174 zSign,
1175 0x7FFE,
1176 LIT64( 0x0000FFFFFFFFFFFF ),
1177 LIT64( 0xFFFFFFFFFFFFFFFF )
1180 return packFloat128( zSign, 0x7FFF, 0, 0 );
1182 if ( zExp < 0 ) {
1183 if (status->flush_to_zero) {
1184 float_raise(float_flag_output_denormal, status);
1185 return packFloat128(zSign, 0, 0, 0);
1187 isTiny =
1188 (status->float_detect_tininess
1189 == float_tininess_before_rounding)
1190 || ( zExp < -1 )
1191 || ! increment
1192 || lt128(
1193 zSig0,
1194 zSig1,
1195 LIT64( 0x0001FFFFFFFFFFFF ),
1196 LIT64( 0xFFFFFFFFFFFFFFFF )
1198 shift128ExtraRightJamming(
1199 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1200 zExp = 0;
1201 if (isTiny && zSig2) {
1202 float_raise(float_flag_underflow, status);
1204 switch (roundingMode) {
1205 case float_round_nearest_even:
1206 case float_round_ties_away:
1207 increment = ((int64_t)zSig2 < 0);
1208 break;
1209 case float_round_to_zero:
1210 increment = 0;
1211 break;
1212 case float_round_up:
1213 increment = !zSign && zSig2;
1214 break;
1215 case float_round_down:
1216 increment = zSign && zSig2;
1217 break;
1218 default:
1219 abort();
1223 if (zSig2) {
1224 status->float_exception_flags |= float_flag_inexact;
1226 if ( increment ) {
1227 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1228 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1230 else {
1231 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1233 return packFloat128( zSign, zExp, zSig0, zSig1 );
1237 /*----------------------------------------------------------------------------
1238 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1239 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1240 | returns the proper quadruple-precision floating-point value corresponding
1241 | to the abstract input. This routine is just like `roundAndPackFloat128'
1242 | except that the input significand has fewer bits and does not have to be
1243 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1244 | point exponent.
1245 *----------------------------------------------------------------------------*/
1247 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
1248 uint64_t zSig0, uint64_t zSig1,
1249 float_status *status)
1251 int8_t shiftCount;
1252 uint64_t zSig2;
1254 if ( zSig0 == 0 ) {
1255 zSig0 = zSig1;
1256 zSig1 = 0;
1257 zExp -= 64;
1259 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1260 if ( 0 <= shiftCount ) {
1261 zSig2 = 0;
1262 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1264 else {
1265 shift128ExtraRightJamming(
1266 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1268 zExp -= shiftCount;
1269 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
1273 /*----------------------------------------------------------------------------
1274 | Returns the result of converting the 32-bit two's complement integer `a'
1275 | to the single-precision floating-point format. The conversion is performed
1276 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1277 *----------------------------------------------------------------------------*/
1279 float32 int32_to_float32(int32_t a, float_status *status)
1281 flag zSign;
1283 if ( a == 0 ) return float32_zero;
1284 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1285 zSign = ( a < 0 );
1286 return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
1289 /*----------------------------------------------------------------------------
1290 | Returns the result of converting the 32-bit two's complement integer `a'
1291 | to the double-precision floating-point format. The conversion is performed
1292 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1293 *----------------------------------------------------------------------------*/
1295 float64 int32_to_float64(int32_t a, float_status *status)
1297 flag zSign;
1298 uint32_t absA;
1299 int8_t shiftCount;
1300 uint64_t zSig;
1302 if ( a == 0 ) return float64_zero;
1303 zSign = ( a < 0 );
1304 absA = zSign ? - a : a;
1305 shiftCount = countLeadingZeros32( absA ) + 21;
1306 zSig = absA;
1307 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1311 /*----------------------------------------------------------------------------
1312 | Returns the result of converting the 32-bit two's complement integer `a'
1313 | to the extended double-precision floating-point format. The conversion
1314 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1315 | Arithmetic.
1316 *----------------------------------------------------------------------------*/
1318 floatx80 int32_to_floatx80(int32_t a, float_status *status)
1320 flag zSign;
1321 uint32_t absA;
1322 int8_t shiftCount;
1323 uint64_t zSig;
1325 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1326 zSign = ( a < 0 );
1327 absA = zSign ? - a : a;
1328 shiftCount = countLeadingZeros32( absA ) + 32;
1329 zSig = absA;
1330 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1334 /*----------------------------------------------------------------------------
1335 | Returns the result of converting the 32-bit two's complement integer `a' to
1336 | the quadruple-precision floating-point format. The conversion is performed
1337 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1338 *----------------------------------------------------------------------------*/
1340 float128 int32_to_float128(int32_t a, float_status *status)
1342 flag zSign;
1343 uint32_t absA;
1344 int8_t shiftCount;
1345 uint64_t zSig0;
1347 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1348 zSign = ( a < 0 );
1349 absA = zSign ? - a : a;
1350 shiftCount = countLeadingZeros32( absA ) + 17;
1351 zSig0 = absA;
1352 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1356 /*----------------------------------------------------------------------------
1357 | Returns the result of converting the 64-bit two's complement integer `a'
1358 | to the single-precision floating-point format. The conversion is performed
1359 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1360 *----------------------------------------------------------------------------*/
1362 float32 int64_to_float32(int64_t a, float_status *status)
1364 flag zSign;
1365 uint64_t absA;
1366 int8_t shiftCount;
1368 if ( a == 0 ) return float32_zero;
1369 zSign = ( a < 0 );
1370 absA = zSign ? - a : a;
1371 shiftCount = countLeadingZeros64( absA ) - 40;
1372 if ( 0 <= shiftCount ) {
1373 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1375 else {
1376 shiftCount += 7;
1377 if ( shiftCount < 0 ) {
1378 shift64RightJamming( absA, - shiftCount, &absA );
1380 else {
1381 absA <<= shiftCount;
1383 return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
1388 /*----------------------------------------------------------------------------
1389 | Returns the result of converting the 64-bit two's complement integer `a'
1390 | to the double-precision floating-point format. The conversion is performed
1391 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1392 *----------------------------------------------------------------------------*/
1394 float64 int64_to_float64(int64_t a, float_status *status)
1396 flag zSign;
1398 if ( a == 0 ) return float64_zero;
1399 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1400 return packFloat64( 1, 0x43E, 0 );
1402 zSign = ( a < 0 );
1403 return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
1406 /*----------------------------------------------------------------------------
1407 | Returns the result of converting the 64-bit two's complement integer `a'
1408 | to the extended double-precision floating-point format. The conversion
1409 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1410 | Arithmetic.
1411 *----------------------------------------------------------------------------*/
1413 floatx80 int64_to_floatx80(int64_t a, float_status *status)
1415 flag zSign;
1416 uint64_t absA;
1417 int8_t shiftCount;
1419 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1420 zSign = ( a < 0 );
1421 absA = zSign ? - a : a;
1422 shiftCount = countLeadingZeros64( absA );
1423 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1427 /*----------------------------------------------------------------------------
1428 | Returns the result of converting the 64-bit two's complement integer `a' to
1429 | the quadruple-precision floating-point format. The conversion is performed
1430 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1431 *----------------------------------------------------------------------------*/
1433 float128 int64_to_float128(int64_t a, float_status *status)
1435 flag zSign;
1436 uint64_t absA;
1437 int8_t shiftCount;
1438 int32_t zExp;
1439 uint64_t zSig0, zSig1;
1441 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1442 zSign = ( a < 0 );
1443 absA = zSign ? - a : a;
1444 shiftCount = countLeadingZeros64( absA ) + 49;
1445 zExp = 0x406E - shiftCount;
1446 if ( 64 <= shiftCount ) {
1447 zSig1 = 0;
1448 zSig0 = absA;
1449 shiftCount -= 64;
1451 else {
1452 zSig1 = absA;
1453 zSig0 = 0;
1455 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1456 return packFloat128( zSign, zExp, zSig0, zSig1 );
1460 /*----------------------------------------------------------------------------
1461 | Returns the result of converting the 64-bit unsigned integer `a'
1462 | to the single-precision floating-point format. The conversion is performed
1463 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1464 *----------------------------------------------------------------------------*/
1466 float32 uint64_to_float32(uint64_t a, float_status *status)
1468 int shiftcount;
1470 if (a == 0) {
1471 return float32_zero;
1474 /* Determine (left) shift needed to put first set bit into bit posn 23
1475 * (since packFloat32() expects the binary point between bits 23 and 22);
1476 * this is the fast case for smallish numbers.
1478 shiftcount = countLeadingZeros64(a) - 40;
1479 if (shiftcount >= 0) {
1480 return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
1482 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1483 * expects the binary point between bits 30 and 29, hence the + 7.
1485 shiftcount += 7;
1486 if (shiftcount < 0) {
1487 shift64RightJamming(a, -shiftcount, &a);
1488 } else {
1489 a <<= shiftcount;
1492 return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
1495 /*----------------------------------------------------------------------------
1496 | Returns the result of converting the 64-bit unsigned integer `a'
1497 | to the double-precision floating-point format. The conversion is performed
1498 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1499 *----------------------------------------------------------------------------*/
1501 float64 uint64_to_float64(uint64_t a, float_status *status)
1503 int exp = 0x43C;
1504 int shiftcount;
1506 if (a == 0) {
1507 return float64_zero;
1510 shiftcount = countLeadingZeros64(a) - 1;
1511 if (shiftcount < 0) {
1512 shift64RightJamming(a, -shiftcount, &a);
1513 } else {
1514 a <<= shiftcount;
1516 return roundAndPackFloat64(0, exp - shiftcount, a, status);
1519 /*----------------------------------------------------------------------------
1520 | Returns the result of converting the 64-bit unsigned integer `a'
1521 | to the quadruple-precision floating-point format. The conversion is performed
1522 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1523 *----------------------------------------------------------------------------*/
1525 float128 uint64_to_float128(uint64_t a, float_status *status)
1527 if (a == 0) {
1528 return float128_zero;
1530 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
1533 /*----------------------------------------------------------------------------
1534 | Returns the result of converting the single-precision floating-point value
1535 | `a' to the 32-bit two's complement integer format. The conversion is
1536 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1537 | Arithmetic---which means in particular that the conversion is rounded
1538 | according to the current rounding mode. If `a' is a NaN, the largest
1539 | positive integer is returned. Otherwise, if the conversion overflows, the
1540 | largest integer with the same sign as `a' is returned.
1541 *----------------------------------------------------------------------------*/
1543 int32_t float32_to_int32(float32 a, float_status *status)
1545 flag aSign;
1546 int_fast16_t aExp, shiftCount;
1547 uint32_t aSig;
1548 uint64_t aSig64;
1550 a = float32_squash_input_denormal(a, status);
1551 aSig = extractFloat32Frac( a );
1552 aExp = extractFloat32Exp( a );
1553 aSign = extractFloat32Sign( a );
1554 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1555 if ( aExp ) aSig |= 0x00800000;
1556 shiftCount = 0xAF - aExp;
1557 aSig64 = aSig;
1558 aSig64 <<= 32;
1559 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1560 return roundAndPackInt32(aSign, aSig64, status);
1564 /*----------------------------------------------------------------------------
1565 | Returns the result of converting the single-precision floating-point value
1566 | `a' to the 32-bit two's complement integer format. The conversion is
1567 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1568 | Arithmetic, except that the conversion is always rounded toward zero.
1569 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1570 | the conversion overflows, the largest integer with the same sign as `a' is
1571 | returned.
1572 *----------------------------------------------------------------------------*/
1574 int32_t float32_to_int32_round_to_zero(float32 a, float_status *status)
1576 flag aSign;
1577 int_fast16_t aExp, shiftCount;
1578 uint32_t aSig;
1579 int32_t z;
1580 a = float32_squash_input_denormal(a, status);
1582 aSig = extractFloat32Frac( a );
1583 aExp = extractFloat32Exp( a );
1584 aSign = extractFloat32Sign( a );
1585 shiftCount = aExp - 0x9E;
1586 if ( 0 <= shiftCount ) {
1587 if ( float32_val(a) != 0xCF000000 ) {
1588 float_raise(float_flag_invalid, status);
1589 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1591 return (int32_t) 0x80000000;
1593 else if ( aExp <= 0x7E ) {
1594 if (aExp | aSig) {
1595 status->float_exception_flags |= float_flag_inexact;
1597 return 0;
1599 aSig = ( aSig | 0x00800000 )<<8;
1600 z = aSig>>( - shiftCount );
1601 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1602 status->float_exception_flags |= float_flag_inexact;
1604 if ( aSign ) z = - z;
1605 return z;
1609 /*----------------------------------------------------------------------------
1610 | Returns the result of converting the single-precision floating-point value
1611 | `a' to the 16-bit two's complement integer format. The conversion is
1612 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1613 | Arithmetic, except that the conversion is always rounded toward zero.
1614 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1615 | the conversion overflows, the largest integer with the same sign as `a' is
1616 | returned.
1617 *----------------------------------------------------------------------------*/
1619 int_fast16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
1621 flag aSign;
1622 int_fast16_t aExp, shiftCount;
1623 uint32_t aSig;
1624 int32_t z;
1626 aSig = extractFloat32Frac( a );
1627 aExp = extractFloat32Exp( a );
1628 aSign = extractFloat32Sign( a );
1629 shiftCount = aExp - 0x8E;
1630 if ( 0 <= shiftCount ) {
1631 if ( float32_val(a) != 0xC7000000 ) {
1632 float_raise(float_flag_invalid, status);
1633 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1634 return 0x7FFF;
1637 return (int32_t) 0xffff8000;
1639 else if ( aExp <= 0x7E ) {
1640 if ( aExp | aSig ) {
1641 status->float_exception_flags |= float_flag_inexact;
1643 return 0;
1645 shiftCount -= 0x10;
1646 aSig = ( aSig | 0x00800000 )<<8;
1647 z = aSig>>( - shiftCount );
1648 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1649 status->float_exception_flags |= float_flag_inexact;
1651 if ( aSign ) {
1652 z = - z;
1654 return z;
1658 /*----------------------------------------------------------------------------
1659 | Returns the result of converting the single-precision floating-point value
1660 | `a' to the 64-bit two's complement integer format. The conversion is
1661 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1662 | Arithmetic---which means in particular that the conversion is rounded
1663 | according to the current rounding mode. If `a' is a NaN, the largest
1664 | positive integer is returned. Otherwise, if the conversion overflows, the
1665 | largest integer with the same sign as `a' is returned.
1666 *----------------------------------------------------------------------------*/
1668 int64_t float32_to_int64(float32 a, float_status *status)
1670 flag aSign;
1671 int_fast16_t aExp, shiftCount;
1672 uint32_t aSig;
1673 uint64_t aSig64, aSigExtra;
1674 a = float32_squash_input_denormal(a, status);
1676 aSig = extractFloat32Frac( a );
1677 aExp = extractFloat32Exp( a );
1678 aSign = extractFloat32Sign( a );
1679 shiftCount = 0xBE - aExp;
1680 if ( shiftCount < 0 ) {
1681 float_raise(float_flag_invalid, status);
1682 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1683 return LIT64( 0x7FFFFFFFFFFFFFFF );
1685 return (int64_t) LIT64( 0x8000000000000000 );
1687 if ( aExp ) aSig |= 0x00800000;
1688 aSig64 = aSig;
1689 aSig64 <<= 40;
1690 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1691 return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
1695 /*----------------------------------------------------------------------------
1696 | Returns the result of converting the single-precision floating-point value
1697 | `a' to the 64-bit unsigned integer format. The conversion is
1698 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1699 | Arithmetic---which means in particular that the conversion is rounded
1700 | according to the current rounding mode. If `a' is a NaN, the largest
1701 | unsigned integer is returned. Otherwise, if the conversion overflows, the
1702 | largest unsigned integer is returned. If the 'a' is negative, the result
1703 | is rounded and zero is returned; values that do not round to zero will
1704 | raise the inexact exception flag.
1705 *----------------------------------------------------------------------------*/
1707 uint64_t float32_to_uint64(float32 a, float_status *status)
1709 flag aSign;
1710 int_fast16_t aExp, shiftCount;
1711 uint32_t aSig;
1712 uint64_t aSig64, aSigExtra;
1713 a = float32_squash_input_denormal(a, status);
1715 aSig = extractFloat32Frac(a);
1716 aExp = extractFloat32Exp(a);
1717 aSign = extractFloat32Sign(a);
1718 if ((aSign) && (aExp > 126)) {
1719 float_raise(float_flag_invalid, status);
1720 if (float32_is_any_nan(a)) {
1721 return LIT64(0xFFFFFFFFFFFFFFFF);
1722 } else {
1723 return 0;
1726 shiftCount = 0xBE - aExp;
1727 if (aExp) {
1728 aSig |= 0x00800000;
1730 if (shiftCount < 0) {
1731 float_raise(float_flag_invalid, status);
1732 return LIT64(0xFFFFFFFFFFFFFFFF);
1735 aSig64 = aSig;
1736 aSig64 <<= 40;
1737 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1738 return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
1741 /*----------------------------------------------------------------------------
1742 | Returns the result of converting the single-precision floating-point value
1743 | `a' to the 64-bit unsigned integer format. The conversion is
1744 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1745 | Arithmetic, except that the conversion is always rounded toward zero. If
1746 | `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the
1747 | conversion overflows, the largest unsigned integer is returned. If the
1748 | 'a' is negative, the result is rounded and zero is returned; values that do
1749 | not round to zero will raise the inexact flag.
1750 *----------------------------------------------------------------------------*/
1752 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status)
1754 signed char current_rounding_mode = status->float_rounding_mode;
1755 set_float_rounding_mode(float_round_to_zero, status);
1756 int64_t v = float32_to_uint64(a, status);
1757 set_float_rounding_mode(current_rounding_mode, status);
1758 return v;
1761 /*----------------------------------------------------------------------------
1762 | Returns the result of converting the single-precision floating-point value
1763 | `a' to the 64-bit two's complement integer format. The conversion is
1764 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1765 | Arithmetic, except that the conversion is always rounded toward zero. If
1766 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1767 | conversion overflows, the largest integer with the same sign as `a' is
1768 | returned.
1769 *----------------------------------------------------------------------------*/
1771 int64_t float32_to_int64_round_to_zero(float32 a, float_status *status)
1773 flag aSign;
1774 int_fast16_t aExp, shiftCount;
1775 uint32_t aSig;
1776 uint64_t aSig64;
1777 int64_t z;
1778 a = float32_squash_input_denormal(a, status);
1780 aSig = extractFloat32Frac( a );
1781 aExp = extractFloat32Exp( a );
1782 aSign = extractFloat32Sign( a );
1783 shiftCount = aExp - 0xBE;
1784 if ( 0 <= shiftCount ) {
1785 if ( float32_val(a) != 0xDF000000 ) {
1786 float_raise(float_flag_invalid, status);
1787 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1788 return LIT64( 0x7FFFFFFFFFFFFFFF );
1791 return (int64_t) LIT64( 0x8000000000000000 );
1793 else if ( aExp <= 0x7E ) {
1794 if (aExp | aSig) {
1795 status->float_exception_flags |= float_flag_inexact;
1797 return 0;
1799 aSig64 = aSig | 0x00800000;
1800 aSig64 <<= 40;
1801 z = aSig64>>( - shiftCount );
1802 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1803 status->float_exception_flags |= float_flag_inexact;
1805 if ( aSign ) z = - z;
1806 return z;
1810 /*----------------------------------------------------------------------------
1811 | Returns the result of converting the single-precision floating-point value
1812 | `a' to the double-precision floating-point format. The conversion is
1813 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1814 | Arithmetic.
1815 *----------------------------------------------------------------------------*/
1817 float64 float32_to_float64(float32 a, float_status *status)
1819 flag aSign;
1820 int_fast16_t aExp;
1821 uint32_t aSig;
1822 a = float32_squash_input_denormal(a, status);
1824 aSig = extractFloat32Frac( a );
1825 aExp = extractFloat32Exp( a );
1826 aSign = extractFloat32Sign( a );
1827 if ( aExp == 0xFF ) {
1828 if (aSig) {
1829 return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
1831 return packFloat64( aSign, 0x7FF, 0 );
1833 if ( aExp == 0 ) {
1834 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1835 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1836 --aExp;
1838 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1842 /*----------------------------------------------------------------------------
1843 | Returns the result of converting the single-precision floating-point value
1844 | `a' to the extended double-precision floating-point format. The conversion
1845 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1846 | Arithmetic.
1847 *----------------------------------------------------------------------------*/
1849 floatx80 float32_to_floatx80(float32 a, float_status *status)
1851 flag aSign;
1852 int_fast16_t aExp;
1853 uint32_t aSig;
1855 a = float32_squash_input_denormal(a, status);
1856 aSig = extractFloat32Frac( a );
1857 aExp = extractFloat32Exp( a );
1858 aSign = extractFloat32Sign( a );
1859 if ( aExp == 0xFF ) {
1860 if (aSig) {
1861 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
1863 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1865 if ( aExp == 0 ) {
1866 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1867 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1869 aSig |= 0x00800000;
1870 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1874 /*----------------------------------------------------------------------------
1875 | Returns the result of converting the single-precision floating-point value
1876 | `a' to the double-precision floating-point format. The conversion is
1877 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1878 | Arithmetic.
1879 *----------------------------------------------------------------------------*/
1881 float128 float32_to_float128(float32 a, float_status *status)
1883 flag aSign;
1884 int_fast16_t aExp;
1885 uint32_t aSig;
1887 a = float32_squash_input_denormal(a, status);
1888 aSig = extractFloat32Frac( a );
1889 aExp = extractFloat32Exp( a );
1890 aSign = extractFloat32Sign( a );
1891 if ( aExp == 0xFF ) {
1892 if (aSig) {
1893 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
1895 return packFloat128( aSign, 0x7FFF, 0, 0 );
1897 if ( aExp == 0 ) {
1898 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1899 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1900 --aExp;
1902 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1906 /*----------------------------------------------------------------------------
1907 | Rounds the single-precision floating-point value `a' to an integer, and
1908 | returns the result as a single-precision floating-point value. The
1909 | operation is performed according to the IEC/IEEE Standard for Binary
1910 | Floating-Point Arithmetic.
1911 *----------------------------------------------------------------------------*/
1913 float32 float32_round_to_int(float32 a, float_status *status)
1915 flag aSign;
1916 int_fast16_t aExp;
1917 uint32_t lastBitMask, roundBitsMask;
1918 uint32_t z;
1919 a = float32_squash_input_denormal(a, status);
1921 aExp = extractFloat32Exp( a );
1922 if ( 0x96 <= aExp ) {
1923 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1924 return propagateFloat32NaN(a, a, status);
1926 return a;
1928 if ( aExp <= 0x7E ) {
1929 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1930 status->float_exception_flags |= float_flag_inexact;
1931 aSign = extractFloat32Sign( a );
1932 switch (status->float_rounding_mode) {
1933 case float_round_nearest_even:
1934 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1935 return packFloat32( aSign, 0x7F, 0 );
1937 break;
1938 case float_round_ties_away:
1939 if (aExp == 0x7E) {
1940 return packFloat32(aSign, 0x7F, 0);
1942 break;
1943 case float_round_down:
1944 return make_float32(aSign ? 0xBF800000 : 0);
1945 case float_round_up:
1946 return make_float32(aSign ? 0x80000000 : 0x3F800000);
1948 return packFloat32( aSign, 0, 0 );
1950 lastBitMask = 1;
1951 lastBitMask <<= 0x96 - aExp;
1952 roundBitsMask = lastBitMask - 1;
1953 z = float32_val(a);
1954 switch (status->float_rounding_mode) {
1955 case float_round_nearest_even:
1956 z += lastBitMask>>1;
1957 if ((z & roundBitsMask) == 0) {
1958 z &= ~lastBitMask;
1960 break;
1961 case float_round_ties_away:
1962 z += lastBitMask >> 1;
1963 break;
1964 case float_round_to_zero:
1965 break;
1966 case float_round_up:
1967 if (!extractFloat32Sign(make_float32(z))) {
1968 z += roundBitsMask;
1970 break;
1971 case float_round_down:
1972 if (extractFloat32Sign(make_float32(z))) {
1973 z += roundBitsMask;
1975 break;
1976 default:
1977 abort();
1979 z &= ~ roundBitsMask;
1980 if (z != float32_val(a)) {
1981 status->float_exception_flags |= float_flag_inexact;
1983 return make_float32(z);
1987 /*----------------------------------------------------------------------------
1988 | Returns the result of adding the absolute values of the single-precision
1989 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1990 | before being returned. `zSign' is ignored if the result is a NaN.
1991 | The addition is performed according to the IEC/IEEE Standard for Binary
1992 | Floating-Point Arithmetic.
1993 *----------------------------------------------------------------------------*/
1995 static float32 addFloat32Sigs(float32 a, float32 b, flag zSign,
1996 float_status *status)
1998 int_fast16_t aExp, bExp, zExp;
1999 uint32_t aSig, bSig, zSig;
2000 int_fast16_t expDiff;
2002 aSig = extractFloat32Frac( a );
2003 aExp = extractFloat32Exp( a );
2004 bSig = extractFloat32Frac( b );
2005 bExp = extractFloat32Exp( b );
2006 expDiff = aExp - bExp;
2007 aSig <<= 6;
2008 bSig <<= 6;
2009 if ( 0 < expDiff ) {
2010 if ( aExp == 0xFF ) {
2011 if (aSig) {
2012 return propagateFloat32NaN(a, b, status);
2014 return a;
2016 if ( bExp == 0 ) {
2017 --expDiff;
2019 else {
2020 bSig |= 0x20000000;
2022 shift32RightJamming( bSig, expDiff, &bSig );
2023 zExp = aExp;
2025 else if ( expDiff < 0 ) {
2026 if ( bExp == 0xFF ) {
2027 if (bSig) {
2028 return propagateFloat32NaN(a, b, status);
2030 return packFloat32( zSign, 0xFF, 0 );
2032 if ( aExp == 0 ) {
2033 ++expDiff;
2035 else {
2036 aSig |= 0x20000000;
2038 shift32RightJamming( aSig, - expDiff, &aSig );
2039 zExp = bExp;
2041 else {
2042 if ( aExp == 0xFF ) {
2043 if (aSig | bSig) {
2044 return propagateFloat32NaN(a, b, status);
2046 return a;
2048 if ( aExp == 0 ) {
2049 if (status->flush_to_zero) {
2050 if (aSig | bSig) {
2051 float_raise(float_flag_output_denormal, status);
2053 return packFloat32(zSign, 0, 0);
2055 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2057 zSig = 0x40000000 + aSig + bSig;
2058 zExp = aExp;
2059 goto roundAndPack;
2061 aSig |= 0x20000000;
2062 zSig = ( aSig + bSig )<<1;
2063 --zExp;
2064 if ( (int32_t) zSig < 0 ) {
2065 zSig = aSig + bSig;
2066 ++zExp;
2068 roundAndPack:
2069 return roundAndPackFloat32(zSign, zExp, zSig, status);
2073 /*----------------------------------------------------------------------------
2074 | Returns the result of subtracting the absolute values of the single-
2075 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2076 | difference is negated before being returned. `zSign' is ignored if the
2077 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2078 | Standard for Binary Floating-Point Arithmetic.
2079 *----------------------------------------------------------------------------*/
2081 static float32 subFloat32Sigs(float32 a, float32 b, flag zSign,
2082 float_status *status)
2084 int_fast16_t aExp, bExp, zExp;
2085 uint32_t aSig, bSig, zSig;
2086 int_fast16_t expDiff;
2088 aSig = extractFloat32Frac( a );
2089 aExp = extractFloat32Exp( a );
2090 bSig = extractFloat32Frac( b );
2091 bExp = extractFloat32Exp( b );
2092 expDiff = aExp - bExp;
2093 aSig <<= 7;
2094 bSig <<= 7;
2095 if ( 0 < expDiff ) goto aExpBigger;
2096 if ( expDiff < 0 ) goto bExpBigger;
2097 if ( aExp == 0xFF ) {
2098 if (aSig | bSig) {
2099 return propagateFloat32NaN(a, b, status);
2101 float_raise(float_flag_invalid, status);
2102 return float32_default_nan;
2104 if ( aExp == 0 ) {
2105 aExp = 1;
2106 bExp = 1;
2108 if ( bSig < aSig ) goto aBigger;
2109 if ( aSig < bSig ) goto bBigger;
2110 return packFloat32(status->float_rounding_mode == float_round_down, 0, 0);
2111 bExpBigger:
2112 if ( bExp == 0xFF ) {
2113 if (bSig) {
2114 return propagateFloat32NaN(a, b, status);
2116 return packFloat32( zSign ^ 1, 0xFF, 0 );
2118 if ( aExp == 0 ) {
2119 ++expDiff;
2121 else {
2122 aSig |= 0x40000000;
2124 shift32RightJamming( aSig, - expDiff, &aSig );
2125 bSig |= 0x40000000;
2126 bBigger:
2127 zSig = bSig - aSig;
2128 zExp = bExp;
2129 zSign ^= 1;
2130 goto normalizeRoundAndPack;
2131 aExpBigger:
2132 if ( aExp == 0xFF ) {
2133 if (aSig) {
2134 return propagateFloat32NaN(a, b, status);
2136 return a;
2138 if ( bExp == 0 ) {
2139 --expDiff;
2141 else {
2142 bSig |= 0x40000000;
2144 shift32RightJamming( bSig, expDiff, &bSig );
2145 aSig |= 0x40000000;
2146 aBigger:
2147 zSig = aSig - bSig;
2148 zExp = aExp;
2149 normalizeRoundAndPack:
2150 --zExp;
2151 return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
2155 /*----------------------------------------------------------------------------
2156 | Returns the result of adding the single-precision floating-point values `a'
2157 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2158 | Binary Floating-Point Arithmetic.
2159 *----------------------------------------------------------------------------*/
2161 float32 float32_add(float32 a, float32 b, float_status *status)
2163 flag aSign, bSign;
2164 a = float32_squash_input_denormal(a, status);
2165 b = float32_squash_input_denormal(b, status);
2167 aSign = extractFloat32Sign( a );
2168 bSign = extractFloat32Sign( b );
2169 if ( aSign == bSign ) {
2170 return addFloat32Sigs(a, b, aSign, status);
2172 else {
2173 return subFloat32Sigs(a, b, aSign, status);
2178 /*----------------------------------------------------------------------------
2179 | Returns the result of subtracting the single-precision floating-point values
2180 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2181 | for Binary Floating-Point Arithmetic.
2182 *----------------------------------------------------------------------------*/
2184 float32 float32_sub(float32 a, float32 b, float_status *status)
2186 flag aSign, bSign;
2187 a = float32_squash_input_denormal(a, status);
2188 b = float32_squash_input_denormal(b, status);
2190 aSign = extractFloat32Sign( a );
2191 bSign = extractFloat32Sign( b );
2192 if ( aSign == bSign ) {
2193 return subFloat32Sigs(a, b, aSign, status);
2195 else {
2196 return addFloat32Sigs(a, b, aSign, status);
2201 /*----------------------------------------------------------------------------
2202 | Returns the result of multiplying the single-precision floating-point values
2203 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2204 | for Binary Floating-Point Arithmetic.
2205 *----------------------------------------------------------------------------*/
2207 float32 float32_mul(float32 a, float32 b, float_status *status)
2209 flag aSign, bSign, zSign;
2210 int_fast16_t aExp, bExp, zExp;
2211 uint32_t aSig, bSig;
2212 uint64_t zSig64;
2213 uint32_t zSig;
2215 a = float32_squash_input_denormal(a, status);
2216 b = float32_squash_input_denormal(b, status);
2218 aSig = extractFloat32Frac( a );
2219 aExp = extractFloat32Exp( a );
2220 aSign = extractFloat32Sign( a );
2221 bSig = extractFloat32Frac( b );
2222 bExp = extractFloat32Exp( b );
2223 bSign = extractFloat32Sign( b );
2224 zSign = aSign ^ bSign;
2225 if ( aExp == 0xFF ) {
2226 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2227 return propagateFloat32NaN(a, b, status);
2229 if ( ( bExp | bSig ) == 0 ) {
2230 float_raise(float_flag_invalid, status);
2231 return float32_default_nan;
2233 return packFloat32( zSign, 0xFF, 0 );
2235 if ( bExp == 0xFF ) {
2236 if (bSig) {
2237 return propagateFloat32NaN(a, b, status);
2239 if ( ( aExp | aSig ) == 0 ) {
2240 float_raise(float_flag_invalid, status);
2241 return float32_default_nan;
2243 return packFloat32( zSign, 0xFF, 0 );
2245 if ( aExp == 0 ) {
2246 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2247 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2249 if ( bExp == 0 ) {
2250 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2251 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2253 zExp = aExp + bExp - 0x7F;
2254 aSig = ( aSig | 0x00800000 )<<7;
2255 bSig = ( bSig | 0x00800000 )<<8;
2256 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2257 zSig = zSig64;
2258 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2259 zSig <<= 1;
2260 --zExp;
2262 return roundAndPackFloat32(zSign, zExp, zSig, status);
2266 /*----------------------------------------------------------------------------
2267 | Returns the result of dividing the single-precision floating-point value `a'
2268 | by the corresponding value `b'. The operation is performed according to the
2269 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2270 *----------------------------------------------------------------------------*/
2272 float32 float32_div(float32 a, float32 b, float_status *status)
2274 flag aSign, bSign, zSign;
2275 int_fast16_t aExp, bExp, zExp;
2276 uint32_t aSig, bSig, zSig;
2277 a = float32_squash_input_denormal(a, status);
2278 b = float32_squash_input_denormal(b, status);
2280 aSig = extractFloat32Frac( a );
2281 aExp = extractFloat32Exp( a );
2282 aSign = extractFloat32Sign( a );
2283 bSig = extractFloat32Frac( b );
2284 bExp = extractFloat32Exp( b );
2285 bSign = extractFloat32Sign( b );
2286 zSign = aSign ^ bSign;
2287 if ( aExp == 0xFF ) {
2288 if (aSig) {
2289 return propagateFloat32NaN(a, b, status);
2291 if ( bExp == 0xFF ) {
2292 if (bSig) {
2293 return propagateFloat32NaN(a, b, status);
2295 float_raise(float_flag_invalid, status);
2296 return float32_default_nan;
2298 return packFloat32( zSign, 0xFF, 0 );
2300 if ( bExp == 0xFF ) {
2301 if (bSig) {
2302 return propagateFloat32NaN(a, b, status);
2304 return packFloat32( zSign, 0, 0 );
2306 if ( bExp == 0 ) {
2307 if ( bSig == 0 ) {
2308 if ( ( aExp | aSig ) == 0 ) {
2309 float_raise(float_flag_invalid, status);
2310 return float32_default_nan;
2312 float_raise(float_flag_divbyzero, status);
2313 return packFloat32( zSign, 0xFF, 0 );
2315 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2317 if ( aExp == 0 ) {
2318 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2319 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2321 zExp = aExp - bExp + 0x7D;
2322 aSig = ( aSig | 0x00800000 )<<7;
2323 bSig = ( bSig | 0x00800000 )<<8;
2324 if ( bSig <= ( aSig + aSig ) ) {
2325 aSig >>= 1;
2326 ++zExp;
2328 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2329 if ( ( zSig & 0x3F ) == 0 ) {
2330 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2332 return roundAndPackFloat32(zSign, zExp, zSig, status);
2336 /*----------------------------------------------------------------------------
2337 | Returns the remainder of the single-precision floating-point value `a'
2338 | with respect to the corresponding value `b'. The operation is performed
2339 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2340 *----------------------------------------------------------------------------*/
2342 float32 float32_rem(float32 a, float32 b, float_status *status)
2344 flag aSign, zSign;
2345 int_fast16_t aExp, bExp, expDiff;
2346 uint32_t aSig, bSig;
2347 uint32_t q;
2348 uint64_t aSig64, bSig64, q64;
2349 uint32_t alternateASig;
2350 int32_t sigMean;
2351 a = float32_squash_input_denormal(a, status);
2352 b = float32_squash_input_denormal(b, status);
2354 aSig = extractFloat32Frac( a );
2355 aExp = extractFloat32Exp( a );
2356 aSign = extractFloat32Sign( a );
2357 bSig = extractFloat32Frac( b );
2358 bExp = extractFloat32Exp( b );
2359 if ( aExp == 0xFF ) {
2360 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2361 return propagateFloat32NaN(a, b, status);
2363 float_raise(float_flag_invalid, status);
2364 return float32_default_nan;
2366 if ( bExp == 0xFF ) {
2367 if (bSig) {
2368 return propagateFloat32NaN(a, b, status);
2370 return a;
2372 if ( bExp == 0 ) {
2373 if ( bSig == 0 ) {
2374 float_raise(float_flag_invalid, status);
2375 return float32_default_nan;
2377 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2379 if ( aExp == 0 ) {
2380 if ( aSig == 0 ) return a;
2381 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2383 expDiff = aExp - bExp;
2384 aSig |= 0x00800000;
2385 bSig |= 0x00800000;
2386 if ( expDiff < 32 ) {
2387 aSig <<= 8;
2388 bSig <<= 8;
2389 if ( expDiff < 0 ) {
2390 if ( expDiff < -1 ) return a;
2391 aSig >>= 1;
2393 q = ( bSig <= aSig );
2394 if ( q ) aSig -= bSig;
2395 if ( 0 < expDiff ) {
2396 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2397 q >>= 32 - expDiff;
2398 bSig >>= 2;
2399 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2401 else {
2402 aSig >>= 2;
2403 bSig >>= 2;
2406 else {
2407 if ( bSig <= aSig ) aSig -= bSig;
2408 aSig64 = ( (uint64_t) aSig )<<40;
2409 bSig64 = ( (uint64_t) bSig )<<40;
2410 expDiff -= 64;
2411 while ( 0 < expDiff ) {
2412 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2413 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2414 aSig64 = - ( ( bSig * q64 )<<38 );
2415 expDiff -= 62;
2417 expDiff += 64;
2418 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2419 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2420 q = q64>>( 64 - expDiff );
2421 bSig <<= 6;
2422 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2424 do {
2425 alternateASig = aSig;
2426 ++q;
2427 aSig -= bSig;
2428 } while ( 0 <= (int32_t) aSig );
2429 sigMean = aSig + alternateASig;
2430 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2431 aSig = alternateASig;
2433 zSign = ( (int32_t) aSig < 0 );
2434 if ( zSign ) aSig = - aSig;
2435 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
2438 /*----------------------------------------------------------------------------
2439 | Returns the result of multiplying the single-precision floating-point values
2440 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2441 | multiplication. The operation is performed according to the IEC/IEEE
2442 | Standard for Binary Floating-Point Arithmetic 754-2008.
2443 | The flags argument allows the caller to select negation of the
2444 | addend, the intermediate product, or the final result. (The difference
2445 | between this and having the caller do a separate negation is that negating
2446 | externally will flip the sign bit on NaNs.)
2447 *----------------------------------------------------------------------------*/
2449 float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
2450 float_status *status)
2452 flag aSign, bSign, cSign, zSign;
2453 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
2454 uint32_t aSig, bSig, cSig;
2455 flag pInf, pZero, pSign;
2456 uint64_t pSig64, cSig64, zSig64;
2457 uint32_t pSig;
2458 int shiftcount;
2459 flag signflip, infzero;
2461 a = float32_squash_input_denormal(a, status);
2462 b = float32_squash_input_denormal(b, status);
2463 c = float32_squash_input_denormal(c, status);
2464 aSig = extractFloat32Frac(a);
2465 aExp = extractFloat32Exp(a);
2466 aSign = extractFloat32Sign(a);
2467 bSig = extractFloat32Frac(b);
2468 bExp = extractFloat32Exp(b);
2469 bSign = extractFloat32Sign(b);
2470 cSig = extractFloat32Frac(c);
2471 cExp = extractFloat32Exp(c);
2472 cSign = extractFloat32Sign(c);
2474 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2475 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2477 /* It is implementation-defined whether the cases of (0,inf,qnan)
2478 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2479 * they return if they do), so we have to hand this information
2480 * off to the target-specific pick-a-NaN routine.
2482 if (((aExp == 0xff) && aSig) ||
2483 ((bExp == 0xff) && bSig) ||
2484 ((cExp == 0xff) && cSig)) {
2485 return propagateFloat32MulAddNaN(a, b, c, infzero, status);
2488 if (infzero) {
2489 float_raise(float_flag_invalid, status);
2490 return float32_default_nan;
2493 if (flags & float_muladd_negate_c) {
2494 cSign ^= 1;
2497 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2499 /* Work out the sign and type of the product */
2500 pSign = aSign ^ bSign;
2501 if (flags & float_muladd_negate_product) {
2502 pSign ^= 1;
2504 pInf = (aExp == 0xff) || (bExp == 0xff);
2505 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2507 if (cExp == 0xff) {
2508 if (pInf && (pSign ^ cSign)) {
2509 /* addition of opposite-signed infinities => InvalidOperation */
2510 float_raise(float_flag_invalid, status);
2511 return float32_default_nan;
2513 /* Otherwise generate an infinity of the same sign */
2514 return packFloat32(cSign ^ signflip, 0xff, 0);
2517 if (pInf) {
2518 return packFloat32(pSign ^ signflip, 0xff, 0);
2521 if (pZero) {
2522 if (cExp == 0) {
2523 if (cSig == 0) {
2524 /* Adding two exact zeroes */
2525 if (pSign == cSign) {
2526 zSign = pSign;
2527 } else if (status->float_rounding_mode == float_round_down) {
2528 zSign = 1;
2529 } else {
2530 zSign = 0;
2532 return packFloat32(zSign ^ signflip, 0, 0);
2534 /* Exact zero plus a denorm */
2535 if (status->flush_to_zero) {
2536 float_raise(float_flag_output_denormal, status);
2537 return packFloat32(cSign ^ signflip, 0, 0);
2540 /* Zero plus something non-zero : just return the something */
2541 if (flags & float_muladd_halve_result) {
2542 if (cExp == 0) {
2543 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2545 /* Subtract one to halve, and one again because roundAndPackFloat32
2546 * wants one less than the true exponent.
2548 cExp -= 2;
2549 cSig = (cSig | 0x00800000) << 7;
2550 return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status);
2552 return packFloat32(cSign ^ signflip, cExp, cSig);
2555 if (aExp == 0) {
2556 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2558 if (bExp == 0) {
2559 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2562 /* Calculate the actual result a * b + c */
2564 /* Multiply first; this is easy. */
2565 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2566 * because we want the true exponent, not the "one-less-than"
2567 * flavour that roundAndPackFloat32() takes.
2569 pExp = aExp + bExp - 0x7e;
2570 aSig = (aSig | 0x00800000) << 7;
2571 bSig = (bSig | 0x00800000) << 8;
2572 pSig64 = (uint64_t)aSig * bSig;
2573 if ((int64_t)(pSig64 << 1) >= 0) {
2574 pSig64 <<= 1;
2575 pExp--;
2578 zSign = pSign ^ signflip;
2580 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2581 * position 62.
2583 if (cExp == 0) {
2584 if (!cSig) {
2585 /* Throw out the special case of c being an exact zero now */
2586 shift64RightJamming(pSig64, 32, &pSig64);
2587 pSig = pSig64;
2588 if (flags & float_muladd_halve_result) {
2589 pExp--;
2591 return roundAndPackFloat32(zSign, pExp - 1,
2592 pSig, status);
2594 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2597 cSig64 = (uint64_t)cSig << (62 - 23);
2598 cSig64 |= LIT64(0x4000000000000000);
2599 expDiff = pExp - cExp;
2601 if (pSign == cSign) {
2602 /* Addition */
2603 if (expDiff > 0) {
2604 /* scale c to match p */
2605 shift64RightJamming(cSig64, expDiff, &cSig64);
2606 zExp = pExp;
2607 } else if (expDiff < 0) {
2608 /* scale p to match c */
2609 shift64RightJamming(pSig64, -expDiff, &pSig64);
2610 zExp = cExp;
2611 } else {
2612 /* no scaling needed */
2613 zExp = cExp;
2615 /* Add significands and make sure explicit bit ends up in posn 62 */
2616 zSig64 = pSig64 + cSig64;
2617 if ((int64_t)zSig64 < 0) {
2618 shift64RightJamming(zSig64, 1, &zSig64);
2619 } else {
2620 zExp--;
2622 } else {
2623 /* Subtraction */
2624 if (expDiff > 0) {
2625 shift64RightJamming(cSig64, expDiff, &cSig64);
2626 zSig64 = pSig64 - cSig64;
2627 zExp = pExp;
2628 } else if (expDiff < 0) {
2629 shift64RightJamming(pSig64, -expDiff, &pSig64);
2630 zSig64 = cSig64 - pSig64;
2631 zExp = cExp;
2632 zSign ^= 1;
2633 } else {
2634 zExp = pExp;
2635 if (cSig64 < pSig64) {
2636 zSig64 = pSig64 - cSig64;
2637 } else if (pSig64 < cSig64) {
2638 zSig64 = cSig64 - pSig64;
2639 zSign ^= 1;
2640 } else {
2641 /* Exact zero */
2642 zSign = signflip;
2643 if (status->float_rounding_mode == float_round_down) {
2644 zSign ^= 1;
2646 return packFloat32(zSign, 0, 0);
2649 --zExp;
2650 /* Normalize to put the explicit bit back into bit 62. */
2651 shiftcount = countLeadingZeros64(zSig64) - 1;
2652 zSig64 <<= shiftcount;
2653 zExp -= shiftcount;
2655 if (flags & float_muladd_halve_result) {
2656 zExp--;
2659 shift64RightJamming(zSig64, 32, &zSig64);
2660 return roundAndPackFloat32(zSign, zExp, zSig64, status);
2664 /*----------------------------------------------------------------------------
2665 | Returns the square root of the single-precision floating-point value `a'.
2666 | The operation is performed according to the IEC/IEEE Standard for Binary
2667 | Floating-Point Arithmetic.
2668 *----------------------------------------------------------------------------*/
2670 float32 float32_sqrt(float32 a, float_status *status)
2672 flag aSign;
2673 int_fast16_t aExp, zExp;
2674 uint32_t aSig, zSig;
2675 uint64_t rem, term;
2676 a = float32_squash_input_denormal(a, status);
2678 aSig = extractFloat32Frac( a );
2679 aExp = extractFloat32Exp( a );
2680 aSign = extractFloat32Sign( a );
2681 if ( aExp == 0xFF ) {
2682 if (aSig) {
2683 return propagateFloat32NaN(a, float32_zero, status);
2685 if ( ! aSign ) return a;
2686 float_raise(float_flag_invalid, status);
2687 return float32_default_nan;
2689 if ( aSign ) {
2690 if ( ( aExp | aSig ) == 0 ) return a;
2691 float_raise(float_flag_invalid, status);
2692 return float32_default_nan;
2694 if ( aExp == 0 ) {
2695 if ( aSig == 0 ) return float32_zero;
2696 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2698 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2699 aSig = ( aSig | 0x00800000 )<<8;
2700 zSig = estimateSqrt32( aExp, aSig ) + 2;
2701 if ( ( zSig & 0x7F ) <= 5 ) {
2702 if ( zSig < 2 ) {
2703 zSig = 0x7FFFFFFF;
2704 goto roundAndPack;
2706 aSig >>= aExp & 1;
2707 term = ( (uint64_t) zSig ) * zSig;
2708 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2709 while ( (int64_t) rem < 0 ) {
2710 --zSig;
2711 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2713 zSig |= ( rem != 0 );
2715 shift32RightJamming( zSig, 1, &zSig );
2716 roundAndPack:
2717 return roundAndPackFloat32(0, zExp, zSig, status);
2721 /*----------------------------------------------------------------------------
2722 | Returns the binary exponential of the single-precision floating-point value
2723 | `a'. The operation is performed according to the IEC/IEEE Standard for
2724 | Binary Floating-Point Arithmetic.
2726 | Uses the following identities:
2728 | 1. -------------------------------------------------------------------------
2729 | x x*ln(2)
2730 | 2 = e
2732 | 2. -------------------------------------------------------------------------
2733 | 2 3 4 5 n
2734 | x x x x x x x
2735 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2736 | 1! 2! 3! 4! 5! n!
2737 *----------------------------------------------------------------------------*/
2739 static const float64 float32_exp2_coefficients[15] =
2741 const_float64( 0x3ff0000000000000ll ), /* 1 */
2742 const_float64( 0x3fe0000000000000ll ), /* 2 */
2743 const_float64( 0x3fc5555555555555ll ), /* 3 */
2744 const_float64( 0x3fa5555555555555ll ), /* 4 */
2745 const_float64( 0x3f81111111111111ll ), /* 5 */
2746 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2747 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2748 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2749 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2750 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2751 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2752 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2753 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2754 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2755 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2758 float32 float32_exp2(float32 a, float_status *status)
2760 flag aSign;
2761 int_fast16_t aExp;
2762 uint32_t aSig;
2763 float64 r, x, xn;
2764 int i;
2765 a = float32_squash_input_denormal(a, status);
2767 aSig = extractFloat32Frac( a );
2768 aExp = extractFloat32Exp( a );
2769 aSign = extractFloat32Sign( a );
2771 if ( aExp == 0xFF) {
2772 if (aSig) {
2773 return propagateFloat32NaN(a, float32_zero, status);
2775 return (aSign) ? float32_zero : a;
2777 if (aExp == 0) {
2778 if (aSig == 0) return float32_one;
2781 float_raise(float_flag_inexact, status);
2783 /* ******************************* */
2784 /* using float64 for approximation */
2785 /* ******************************* */
2786 x = float32_to_float64(a, status);
2787 x = float64_mul(x, float64_ln2, status);
2789 xn = x;
2790 r = float64_one;
2791 for (i = 0 ; i < 15 ; i++) {
2792 float64 f;
2794 f = float64_mul(xn, float32_exp2_coefficients[i], status);
2795 r = float64_add(r, f, status);
2797 xn = float64_mul(xn, x, status);
2800 return float64_to_float32(r, status);
2803 /*----------------------------------------------------------------------------
2804 | Returns the binary log of the single-precision floating-point value `a'.
2805 | The operation is performed according to the IEC/IEEE Standard for Binary
2806 | Floating-Point Arithmetic.
2807 *----------------------------------------------------------------------------*/
2808 float32 float32_log2(float32 a, float_status *status)
2810 flag aSign, zSign;
2811 int_fast16_t aExp;
2812 uint32_t aSig, zSig, i;
2814 a = float32_squash_input_denormal(a, status);
2815 aSig = extractFloat32Frac( a );
2816 aExp = extractFloat32Exp( a );
2817 aSign = extractFloat32Sign( a );
2819 if ( aExp == 0 ) {
2820 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2821 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2823 if ( aSign ) {
2824 float_raise(float_flag_invalid, status);
2825 return float32_default_nan;
2827 if ( aExp == 0xFF ) {
2828 if (aSig) {
2829 return propagateFloat32NaN(a, float32_zero, status);
2831 return a;
2834 aExp -= 0x7F;
2835 aSig |= 0x00800000;
2836 zSign = aExp < 0;
2837 zSig = aExp << 23;
2839 for (i = 1 << 22; i > 0; i >>= 1) {
2840 aSig = ( (uint64_t)aSig * aSig ) >> 23;
2841 if ( aSig & 0x01000000 ) {
2842 aSig >>= 1;
2843 zSig |= i;
2847 if ( zSign )
2848 zSig = -zSig;
2850 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
2853 /*----------------------------------------------------------------------------
2854 | Returns 1 if the single-precision floating-point value `a' is equal to
2855 | the corresponding value `b', and 0 otherwise. The invalid exception is
2856 | raised if either operand is a NaN. Otherwise, the comparison is performed
2857 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2858 *----------------------------------------------------------------------------*/
2860 int float32_eq(float32 a, float32 b, float_status *status)
2862 uint32_t av, bv;
2863 a = float32_squash_input_denormal(a, status);
2864 b = float32_squash_input_denormal(b, status);
2866 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2867 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2869 float_raise(float_flag_invalid, status);
2870 return 0;
2872 av = float32_val(a);
2873 bv = float32_val(b);
2874 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2877 /*----------------------------------------------------------------------------
2878 | Returns 1 if the single-precision floating-point value `a' is less than
2879 | or equal to the corresponding value `b', and 0 otherwise. The invalid
2880 | exception is raised if either operand is a NaN. The comparison is performed
2881 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2882 *----------------------------------------------------------------------------*/
2884 int float32_le(float32 a, float32 b, float_status *status)
2886 flag aSign, bSign;
2887 uint32_t av, bv;
2888 a = float32_squash_input_denormal(a, status);
2889 b = float32_squash_input_denormal(b, status);
2891 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2892 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2894 float_raise(float_flag_invalid, status);
2895 return 0;
2897 aSign = extractFloat32Sign( a );
2898 bSign = extractFloat32Sign( b );
2899 av = float32_val(a);
2900 bv = float32_val(b);
2901 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2902 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2906 /*----------------------------------------------------------------------------
2907 | Returns 1 if the single-precision floating-point value `a' is less than
2908 | the corresponding value `b', and 0 otherwise. The invalid exception is
2909 | raised if either operand is a NaN. The comparison is performed according
2910 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2911 *----------------------------------------------------------------------------*/
2913 int float32_lt(float32 a, float32 b, float_status *status)
2915 flag aSign, bSign;
2916 uint32_t av, bv;
2917 a = float32_squash_input_denormal(a, status);
2918 b = float32_squash_input_denormal(b, status);
2920 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2921 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2923 float_raise(float_flag_invalid, status);
2924 return 0;
2926 aSign = extractFloat32Sign( a );
2927 bSign = extractFloat32Sign( b );
2928 av = float32_val(a);
2929 bv = float32_val(b);
2930 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2931 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2935 /*----------------------------------------------------------------------------
2936 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2937 | be compared, and 0 otherwise. The invalid exception is raised if either
2938 | operand is a NaN. The comparison is performed according to the IEC/IEEE
2939 | Standard for Binary Floating-Point Arithmetic.
2940 *----------------------------------------------------------------------------*/
2942 int float32_unordered(float32 a, float32 b, float_status *status)
2944 a = float32_squash_input_denormal(a, status);
2945 b = float32_squash_input_denormal(b, status);
2947 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2948 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2950 float_raise(float_flag_invalid, status);
2951 return 1;
2953 return 0;
2956 /*----------------------------------------------------------------------------
2957 | Returns 1 if the single-precision floating-point value `a' is equal to
2958 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2959 | exception. The comparison is performed according to the IEC/IEEE Standard
2960 | for Binary Floating-Point Arithmetic.
2961 *----------------------------------------------------------------------------*/
2963 int float32_eq_quiet(float32 a, float32 b, float_status *status)
2965 a = float32_squash_input_denormal(a, status);
2966 b = float32_squash_input_denormal(b, status);
2968 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2969 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2971 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2972 float_raise(float_flag_invalid, status);
2974 return 0;
2976 return ( float32_val(a) == float32_val(b) ) ||
2977 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2980 /*----------------------------------------------------------------------------
2981 | Returns 1 if the single-precision floating-point value `a' is less than or
2982 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2983 | cause an exception. Otherwise, the comparison is performed according to the
2984 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2985 *----------------------------------------------------------------------------*/
2987 int float32_le_quiet(float32 a, float32 b, float_status *status)
2989 flag aSign, bSign;
2990 uint32_t av, bv;
2991 a = float32_squash_input_denormal(a, status);
2992 b = float32_squash_input_denormal(b, status);
2994 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2995 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2997 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2998 float_raise(float_flag_invalid, status);
3000 return 0;
3002 aSign = extractFloat32Sign( a );
3003 bSign = extractFloat32Sign( b );
3004 av = float32_val(a);
3005 bv = float32_val(b);
3006 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3007 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3011 /*----------------------------------------------------------------------------
3012 | Returns 1 if the single-precision floating-point value `a' is less than
3013 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3014 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3015 | Standard for Binary Floating-Point Arithmetic.
3016 *----------------------------------------------------------------------------*/
3018 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3020 flag aSign, bSign;
3021 uint32_t av, bv;
3022 a = float32_squash_input_denormal(a, status);
3023 b = float32_squash_input_denormal(b, status);
3025 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3026 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3028 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
3029 float_raise(float_flag_invalid, status);
3031 return 0;
3033 aSign = extractFloat32Sign( a );
3034 bSign = extractFloat32Sign( b );
3035 av = float32_val(a);
3036 bv = float32_val(b);
3037 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3038 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3042 /*----------------------------------------------------------------------------
3043 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3044 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3045 | comparison is performed according to the IEC/IEEE Standard for Binary
3046 | Floating-Point Arithmetic.
3047 *----------------------------------------------------------------------------*/
3049 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3051 a = float32_squash_input_denormal(a, status);
3052 b = float32_squash_input_denormal(b, status);
3054 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3055 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3057 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
3058 float_raise(float_flag_invalid, status);
3060 return 1;
3062 return 0;
3065 /*----------------------------------------------------------------------------
3066 | Returns the result of converting the double-precision floating-point value
3067 | `a' to the 32-bit two's complement integer format. The conversion is
3068 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3069 | Arithmetic---which means in particular that the conversion is rounded
3070 | according to the current rounding mode. If `a' is a NaN, the largest
3071 | positive integer is returned. Otherwise, if the conversion overflows, the
3072 | largest integer with the same sign as `a' is returned.
3073 *----------------------------------------------------------------------------*/
3075 int32_t float64_to_int32(float64 a, float_status *status)
3077 flag aSign;
3078 int_fast16_t aExp, shiftCount;
3079 uint64_t aSig;
3080 a = float64_squash_input_denormal(a, status);
3082 aSig = extractFloat64Frac( a );
3083 aExp = extractFloat64Exp( a );
3084 aSign = extractFloat64Sign( a );
3085 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3086 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3087 shiftCount = 0x42C - aExp;
3088 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
3089 return roundAndPackInt32(aSign, aSig, status);
3093 /*----------------------------------------------------------------------------
3094 | Returns the result of converting the double-precision floating-point value
3095 | `a' to the 32-bit two's complement integer format. The conversion is
3096 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3097 | Arithmetic, except that the conversion is always rounded toward zero.
3098 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3099 | the conversion overflows, the largest integer with the same sign as `a' is
3100 | returned.
3101 *----------------------------------------------------------------------------*/
3103 int32_t float64_to_int32_round_to_zero(float64 a, float_status *status)
3105 flag aSign;
3106 int_fast16_t aExp, shiftCount;
3107 uint64_t aSig, savedASig;
3108 int32_t z;
3109 a = float64_squash_input_denormal(a, status);
3111 aSig = extractFloat64Frac( a );
3112 aExp = extractFloat64Exp( a );
3113 aSign = extractFloat64Sign( a );
3114 if ( 0x41E < aExp ) {
3115 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3116 goto invalid;
3118 else if ( aExp < 0x3FF ) {
3119 if (aExp || aSig) {
3120 status->float_exception_flags |= float_flag_inexact;
3122 return 0;
3124 aSig |= LIT64( 0x0010000000000000 );
3125 shiftCount = 0x433 - aExp;
3126 savedASig = aSig;
3127 aSig >>= shiftCount;
3128 z = aSig;
3129 if ( aSign ) z = - z;
3130 if ( ( z < 0 ) ^ aSign ) {
3131 invalid:
3132 float_raise(float_flag_invalid, status);
3133 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3135 if ( ( aSig<<shiftCount ) != savedASig ) {
3136 status->float_exception_flags |= float_flag_inexact;
3138 return z;
3142 /*----------------------------------------------------------------------------
3143 | Returns the result of converting the double-precision floating-point value
3144 | `a' to the 16-bit two's complement integer format. The conversion is
3145 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3146 | Arithmetic, except that the conversion is always rounded toward zero.
3147 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3148 | the conversion overflows, the largest integer with the same sign as `a' is
3149 | returned.
3150 *----------------------------------------------------------------------------*/
3152 int_fast16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
3154 flag aSign;
3155 int_fast16_t aExp, shiftCount;
3156 uint64_t aSig, savedASig;
3157 int32_t z;
3159 aSig = extractFloat64Frac( a );
3160 aExp = extractFloat64Exp( a );
3161 aSign = extractFloat64Sign( a );
3162 if ( 0x40E < aExp ) {
3163 if ( ( aExp == 0x7FF ) && aSig ) {
3164 aSign = 0;
3166 goto invalid;
3168 else if ( aExp < 0x3FF ) {
3169 if ( aExp || aSig ) {
3170 status->float_exception_flags |= float_flag_inexact;
3172 return 0;
3174 aSig |= LIT64( 0x0010000000000000 );
3175 shiftCount = 0x433 - aExp;
3176 savedASig = aSig;
3177 aSig >>= shiftCount;
3178 z = aSig;
3179 if ( aSign ) {
3180 z = - z;
3182 if ( ( (int16_t)z < 0 ) ^ aSign ) {
3183 invalid:
3184 float_raise(float_flag_invalid, status);
3185 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3187 if ( ( aSig<<shiftCount ) != savedASig ) {
3188 status->float_exception_flags |= float_flag_inexact;
3190 return z;
3193 /*----------------------------------------------------------------------------
3194 | Returns the result of converting the double-precision floating-point value
3195 | `a' to the 64-bit two's complement integer format. The conversion is
3196 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3197 | Arithmetic---which means in particular that the conversion is rounded
3198 | according to the current rounding mode. If `a' is a NaN, the largest
3199 | positive integer is returned. Otherwise, if the conversion overflows, the
3200 | largest integer with the same sign as `a' is returned.
3201 *----------------------------------------------------------------------------*/
3203 int64_t float64_to_int64(float64 a, float_status *status)
3205 flag aSign;
3206 int_fast16_t aExp, shiftCount;
3207 uint64_t aSig, aSigExtra;
3208 a = float64_squash_input_denormal(a, status);
3210 aSig = extractFloat64Frac( a );
3211 aExp = extractFloat64Exp( a );
3212 aSign = extractFloat64Sign( a );
3213 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3214 shiftCount = 0x433 - aExp;
3215 if ( shiftCount <= 0 ) {
3216 if ( 0x43E < aExp ) {
3217 float_raise(float_flag_invalid, status);
3218 if ( ! aSign
3219 || ( ( aExp == 0x7FF )
3220 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3222 return LIT64( 0x7FFFFFFFFFFFFFFF );
3224 return (int64_t) LIT64( 0x8000000000000000 );
3226 aSigExtra = 0;
3227 aSig <<= - shiftCount;
3229 else {
3230 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3232 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
3236 /*----------------------------------------------------------------------------
3237 | Returns the result of converting the double-precision floating-point value
3238 | `a' to the 64-bit two's complement integer format. The conversion is
3239 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3240 | Arithmetic, except that the conversion is always rounded toward zero.
3241 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3242 | the conversion overflows, the largest integer with the same sign as `a' is
3243 | returned.
3244 *----------------------------------------------------------------------------*/
3246 int64_t float64_to_int64_round_to_zero(float64 a, float_status *status)
3248 flag aSign;
3249 int_fast16_t aExp, shiftCount;
3250 uint64_t aSig;
3251 int64_t z;
3252 a = float64_squash_input_denormal(a, status);
3254 aSig = extractFloat64Frac( a );
3255 aExp = extractFloat64Exp( a );
3256 aSign = extractFloat64Sign( a );
3257 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3258 shiftCount = aExp - 0x433;
3259 if ( 0 <= shiftCount ) {
3260 if ( 0x43E <= aExp ) {
3261 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3262 float_raise(float_flag_invalid, status);
3263 if ( ! aSign
3264 || ( ( aExp == 0x7FF )
3265 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3267 return LIT64( 0x7FFFFFFFFFFFFFFF );
3270 return (int64_t) LIT64( 0x8000000000000000 );
3272 z = aSig<<shiftCount;
3274 else {
3275 if ( aExp < 0x3FE ) {
3276 if (aExp | aSig) {
3277 status->float_exception_flags |= float_flag_inexact;
3279 return 0;
3281 z = aSig>>( - shiftCount );
3282 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3283 status->float_exception_flags |= float_flag_inexact;
3286 if ( aSign ) z = - z;
3287 return z;
3291 /*----------------------------------------------------------------------------
3292 | Returns the result of converting the double-precision floating-point value
3293 | `a' to the single-precision floating-point format. The conversion is
3294 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3295 | Arithmetic.
3296 *----------------------------------------------------------------------------*/
3298 float32 float64_to_float32(float64 a, float_status *status)
3300 flag aSign;
3301 int_fast16_t aExp;
3302 uint64_t aSig;
3303 uint32_t zSig;
3304 a = float64_squash_input_denormal(a, status);
3306 aSig = extractFloat64Frac( a );
3307 aExp = extractFloat64Exp( a );
3308 aSign = extractFloat64Sign( a );
3309 if ( aExp == 0x7FF ) {
3310 if (aSig) {
3311 return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3313 return packFloat32( aSign, 0xFF, 0 );
3315 shift64RightJamming( aSig, 22, &aSig );
3316 zSig = aSig;
3317 if ( aExp || zSig ) {
3318 zSig |= 0x40000000;
3319 aExp -= 0x381;
3321 return roundAndPackFloat32(aSign, aExp, zSig, status);
3326 /*----------------------------------------------------------------------------
3327 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3328 | half-precision floating-point value, returning the result. After being
3329 | shifted into the proper positions, the three fields are simply added
3330 | together to form the result. This means that any integer portion of `zSig'
3331 | will be added into the exponent. Since a properly normalized significand
3332 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3333 | than the desired result exponent whenever `zSig' is a complete, normalized
3334 | significand.
3335 *----------------------------------------------------------------------------*/
3336 static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
3338 return make_float16(
3339 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3342 /*----------------------------------------------------------------------------
3343 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3344 | and significand `zSig', and returns the proper half-precision floating-
3345 | point value corresponding to the abstract input. Ordinarily, the abstract
3346 | value is simply rounded and packed into the half-precision format, with
3347 | the inexact exception raised if the abstract input cannot be represented
3348 | exactly. However, if the abstract value is too large, the overflow and
3349 | inexact exceptions are raised and an infinity or maximal finite value is
3350 | returned. If the abstract value is too small, the input value is rounded to
3351 | a subnormal number, and the underflow and inexact exceptions are raised if
3352 | the abstract input cannot be represented exactly as a subnormal half-
3353 | precision floating-point number.
3354 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3355 | ARM-style "alternative representation", which omits the NaN and Inf
3356 | encodings in order to raise the maximum representable exponent by one.
3357 | The input significand `zSig' has its binary point between bits 22
3358 | and 23, which is 13 bits to the left of the usual location. This shifted
3359 | significand must be normalized or smaller. If `zSig' is not normalized,
3360 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3361 | and it must not require rounding. In the usual case that `zSig' is
3362 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3363 | Note the slightly odd position of the binary point in zSig compared with the
3364 | other roundAndPackFloat functions. This should probably be fixed if we
3365 | need to implement more float16 routines than just conversion.
3366 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3367 | Binary Floating-Point Arithmetic.
3368 *----------------------------------------------------------------------------*/
3370 static float16 roundAndPackFloat16(flag zSign, int_fast16_t zExp,
3371 uint32_t zSig, flag ieee,
3372 float_status *status)
3374 int maxexp = ieee ? 29 : 30;
3375 uint32_t mask;
3376 uint32_t increment;
3377 bool rounding_bumps_exp;
3378 bool is_tiny = false;
3380 /* Calculate the mask of bits of the mantissa which are not
3381 * representable in half-precision and will be lost.
3383 if (zExp < 1) {
3384 /* Will be denormal in halfprec */
3385 mask = 0x00ffffff;
3386 if (zExp >= -11) {
3387 mask >>= 11 + zExp;
3389 } else {
3390 /* Normal number in halfprec */
3391 mask = 0x00001fff;
3394 switch (status->float_rounding_mode) {
3395 case float_round_nearest_even:
3396 increment = (mask + 1) >> 1;
3397 if ((zSig & mask) == increment) {
3398 increment = zSig & (increment << 1);
3400 break;
3401 case float_round_ties_away:
3402 increment = (mask + 1) >> 1;
3403 break;
3404 case float_round_up:
3405 increment = zSign ? 0 : mask;
3406 break;
3407 case float_round_down:
3408 increment = zSign ? mask : 0;
3409 break;
3410 default: /* round_to_zero */
3411 increment = 0;
3412 break;
3415 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3417 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3418 if (ieee) {
3419 float_raise(float_flag_overflow | float_flag_inexact, status);
3420 return packFloat16(zSign, 0x1f, 0);
3421 } else {
3422 float_raise(float_flag_invalid, status);
3423 return packFloat16(zSign, 0x1f, 0x3ff);
3427 if (zExp < 0) {
3428 /* Note that flush-to-zero does not affect half-precision results */
3429 is_tiny =
3430 (status->float_detect_tininess == float_tininess_before_rounding)
3431 || (zExp < -1)
3432 || (!rounding_bumps_exp);
3434 if (zSig & mask) {
3435 float_raise(float_flag_inexact, status);
3436 if (is_tiny) {
3437 float_raise(float_flag_underflow, status);
3441 zSig += increment;
3442 if (rounding_bumps_exp) {
3443 zSig >>= 1;
3444 zExp++;
3447 if (zExp < -10) {
3448 return packFloat16(zSign, 0, 0);
3450 if (zExp < 0) {
3451 zSig >>= -zExp;
3452 zExp = 0;
3454 return packFloat16(zSign, zExp, zSig >> 13);
3457 static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr,
3458 uint32_t *zSigPtr)
3460 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3461 *zSigPtr = aSig << shiftCount;
3462 *zExpPtr = 1 - shiftCount;
3465 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3466 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3468 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3470 flag aSign;
3471 int_fast16_t aExp;
3472 uint32_t aSig;
3474 aSign = extractFloat16Sign(a);
3475 aExp = extractFloat16Exp(a);
3476 aSig = extractFloat16Frac(a);
3478 if (aExp == 0x1f && ieee) {
3479 if (aSig) {
3480 return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3482 return packFloat32(aSign, 0xff, 0);
3484 if (aExp == 0) {
3485 if (aSig == 0) {
3486 return packFloat32(aSign, 0, 0);
3489 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3490 aExp--;
3492 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3495 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3497 flag aSign;
3498 int_fast16_t aExp;
3499 uint32_t aSig;
3501 a = float32_squash_input_denormal(a, status);
3503 aSig = extractFloat32Frac( a );
3504 aExp = extractFloat32Exp( a );
3505 aSign = extractFloat32Sign( a );
3506 if ( aExp == 0xFF ) {
3507 if (aSig) {
3508 /* Input is a NaN */
3509 if (!ieee) {
3510 float_raise(float_flag_invalid, status);
3511 return packFloat16(aSign, 0, 0);
3513 return commonNaNToFloat16(
3514 float32ToCommonNaN(a, status), status);
3516 /* Infinity */
3517 if (!ieee) {
3518 float_raise(float_flag_invalid, status);
3519 return packFloat16(aSign, 0x1f, 0x3ff);
3521 return packFloat16(aSign, 0x1f, 0);
3523 if (aExp == 0 && aSig == 0) {
3524 return packFloat16(aSign, 0, 0);
3526 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3527 * even if the input is denormal; however this is harmless because
3528 * the largest possible single-precision denormal is still smaller
3529 * than the smallest representable half-precision denormal, and so we
3530 * will end up ignoring aSig and returning via the "always return zero"
3531 * codepath.
3533 aSig |= 0x00800000;
3534 aExp -= 0x71;
3536 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3539 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3541 flag aSign;
3542 int_fast16_t aExp;
3543 uint32_t aSig;
3545 aSign = extractFloat16Sign(a);
3546 aExp = extractFloat16Exp(a);
3547 aSig = extractFloat16Frac(a);
3549 if (aExp == 0x1f && ieee) {
3550 if (aSig) {
3551 return commonNaNToFloat64(
3552 float16ToCommonNaN(a, status), status);
3554 return packFloat64(aSign, 0x7ff, 0);
3556 if (aExp == 0) {
3557 if (aSig == 0) {
3558 return packFloat64(aSign, 0, 0);
3561 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3562 aExp--;
3564 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3567 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3569 flag aSign;
3570 int_fast16_t aExp;
3571 uint64_t aSig;
3572 uint32_t zSig;
3574 a = float64_squash_input_denormal(a, status);
3576 aSig = extractFloat64Frac(a);
3577 aExp = extractFloat64Exp(a);
3578 aSign = extractFloat64Sign(a);
3579 if (aExp == 0x7FF) {
3580 if (aSig) {
3581 /* Input is a NaN */
3582 if (!ieee) {
3583 float_raise(float_flag_invalid, status);
3584 return packFloat16(aSign, 0, 0);
3586 return commonNaNToFloat16(
3587 float64ToCommonNaN(a, status), status);
3589 /* Infinity */
3590 if (!ieee) {
3591 float_raise(float_flag_invalid, status);
3592 return packFloat16(aSign, 0x1f, 0x3ff);
3594 return packFloat16(aSign, 0x1f, 0);
3596 shift64RightJamming(aSig, 29, &aSig);
3597 zSig = aSig;
3598 if (aExp == 0 && zSig == 0) {
3599 return packFloat16(aSign, 0, 0);
3601 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3602 * even if the input is denormal; however this is harmless because
3603 * the largest possible single-precision denormal is still smaller
3604 * than the smallest representable half-precision denormal, and so we
3605 * will end up ignoring aSig and returning via the "always return zero"
3606 * codepath.
3608 zSig |= 0x00800000;
3609 aExp -= 0x3F1;
3611 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
3614 /*----------------------------------------------------------------------------
3615 | Returns the result of converting the double-precision floating-point value
3616 | `a' to the extended double-precision floating-point format. The conversion
3617 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3618 | Arithmetic.
3619 *----------------------------------------------------------------------------*/
3621 floatx80 float64_to_floatx80(float64 a, float_status *status)
3623 flag aSign;
3624 int_fast16_t aExp;
3625 uint64_t aSig;
3627 a = float64_squash_input_denormal(a, status);
3628 aSig = extractFloat64Frac( a );
3629 aExp = extractFloat64Exp( a );
3630 aSign = extractFloat64Sign( a );
3631 if ( aExp == 0x7FF ) {
3632 if (aSig) {
3633 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3635 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3637 if ( aExp == 0 ) {
3638 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3639 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3641 return
3642 packFloatx80(
3643 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3647 /*----------------------------------------------------------------------------
3648 | Returns the result of converting the double-precision floating-point value
3649 | `a' to the quadruple-precision floating-point format. The conversion is
3650 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3651 | Arithmetic.
3652 *----------------------------------------------------------------------------*/
3654 float128 float64_to_float128(float64 a, float_status *status)
3656 flag aSign;
3657 int_fast16_t aExp;
3658 uint64_t aSig, zSig0, zSig1;
3660 a = float64_squash_input_denormal(a, status);
3661 aSig = extractFloat64Frac( a );
3662 aExp = extractFloat64Exp( a );
3663 aSign = extractFloat64Sign( a );
3664 if ( aExp == 0x7FF ) {
3665 if (aSig) {
3666 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3668 return packFloat128( aSign, 0x7FFF, 0, 0 );
3670 if ( aExp == 0 ) {
3671 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3672 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3673 --aExp;
3675 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3676 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3680 /*----------------------------------------------------------------------------
3681 | Rounds the double-precision floating-point value `a' to an integer, and
3682 | returns the result as a double-precision floating-point value. The
3683 | operation is performed according to the IEC/IEEE Standard for Binary
3684 | Floating-Point Arithmetic.
3685 *----------------------------------------------------------------------------*/
3687 float64 float64_round_to_int(float64 a, float_status *status)
3689 flag aSign;
3690 int_fast16_t aExp;
3691 uint64_t lastBitMask, roundBitsMask;
3692 uint64_t z;
3693 a = float64_squash_input_denormal(a, status);
3695 aExp = extractFloat64Exp( a );
3696 if ( 0x433 <= aExp ) {
3697 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3698 return propagateFloat64NaN(a, a, status);
3700 return a;
3702 if ( aExp < 0x3FF ) {
3703 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3704 status->float_exception_flags |= float_flag_inexact;
3705 aSign = extractFloat64Sign( a );
3706 switch (status->float_rounding_mode) {
3707 case float_round_nearest_even:
3708 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3709 return packFloat64( aSign, 0x3FF, 0 );
3711 break;
3712 case float_round_ties_away:
3713 if (aExp == 0x3FE) {
3714 return packFloat64(aSign, 0x3ff, 0);
3716 break;
3717 case float_round_down:
3718 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3719 case float_round_up:
3720 return make_float64(
3721 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3723 return packFloat64( aSign, 0, 0 );
3725 lastBitMask = 1;
3726 lastBitMask <<= 0x433 - aExp;
3727 roundBitsMask = lastBitMask - 1;
3728 z = float64_val(a);
3729 switch (status->float_rounding_mode) {
3730 case float_round_nearest_even:
3731 z += lastBitMask >> 1;
3732 if ((z & roundBitsMask) == 0) {
3733 z &= ~lastBitMask;
3735 break;
3736 case float_round_ties_away:
3737 z += lastBitMask >> 1;
3738 break;
3739 case float_round_to_zero:
3740 break;
3741 case float_round_up:
3742 if (!extractFloat64Sign(make_float64(z))) {
3743 z += roundBitsMask;
3745 break;
3746 case float_round_down:
3747 if (extractFloat64Sign(make_float64(z))) {
3748 z += roundBitsMask;
3750 break;
3751 default:
3752 abort();
3754 z &= ~ roundBitsMask;
3755 if (z != float64_val(a)) {
3756 status->float_exception_flags |= float_flag_inexact;
3758 return make_float64(z);
3762 float64 float64_trunc_to_int(float64 a, float_status *status)
3764 int oldmode;
3765 float64 res;
3766 oldmode = status->float_rounding_mode;
3767 status->float_rounding_mode = float_round_to_zero;
3768 res = float64_round_to_int(a, status);
3769 status->float_rounding_mode = oldmode;
3770 return res;
3773 /*----------------------------------------------------------------------------
3774 | Returns the result of adding the absolute values of the double-precision
3775 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3776 | before being returned. `zSign' is ignored if the result is a NaN.
3777 | The addition is performed according to the IEC/IEEE Standard for Binary
3778 | Floating-Point Arithmetic.
3779 *----------------------------------------------------------------------------*/
3781 static float64 addFloat64Sigs(float64 a, float64 b, flag zSign,
3782 float_status *status)
3784 int_fast16_t aExp, bExp, zExp;
3785 uint64_t aSig, bSig, zSig;
3786 int_fast16_t expDiff;
3788 aSig = extractFloat64Frac( a );
3789 aExp = extractFloat64Exp( a );
3790 bSig = extractFloat64Frac( b );
3791 bExp = extractFloat64Exp( b );
3792 expDiff = aExp - bExp;
3793 aSig <<= 9;
3794 bSig <<= 9;
3795 if ( 0 < expDiff ) {
3796 if ( aExp == 0x7FF ) {
3797 if (aSig) {
3798 return propagateFloat64NaN(a, b, status);
3800 return a;
3802 if ( bExp == 0 ) {
3803 --expDiff;
3805 else {
3806 bSig |= LIT64( 0x2000000000000000 );
3808 shift64RightJamming( bSig, expDiff, &bSig );
3809 zExp = aExp;
3811 else if ( expDiff < 0 ) {
3812 if ( bExp == 0x7FF ) {
3813 if (bSig) {
3814 return propagateFloat64NaN(a, b, status);
3816 return packFloat64( zSign, 0x7FF, 0 );
3818 if ( aExp == 0 ) {
3819 ++expDiff;
3821 else {
3822 aSig |= LIT64( 0x2000000000000000 );
3824 shift64RightJamming( aSig, - expDiff, &aSig );
3825 zExp = bExp;
3827 else {
3828 if ( aExp == 0x7FF ) {
3829 if (aSig | bSig) {
3830 return propagateFloat64NaN(a, b, status);
3832 return a;
3834 if ( aExp == 0 ) {
3835 if (status->flush_to_zero) {
3836 if (aSig | bSig) {
3837 float_raise(float_flag_output_denormal, status);
3839 return packFloat64(zSign, 0, 0);
3841 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3843 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3844 zExp = aExp;
3845 goto roundAndPack;
3847 aSig |= LIT64( 0x2000000000000000 );
3848 zSig = ( aSig + bSig )<<1;
3849 --zExp;
3850 if ( (int64_t) zSig < 0 ) {
3851 zSig = aSig + bSig;
3852 ++zExp;
3854 roundAndPack:
3855 return roundAndPackFloat64(zSign, zExp, zSig, status);
3859 /*----------------------------------------------------------------------------
3860 | Returns the result of subtracting the absolute values of the double-
3861 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3862 | difference is negated before being returned. `zSign' is ignored if the
3863 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3864 | Standard for Binary Floating-Point Arithmetic.
3865 *----------------------------------------------------------------------------*/
3867 static float64 subFloat64Sigs(float64 a, float64 b, flag zSign,
3868 float_status *status)
3870 int_fast16_t aExp, bExp, zExp;
3871 uint64_t aSig, bSig, zSig;
3872 int_fast16_t expDiff;
3874 aSig = extractFloat64Frac( a );
3875 aExp = extractFloat64Exp( a );
3876 bSig = extractFloat64Frac( b );
3877 bExp = extractFloat64Exp( b );
3878 expDiff = aExp - bExp;
3879 aSig <<= 10;
3880 bSig <<= 10;
3881 if ( 0 < expDiff ) goto aExpBigger;
3882 if ( expDiff < 0 ) goto bExpBigger;
3883 if ( aExp == 0x7FF ) {
3884 if (aSig | bSig) {
3885 return propagateFloat64NaN(a, b, status);
3887 float_raise(float_flag_invalid, status);
3888 return float64_default_nan;
3890 if ( aExp == 0 ) {
3891 aExp = 1;
3892 bExp = 1;
3894 if ( bSig < aSig ) goto aBigger;
3895 if ( aSig < bSig ) goto bBigger;
3896 return packFloat64(status->float_rounding_mode == float_round_down, 0, 0);
3897 bExpBigger:
3898 if ( bExp == 0x7FF ) {
3899 if (bSig) {
3900 return propagateFloat64NaN(a, b, status);
3902 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3904 if ( aExp == 0 ) {
3905 ++expDiff;
3907 else {
3908 aSig |= LIT64( 0x4000000000000000 );
3910 shift64RightJamming( aSig, - expDiff, &aSig );
3911 bSig |= LIT64( 0x4000000000000000 );
3912 bBigger:
3913 zSig = bSig - aSig;
3914 zExp = bExp;
3915 zSign ^= 1;
3916 goto normalizeRoundAndPack;
3917 aExpBigger:
3918 if ( aExp == 0x7FF ) {
3919 if (aSig) {
3920 return propagateFloat64NaN(a, b, status);
3922 return a;
3924 if ( bExp == 0 ) {
3925 --expDiff;
3927 else {
3928 bSig |= LIT64( 0x4000000000000000 );
3930 shift64RightJamming( bSig, expDiff, &bSig );
3931 aSig |= LIT64( 0x4000000000000000 );
3932 aBigger:
3933 zSig = aSig - bSig;
3934 zExp = aExp;
3935 normalizeRoundAndPack:
3936 --zExp;
3937 return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
3941 /*----------------------------------------------------------------------------
3942 | Returns the result of adding the double-precision floating-point values `a'
3943 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3944 | Binary Floating-Point Arithmetic.
3945 *----------------------------------------------------------------------------*/
3947 float64 float64_add(float64 a, float64 b, float_status *status)
3949 flag aSign, bSign;
3950 a = float64_squash_input_denormal(a, status);
3951 b = float64_squash_input_denormal(b, status);
3953 aSign = extractFloat64Sign( a );
3954 bSign = extractFloat64Sign( b );
3955 if ( aSign == bSign ) {
3956 return addFloat64Sigs(a, b, aSign, status);
3958 else {
3959 return subFloat64Sigs(a, b, aSign, status);
3964 /*----------------------------------------------------------------------------
3965 | Returns the result of subtracting the double-precision floating-point values
3966 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3967 | for Binary Floating-Point Arithmetic.
3968 *----------------------------------------------------------------------------*/
3970 float64 float64_sub(float64 a, float64 b, float_status *status)
3972 flag aSign, bSign;
3973 a = float64_squash_input_denormal(a, status);
3974 b = float64_squash_input_denormal(b, status);
3976 aSign = extractFloat64Sign( a );
3977 bSign = extractFloat64Sign( b );
3978 if ( aSign == bSign ) {
3979 return subFloat64Sigs(a, b, aSign, status);
3981 else {
3982 return addFloat64Sigs(a, b, aSign, status);
3987 /*----------------------------------------------------------------------------
3988 | Returns the result of multiplying the double-precision floating-point values
3989 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3990 | for Binary Floating-Point Arithmetic.
3991 *----------------------------------------------------------------------------*/
3993 float64 float64_mul(float64 a, float64 b, float_status *status)
3995 flag aSign, bSign, zSign;
3996 int_fast16_t aExp, bExp, zExp;
3997 uint64_t aSig, bSig, zSig0, zSig1;
3999 a = float64_squash_input_denormal(a, status);
4000 b = float64_squash_input_denormal(b, status);
4002 aSig = extractFloat64Frac( a );
4003 aExp = extractFloat64Exp( a );
4004 aSign = extractFloat64Sign( a );
4005 bSig = extractFloat64Frac( b );
4006 bExp = extractFloat64Exp( b );
4007 bSign = extractFloat64Sign( b );
4008 zSign = aSign ^ bSign;
4009 if ( aExp == 0x7FF ) {
4010 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4011 return propagateFloat64NaN(a, b, status);
4013 if ( ( bExp | bSig ) == 0 ) {
4014 float_raise(float_flag_invalid, status);
4015 return float64_default_nan;
4017 return packFloat64( zSign, 0x7FF, 0 );
4019 if ( bExp == 0x7FF ) {
4020 if (bSig) {
4021 return propagateFloat64NaN(a, b, status);
4023 if ( ( aExp | aSig ) == 0 ) {
4024 float_raise(float_flag_invalid, status);
4025 return float64_default_nan;
4027 return packFloat64( zSign, 0x7FF, 0 );
4029 if ( aExp == 0 ) {
4030 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4031 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4033 if ( bExp == 0 ) {
4034 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
4035 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4037 zExp = aExp + bExp - 0x3FF;
4038 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4039 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4040 mul64To128( aSig, bSig, &zSig0, &zSig1 );
4041 zSig0 |= ( zSig1 != 0 );
4042 if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
4043 zSig0 <<= 1;
4044 --zExp;
4046 return roundAndPackFloat64(zSign, zExp, zSig0, status);
4050 /*----------------------------------------------------------------------------
4051 | Returns the result of dividing the double-precision floating-point value `a'
4052 | by the corresponding value `b'. The operation is performed according to
4053 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4054 *----------------------------------------------------------------------------*/
4056 float64 float64_div(float64 a, float64 b, float_status *status)
4058 flag aSign, bSign, zSign;
4059 int_fast16_t aExp, bExp, zExp;
4060 uint64_t aSig, bSig, zSig;
4061 uint64_t rem0, rem1;
4062 uint64_t term0, term1;
4063 a = float64_squash_input_denormal(a, status);
4064 b = float64_squash_input_denormal(b, status);
4066 aSig = extractFloat64Frac( a );
4067 aExp = extractFloat64Exp( a );
4068 aSign = extractFloat64Sign( a );
4069 bSig = extractFloat64Frac( b );
4070 bExp = extractFloat64Exp( b );
4071 bSign = extractFloat64Sign( b );
4072 zSign = aSign ^ bSign;
4073 if ( aExp == 0x7FF ) {
4074 if (aSig) {
4075 return propagateFloat64NaN(a, b, status);
4077 if ( bExp == 0x7FF ) {
4078 if (bSig) {
4079 return propagateFloat64NaN(a, b, status);
4081 float_raise(float_flag_invalid, status);
4082 return float64_default_nan;
4084 return packFloat64( zSign, 0x7FF, 0 );
4086 if ( bExp == 0x7FF ) {
4087 if (bSig) {
4088 return propagateFloat64NaN(a, b, status);
4090 return packFloat64( zSign, 0, 0 );
4092 if ( bExp == 0 ) {
4093 if ( bSig == 0 ) {
4094 if ( ( aExp | aSig ) == 0 ) {
4095 float_raise(float_flag_invalid, status);
4096 return float64_default_nan;
4098 float_raise(float_flag_divbyzero, status);
4099 return packFloat64( zSign, 0x7FF, 0 );
4101 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4103 if ( aExp == 0 ) {
4104 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4105 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4107 zExp = aExp - bExp + 0x3FD;
4108 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4109 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4110 if ( bSig <= ( aSig + aSig ) ) {
4111 aSig >>= 1;
4112 ++zExp;
4114 zSig = estimateDiv128To64( aSig, 0, bSig );
4115 if ( ( zSig & 0x1FF ) <= 2 ) {
4116 mul64To128( bSig, zSig, &term0, &term1 );
4117 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4118 while ( (int64_t) rem0 < 0 ) {
4119 --zSig;
4120 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4122 zSig |= ( rem1 != 0 );
4124 return roundAndPackFloat64(zSign, zExp, zSig, status);
4128 /*----------------------------------------------------------------------------
4129 | Returns the remainder of the double-precision floating-point value `a'
4130 | with respect to the corresponding value `b'. The operation is performed
4131 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4132 *----------------------------------------------------------------------------*/
4134 float64 float64_rem(float64 a, float64 b, float_status *status)
4136 flag aSign, zSign;
4137 int_fast16_t aExp, bExp, expDiff;
4138 uint64_t aSig, bSig;
4139 uint64_t q, alternateASig;
4140 int64_t sigMean;
4142 a = float64_squash_input_denormal(a, status);
4143 b = float64_squash_input_denormal(b, status);
4144 aSig = extractFloat64Frac( a );
4145 aExp = extractFloat64Exp( a );
4146 aSign = extractFloat64Sign( a );
4147 bSig = extractFloat64Frac( b );
4148 bExp = extractFloat64Exp( b );
4149 if ( aExp == 0x7FF ) {
4150 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4151 return propagateFloat64NaN(a, b, status);
4153 float_raise(float_flag_invalid, status);
4154 return float64_default_nan;
4156 if ( bExp == 0x7FF ) {
4157 if (bSig) {
4158 return propagateFloat64NaN(a, b, status);
4160 return a;
4162 if ( bExp == 0 ) {
4163 if ( bSig == 0 ) {
4164 float_raise(float_flag_invalid, status);
4165 return float64_default_nan;
4167 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4169 if ( aExp == 0 ) {
4170 if ( aSig == 0 ) return a;
4171 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4173 expDiff = aExp - bExp;
4174 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4175 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4176 if ( expDiff < 0 ) {
4177 if ( expDiff < -1 ) return a;
4178 aSig >>= 1;
4180 q = ( bSig <= aSig );
4181 if ( q ) aSig -= bSig;
4182 expDiff -= 64;
4183 while ( 0 < expDiff ) {
4184 q = estimateDiv128To64( aSig, 0, bSig );
4185 q = ( 2 < q ) ? q - 2 : 0;
4186 aSig = - ( ( bSig>>2 ) * q );
4187 expDiff -= 62;
4189 expDiff += 64;
4190 if ( 0 < expDiff ) {
4191 q = estimateDiv128To64( aSig, 0, bSig );
4192 q = ( 2 < q ) ? q - 2 : 0;
4193 q >>= 64 - expDiff;
4194 bSig >>= 2;
4195 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4197 else {
4198 aSig >>= 2;
4199 bSig >>= 2;
4201 do {
4202 alternateASig = aSig;
4203 ++q;
4204 aSig -= bSig;
4205 } while ( 0 <= (int64_t) aSig );
4206 sigMean = aSig + alternateASig;
4207 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4208 aSig = alternateASig;
4210 zSign = ( (int64_t) aSig < 0 );
4211 if ( zSign ) aSig = - aSig;
4212 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4216 /*----------------------------------------------------------------------------
4217 | Returns the result of multiplying the double-precision floating-point values
4218 | `a' and `b' then adding 'c', with no intermediate rounding step after the
4219 | multiplication. The operation is performed according to the IEC/IEEE
4220 | Standard for Binary Floating-Point Arithmetic 754-2008.
4221 | The flags argument allows the caller to select negation of the
4222 | addend, the intermediate product, or the final result. (The difference
4223 | between this and having the caller do a separate negation is that negating
4224 | externally will flip the sign bit on NaNs.)
4225 *----------------------------------------------------------------------------*/
4227 float64 float64_muladd(float64 a, float64 b, float64 c, int flags,
4228 float_status *status)
4230 flag aSign, bSign, cSign, zSign;
4231 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
4232 uint64_t aSig, bSig, cSig;
4233 flag pInf, pZero, pSign;
4234 uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4235 int shiftcount;
4236 flag signflip, infzero;
4238 a = float64_squash_input_denormal(a, status);
4239 b = float64_squash_input_denormal(b, status);
4240 c = float64_squash_input_denormal(c, status);
4241 aSig = extractFloat64Frac(a);
4242 aExp = extractFloat64Exp(a);
4243 aSign = extractFloat64Sign(a);
4244 bSig = extractFloat64Frac(b);
4245 bExp = extractFloat64Exp(b);
4246 bSign = extractFloat64Sign(b);
4247 cSig = extractFloat64Frac(c);
4248 cExp = extractFloat64Exp(c);
4249 cSign = extractFloat64Sign(c);
4251 infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4252 (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4254 /* It is implementation-defined whether the cases of (0,inf,qnan)
4255 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4256 * they return if they do), so we have to hand this information
4257 * off to the target-specific pick-a-NaN routine.
4259 if (((aExp == 0x7ff) && aSig) ||
4260 ((bExp == 0x7ff) && bSig) ||
4261 ((cExp == 0x7ff) && cSig)) {
4262 return propagateFloat64MulAddNaN(a, b, c, infzero, status);
4265 if (infzero) {
4266 float_raise(float_flag_invalid, status);
4267 return float64_default_nan;
4270 if (flags & float_muladd_negate_c) {
4271 cSign ^= 1;
4274 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4276 /* Work out the sign and type of the product */
4277 pSign = aSign ^ bSign;
4278 if (flags & float_muladd_negate_product) {
4279 pSign ^= 1;
4281 pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4282 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4284 if (cExp == 0x7ff) {
4285 if (pInf && (pSign ^ cSign)) {
4286 /* addition of opposite-signed infinities => InvalidOperation */
4287 float_raise(float_flag_invalid, status);
4288 return float64_default_nan;
4290 /* Otherwise generate an infinity of the same sign */
4291 return packFloat64(cSign ^ signflip, 0x7ff, 0);
4294 if (pInf) {
4295 return packFloat64(pSign ^ signflip, 0x7ff, 0);
4298 if (pZero) {
4299 if (cExp == 0) {
4300 if (cSig == 0) {
4301 /* Adding two exact zeroes */
4302 if (pSign == cSign) {
4303 zSign = pSign;
4304 } else if (status->float_rounding_mode == float_round_down) {
4305 zSign = 1;
4306 } else {
4307 zSign = 0;
4309 return packFloat64(zSign ^ signflip, 0, 0);
4311 /* Exact zero plus a denorm */
4312 if (status->flush_to_zero) {
4313 float_raise(float_flag_output_denormal, status);
4314 return packFloat64(cSign ^ signflip, 0, 0);
4317 /* Zero plus something non-zero : just return the something */
4318 if (flags & float_muladd_halve_result) {
4319 if (cExp == 0) {
4320 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4322 /* Subtract one to halve, and one again because roundAndPackFloat64
4323 * wants one less than the true exponent.
4325 cExp -= 2;
4326 cSig = (cSig | 0x0010000000000000ULL) << 10;
4327 return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status);
4329 return packFloat64(cSign ^ signflip, cExp, cSig);
4332 if (aExp == 0) {
4333 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4335 if (bExp == 0) {
4336 normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4339 /* Calculate the actual result a * b + c */
4341 /* Multiply first; this is easy. */
4342 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4343 * because we want the true exponent, not the "one-less-than"
4344 * flavour that roundAndPackFloat64() takes.
4346 pExp = aExp + bExp - 0x3fe;
4347 aSig = (aSig | LIT64(0x0010000000000000))<<10;
4348 bSig = (bSig | LIT64(0x0010000000000000))<<11;
4349 mul64To128(aSig, bSig, &pSig0, &pSig1);
4350 if ((int64_t)(pSig0 << 1) >= 0) {
4351 shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4352 pExp--;
4355 zSign = pSign ^ signflip;
4357 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4358 * bit in position 126.
4360 if (cExp == 0) {
4361 if (!cSig) {
4362 /* Throw out the special case of c being an exact zero now */
4363 shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4364 if (flags & float_muladd_halve_result) {
4365 pExp--;
4367 return roundAndPackFloat64(zSign, pExp - 1,
4368 pSig1, status);
4370 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4373 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4374 * significand of the addend, with the explicit bit in position 126.
4376 cSig0 = cSig << (126 - 64 - 52);
4377 cSig1 = 0;
4378 cSig0 |= LIT64(0x4000000000000000);
4379 expDiff = pExp - cExp;
4381 if (pSign == cSign) {
4382 /* Addition */
4383 if (expDiff > 0) {
4384 /* scale c to match p */
4385 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4386 zExp = pExp;
4387 } else if (expDiff < 0) {
4388 /* scale p to match c */
4389 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4390 zExp = cExp;
4391 } else {
4392 /* no scaling needed */
4393 zExp = cExp;
4395 /* Add significands and make sure explicit bit ends up in posn 126 */
4396 add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4397 if ((int64_t)zSig0 < 0) {
4398 shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4399 } else {
4400 zExp--;
4402 shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4403 if (flags & float_muladd_halve_result) {
4404 zExp--;
4406 return roundAndPackFloat64(zSign, zExp, zSig1, status);
4407 } else {
4408 /* Subtraction */
4409 if (expDiff > 0) {
4410 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4411 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4412 zExp = pExp;
4413 } else if (expDiff < 0) {
4414 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4415 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4416 zExp = cExp;
4417 zSign ^= 1;
4418 } else {
4419 zExp = pExp;
4420 if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4421 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4422 } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4423 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4424 zSign ^= 1;
4425 } else {
4426 /* Exact zero */
4427 zSign = signflip;
4428 if (status->float_rounding_mode == float_round_down) {
4429 zSign ^= 1;
4431 return packFloat64(zSign, 0, 0);
4434 --zExp;
4435 /* Do the equivalent of normalizeRoundAndPackFloat64() but
4436 * starting with the significand in a pair of uint64_t.
4438 if (zSig0) {
4439 shiftcount = countLeadingZeros64(zSig0) - 1;
4440 shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4441 if (zSig1) {
4442 zSig0 |= 1;
4444 zExp -= shiftcount;
4445 } else {
4446 shiftcount = countLeadingZeros64(zSig1);
4447 if (shiftcount == 0) {
4448 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4449 zExp -= 63;
4450 } else {
4451 shiftcount--;
4452 zSig0 = zSig1 << shiftcount;
4453 zExp -= (shiftcount + 64);
4456 if (flags & float_muladd_halve_result) {
4457 zExp--;
4459 return roundAndPackFloat64(zSign, zExp, zSig0, status);
4463 /*----------------------------------------------------------------------------
4464 | Returns the square root of the double-precision floating-point value `a'.
4465 | The operation is performed according to the IEC/IEEE Standard for Binary
4466 | Floating-Point Arithmetic.
4467 *----------------------------------------------------------------------------*/
4469 float64 float64_sqrt(float64 a, float_status *status)
4471 flag aSign;
4472 int_fast16_t aExp, zExp;
4473 uint64_t aSig, zSig, doubleZSig;
4474 uint64_t rem0, rem1, term0, term1;
4475 a = float64_squash_input_denormal(a, status);
4477 aSig = extractFloat64Frac( a );
4478 aExp = extractFloat64Exp( a );
4479 aSign = extractFloat64Sign( a );
4480 if ( aExp == 0x7FF ) {
4481 if (aSig) {
4482 return propagateFloat64NaN(a, a, status);
4484 if ( ! aSign ) return a;
4485 float_raise(float_flag_invalid, status);
4486 return float64_default_nan;
4488 if ( aSign ) {
4489 if ( ( aExp | aSig ) == 0 ) return a;
4490 float_raise(float_flag_invalid, status);
4491 return float64_default_nan;
4493 if ( aExp == 0 ) {
4494 if ( aSig == 0 ) return float64_zero;
4495 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4497 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4498 aSig |= LIT64( 0x0010000000000000 );
4499 zSig = estimateSqrt32( aExp, aSig>>21 );
4500 aSig <<= 9 - ( aExp & 1 );
4501 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4502 if ( ( zSig & 0x1FF ) <= 5 ) {
4503 doubleZSig = zSig<<1;
4504 mul64To128( zSig, zSig, &term0, &term1 );
4505 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4506 while ( (int64_t) rem0 < 0 ) {
4507 --zSig;
4508 doubleZSig -= 2;
4509 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4511 zSig |= ( ( rem0 | rem1 ) != 0 );
4513 return roundAndPackFloat64(0, zExp, zSig, status);
4517 /*----------------------------------------------------------------------------
4518 | Returns the binary log of the double-precision floating-point value `a'.
4519 | The operation is performed according to the IEC/IEEE Standard for Binary
4520 | Floating-Point Arithmetic.
4521 *----------------------------------------------------------------------------*/
4522 float64 float64_log2(float64 a, float_status *status)
4524 flag aSign, zSign;
4525 int_fast16_t aExp;
4526 uint64_t aSig, aSig0, aSig1, zSig, i;
4527 a = float64_squash_input_denormal(a, status);
4529 aSig = extractFloat64Frac( a );
4530 aExp = extractFloat64Exp( a );
4531 aSign = extractFloat64Sign( a );
4533 if ( aExp == 0 ) {
4534 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4535 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4537 if ( aSign ) {
4538 float_raise(float_flag_invalid, status);
4539 return float64_default_nan;
4541 if ( aExp == 0x7FF ) {
4542 if (aSig) {
4543 return propagateFloat64NaN(a, float64_zero, status);
4545 return a;
4548 aExp -= 0x3FF;
4549 aSig |= LIT64( 0x0010000000000000 );
4550 zSign = aExp < 0;
4551 zSig = (uint64_t)aExp << 52;
4552 for (i = 1LL << 51; i > 0; i >>= 1) {
4553 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4554 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4555 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4556 aSig >>= 1;
4557 zSig |= i;
4561 if ( zSign )
4562 zSig = -zSig;
4563 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4566 /*----------------------------------------------------------------------------
4567 | Returns 1 if the double-precision floating-point value `a' is equal to the
4568 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4569 | if either operand is a NaN. Otherwise, the comparison is performed
4570 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4571 *----------------------------------------------------------------------------*/
4573 int float64_eq(float64 a, float64 b, float_status *status)
4575 uint64_t av, bv;
4576 a = float64_squash_input_denormal(a, status);
4577 b = float64_squash_input_denormal(b, status);
4579 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4580 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4582 float_raise(float_flag_invalid, status);
4583 return 0;
4585 av = float64_val(a);
4586 bv = float64_val(b);
4587 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4591 /*----------------------------------------------------------------------------
4592 | Returns 1 if the double-precision floating-point value `a' is less than or
4593 | equal to the corresponding value `b', and 0 otherwise. The invalid
4594 | exception is raised if either operand is a NaN. The comparison is performed
4595 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4596 *----------------------------------------------------------------------------*/
4598 int float64_le(float64 a, float64 b, float_status *status)
4600 flag aSign, bSign;
4601 uint64_t av, bv;
4602 a = float64_squash_input_denormal(a, status);
4603 b = float64_squash_input_denormal(b, status);
4605 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4606 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4608 float_raise(float_flag_invalid, status);
4609 return 0;
4611 aSign = extractFloat64Sign( a );
4612 bSign = extractFloat64Sign( b );
4613 av = float64_val(a);
4614 bv = float64_val(b);
4615 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4616 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4620 /*----------------------------------------------------------------------------
4621 | Returns 1 if the double-precision floating-point value `a' is less than
4622 | the corresponding value `b', and 0 otherwise. The invalid exception is
4623 | raised if either operand is a NaN. The comparison is performed according
4624 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4625 *----------------------------------------------------------------------------*/
4627 int float64_lt(float64 a, float64 b, float_status *status)
4629 flag aSign, bSign;
4630 uint64_t av, bv;
4632 a = float64_squash_input_denormal(a, status);
4633 b = float64_squash_input_denormal(b, status);
4634 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4635 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4637 float_raise(float_flag_invalid, status);
4638 return 0;
4640 aSign = extractFloat64Sign( a );
4641 bSign = extractFloat64Sign( b );
4642 av = float64_val(a);
4643 bv = float64_val(b);
4644 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4645 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4649 /*----------------------------------------------------------------------------
4650 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4651 | be compared, and 0 otherwise. The invalid exception is raised if either
4652 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4653 | Standard for Binary Floating-Point Arithmetic.
4654 *----------------------------------------------------------------------------*/
4656 int float64_unordered(float64 a, float64 b, float_status *status)
4658 a = float64_squash_input_denormal(a, status);
4659 b = float64_squash_input_denormal(b, status);
4661 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4662 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4664 float_raise(float_flag_invalid, status);
4665 return 1;
4667 return 0;
4670 /*----------------------------------------------------------------------------
4671 | Returns 1 if the double-precision floating-point value `a' is equal to the
4672 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4673 | exception.The comparison is performed according to the IEC/IEEE Standard
4674 | for Binary Floating-Point Arithmetic.
4675 *----------------------------------------------------------------------------*/
4677 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4679 uint64_t av, bv;
4680 a = float64_squash_input_denormal(a, status);
4681 b = float64_squash_input_denormal(b, status);
4683 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4684 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4686 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4687 float_raise(float_flag_invalid, status);
4689 return 0;
4691 av = float64_val(a);
4692 bv = float64_val(b);
4693 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4697 /*----------------------------------------------------------------------------
4698 | Returns 1 if the double-precision floating-point value `a' is less than or
4699 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4700 | cause an exception. Otherwise, the comparison is performed according to the
4701 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4702 *----------------------------------------------------------------------------*/
4704 int float64_le_quiet(float64 a, float64 b, float_status *status)
4706 flag aSign, bSign;
4707 uint64_t av, bv;
4708 a = float64_squash_input_denormal(a, status);
4709 b = float64_squash_input_denormal(b, status);
4711 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4712 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4714 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4715 float_raise(float_flag_invalid, status);
4717 return 0;
4719 aSign = extractFloat64Sign( a );
4720 bSign = extractFloat64Sign( b );
4721 av = float64_val(a);
4722 bv = float64_val(b);
4723 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4724 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4728 /*----------------------------------------------------------------------------
4729 | Returns 1 if the double-precision floating-point value `a' is less than
4730 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4731 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4732 | Standard for Binary Floating-Point Arithmetic.
4733 *----------------------------------------------------------------------------*/
4735 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4737 flag aSign, bSign;
4738 uint64_t av, bv;
4739 a = float64_squash_input_denormal(a, status);
4740 b = float64_squash_input_denormal(b, status);
4742 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4743 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4745 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4746 float_raise(float_flag_invalid, status);
4748 return 0;
4750 aSign = extractFloat64Sign( a );
4751 bSign = extractFloat64Sign( b );
4752 av = float64_val(a);
4753 bv = float64_val(b);
4754 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4755 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4759 /*----------------------------------------------------------------------------
4760 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4761 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4762 | comparison is performed according to the IEC/IEEE Standard for Binary
4763 | Floating-Point Arithmetic.
4764 *----------------------------------------------------------------------------*/
4766 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4768 a = float64_squash_input_denormal(a, status);
4769 b = float64_squash_input_denormal(b, status);
4771 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4772 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4774 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4775 float_raise(float_flag_invalid, status);
4777 return 1;
4779 return 0;
4782 /*----------------------------------------------------------------------------
4783 | Returns the result of converting the extended double-precision floating-
4784 | point value `a' to the 32-bit two's complement integer format. The
4785 | conversion is performed according to the IEC/IEEE Standard for Binary
4786 | Floating-Point Arithmetic---which means in particular that the conversion
4787 | is rounded according to the current rounding mode. If `a' is a NaN, the
4788 | largest positive integer is returned. Otherwise, if the conversion
4789 | overflows, the largest integer with the same sign as `a' is returned.
4790 *----------------------------------------------------------------------------*/
4792 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4794 flag aSign;
4795 int32_t aExp, shiftCount;
4796 uint64_t aSig;
4798 aSig = extractFloatx80Frac( a );
4799 aExp = extractFloatx80Exp( a );
4800 aSign = extractFloatx80Sign( a );
4801 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4802 shiftCount = 0x4037 - aExp;
4803 if ( shiftCount <= 0 ) shiftCount = 1;
4804 shift64RightJamming( aSig, shiftCount, &aSig );
4805 return roundAndPackInt32(aSign, aSig, status);
4809 /*----------------------------------------------------------------------------
4810 | Returns the result of converting the extended double-precision floating-
4811 | point value `a' to the 32-bit two's complement integer format. The
4812 | conversion is performed according to the IEC/IEEE Standard for Binary
4813 | Floating-Point Arithmetic, except that the conversion is always rounded
4814 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4815 | Otherwise, if the conversion overflows, the largest integer with the same
4816 | sign as `a' is returned.
4817 *----------------------------------------------------------------------------*/
4819 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4821 flag aSign;
4822 int32_t aExp, shiftCount;
4823 uint64_t aSig, savedASig;
4824 int32_t z;
4826 aSig = extractFloatx80Frac( a );
4827 aExp = extractFloatx80Exp( a );
4828 aSign = extractFloatx80Sign( a );
4829 if ( 0x401E < aExp ) {
4830 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4831 goto invalid;
4833 else if ( aExp < 0x3FFF ) {
4834 if (aExp || aSig) {
4835 status->float_exception_flags |= float_flag_inexact;
4837 return 0;
4839 shiftCount = 0x403E - aExp;
4840 savedASig = aSig;
4841 aSig >>= shiftCount;
4842 z = aSig;
4843 if ( aSign ) z = - z;
4844 if ( ( z < 0 ) ^ aSign ) {
4845 invalid:
4846 float_raise(float_flag_invalid, status);
4847 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4849 if ( ( aSig<<shiftCount ) != savedASig ) {
4850 status->float_exception_flags |= float_flag_inexact;
4852 return z;
4856 /*----------------------------------------------------------------------------
4857 | Returns the result of converting the extended double-precision floating-
4858 | point value `a' to the 64-bit two's complement integer format. The
4859 | conversion is performed according to the IEC/IEEE Standard for Binary
4860 | Floating-Point Arithmetic---which means in particular that the conversion
4861 | is rounded according to the current rounding mode. If `a' is a NaN,
4862 | the largest positive integer is returned. Otherwise, if the conversion
4863 | overflows, the largest integer with the same sign as `a' is returned.
4864 *----------------------------------------------------------------------------*/
4866 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4868 flag aSign;
4869 int32_t aExp, shiftCount;
4870 uint64_t aSig, aSigExtra;
4872 aSig = extractFloatx80Frac( a );
4873 aExp = extractFloatx80Exp( a );
4874 aSign = extractFloatx80Sign( a );
4875 shiftCount = 0x403E - aExp;
4876 if ( shiftCount <= 0 ) {
4877 if ( shiftCount ) {
4878 float_raise(float_flag_invalid, status);
4879 if ( ! aSign
4880 || ( ( aExp == 0x7FFF )
4881 && ( aSig != LIT64( 0x8000000000000000 ) ) )
4883 return LIT64( 0x7FFFFFFFFFFFFFFF );
4885 return (int64_t) LIT64( 0x8000000000000000 );
4887 aSigExtra = 0;
4889 else {
4890 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4892 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4896 /*----------------------------------------------------------------------------
4897 | Returns the result of converting the extended double-precision floating-
4898 | point value `a' to the 64-bit two's complement integer format. The
4899 | conversion is performed according to the IEC/IEEE Standard for Binary
4900 | Floating-Point Arithmetic, except that the conversion is always rounded
4901 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4902 | Otherwise, if the conversion overflows, the largest integer with the same
4903 | sign as `a' is returned.
4904 *----------------------------------------------------------------------------*/
4906 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4908 flag aSign;
4909 int32_t aExp, shiftCount;
4910 uint64_t aSig;
4911 int64_t z;
4913 aSig = extractFloatx80Frac( a );
4914 aExp = extractFloatx80Exp( a );
4915 aSign = extractFloatx80Sign( a );
4916 shiftCount = aExp - 0x403E;
4917 if ( 0 <= shiftCount ) {
4918 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4919 if ( ( a.high != 0xC03E ) || aSig ) {
4920 float_raise(float_flag_invalid, status);
4921 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4922 return LIT64( 0x7FFFFFFFFFFFFFFF );
4925 return (int64_t) LIT64( 0x8000000000000000 );
4927 else if ( aExp < 0x3FFF ) {
4928 if (aExp | aSig) {
4929 status->float_exception_flags |= float_flag_inexact;
4931 return 0;
4933 z = aSig>>( - shiftCount );
4934 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4935 status->float_exception_flags |= float_flag_inexact;
4937 if ( aSign ) z = - z;
4938 return z;
4942 /*----------------------------------------------------------------------------
4943 | Returns the result of converting the extended double-precision floating-
4944 | point value `a' to the single-precision floating-point format. The
4945 | conversion is performed according to the IEC/IEEE Standard for Binary
4946 | Floating-Point Arithmetic.
4947 *----------------------------------------------------------------------------*/
4949 float32 floatx80_to_float32(floatx80 a, float_status *status)
4951 flag aSign;
4952 int32_t aExp;
4953 uint64_t aSig;
4955 aSig = extractFloatx80Frac( a );
4956 aExp = extractFloatx80Exp( a );
4957 aSign = extractFloatx80Sign( a );
4958 if ( aExp == 0x7FFF ) {
4959 if ( (uint64_t) ( aSig<<1 ) ) {
4960 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4962 return packFloat32( aSign, 0xFF, 0 );
4964 shift64RightJamming( aSig, 33, &aSig );
4965 if ( aExp || aSig ) aExp -= 0x3F81;
4966 return roundAndPackFloat32(aSign, aExp, aSig, status);
4970 /*----------------------------------------------------------------------------
4971 | Returns the result of converting the extended double-precision floating-
4972 | point value `a' to the double-precision floating-point format. The
4973 | conversion is performed according to the IEC/IEEE Standard for Binary
4974 | Floating-Point Arithmetic.
4975 *----------------------------------------------------------------------------*/
4977 float64 floatx80_to_float64(floatx80 a, float_status *status)
4979 flag aSign;
4980 int32_t aExp;
4981 uint64_t aSig, zSig;
4983 aSig = extractFloatx80Frac( a );
4984 aExp = extractFloatx80Exp( a );
4985 aSign = extractFloatx80Sign( a );
4986 if ( aExp == 0x7FFF ) {
4987 if ( (uint64_t) ( aSig<<1 ) ) {
4988 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4990 return packFloat64( aSign, 0x7FF, 0 );
4992 shift64RightJamming( aSig, 1, &zSig );
4993 if ( aExp || aSig ) aExp -= 0x3C01;
4994 return roundAndPackFloat64(aSign, aExp, zSig, status);
4998 /*----------------------------------------------------------------------------
4999 | Returns the result of converting the extended double-precision floating-
5000 | point value `a' to the quadruple-precision floating-point format. The
5001 | conversion is performed according to the IEC/IEEE Standard for Binary
5002 | Floating-Point Arithmetic.
5003 *----------------------------------------------------------------------------*/
5005 float128 floatx80_to_float128(floatx80 a, float_status *status)
5007 flag aSign;
5008 int_fast16_t aExp;
5009 uint64_t aSig, zSig0, zSig1;
5011 aSig = extractFloatx80Frac( a );
5012 aExp = extractFloatx80Exp( a );
5013 aSign = extractFloatx80Sign( a );
5014 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5015 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5017 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5018 return packFloat128( aSign, aExp, zSig0, zSig1 );
5022 /*----------------------------------------------------------------------------
5023 | Rounds the extended double-precision floating-point value `a' to an integer,
5024 | and returns the result as an extended quadruple-precision floating-point
5025 | value. The operation is performed according to the IEC/IEEE Standard for
5026 | Binary Floating-Point Arithmetic.
5027 *----------------------------------------------------------------------------*/
5029 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5031 flag aSign;
5032 int32_t aExp;
5033 uint64_t lastBitMask, roundBitsMask;
5034 floatx80 z;
5036 aExp = extractFloatx80Exp( a );
5037 if ( 0x403E <= aExp ) {
5038 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5039 return propagateFloatx80NaN(a, a, status);
5041 return a;
5043 if ( aExp < 0x3FFF ) {
5044 if ( ( aExp == 0 )
5045 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5046 return a;
5048 status->float_exception_flags |= float_flag_inexact;
5049 aSign = extractFloatx80Sign( a );
5050 switch (status->float_rounding_mode) {
5051 case float_round_nearest_even:
5052 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5054 return
5055 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5057 break;
5058 case float_round_ties_away:
5059 if (aExp == 0x3FFE) {
5060 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5062 break;
5063 case float_round_down:
5064 return
5065 aSign ?
5066 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5067 : packFloatx80( 0, 0, 0 );
5068 case float_round_up:
5069 return
5070 aSign ? packFloatx80( 1, 0, 0 )
5071 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5073 return packFloatx80( aSign, 0, 0 );
5075 lastBitMask = 1;
5076 lastBitMask <<= 0x403E - aExp;
5077 roundBitsMask = lastBitMask - 1;
5078 z = a;
5079 switch (status->float_rounding_mode) {
5080 case float_round_nearest_even:
5081 z.low += lastBitMask>>1;
5082 if ((z.low & roundBitsMask) == 0) {
5083 z.low &= ~lastBitMask;
5085 break;
5086 case float_round_ties_away:
5087 z.low += lastBitMask >> 1;
5088 break;
5089 case float_round_to_zero:
5090 break;
5091 case float_round_up:
5092 if (!extractFloatx80Sign(z)) {
5093 z.low += roundBitsMask;
5095 break;
5096 case float_round_down:
5097 if (extractFloatx80Sign(z)) {
5098 z.low += roundBitsMask;
5100 break;
5101 default:
5102 abort();
5104 z.low &= ~ roundBitsMask;
5105 if ( z.low == 0 ) {
5106 ++z.high;
5107 z.low = LIT64( 0x8000000000000000 );
5109 if (z.low != a.low) {
5110 status->float_exception_flags |= float_flag_inexact;
5112 return z;
5116 /*----------------------------------------------------------------------------
5117 | Returns the result of adding the absolute values of the extended double-
5118 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5119 | negated before being returned. `zSign' is ignored if the result is a NaN.
5120 | The addition is performed according to the IEC/IEEE Standard for Binary
5121 | Floating-Point Arithmetic.
5122 *----------------------------------------------------------------------------*/
5124 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5125 float_status *status)
5127 int32_t aExp, bExp, zExp;
5128 uint64_t aSig, bSig, zSig0, zSig1;
5129 int32_t expDiff;
5131 aSig = extractFloatx80Frac( a );
5132 aExp = extractFloatx80Exp( a );
5133 bSig = extractFloatx80Frac( b );
5134 bExp = extractFloatx80Exp( b );
5135 expDiff = aExp - bExp;
5136 if ( 0 < expDiff ) {
5137 if ( aExp == 0x7FFF ) {
5138 if ((uint64_t)(aSig << 1)) {
5139 return propagateFloatx80NaN(a, b, status);
5141 return a;
5143 if ( bExp == 0 ) --expDiff;
5144 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5145 zExp = aExp;
5147 else if ( expDiff < 0 ) {
5148 if ( bExp == 0x7FFF ) {
5149 if ((uint64_t)(bSig << 1)) {
5150 return propagateFloatx80NaN(a, b, status);
5152 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5154 if ( aExp == 0 ) ++expDiff;
5155 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5156 zExp = bExp;
5158 else {
5159 if ( aExp == 0x7FFF ) {
5160 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5161 return propagateFloatx80NaN(a, b, status);
5163 return a;
5165 zSig1 = 0;
5166 zSig0 = aSig + bSig;
5167 if ( aExp == 0 ) {
5168 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5169 goto roundAndPack;
5171 zExp = aExp;
5172 goto shiftRight1;
5174 zSig0 = aSig + bSig;
5175 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5176 shiftRight1:
5177 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5178 zSig0 |= LIT64( 0x8000000000000000 );
5179 ++zExp;
5180 roundAndPack:
5181 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5182 zSign, zExp, zSig0, zSig1, status);
5185 /*----------------------------------------------------------------------------
5186 | Returns the result of subtracting the absolute values of the extended
5187 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5188 | difference is negated before being returned. `zSign' is ignored if the
5189 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5190 | Standard for Binary Floating-Point Arithmetic.
5191 *----------------------------------------------------------------------------*/
5193 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5194 float_status *status)
5196 int32_t aExp, bExp, zExp;
5197 uint64_t aSig, bSig, zSig0, zSig1;
5198 int32_t expDiff;
5199 floatx80 z;
5201 aSig = extractFloatx80Frac( a );
5202 aExp = extractFloatx80Exp( a );
5203 bSig = extractFloatx80Frac( b );
5204 bExp = extractFloatx80Exp( b );
5205 expDiff = aExp - bExp;
5206 if ( 0 < expDiff ) goto aExpBigger;
5207 if ( expDiff < 0 ) goto bExpBigger;
5208 if ( aExp == 0x7FFF ) {
5209 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5210 return propagateFloatx80NaN(a, b, status);
5212 float_raise(float_flag_invalid, status);
5213 z.low = floatx80_default_nan_low;
5214 z.high = floatx80_default_nan_high;
5215 return z;
5217 if ( aExp == 0 ) {
5218 aExp = 1;
5219 bExp = 1;
5221 zSig1 = 0;
5222 if ( bSig < aSig ) goto aBigger;
5223 if ( aSig < bSig ) goto bBigger;
5224 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5225 bExpBigger:
5226 if ( bExp == 0x7FFF ) {
5227 if ((uint64_t)(bSig << 1)) {
5228 return propagateFloatx80NaN(a, b, status);
5230 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5232 if ( aExp == 0 ) ++expDiff;
5233 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5234 bBigger:
5235 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5236 zExp = bExp;
5237 zSign ^= 1;
5238 goto normalizeRoundAndPack;
5239 aExpBigger:
5240 if ( aExp == 0x7FFF ) {
5241 if ((uint64_t)(aSig << 1)) {
5242 return propagateFloatx80NaN(a, b, status);
5244 return a;
5246 if ( bExp == 0 ) --expDiff;
5247 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5248 aBigger:
5249 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5250 zExp = aExp;
5251 normalizeRoundAndPack:
5252 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5253 zSign, zExp, zSig0, zSig1, status);
5256 /*----------------------------------------------------------------------------
5257 | Returns the result of adding the extended double-precision floating-point
5258 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5259 | Standard for Binary Floating-Point Arithmetic.
5260 *----------------------------------------------------------------------------*/
5262 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5264 flag aSign, bSign;
5266 aSign = extractFloatx80Sign( a );
5267 bSign = extractFloatx80Sign( b );
5268 if ( aSign == bSign ) {
5269 return addFloatx80Sigs(a, b, aSign, status);
5271 else {
5272 return subFloatx80Sigs(a, b, aSign, status);
5277 /*----------------------------------------------------------------------------
5278 | Returns the result of subtracting the extended double-precision floating-
5279 | point values `a' and `b'. The operation is performed according to the
5280 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5281 *----------------------------------------------------------------------------*/
5283 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5285 flag aSign, bSign;
5287 aSign = extractFloatx80Sign( a );
5288 bSign = extractFloatx80Sign( b );
5289 if ( aSign == bSign ) {
5290 return subFloatx80Sigs(a, b, aSign, status);
5292 else {
5293 return addFloatx80Sigs(a, b, aSign, status);
5298 /*----------------------------------------------------------------------------
5299 | Returns the result of multiplying the extended double-precision floating-
5300 | point values `a' and `b'. The operation is performed according to the
5301 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5302 *----------------------------------------------------------------------------*/
5304 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5306 flag aSign, bSign, zSign;
5307 int32_t aExp, bExp, zExp;
5308 uint64_t aSig, bSig, zSig0, zSig1;
5309 floatx80 z;
5311 aSig = extractFloatx80Frac( a );
5312 aExp = extractFloatx80Exp( a );
5313 aSign = extractFloatx80Sign( a );
5314 bSig = extractFloatx80Frac( b );
5315 bExp = extractFloatx80Exp( b );
5316 bSign = extractFloatx80Sign( b );
5317 zSign = aSign ^ bSign;
5318 if ( aExp == 0x7FFF ) {
5319 if ( (uint64_t) ( aSig<<1 )
5320 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5321 return propagateFloatx80NaN(a, b, status);
5323 if ( ( bExp | bSig ) == 0 ) goto invalid;
5324 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5326 if ( bExp == 0x7FFF ) {
5327 if ((uint64_t)(bSig << 1)) {
5328 return propagateFloatx80NaN(a, b, status);
5330 if ( ( aExp | aSig ) == 0 ) {
5331 invalid:
5332 float_raise(float_flag_invalid, status);
5333 z.low = floatx80_default_nan_low;
5334 z.high = floatx80_default_nan_high;
5335 return z;
5337 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5339 if ( aExp == 0 ) {
5340 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5341 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5343 if ( bExp == 0 ) {
5344 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5345 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5347 zExp = aExp + bExp - 0x3FFE;
5348 mul64To128( aSig, bSig, &zSig0, &zSig1 );
5349 if ( 0 < (int64_t) zSig0 ) {
5350 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5351 --zExp;
5353 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5354 zSign, zExp, zSig0, zSig1, status);
5357 /*----------------------------------------------------------------------------
5358 | Returns the result of dividing the extended double-precision floating-point
5359 | value `a' by the corresponding value `b'. The operation is performed
5360 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5361 *----------------------------------------------------------------------------*/
5363 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5365 flag aSign, bSign, zSign;
5366 int32_t aExp, bExp, zExp;
5367 uint64_t aSig, bSig, zSig0, zSig1;
5368 uint64_t rem0, rem1, rem2, term0, term1, term2;
5369 floatx80 z;
5371 aSig = extractFloatx80Frac( a );
5372 aExp = extractFloatx80Exp( a );
5373 aSign = extractFloatx80Sign( a );
5374 bSig = extractFloatx80Frac( b );
5375 bExp = extractFloatx80Exp( b );
5376 bSign = extractFloatx80Sign( b );
5377 zSign = aSign ^ bSign;
5378 if ( aExp == 0x7FFF ) {
5379 if ((uint64_t)(aSig << 1)) {
5380 return propagateFloatx80NaN(a, b, status);
5382 if ( bExp == 0x7FFF ) {
5383 if ((uint64_t)(bSig << 1)) {
5384 return propagateFloatx80NaN(a, b, status);
5386 goto invalid;
5388 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5390 if ( bExp == 0x7FFF ) {
5391 if ((uint64_t)(bSig << 1)) {
5392 return propagateFloatx80NaN(a, b, status);
5394 return packFloatx80( zSign, 0, 0 );
5396 if ( bExp == 0 ) {
5397 if ( bSig == 0 ) {
5398 if ( ( aExp | aSig ) == 0 ) {
5399 invalid:
5400 float_raise(float_flag_invalid, status);
5401 z.low = floatx80_default_nan_low;
5402 z.high = floatx80_default_nan_high;
5403 return z;
5405 float_raise(float_flag_divbyzero, status);
5406 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5408 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5410 if ( aExp == 0 ) {
5411 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5412 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5414 zExp = aExp - bExp + 0x3FFE;
5415 rem1 = 0;
5416 if ( bSig <= aSig ) {
5417 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5418 ++zExp;
5420 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5421 mul64To128( bSig, zSig0, &term0, &term1 );
5422 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5423 while ( (int64_t) rem0 < 0 ) {
5424 --zSig0;
5425 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5427 zSig1 = estimateDiv128To64( rem1, 0, bSig );
5428 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5429 mul64To128( bSig, zSig1, &term1, &term2 );
5430 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5431 while ( (int64_t) rem1 < 0 ) {
5432 --zSig1;
5433 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5435 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5437 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5438 zSign, zExp, zSig0, zSig1, status);
5441 /*----------------------------------------------------------------------------
5442 | Returns the remainder of the extended double-precision floating-point value
5443 | `a' with respect to the corresponding value `b'. The operation is performed
5444 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5445 *----------------------------------------------------------------------------*/
5447 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5449 flag aSign, zSign;
5450 int32_t aExp, bExp, expDiff;
5451 uint64_t aSig0, aSig1, bSig;
5452 uint64_t q, term0, term1, alternateASig0, alternateASig1;
5453 floatx80 z;
5455 aSig0 = extractFloatx80Frac( a );
5456 aExp = extractFloatx80Exp( a );
5457 aSign = extractFloatx80Sign( a );
5458 bSig = extractFloatx80Frac( b );
5459 bExp = extractFloatx80Exp( b );
5460 if ( aExp == 0x7FFF ) {
5461 if ( (uint64_t) ( aSig0<<1 )
5462 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5463 return propagateFloatx80NaN(a, b, status);
5465 goto invalid;
5467 if ( bExp == 0x7FFF ) {
5468 if ((uint64_t)(bSig << 1)) {
5469 return propagateFloatx80NaN(a, b, status);
5471 return a;
5473 if ( bExp == 0 ) {
5474 if ( bSig == 0 ) {
5475 invalid:
5476 float_raise(float_flag_invalid, status);
5477 z.low = floatx80_default_nan_low;
5478 z.high = floatx80_default_nan_high;
5479 return z;
5481 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5483 if ( aExp == 0 ) {
5484 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5485 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5487 bSig |= LIT64( 0x8000000000000000 );
5488 zSign = aSign;
5489 expDiff = aExp - bExp;
5490 aSig1 = 0;
5491 if ( expDiff < 0 ) {
5492 if ( expDiff < -1 ) return a;
5493 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5494 expDiff = 0;
5496 q = ( bSig <= aSig0 );
5497 if ( q ) aSig0 -= bSig;
5498 expDiff -= 64;
5499 while ( 0 < expDiff ) {
5500 q = estimateDiv128To64( aSig0, aSig1, bSig );
5501 q = ( 2 < q ) ? q - 2 : 0;
5502 mul64To128( bSig, q, &term0, &term1 );
5503 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5504 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5505 expDiff -= 62;
5507 expDiff += 64;
5508 if ( 0 < expDiff ) {
5509 q = estimateDiv128To64( aSig0, aSig1, bSig );
5510 q = ( 2 < q ) ? q - 2 : 0;
5511 q >>= 64 - expDiff;
5512 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5513 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5514 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5515 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5516 ++q;
5517 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5520 else {
5521 term1 = 0;
5522 term0 = bSig;
5524 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5525 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5526 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5527 && ( q & 1 ) )
5529 aSig0 = alternateASig0;
5530 aSig1 = alternateASig1;
5531 zSign = ! zSign;
5533 return
5534 normalizeRoundAndPackFloatx80(
5535 80, zSign, bExp + expDiff, aSig0, aSig1, status);
5539 /*----------------------------------------------------------------------------
5540 | Returns the square root of the extended double-precision floating-point
5541 | value `a'. The operation is performed according to the IEC/IEEE Standard
5542 | for Binary Floating-Point Arithmetic.
5543 *----------------------------------------------------------------------------*/
5545 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5547 flag aSign;
5548 int32_t aExp, zExp;
5549 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5550 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5551 floatx80 z;
5553 aSig0 = extractFloatx80Frac( a );
5554 aExp = extractFloatx80Exp( a );
5555 aSign = extractFloatx80Sign( a );
5556 if ( aExp == 0x7FFF ) {
5557 if ((uint64_t)(aSig0 << 1)) {
5558 return propagateFloatx80NaN(a, a, status);
5560 if ( ! aSign ) return a;
5561 goto invalid;
5563 if ( aSign ) {
5564 if ( ( aExp | aSig0 ) == 0 ) return a;
5565 invalid:
5566 float_raise(float_flag_invalid, status);
5567 z.low = floatx80_default_nan_low;
5568 z.high = floatx80_default_nan_high;
5569 return z;
5571 if ( aExp == 0 ) {
5572 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5573 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5575 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5576 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5577 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5578 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5579 doubleZSig0 = zSig0<<1;
5580 mul64To128( zSig0, zSig0, &term0, &term1 );
5581 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5582 while ( (int64_t) rem0 < 0 ) {
5583 --zSig0;
5584 doubleZSig0 -= 2;
5585 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5587 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5588 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5589 if ( zSig1 == 0 ) zSig1 = 1;
5590 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5591 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5592 mul64To128( zSig1, zSig1, &term2, &term3 );
5593 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5594 while ( (int64_t) rem1 < 0 ) {
5595 --zSig1;
5596 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5597 term3 |= 1;
5598 term2 |= doubleZSig0;
5599 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5601 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5603 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5604 zSig0 |= doubleZSig0;
5605 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5606 0, zExp, zSig0, zSig1, status);
5609 /*----------------------------------------------------------------------------
5610 | Returns 1 if the extended double-precision floating-point value `a' is equal
5611 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5612 | raised if either operand is a NaN. Otherwise, the comparison is performed
5613 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5614 *----------------------------------------------------------------------------*/
5616 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5619 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5620 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5621 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5622 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5624 float_raise(float_flag_invalid, status);
5625 return 0;
5627 return
5628 ( a.low == b.low )
5629 && ( ( a.high == b.high )
5630 || ( ( a.low == 0 )
5631 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5636 /*----------------------------------------------------------------------------
5637 | Returns 1 if the extended double-precision floating-point value `a' is
5638 | less than or equal to the corresponding value `b', and 0 otherwise. The
5639 | invalid exception is raised if either operand is a NaN. The comparison is
5640 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5641 | Arithmetic.
5642 *----------------------------------------------------------------------------*/
5644 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5646 flag aSign, bSign;
5648 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5649 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5650 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5651 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5653 float_raise(float_flag_invalid, status);
5654 return 0;
5656 aSign = extractFloatx80Sign( a );
5657 bSign = extractFloatx80Sign( b );
5658 if ( aSign != bSign ) {
5659 return
5660 aSign
5661 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5662 == 0 );
5664 return
5665 aSign ? le128( b.high, b.low, a.high, a.low )
5666 : le128( a.high, a.low, b.high, b.low );
5670 /*----------------------------------------------------------------------------
5671 | Returns 1 if the extended double-precision floating-point value `a' is
5672 | less than the corresponding value `b', and 0 otherwise. The invalid
5673 | exception is raised if either operand is a NaN. The comparison is performed
5674 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5675 *----------------------------------------------------------------------------*/
5677 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5679 flag aSign, bSign;
5681 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5682 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5683 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5684 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5686 float_raise(float_flag_invalid, status);
5687 return 0;
5689 aSign = extractFloatx80Sign( a );
5690 bSign = extractFloatx80Sign( b );
5691 if ( aSign != bSign ) {
5692 return
5693 aSign
5694 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5695 != 0 );
5697 return
5698 aSign ? lt128( b.high, b.low, a.high, a.low )
5699 : lt128( a.high, a.low, b.high, b.low );
5703 /*----------------------------------------------------------------------------
5704 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5705 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5706 | either operand is a NaN. The comparison is performed according to the
5707 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5708 *----------------------------------------------------------------------------*/
5709 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5711 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5712 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5713 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5714 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5716 float_raise(float_flag_invalid, status);
5717 return 1;
5719 return 0;
5722 /*----------------------------------------------------------------------------
5723 | Returns 1 if the extended double-precision floating-point value `a' is
5724 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5725 | cause an exception. The comparison is performed according to the IEC/IEEE
5726 | Standard for Binary Floating-Point Arithmetic.
5727 *----------------------------------------------------------------------------*/
5729 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5732 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5733 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5734 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5735 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5737 if ( floatx80_is_signaling_nan( a )
5738 || floatx80_is_signaling_nan( b ) ) {
5739 float_raise(float_flag_invalid, status);
5741 return 0;
5743 return
5744 ( a.low == b.low )
5745 && ( ( a.high == b.high )
5746 || ( ( a.low == 0 )
5747 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5752 /*----------------------------------------------------------------------------
5753 | Returns 1 if the extended double-precision floating-point value `a' is less
5754 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5755 | do not cause an exception. Otherwise, the comparison is performed according
5756 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5757 *----------------------------------------------------------------------------*/
5759 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5761 flag aSign, bSign;
5763 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5764 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5765 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5766 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5768 if ( floatx80_is_signaling_nan( a )
5769 || floatx80_is_signaling_nan( b ) ) {
5770 float_raise(float_flag_invalid, status);
5772 return 0;
5774 aSign = extractFloatx80Sign( a );
5775 bSign = extractFloatx80Sign( b );
5776 if ( aSign != bSign ) {
5777 return
5778 aSign
5779 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5780 == 0 );
5782 return
5783 aSign ? le128( b.high, b.low, a.high, a.low )
5784 : le128( a.high, a.low, b.high, b.low );
5788 /*----------------------------------------------------------------------------
5789 | Returns 1 if the extended double-precision floating-point value `a' is less
5790 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5791 | an exception. Otherwise, the comparison is performed according to the
5792 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5793 *----------------------------------------------------------------------------*/
5795 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5797 flag aSign, bSign;
5799 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5800 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5801 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5802 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5804 if ( floatx80_is_signaling_nan( a )
5805 || floatx80_is_signaling_nan( b ) ) {
5806 float_raise(float_flag_invalid, status);
5808 return 0;
5810 aSign = extractFloatx80Sign( a );
5811 bSign = extractFloatx80Sign( b );
5812 if ( aSign != bSign ) {
5813 return
5814 aSign
5815 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5816 != 0 );
5818 return
5819 aSign ? lt128( b.high, b.low, a.high, a.low )
5820 : lt128( a.high, a.low, b.high, b.low );
5824 /*----------------------------------------------------------------------------
5825 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5826 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5827 | The comparison is performed according to the IEC/IEEE Standard for Binary
5828 | Floating-Point Arithmetic.
5829 *----------------------------------------------------------------------------*/
5830 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5832 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5833 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5834 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5835 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5837 if ( floatx80_is_signaling_nan( a )
5838 || floatx80_is_signaling_nan( b ) ) {
5839 float_raise(float_flag_invalid, status);
5841 return 1;
5843 return 0;
5846 /*----------------------------------------------------------------------------
5847 | Returns the result of converting the quadruple-precision floating-point
5848 | value `a' to the 32-bit two's complement integer format. The conversion
5849 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5850 | Arithmetic---which means in particular that the conversion is rounded
5851 | according to the current rounding mode. If `a' is a NaN, the largest
5852 | positive integer is returned. Otherwise, if the conversion overflows, the
5853 | largest integer with the same sign as `a' is returned.
5854 *----------------------------------------------------------------------------*/
5856 int32_t float128_to_int32(float128 a, float_status *status)
5858 flag aSign;
5859 int32_t aExp, shiftCount;
5860 uint64_t aSig0, aSig1;
5862 aSig1 = extractFloat128Frac1( a );
5863 aSig0 = extractFloat128Frac0( a );
5864 aExp = extractFloat128Exp( a );
5865 aSign = extractFloat128Sign( a );
5866 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5867 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5868 aSig0 |= ( aSig1 != 0 );
5869 shiftCount = 0x4028 - aExp;
5870 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5871 return roundAndPackInt32(aSign, aSig0, status);
5875 /*----------------------------------------------------------------------------
5876 | Returns the result of converting the quadruple-precision floating-point
5877 | value `a' to the 32-bit two's complement integer format. The conversion
5878 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5879 | Arithmetic, except that the conversion is always rounded toward zero. If
5880 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5881 | conversion overflows, the largest integer with the same sign as `a' is
5882 | returned.
5883 *----------------------------------------------------------------------------*/
5885 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5887 flag aSign;
5888 int32_t aExp, shiftCount;
5889 uint64_t aSig0, aSig1, savedASig;
5890 int32_t z;
5892 aSig1 = extractFloat128Frac1( a );
5893 aSig0 = extractFloat128Frac0( a );
5894 aExp = extractFloat128Exp( a );
5895 aSign = extractFloat128Sign( a );
5896 aSig0 |= ( aSig1 != 0 );
5897 if ( 0x401E < aExp ) {
5898 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5899 goto invalid;
5901 else if ( aExp < 0x3FFF ) {
5902 if (aExp || aSig0) {
5903 status->float_exception_flags |= float_flag_inexact;
5905 return 0;
5907 aSig0 |= LIT64( 0x0001000000000000 );
5908 shiftCount = 0x402F - aExp;
5909 savedASig = aSig0;
5910 aSig0 >>= shiftCount;
5911 z = aSig0;
5912 if ( aSign ) z = - z;
5913 if ( ( z < 0 ) ^ aSign ) {
5914 invalid:
5915 float_raise(float_flag_invalid, status);
5916 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5918 if ( ( aSig0<<shiftCount ) != savedASig ) {
5919 status->float_exception_flags |= float_flag_inexact;
5921 return z;
5925 /*----------------------------------------------------------------------------
5926 | Returns the result of converting the quadruple-precision floating-point
5927 | value `a' to the 64-bit two's complement integer format. The conversion
5928 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5929 | Arithmetic---which means in particular that the conversion is rounded
5930 | according to the current rounding mode. If `a' is a NaN, the largest
5931 | positive integer is returned. Otherwise, if the conversion overflows, the
5932 | largest integer with the same sign as `a' is returned.
5933 *----------------------------------------------------------------------------*/
5935 int64_t float128_to_int64(float128 a, float_status *status)
5937 flag aSign;
5938 int32_t aExp, shiftCount;
5939 uint64_t aSig0, aSig1;
5941 aSig1 = extractFloat128Frac1( a );
5942 aSig0 = extractFloat128Frac0( a );
5943 aExp = extractFloat128Exp( a );
5944 aSign = extractFloat128Sign( a );
5945 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5946 shiftCount = 0x402F - aExp;
5947 if ( shiftCount <= 0 ) {
5948 if ( 0x403E < aExp ) {
5949 float_raise(float_flag_invalid, status);
5950 if ( ! aSign
5951 || ( ( aExp == 0x7FFF )
5952 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5955 return LIT64( 0x7FFFFFFFFFFFFFFF );
5957 return (int64_t) LIT64( 0x8000000000000000 );
5959 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5961 else {
5962 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5964 return roundAndPackInt64(aSign, aSig0, aSig1, status);
5968 /*----------------------------------------------------------------------------
5969 | Returns the result of converting the quadruple-precision floating-point
5970 | value `a' to the 64-bit two's complement integer format. The conversion
5971 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5972 | Arithmetic, except that the conversion is always rounded toward zero.
5973 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5974 | the conversion overflows, the largest integer with the same sign as `a' is
5975 | returned.
5976 *----------------------------------------------------------------------------*/
5978 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5980 flag aSign;
5981 int32_t aExp, shiftCount;
5982 uint64_t aSig0, aSig1;
5983 int64_t z;
5985 aSig1 = extractFloat128Frac1( a );
5986 aSig0 = extractFloat128Frac0( a );
5987 aExp = extractFloat128Exp( a );
5988 aSign = extractFloat128Sign( a );
5989 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5990 shiftCount = aExp - 0x402F;
5991 if ( 0 < shiftCount ) {
5992 if ( 0x403E <= aExp ) {
5993 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5994 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5995 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5996 if (aSig1) {
5997 status->float_exception_flags |= float_flag_inexact;
6000 else {
6001 float_raise(float_flag_invalid, status);
6002 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6003 return LIT64( 0x7FFFFFFFFFFFFFFF );
6006 return (int64_t) LIT64( 0x8000000000000000 );
6008 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6009 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6010 status->float_exception_flags |= float_flag_inexact;
6013 else {
6014 if ( aExp < 0x3FFF ) {
6015 if ( aExp | aSig0 | aSig1 ) {
6016 status->float_exception_flags |= float_flag_inexact;
6018 return 0;
6020 z = aSig0>>( - shiftCount );
6021 if ( aSig1
6022 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6023 status->float_exception_flags |= float_flag_inexact;
6026 if ( aSign ) z = - z;
6027 return z;
6031 /*----------------------------------------------------------------------------
6032 | Returns the result of converting the quadruple-precision floating-point
6033 | value `a' to the single-precision floating-point format. The conversion
6034 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6035 | Arithmetic.
6036 *----------------------------------------------------------------------------*/
6038 float32 float128_to_float32(float128 a, float_status *status)
6040 flag aSign;
6041 int32_t aExp;
6042 uint64_t aSig0, aSig1;
6043 uint32_t zSig;
6045 aSig1 = extractFloat128Frac1( a );
6046 aSig0 = extractFloat128Frac0( a );
6047 aExp = extractFloat128Exp( a );
6048 aSign = extractFloat128Sign( a );
6049 if ( aExp == 0x7FFF ) {
6050 if ( aSig0 | aSig1 ) {
6051 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6053 return packFloat32( aSign, 0xFF, 0 );
6055 aSig0 |= ( aSig1 != 0 );
6056 shift64RightJamming( aSig0, 18, &aSig0 );
6057 zSig = aSig0;
6058 if ( aExp || zSig ) {
6059 zSig |= 0x40000000;
6060 aExp -= 0x3F81;
6062 return roundAndPackFloat32(aSign, aExp, zSig, status);
6066 /*----------------------------------------------------------------------------
6067 | Returns the result of converting the quadruple-precision floating-point
6068 | value `a' to the double-precision floating-point format. The conversion
6069 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6070 | Arithmetic.
6071 *----------------------------------------------------------------------------*/
6073 float64 float128_to_float64(float128 a, float_status *status)
6075 flag aSign;
6076 int32_t aExp;
6077 uint64_t aSig0, aSig1;
6079 aSig1 = extractFloat128Frac1( a );
6080 aSig0 = extractFloat128Frac0( a );
6081 aExp = extractFloat128Exp( a );
6082 aSign = extractFloat128Sign( a );
6083 if ( aExp == 0x7FFF ) {
6084 if ( aSig0 | aSig1 ) {
6085 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6087 return packFloat64( aSign, 0x7FF, 0 );
6089 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6090 aSig0 |= ( aSig1 != 0 );
6091 if ( aExp || aSig0 ) {
6092 aSig0 |= LIT64( 0x4000000000000000 );
6093 aExp -= 0x3C01;
6095 return roundAndPackFloat64(aSign, aExp, aSig0, status);
6099 /*----------------------------------------------------------------------------
6100 | Returns the result of converting the quadruple-precision floating-point
6101 | value `a' to the extended double-precision floating-point format. The
6102 | conversion is performed according to the IEC/IEEE Standard for Binary
6103 | Floating-Point Arithmetic.
6104 *----------------------------------------------------------------------------*/
6106 floatx80 float128_to_floatx80(float128 a, float_status *status)
6108 flag aSign;
6109 int32_t aExp;
6110 uint64_t aSig0, aSig1;
6112 aSig1 = extractFloat128Frac1( a );
6113 aSig0 = extractFloat128Frac0( a );
6114 aExp = extractFloat128Exp( a );
6115 aSign = extractFloat128Sign( a );
6116 if ( aExp == 0x7FFF ) {
6117 if ( aSig0 | aSig1 ) {
6118 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6120 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6122 if ( aExp == 0 ) {
6123 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6124 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6126 else {
6127 aSig0 |= LIT64( 0x0001000000000000 );
6129 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6130 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6134 /*----------------------------------------------------------------------------
6135 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6136 | returns the result as a quadruple-precision floating-point value. The
6137 | operation is performed according to the IEC/IEEE Standard for Binary
6138 | Floating-Point Arithmetic.
6139 *----------------------------------------------------------------------------*/
6141 float128 float128_round_to_int(float128 a, float_status *status)
6143 flag aSign;
6144 int32_t aExp;
6145 uint64_t lastBitMask, roundBitsMask;
6146 float128 z;
6148 aExp = extractFloat128Exp( a );
6149 if ( 0x402F <= aExp ) {
6150 if ( 0x406F <= aExp ) {
6151 if ( ( aExp == 0x7FFF )
6152 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6154 return propagateFloat128NaN(a, a, status);
6156 return a;
6158 lastBitMask = 1;
6159 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6160 roundBitsMask = lastBitMask - 1;
6161 z = a;
6162 switch (status->float_rounding_mode) {
6163 case float_round_nearest_even:
6164 if ( lastBitMask ) {
6165 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6166 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6168 else {
6169 if ( (int64_t) z.low < 0 ) {
6170 ++z.high;
6171 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6174 break;
6175 case float_round_ties_away:
6176 if (lastBitMask) {
6177 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6178 } else {
6179 if ((int64_t) z.low < 0) {
6180 ++z.high;
6183 break;
6184 case float_round_to_zero:
6185 break;
6186 case float_round_up:
6187 if (!extractFloat128Sign(z)) {
6188 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6190 break;
6191 case float_round_down:
6192 if (extractFloat128Sign(z)) {
6193 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6195 break;
6196 default:
6197 abort();
6199 z.low &= ~ roundBitsMask;
6201 else {
6202 if ( aExp < 0x3FFF ) {
6203 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6204 status->float_exception_flags |= float_flag_inexact;
6205 aSign = extractFloat128Sign( a );
6206 switch (status->float_rounding_mode) {
6207 case float_round_nearest_even:
6208 if ( ( aExp == 0x3FFE )
6209 && ( extractFloat128Frac0( a )
6210 | extractFloat128Frac1( a ) )
6212 return packFloat128( aSign, 0x3FFF, 0, 0 );
6214 break;
6215 case float_round_ties_away:
6216 if (aExp == 0x3FFE) {
6217 return packFloat128(aSign, 0x3FFF, 0, 0);
6219 break;
6220 case float_round_down:
6221 return
6222 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6223 : packFloat128( 0, 0, 0, 0 );
6224 case float_round_up:
6225 return
6226 aSign ? packFloat128( 1, 0, 0, 0 )
6227 : packFloat128( 0, 0x3FFF, 0, 0 );
6229 return packFloat128( aSign, 0, 0, 0 );
6231 lastBitMask = 1;
6232 lastBitMask <<= 0x402F - aExp;
6233 roundBitsMask = lastBitMask - 1;
6234 z.low = 0;
6235 z.high = a.high;
6236 switch (status->float_rounding_mode) {
6237 case float_round_nearest_even:
6238 z.high += lastBitMask>>1;
6239 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6240 z.high &= ~ lastBitMask;
6242 break;
6243 case float_round_ties_away:
6244 z.high += lastBitMask>>1;
6245 break;
6246 case float_round_to_zero:
6247 break;
6248 case float_round_up:
6249 if (!extractFloat128Sign(z)) {
6250 z.high |= ( a.low != 0 );
6251 z.high += roundBitsMask;
6253 break;
6254 case float_round_down:
6255 if (extractFloat128Sign(z)) {
6256 z.high |= (a.low != 0);
6257 z.high += roundBitsMask;
6259 break;
6260 default:
6261 abort();
6263 z.high &= ~ roundBitsMask;
6265 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6266 status->float_exception_flags |= float_flag_inexact;
6268 return z;
6272 /*----------------------------------------------------------------------------
6273 | Returns the result of adding the absolute values of the quadruple-precision
6274 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6275 | before being returned. `zSign' is ignored if the result is a NaN.
6276 | The addition is performed according to the IEC/IEEE Standard for Binary
6277 | Floating-Point Arithmetic.
6278 *----------------------------------------------------------------------------*/
6280 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6281 float_status *status)
6283 int32_t aExp, bExp, zExp;
6284 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6285 int32_t expDiff;
6287 aSig1 = extractFloat128Frac1( a );
6288 aSig0 = extractFloat128Frac0( a );
6289 aExp = extractFloat128Exp( a );
6290 bSig1 = extractFloat128Frac1( b );
6291 bSig0 = extractFloat128Frac0( b );
6292 bExp = extractFloat128Exp( b );
6293 expDiff = aExp - bExp;
6294 if ( 0 < expDiff ) {
6295 if ( aExp == 0x7FFF ) {
6296 if (aSig0 | aSig1) {
6297 return propagateFloat128NaN(a, b, status);
6299 return a;
6301 if ( bExp == 0 ) {
6302 --expDiff;
6304 else {
6305 bSig0 |= LIT64( 0x0001000000000000 );
6307 shift128ExtraRightJamming(
6308 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6309 zExp = aExp;
6311 else if ( expDiff < 0 ) {
6312 if ( bExp == 0x7FFF ) {
6313 if (bSig0 | bSig1) {
6314 return propagateFloat128NaN(a, b, status);
6316 return packFloat128( zSign, 0x7FFF, 0, 0 );
6318 if ( aExp == 0 ) {
6319 ++expDiff;
6321 else {
6322 aSig0 |= LIT64( 0x0001000000000000 );
6324 shift128ExtraRightJamming(
6325 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6326 zExp = bExp;
6328 else {
6329 if ( aExp == 0x7FFF ) {
6330 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6331 return propagateFloat128NaN(a, b, status);
6333 return a;
6335 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6336 if ( aExp == 0 ) {
6337 if (status->flush_to_zero) {
6338 if (zSig0 | zSig1) {
6339 float_raise(float_flag_output_denormal, status);
6341 return packFloat128(zSign, 0, 0, 0);
6343 return packFloat128( zSign, 0, zSig0, zSig1 );
6345 zSig2 = 0;
6346 zSig0 |= LIT64( 0x0002000000000000 );
6347 zExp = aExp;
6348 goto shiftRight1;
6350 aSig0 |= LIT64( 0x0001000000000000 );
6351 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6352 --zExp;
6353 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6354 ++zExp;
6355 shiftRight1:
6356 shift128ExtraRightJamming(
6357 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6358 roundAndPack:
6359 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6363 /*----------------------------------------------------------------------------
6364 | Returns the result of subtracting the absolute values of the quadruple-
6365 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6366 | difference is negated before being returned. `zSign' is ignored if the
6367 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6368 | Standard for Binary Floating-Point Arithmetic.
6369 *----------------------------------------------------------------------------*/
6371 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6372 float_status *status)
6374 int32_t aExp, bExp, zExp;
6375 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6376 int32_t expDiff;
6377 float128 z;
6379 aSig1 = extractFloat128Frac1( a );
6380 aSig0 = extractFloat128Frac0( a );
6381 aExp = extractFloat128Exp( a );
6382 bSig1 = extractFloat128Frac1( b );
6383 bSig0 = extractFloat128Frac0( b );
6384 bExp = extractFloat128Exp( b );
6385 expDiff = aExp - bExp;
6386 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6387 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6388 if ( 0 < expDiff ) goto aExpBigger;
6389 if ( expDiff < 0 ) goto bExpBigger;
6390 if ( aExp == 0x7FFF ) {
6391 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6392 return propagateFloat128NaN(a, b, status);
6394 float_raise(float_flag_invalid, status);
6395 z.low = float128_default_nan_low;
6396 z.high = float128_default_nan_high;
6397 return z;
6399 if ( aExp == 0 ) {
6400 aExp = 1;
6401 bExp = 1;
6403 if ( bSig0 < aSig0 ) goto aBigger;
6404 if ( aSig0 < bSig0 ) goto bBigger;
6405 if ( bSig1 < aSig1 ) goto aBigger;
6406 if ( aSig1 < bSig1 ) goto bBigger;
6407 return packFloat128(status->float_rounding_mode == float_round_down,
6408 0, 0, 0);
6409 bExpBigger:
6410 if ( bExp == 0x7FFF ) {
6411 if (bSig0 | bSig1) {
6412 return propagateFloat128NaN(a, b, status);
6414 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6416 if ( aExp == 0 ) {
6417 ++expDiff;
6419 else {
6420 aSig0 |= LIT64( 0x4000000000000000 );
6422 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6423 bSig0 |= LIT64( 0x4000000000000000 );
6424 bBigger:
6425 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6426 zExp = bExp;
6427 zSign ^= 1;
6428 goto normalizeRoundAndPack;
6429 aExpBigger:
6430 if ( aExp == 0x7FFF ) {
6431 if (aSig0 | aSig1) {
6432 return propagateFloat128NaN(a, b, status);
6434 return a;
6436 if ( bExp == 0 ) {
6437 --expDiff;
6439 else {
6440 bSig0 |= LIT64( 0x4000000000000000 );
6442 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6443 aSig0 |= LIT64( 0x4000000000000000 );
6444 aBigger:
6445 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6446 zExp = aExp;
6447 normalizeRoundAndPack:
6448 --zExp;
6449 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6450 status);
6454 /*----------------------------------------------------------------------------
6455 | Returns the result of adding the quadruple-precision floating-point values
6456 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6457 | for Binary Floating-Point Arithmetic.
6458 *----------------------------------------------------------------------------*/
6460 float128 float128_add(float128 a, float128 b, float_status *status)
6462 flag aSign, bSign;
6464 aSign = extractFloat128Sign( a );
6465 bSign = extractFloat128Sign( b );
6466 if ( aSign == bSign ) {
6467 return addFloat128Sigs(a, b, aSign, status);
6469 else {
6470 return subFloat128Sigs(a, b, aSign, status);
6475 /*----------------------------------------------------------------------------
6476 | Returns the result of subtracting the quadruple-precision floating-point
6477 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6478 | Standard for Binary Floating-Point Arithmetic.
6479 *----------------------------------------------------------------------------*/
6481 float128 float128_sub(float128 a, float128 b, float_status *status)
6483 flag aSign, bSign;
6485 aSign = extractFloat128Sign( a );
6486 bSign = extractFloat128Sign( b );
6487 if ( aSign == bSign ) {
6488 return subFloat128Sigs(a, b, aSign, status);
6490 else {
6491 return addFloat128Sigs(a, b, aSign, status);
6496 /*----------------------------------------------------------------------------
6497 | Returns the result of multiplying the quadruple-precision floating-point
6498 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6499 | Standard for Binary Floating-Point Arithmetic.
6500 *----------------------------------------------------------------------------*/
6502 float128 float128_mul(float128 a, float128 b, float_status *status)
6504 flag aSign, bSign, zSign;
6505 int32_t aExp, bExp, zExp;
6506 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6507 float128 z;
6509 aSig1 = extractFloat128Frac1( a );
6510 aSig0 = extractFloat128Frac0( a );
6511 aExp = extractFloat128Exp( a );
6512 aSign = extractFloat128Sign( a );
6513 bSig1 = extractFloat128Frac1( b );
6514 bSig0 = extractFloat128Frac0( b );
6515 bExp = extractFloat128Exp( b );
6516 bSign = extractFloat128Sign( b );
6517 zSign = aSign ^ bSign;
6518 if ( aExp == 0x7FFF ) {
6519 if ( ( aSig0 | aSig1 )
6520 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6521 return propagateFloat128NaN(a, b, status);
6523 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6524 return packFloat128( zSign, 0x7FFF, 0, 0 );
6526 if ( bExp == 0x7FFF ) {
6527 if (bSig0 | bSig1) {
6528 return propagateFloat128NaN(a, b, status);
6530 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6531 invalid:
6532 float_raise(float_flag_invalid, status);
6533 z.low = float128_default_nan_low;
6534 z.high = float128_default_nan_high;
6535 return z;
6537 return packFloat128( zSign, 0x7FFF, 0, 0 );
6539 if ( aExp == 0 ) {
6540 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6541 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6543 if ( bExp == 0 ) {
6544 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6545 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6547 zExp = aExp + bExp - 0x4000;
6548 aSig0 |= LIT64( 0x0001000000000000 );
6549 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6550 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6551 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6552 zSig2 |= ( zSig3 != 0 );
6553 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6554 shift128ExtraRightJamming(
6555 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6556 ++zExp;
6558 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6562 /*----------------------------------------------------------------------------
6563 | Returns the result of dividing the quadruple-precision floating-point value
6564 | `a' by the corresponding value `b'. The operation is performed according to
6565 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6566 *----------------------------------------------------------------------------*/
6568 float128 float128_div(float128 a, float128 b, float_status *status)
6570 flag aSign, bSign, zSign;
6571 int32_t aExp, bExp, zExp;
6572 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6573 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6574 float128 z;
6576 aSig1 = extractFloat128Frac1( a );
6577 aSig0 = extractFloat128Frac0( a );
6578 aExp = extractFloat128Exp( a );
6579 aSign = extractFloat128Sign( a );
6580 bSig1 = extractFloat128Frac1( b );
6581 bSig0 = extractFloat128Frac0( b );
6582 bExp = extractFloat128Exp( b );
6583 bSign = extractFloat128Sign( b );
6584 zSign = aSign ^ bSign;
6585 if ( aExp == 0x7FFF ) {
6586 if (aSig0 | aSig1) {
6587 return propagateFloat128NaN(a, b, status);
6589 if ( bExp == 0x7FFF ) {
6590 if (bSig0 | bSig1) {
6591 return propagateFloat128NaN(a, b, status);
6593 goto invalid;
6595 return packFloat128( zSign, 0x7FFF, 0, 0 );
6597 if ( bExp == 0x7FFF ) {
6598 if (bSig0 | bSig1) {
6599 return propagateFloat128NaN(a, b, status);
6601 return packFloat128( zSign, 0, 0, 0 );
6603 if ( bExp == 0 ) {
6604 if ( ( bSig0 | bSig1 ) == 0 ) {
6605 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6606 invalid:
6607 float_raise(float_flag_invalid, status);
6608 z.low = float128_default_nan_low;
6609 z.high = float128_default_nan_high;
6610 return z;
6612 float_raise(float_flag_divbyzero, status);
6613 return packFloat128( zSign, 0x7FFF, 0, 0 );
6615 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6617 if ( aExp == 0 ) {
6618 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6619 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6621 zExp = aExp - bExp + 0x3FFD;
6622 shortShift128Left(
6623 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6624 shortShift128Left(
6625 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6626 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6627 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6628 ++zExp;
6630 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6631 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6632 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6633 while ( (int64_t) rem0 < 0 ) {
6634 --zSig0;
6635 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6637 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6638 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6639 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6640 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6641 while ( (int64_t) rem1 < 0 ) {
6642 --zSig1;
6643 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6645 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6647 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6648 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6652 /*----------------------------------------------------------------------------
6653 | Returns the remainder of the quadruple-precision floating-point value `a'
6654 | with respect to the corresponding value `b'. The operation is performed
6655 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6656 *----------------------------------------------------------------------------*/
6658 float128 float128_rem(float128 a, float128 b, float_status *status)
6660 flag aSign, zSign;
6661 int32_t aExp, bExp, expDiff;
6662 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6663 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6664 int64_t sigMean0;
6665 float128 z;
6667 aSig1 = extractFloat128Frac1( a );
6668 aSig0 = extractFloat128Frac0( a );
6669 aExp = extractFloat128Exp( a );
6670 aSign = extractFloat128Sign( a );
6671 bSig1 = extractFloat128Frac1( b );
6672 bSig0 = extractFloat128Frac0( b );
6673 bExp = extractFloat128Exp( b );
6674 if ( aExp == 0x7FFF ) {
6675 if ( ( aSig0 | aSig1 )
6676 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6677 return propagateFloat128NaN(a, b, status);
6679 goto invalid;
6681 if ( bExp == 0x7FFF ) {
6682 if (bSig0 | bSig1) {
6683 return propagateFloat128NaN(a, b, status);
6685 return a;
6687 if ( bExp == 0 ) {
6688 if ( ( bSig0 | bSig1 ) == 0 ) {
6689 invalid:
6690 float_raise(float_flag_invalid, status);
6691 z.low = float128_default_nan_low;
6692 z.high = float128_default_nan_high;
6693 return z;
6695 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6697 if ( aExp == 0 ) {
6698 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6699 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6701 expDiff = aExp - bExp;
6702 if ( expDiff < -1 ) return a;
6703 shortShift128Left(
6704 aSig0 | LIT64( 0x0001000000000000 ),
6705 aSig1,
6706 15 - ( expDiff < 0 ),
6707 &aSig0,
6708 &aSig1
6710 shortShift128Left(
6711 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6712 q = le128( bSig0, bSig1, aSig0, aSig1 );
6713 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6714 expDiff -= 64;
6715 while ( 0 < expDiff ) {
6716 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6717 q = ( 4 < q ) ? q - 4 : 0;
6718 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6719 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6720 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6721 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6722 expDiff -= 61;
6724 if ( -64 < expDiff ) {
6725 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6726 q = ( 4 < q ) ? q - 4 : 0;
6727 q >>= - expDiff;
6728 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6729 expDiff += 52;
6730 if ( expDiff < 0 ) {
6731 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6733 else {
6734 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6736 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6737 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6739 else {
6740 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6741 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6743 do {
6744 alternateASig0 = aSig0;
6745 alternateASig1 = aSig1;
6746 ++q;
6747 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6748 } while ( 0 <= (int64_t) aSig0 );
6749 add128(
6750 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6751 if ( ( sigMean0 < 0 )
6752 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6753 aSig0 = alternateASig0;
6754 aSig1 = alternateASig1;
6756 zSign = ( (int64_t) aSig0 < 0 );
6757 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6758 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6759 status);
6762 /*----------------------------------------------------------------------------
6763 | Returns the square root of the quadruple-precision floating-point value `a'.
6764 | The operation is performed according to the IEC/IEEE Standard for Binary
6765 | Floating-Point Arithmetic.
6766 *----------------------------------------------------------------------------*/
6768 float128 float128_sqrt(float128 a, float_status *status)
6770 flag aSign;
6771 int32_t aExp, zExp;
6772 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6773 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6774 float128 z;
6776 aSig1 = extractFloat128Frac1( a );
6777 aSig0 = extractFloat128Frac0( a );
6778 aExp = extractFloat128Exp( a );
6779 aSign = extractFloat128Sign( a );
6780 if ( aExp == 0x7FFF ) {
6781 if (aSig0 | aSig1) {
6782 return propagateFloat128NaN(a, a, status);
6784 if ( ! aSign ) return a;
6785 goto invalid;
6787 if ( aSign ) {
6788 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6789 invalid:
6790 float_raise(float_flag_invalid, status);
6791 z.low = float128_default_nan_low;
6792 z.high = float128_default_nan_high;
6793 return z;
6795 if ( aExp == 0 ) {
6796 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6797 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6799 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6800 aSig0 |= LIT64( 0x0001000000000000 );
6801 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6802 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6803 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6804 doubleZSig0 = zSig0<<1;
6805 mul64To128( zSig0, zSig0, &term0, &term1 );
6806 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6807 while ( (int64_t) rem0 < 0 ) {
6808 --zSig0;
6809 doubleZSig0 -= 2;
6810 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6812 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6813 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6814 if ( zSig1 == 0 ) zSig1 = 1;
6815 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6816 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6817 mul64To128( zSig1, zSig1, &term2, &term3 );
6818 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6819 while ( (int64_t) rem1 < 0 ) {
6820 --zSig1;
6821 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6822 term3 |= 1;
6823 term2 |= doubleZSig0;
6824 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6826 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6828 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6829 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6833 /*----------------------------------------------------------------------------
6834 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6835 | the corresponding value `b', and 0 otherwise. The invalid exception is
6836 | raised if either operand is a NaN. Otherwise, the comparison is performed
6837 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6838 *----------------------------------------------------------------------------*/
6840 int float128_eq(float128 a, float128 b, float_status *status)
6843 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6844 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6845 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6846 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6848 float_raise(float_flag_invalid, status);
6849 return 0;
6851 return
6852 ( a.low == b.low )
6853 && ( ( a.high == b.high )
6854 || ( ( a.low == 0 )
6855 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6860 /*----------------------------------------------------------------------------
6861 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6862 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6863 | exception is raised if either operand is a NaN. The comparison is performed
6864 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6865 *----------------------------------------------------------------------------*/
6867 int float128_le(float128 a, float128 b, float_status *status)
6869 flag aSign, bSign;
6871 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6872 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6873 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6874 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6876 float_raise(float_flag_invalid, status);
6877 return 0;
6879 aSign = extractFloat128Sign( a );
6880 bSign = extractFloat128Sign( b );
6881 if ( aSign != bSign ) {
6882 return
6883 aSign
6884 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6885 == 0 );
6887 return
6888 aSign ? le128( b.high, b.low, a.high, a.low )
6889 : le128( a.high, a.low, b.high, b.low );
6893 /*----------------------------------------------------------------------------
6894 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6895 | the corresponding value `b', and 0 otherwise. The invalid exception is
6896 | raised if either operand is a NaN. The comparison is performed according
6897 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6898 *----------------------------------------------------------------------------*/
6900 int float128_lt(float128 a, float128 b, float_status *status)
6902 flag aSign, bSign;
6904 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6905 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6906 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6907 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6909 float_raise(float_flag_invalid, status);
6910 return 0;
6912 aSign = extractFloat128Sign( a );
6913 bSign = extractFloat128Sign( b );
6914 if ( aSign != bSign ) {
6915 return
6916 aSign
6917 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6918 != 0 );
6920 return
6921 aSign ? lt128( b.high, b.low, a.high, a.low )
6922 : lt128( a.high, a.low, b.high, b.low );
6926 /*----------------------------------------------------------------------------
6927 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6928 | be compared, and 0 otherwise. The invalid exception is raised if either
6929 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6930 | Standard for Binary Floating-Point Arithmetic.
6931 *----------------------------------------------------------------------------*/
6933 int float128_unordered(float128 a, float128 b, float_status *status)
6935 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6936 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6937 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6938 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6940 float_raise(float_flag_invalid, status);
6941 return 1;
6943 return 0;
6946 /*----------------------------------------------------------------------------
6947 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6948 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6949 | exception. The comparison is performed according to the IEC/IEEE Standard
6950 | for Binary Floating-Point Arithmetic.
6951 *----------------------------------------------------------------------------*/
6953 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6956 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6957 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6958 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6959 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6961 if ( float128_is_signaling_nan( a )
6962 || float128_is_signaling_nan( b ) ) {
6963 float_raise(float_flag_invalid, status);
6965 return 0;
6967 return
6968 ( a.low == b.low )
6969 && ( ( a.high == b.high )
6970 || ( ( a.low == 0 )
6971 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6976 /*----------------------------------------------------------------------------
6977 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6978 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6979 | cause an exception. Otherwise, the comparison is performed according to the
6980 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6981 *----------------------------------------------------------------------------*/
6983 int float128_le_quiet(float128 a, float128 b, float_status *status)
6985 flag aSign, bSign;
6987 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6988 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6989 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6990 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6992 if ( float128_is_signaling_nan( a )
6993 || float128_is_signaling_nan( b ) ) {
6994 float_raise(float_flag_invalid, status);
6996 return 0;
6998 aSign = extractFloat128Sign( a );
6999 bSign = extractFloat128Sign( b );
7000 if ( aSign != bSign ) {
7001 return
7002 aSign
7003 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7004 == 0 );
7006 return
7007 aSign ? le128( b.high, b.low, a.high, a.low )
7008 : le128( a.high, a.low, b.high, b.low );
7012 /*----------------------------------------------------------------------------
7013 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7014 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7015 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
7016 | Standard for Binary Floating-Point Arithmetic.
7017 *----------------------------------------------------------------------------*/
7019 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7021 flag aSign, bSign;
7023 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7024 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7025 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7026 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7028 if ( float128_is_signaling_nan( a )
7029 || float128_is_signaling_nan( b ) ) {
7030 float_raise(float_flag_invalid, status);
7032 return 0;
7034 aSign = extractFloat128Sign( a );
7035 bSign = extractFloat128Sign( b );
7036 if ( aSign != bSign ) {
7037 return
7038 aSign
7039 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7040 != 0 );
7042 return
7043 aSign ? lt128( b.high, b.low, a.high, a.low )
7044 : lt128( a.high, a.low, b.high, b.low );
7048 /*----------------------------------------------------------------------------
7049 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7050 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
7051 | comparison is performed according to the IEC/IEEE Standard for Binary
7052 | Floating-Point Arithmetic.
7053 *----------------------------------------------------------------------------*/
7055 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7057 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7058 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7059 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7060 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7062 if ( float128_is_signaling_nan( a )
7063 || float128_is_signaling_nan( b ) ) {
7064 float_raise(float_flag_invalid, status);
7066 return 1;
7068 return 0;
7071 /* misc functions */
7072 float32 uint32_to_float32(uint32_t a, float_status *status)
7074 return int64_to_float32(a, status);
7077 float64 uint32_to_float64(uint32_t a, float_status *status)
7079 return int64_to_float64(a, status);
7082 uint32_t float32_to_uint32(float32 a, float_status *status)
7084 int64_t v;
7085 uint32_t res;
7086 int old_exc_flags = get_float_exception_flags(status);
7088 v = float32_to_int64(a, status);
7089 if (v < 0) {
7090 res = 0;
7091 } else if (v > 0xffffffff) {
7092 res = 0xffffffff;
7093 } else {
7094 return v;
7096 set_float_exception_flags(old_exc_flags, status);
7097 float_raise(float_flag_invalid, status);
7098 return res;
7101 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status)
7103 int64_t v;
7104 uint32_t res;
7105 int old_exc_flags = get_float_exception_flags(status);
7107 v = float32_to_int64_round_to_zero(a, status);
7108 if (v < 0) {
7109 res = 0;
7110 } else if (v > 0xffffffff) {
7111 res = 0xffffffff;
7112 } else {
7113 return v;
7115 set_float_exception_flags(old_exc_flags, status);
7116 float_raise(float_flag_invalid, status);
7117 return res;
7120 int_fast16_t float32_to_int16(float32 a, float_status *status)
7122 int32_t v;
7123 int_fast16_t res;
7124 int old_exc_flags = get_float_exception_flags(status);
7126 v = float32_to_int32(a, status);
7127 if (v < -0x8000) {
7128 res = -0x8000;
7129 } else if (v > 0x7fff) {
7130 res = 0x7fff;
7131 } else {
7132 return v;
7135 set_float_exception_flags(old_exc_flags, status);
7136 float_raise(float_flag_invalid, status);
7137 return res;
7140 uint_fast16_t float32_to_uint16(float32 a, float_status *status)
7142 int32_t v;
7143 uint_fast16_t res;
7144 int old_exc_flags = get_float_exception_flags(status);
7146 v = float32_to_int32(a, status);
7147 if (v < 0) {
7148 res = 0;
7149 } else if (v > 0xffff) {
7150 res = 0xffff;
7151 } else {
7152 return v;
7155 set_float_exception_flags(old_exc_flags, status);
7156 float_raise(float_flag_invalid, status);
7157 return res;
7160 uint_fast16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
7162 int64_t v;
7163 uint_fast16_t res;
7164 int old_exc_flags = get_float_exception_flags(status);
7166 v = float32_to_int64_round_to_zero(a, status);
7167 if (v < 0) {
7168 res = 0;
7169 } else if (v > 0xffff) {
7170 res = 0xffff;
7171 } else {
7172 return v;
7174 set_float_exception_flags(old_exc_flags, status);
7175 float_raise(float_flag_invalid, status);
7176 return res;
7179 uint32_t float64_to_uint32(float64 a, float_status *status)
7181 uint64_t v;
7182 uint32_t res;
7183 int old_exc_flags = get_float_exception_flags(status);
7185 v = float64_to_uint64(a, status);
7186 if (v > 0xffffffff) {
7187 res = 0xffffffff;
7188 } else {
7189 return v;
7191 set_float_exception_flags(old_exc_flags, status);
7192 float_raise(float_flag_invalid, status);
7193 return res;
7196 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status)
7198 uint64_t v;
7199 uint32_t res;
7200 int old_exc_flags = get_float_exception_flags(status);
7202 v = float64_to_uint64_round_to_zero(a, status);
7203 if (v > 0xffffffff) {
7204 res = 0xffffffff;
7205 } else {
7206 return v;
7208 set_float_exception_flags(old_exc_flags, status);
7209 float_raise(float_flag_invalid, status);
7210 return res;
7213 int_fast16_t float64_to_int16(float64 a, float_status *status)
7215 int64_t v;
7216 int_fast16_t res;
7217 int old_exc_flags = get_float_exception_flags(status);
7219 v = float64_to_int32(a, status);
7220 if (v < -0x8000) {
7221 res = -0x8000;
7222 } else if (v > 0x7fff) {
7223 res = 0x7fff;
7224 } else {
7225 return v;
7228 set_float_exception_flags(old_exc_flags, status);
7229 float_raise(float_flag_invalid, status);
7230 return res;
7233 uint_fast16_t float64_to_uint16(float64 a, float_status *status)
7235 int64_t v;
7236 uint_fast16_t res;
7237 int old_exc_flags = get_float_exception_flags(status);
7239 v = float64_to_int32(a, status);
7240 if (v < 0) {
7241 res = 0;
7242 } else if (v > 0xffff) {
7243 res = 0xffff;
7244 } else {
7245 return v;
7248 set_float_exception_flags(old_exc_flags, status);
7249 float_raise(float_flag_invalid, status);
7250 return res;
7253 uint_fast16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
7255 int64_t v;
7256 uint_fast16_t res;
7257 int old_exc_flags = get_float_exception_flags(status);
7259 v = float64_to_int64_round_to_zero(a, status);
7260 if (v < 0) {
7261 res = 0;
7262 } else if (v > 0xffff) {
7263 res = 0xffff;
7264 } else {
7265 return v;
7267 set_float_exception_flags(old_exc_flags, status);
7268 float_raise(float_flag_invalid, status);
7269 return res;
7272 /*----------------------------------------------------------------------------
7273 | Returns the result of converting the double-precision floating-point value
7274 | `a' to the 64-bit unsigned integer format. The conversion is
7275 | performed according to the IEC/IEEE Standard for Binary Floating-Point
7276 | Arithmetic---which means in particular that the conversion is rounded
7277 | according to the current rounding mode. If `a' is a NaN, the largest
7278 | positive integer is returned. If the conversion overflows, the
7279 | largest unsigned integer is returned. If 'a' is negative, the value is
7280 | rounded and zero is returned; negative values that do not round to zero
7281 | will raise the inexact exception.
7282 *----------------------------------------------------------------------------*/
7284 uint64_t float64_to_uint64(float64 a, float_status *status)
7286 flag aSign;
7287 int_fast16_t aExp, shiftCount;
7288 uint64_t aSig, aSigExtra;
7289 a = float64_squash_input_denormal(a, status);
7291 aSig = extractFloat64Frac(a);
7292 aExp = extractFloat64Exp(a);
7293 aSign = extractFloat64Sign(a);
7294 if (aSign && (aExp > 1022)) {
7295 float_raise(float_flag_invalid, status);
7296 if (float64_is_any_nan(a)) {
7297 return LIT64(0xFFFFFFFFFFFFFFFF);
7298 } else {
7299 return 0;
7302 if (aExp) {
7303 aSig |= LIT64(0x0010000000000000);
7305 shiftCount = 0x433 - aExp;
7306 if (shiftCount <= 0) {
7307 if (0x43E < aExp) {
7308 float_raise(float_flag_invalid, status);
7309 return LIT64(0xFFFFFFFFFFFFFFFF);
7311 aSigExtra = 0;
7312 aSig <<= -shiftCount;
7313 } else {
7314 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7316 return roundAndPackUint64(aSign, aSig, aSigExtra, status);
7319 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
7321 signed char current_rounding_mode = status->float_rounding_mode;
7322 set_float_rounding_mode(float_round_to_zero, status);
7323 int64_t v = float64_to_uint64(a, status);
7324 set_float_rounding_mode(current_rounding_mode, status);
7325 return v;
7328 #define COMPARE(s, nan_exp) \
7329 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7330 int is_quiet, float_status *status) \
7332 flag aSign, bSign; \
7333 uint ## s ## _t av, bv; \
7334 a = float ## s ## _squash_input_denormal(a, status); \
7335 b = float ## s ## _squash_input_denormal(b, status); \
7337 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
7338 extractFloat ## s ## Frac( a ) ) || \
7339 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
7340 extractFloat ## s ## Frac( b ) )) { \
7341 if (!is_quiet || \
7342 float ## s ## _is_signaling_nan( a ) || \
7343 float ## s ## _is_signaling_nan( b ) ) { \
7344 float_raise(float_flag_invalid, status); \
7346 return float_relation_unordered; \
7348 aSign = extractFloat ## s ## Sign( a ); \
7349 bSign = extractFloat ## s ## Sign( b ); \
7350 av = float ## s ## _val(a); \
7351 bv = float ## s ## _val(b); \
7352 if ( aSign != bSign ) { \
7353 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
7354 /* zero case */ \
7355 return float_relation_equal; \
7356 } else { \
7357 return 1 - (2 * aSign); \
7359 } else { \
7360 if (av == bv) { \
7361 return float_relation_equal; \
7362 } else { \
7363 return 1 - 2 * (aSign ^ ( av < bv )); \
7368 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
7370 return float ## s ## _compare_internal(a, b, 0, status); \
7373 int float ## s ## _compare_quiet(float ## s a, float ## s b, \
7374 float_status *status) \
7376 return float ## s ## _compare_internal(a, b, 1, status); \
7379 COMPARE(32, 0xff)
7380 COMPARE(64, 0x7ff)
7382 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7383 int is_quiet, float_status *status)
7385 flag aSign, bSign;
7387 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7388 ( extractFloatx80Frac( a )<<1 ) ) ||
7389 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7390 ( extractFloatx80Frac( b )<<1 ) )) {
7391 if (!is_quiet ||
7392 floatx80_is_signaling_nan( a ) ||
7393 floatx80_is_signaling_nan( b ) ) {
7394 float_raise(float_flag_invalid, status);
7396 return float_relation_unordered;
7398 aSign = extractFloatx80Sign( a );
7399 bSign = extractFloatx80Sign( b );
7400 if ( aSign != bSign ) {
7402 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7403 ( ( a.low | b.low ) == 0 ) ) {
7404 /* zero case */
7405 return float_relation_equal;
7406 } else {
7407 return 1 - (2 * aSign);
7409 } else {
7410 if (a.low == b.low && a.high == b.high) {
7411 return float_relation_equal;
7412 } else {
7413 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7418 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7420 return floatx80_compare_internal(a, b, 0, status);
7423 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7425 return floatx80_compare_internal(a, b, 1, status);
7428 static inline int float128_compare_internal(float128 a, float128 b,
7429 int is_quiet, float_status *status)
7431 flag aSign, bSign;
7433 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7434 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7435 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7436 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7437 if (!is_quiet ||
7438 float128_is_signaling_nan( a ) ||
7439 float128_is_signaling_nan( b ) ) {
7440 float_raise(float_flag_invalid, status);
7442 return float_relation_unordered;
7444 aSign = extractFloat128Sign( a );
7445 bSign = extractFloat128Sign( b );
7446 if ( aSign != bSign ) {
7447 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7448 /* zero case */
7449 return float_relation_equal;
7450 } else {
7451 return 1 - (2 * aSign);
7453 } else {
7454 if (a.low == b.low && a.high == b.high) {
7455 return float_relation_equal;
7456 } else {
7457 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7462 int float128_compare(float128 a, float128 b, float_status *status)
7464 return float128_compare_internal(a, b, 0, status);
7467 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7469 return float128_compare_internal(a, b, 1, status);
7472 /* min() and max() functions. These can't be implemented as
7473 * 'compare and pick one input' because that would mishandle
7474 * NaNs and +0 vs -0.
7476 * minnum() and maxnum() functions. These are similar to the min()
7477 * and max() functions but if one of the arguments is a QNaN and
7478 * the other is numerical then the numerical argument is returned.
7479 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7480 * and maxNum() operations. min() and max() are the typical min/max
7481 * semantics provided by many CPUs which predate that specification.
7483 * minnummag() and maxnummag() functions correspond to minNumMag()
7484 * and minNumMag() from the IEEE-754 2008.
7486 #define MINMAX(s) \
7487 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \
7488 int ismin, int isieee, \
7489 int ismag, \
7490 float_status *status) \
7492 flag aSign, bSign; \
7493 uint ## s ## _t av, bv, aav, abv; \
7494 a = float ## s ## _squash_input_denormal(a, status); \
7495 b = float ## s ## _squash_input_denormal(b, status); \
7496 if (float ## s ## _is_any_nan(a) || \
7497 float ## s ## _is_any_nan(b)) { \
7498 if (isieee) { \
7499 if (float ## s ## _is_quiet_nan(a) && \
7500 !float ## s ##_is_any_nan(b)) { \
7501 return b; \
7502 } else if (float ## s ## _is_quiet_nan(b) && \
7503 !float ## s ## _is_any_nan(a)) { \
7504 return a; \
7507 return propagateFloat ## s ## NaN(a, b, status); \
7509 aSign = extractFloat ## s ## Sign(a); \
7510 bSign = extractFloat ## s ## Sign(b); \
7511 av = float ## s ## _val(a); \
7512 bv = float ## s ## _val(b); \
7513 if (ismag) { \
7514 aav = float ## s ## _abs(av); \
7515 abv = float ## s ## _abs(bv); \
7516 if (aav != abv) { \
7517 if (ismin) { \
7518 return (aav < abv) ? a : b; \
7519 } else { \
7520 return (aav < abv) ? b : a; \
7524 if (aSign != bSign) { \
7525 if (ismin) { \
7526 return aSign ? a : b; \
7527 } else { \
7528 return aSign ? b : a; \
7530 } else { \
7531 if (ismin) { \
7532 return (aSign ^ (av < bv)) ? a : b; \
7533 } else { \
7534 return (aSign ^ (av < bv)) ? b : a; \
7539 float ## s float ## s ## _min(float ## s a, float ## s b, \
7540 float_status *status) \
7542 return float ## s ## _minmax(a, b, 1, 0, 0, status); \
7545 float ## s float ## s ## _max(float ## s a, float ## s b, \
7546 float_status *status) \
7548 return float ## s ## _minmax(a, b, 0, 0, 0, status); \
7551 float ## s float ## s ## _minnum(float ## s a, float ## s b, \
7552 float_status *status) \
7554 return float ## s ## _minmax(a, b, 1, 1, 0, status); \
7557 float ## s float ## s ## _maxnum(float ## s a, float ## s b, \
7558 float_status *status) \
7560 return float ## s ## _minmax(a, b, 0, 1, 0, status); \
7563 float ## s float ## s ## _minnummag(float ## s a, float ## s b, \
7564 float_status *status) \
7566 return float ## s ## _minmax(a, b, 1, 1, 1, status); \
7569 float ## s float ## s ## _maxnummag(float ## s a, float ## s b, \
7570 float_status *status) \
7572 return float ## s ## _minmax(a, b, 0, 1, 1, status); \
7575 MINMAX(32)
7576 MINMAX(64)
7579 /* Multiply A by 2 raised to the power N. */
7580 float32 float32_scalbn(float32 a, int n, float_status *status)
7582 flag aSign;
7583 int16_t aExp;
7584 uint32_t aSig;
7586 a = float32_squash_input_denormal(a, status);
7587 aSig = extractFloat32Frac( a );
7588 aExp = extractFloat32Exp( a );
7589 aSign = extractFloat32Sign( a );
7591 if ( aExp == 0xFF ) {
7592 if ( aSig ) {
7593 return propagateFloat32NaN(a, a, status);
7595 return a;
7597 if (aExp != 0) {
7598 aSig |= 0x00800000;
7599 } else if (aSig == 0) {
7600 return a;
7601 } else {
7602 aExp++;
7605 if (n > 0x200) {
7606 n = 0x200;
7607 } else if (n < -0x200) {
7608 n = -0x200;
7611 aExp += n - 1;
7612 aSig <<= 7;
7613 return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
7616 float64 float64_scalbn(float64 a, int n, float_status *status)
7618 flag aSign;
7619 int16_t aExp;
7620 uint64_t aSig;
7622 a = float64_squash_input_denormal(a, status);
7623 aSig = extractFloat64Frac( a );
7624 aExp = extractFloat64Exp( a );
7625 aSign = extractFloat64Sign( a );
7627 if ( aExp == 0x7FF ) {
7628 if ( aSig ) {
7629 return propagateFloat64NaN(a, a, status);
7631 return a;
7633 if (aExp != 0) {
7634 aSig |= LIT64( 0x0010000000000000 );
7635 } else if (aSig == 0) {
7636 return a;
7637 } else {
7638 aExp++;
7641 if (n > 0x1000) {
7642 n = 0x1000;
7643 } else if (n < -0x1000) {
7644 n = -0x1000;
7647 aExp += n - 1;
7648 aSig <<= 10;
7649 return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
7652 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7654 flag aSign;
7655 int32_t aExp;
7656 uint64_t aSig;
7658 aSig = extractFloatx80Frac( a );
7659 aExp = extractFloatx80Exp( a );
7660 aSign = extractFloatx80Sign( a );
7662 if ( aExp == 0x7FFF ) {
7663 if ( aSig<<1 ) {
7664 return propagateFloatx80NaN(a, a, status);
7666 return a;
7669 if (aExp == 0) {
7670 if (aSig == 0) {
7671 return a;
7673 aExp++;
7676 if (n > 0x10000) {
7677 n = 0x10000;
7678 } else if (n < -0x10000) {
7679 n = -0x10000;
7682 aExp += n;
7683 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7684 aSign, aExp, aSig, 0, status);
7687 float128 float128_scalbn(float128 a, int n, float_status *status)
7689 flag aSign;
7690 int32_t aExp;
7691 uint64_t aSig0, aSig1;
7693 aSig1 = extractFloat128Frac1( a );
7694 aSig0 = extractFloat128Frac0( a );
7695 aExp = extractFloat128Exp( a );
7696 aSign = extractFloat128Sign( a );
7697 if ( aExp == 0x7FFF ) {
7698 if ( aSig0 | aSig1 ) {
7699 return propagateFloat128NaN(a, a, status);
7701 return a;
7703 if (aExp != 0) {
7704 aSig0 |= LIT64( 0x0001000000000000 );
7705 } else if (aSig0 == 0 && aSig1 == 0) {
7706 return a;
7707 } else {
7708 aExp++;
7711 if (n > 0x10000) {
7712 n = 0x10000;
7713 } else if (n < -0x10000) {
7714 n = -0x10000;
7717 aExp += n - 1;
7718 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7719 , status);