4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
95 *----------------------------------------------------------------------------*/
96 #include "softfloat-macros.h"
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine: (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output. These details are target-
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
112 static inline uint32_t extractFloat16Frac(float16 a
)
114 return float16_val(a
) & 0x3ff;
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
121 static inline int extractFloat16Exp(float16 a
)
123 return (float16_val(a
) >> 10) & 0x1f;
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
130 static inline flag
extractFloat16Sign(float16 a
)
132 return float16_val(a
)>>15;
135 /*----------------------------------------------------------------------------
136 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
137 | and 7, and returns the properly rounded 32-bit integer corresponding to the
138 | input. If `zSign' is 1, the input is negated before being converted to an
139 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
140 | is simply rounded to an integer, with the inexact exception raised if the
141 | input cannot be represented exactly as an integer. However, if the fixed-
142 | point input is too large, the invalid exception is raised and the largest
143 | positive or negative integer is returned.
144 *----------------------------------------------------------------------------*/
146 static int32_t roundAndPackInt32(flag zSign
, uint64_t absZ
, float_status
*status
)
149 flag roundNearestEven
;
150 int8_t roundIncrement
, roundBits
;
153 roundingMode
= status
->float_rounding_mode
;
154 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
155 switch (roundingMode
) {
156 case float_round_nearest_even
:
157 case float_round_ties_away
:
158 roundIncrement
= 0x40;
160 case float_round_to_zero
:
164 roundIncrement
= zSign
? 0 : 0x7f;
166 case float_round_down
:
167 roundIncrement
= zSign
? 0x7f : 0;
172 roundBits
= absZ
& 0x7F;
173 absZ
= ( absZ
+ roundIncrement
)>>7;
174 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
176 if ( zSign
) z
= - z
;
177 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
178 float_raise(float_flag_invalid
, status
);
179 return zSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
182 status
->float_exception_flags
|= float_flag_inexact
;
188 /*----------------------------------------------------------------------------
189 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
190 | `absZ1', with binary point between bits 63 and 64 (between the input words),
191 | and returns the properly rounded 64-bit integer corresponding to the input.
192 | If `zSign' is 1, the input is negated before being converted to an integer.
193 | Ordinarily, the fixed-point input is simply rounded to an integer, with
194 | the inexact exception raised if the input cannot be represented exactly as
195 | an integer. However, if the fixed-point input is too large, the invalid
196 | exception is raised and the largest positive or negative integer is
198 *----------------------------------------------------------------------------*/
200 static int64_t roundAndPackInt64(flag zSign
, uint64_t absZ0
, uint64_t absZ1
,
201 float_status
*status
)
204 flag roundNearestEven
, increment
;
207 roundingMode
= status
->float_rounding_mode
;
208 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
209 switch (roundingMode
) {
210 case float_round_nearest_even
:
211 case float_round_ties_away
:
212 increment
= ((int64_t) absZ1
< 0);
214 case float_round_to_zero
:
218 increment
= !zSign
&& absZ1
;
220 case float_round_down
:
221 increment
= zSign
&& absZ1
;
228 if ( absZ0
== 0 ) goto overflow
;
229 absZ0
&= ~ ( ( (uint64_t) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
232 if ( zSign
) z
= - z
;
233 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
235 float_raise(float_flag_invalid
, status
);
237 zSign
? (int64_t) LIT64( 0x8000000000000000 )
238 : LIT64( 0x7FFFFFFFFFFFFFFF );
241 status
->float_exception_flags
|= float_flag_inexact
;
247 /*----------------------------------------------------------------------------
248 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
249 | `absZ1', with binary point between bits 63 and 64 (between the input words),
250 | and returns the properly rounded 64-bit unsigned integer corresponding to the
251 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
252 | with the inexact exception raised if the input cannot be represented exactly
253 | as an integer. However, if the fixed-point input is too large, the invalid
254 | exception is raised and the largest unsigned integer is returned.
255 *----------------------------------------------------------------------------*/
257 static int64_t roundAndPackUint64(flag zSign
, uint64_t absZ0
,
258 uint64_t absZ1
, float_status
*status
)
261 flag roundNearestEven
, increment
;
263 roundingMode
= status
->float_rounding_mode
;
264 roundNearestEven
= (roundingMode
== float_round_nearest_even
);
265 switch (roundingMode
) {
266 case float_round_nearest_even
:
267 case float_round_ties_away
:
268 increment
= ((int64_t)absZ1
< 0);
270 case float_round_to_zero
:
274 increment
= !zSign
&& absZ1
;
276 case float_round_down
:
277 increment
= zSign
&& absZ1
;
285 float_raise(float_flag_invalid
, status
);
286 return LIT64(0xFFFFFFFFFFFFFFFF);
288 absZ0
&= ~(((uint64_t)(absZ1
<<1) == 0) & roundNearestEven
);
291 if (zSign
&& absZ0
) {
292 float_raise(float_flag_invalid
, status
);
297 status
->float_exception_flags
|= float_flag_inexact
;
302 /*----------------------------------------------------------------------------
303 | Returns the fraction bits of the single-precision floating-point value `a'.
304 *----------------------------------------------------------------------------*/
306 static inline uint32_t extractFloat32Frac( float32 a
)
309 return float32_val(a
) & 0x007FFFFF;
313 /*----------------------------------------------------------------------------
314 | Returns the exponent bits of the single-precision floating-point value `a'.
315 *----------------------------------------------------------------------------*/
317 static inline int extractFloat32Exp(float32 a
)
320 return ( float32_val(a
)>>23 ) & 0xFF;
324 /*----------------------------------------------------------------------------
325 | Returns the sign bit of the single-precision floating-point value `a'.
326 *----------------------------------------------------------------------------*/
328 static inline flag
extractFloat32Sign( float32 a
)
331 return float32_val(a
)>>31;
335 /*----------------------------------------------------------------------------
336 | If `a' is denormal and we are in flush-to-zero mode then set the
337 | input-denormal exception and return zero. Otherwise just return the value.
338 *----------------------------------------------------------------------------*/
339 float32
float32_squash_input_denormal(float32 a
, float_status
*status
)
341 if (status
->flush_inputs_to_zero
) {
342 if (extractFloat32Exp(a
) == 0 && extractFloat32Frac(a
) != 0) {
343 float_raise(float_flag_input_denormal
, status
);
344 return make_float32(float32_val(a
) & 0x80000000);
350 /*----------------------------------------------------------------------------
351 | Normalizes the subnormal single-precision floating-point value represented
352 | by the denormalized significand `aSig'. The normalized exponent and
353 | significand are stored at the locations pointed to by `zExpPtr' and
354 | `zSigPtr', respectively.
355 *----------------------------------------------------------------------------*/
358 normalizeFloat32Subnormal(uint32_t aSig
, int *zExpPtr
, uint32_t *zSigPtr
)
362 shiftCount
= countLeadingZeros32( aSig
) - 8;
363 *zSigPtr
= aSig
<<shiftCount
;
364 *zExpPtr
= 1 - shiftCount
;
368 /*----------------------------------------------------------------------------
369 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
370 | single-precision floating-point value, returning the result. After being
371 | shifted into the proper positions, the three fields are simply added
372 | together to form the result. This means that any integer portion of `zSig'
373 | will be added into the exponent. Since a properly normalized significand
374 | will have an integer portion equal to 1, the `zExp' input should be 1 less
375 | than the desired result exponent whenever `zSig' is a complete, normalized
377 *----------------------------------------------------------------------------*/
379 static inline float32
packFloat32(flag zSign
, int zExp
, uint32_t zSig
)
383 ( ( (uint32_t) zSign
)<<31 ) + ( ( (uint32_t) zExp
)<<23 ) + zSig
);
387 /*----------------------------------------------------------------------------
388 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
389 | and significand `zSig', and returns the proper single-precision floating-
390 | point value corresponding to the abstract input. Ordinarily, the abstract
391 | value is simply rounded and packed into the single-precision format, with
392 | the inexact exception raised if the abstract input cannot be represented
393 | exactly. However, if the abstract value is too large, the overflow and
394 | inexact exceptions are raised and an infinity or maximal finite value is
395 | returned. If the abstract value is too small, the input value is rounded to
396 | a subnormal number, and the underflow and inexact exceptions are raised if
397 | the abstract input cannot be represented exactly as a subnormal single-
398 | precision floating-point number.
399 | The input significand `zSig' has its binary point between bits 30
400 | and 29, which is 7 bits to the left of the usual location. This shifted
401 | significand must be normalized or smaller. If `zSig' is not normalized,
402 | `zExp' must be 0; in that case, the result returned is a subnormal number,
403 | and it must not require rounding. In the usual case that `zSig' is
404 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
405 | The handling of underflow and overflow follows the IEC/IEEE Standard for
406 | Binary Floating-Point Arithmetic.
407 *----------------------------------------------------------------------------*/
409 static float32
roundAndPackFloat32(flag zSign
, int zExp
, uint32_t zSig
,
410 float_status
*status
)
413 flag roundNearestEven
;
414 int8_t roundIncrement
, roundBits
;
417 roundingMode
= status
->float_rounding_mode
;
418 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
419 switch (roundingMode
) {
420 case float_round_nearest_even
:
421 case float_round_ties_away
:
422 roundIncrement
= 0x40;
424 case float_round_to_zero
:
428 roundIncrement
= zSign
? 0 : 0x7f;
430 case float_round_down
:
431 roundIncrement
= zSign
? 0x7f : 0;
437 roundBits
= zSig
& 0x7F;
438 if ( 0xFD <= (uint16_t) zExp
) {
440 || ( ( zExp
== 0xFD )
441 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
443 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
444 return packFloat32( zSign
, 0xFF, - ( roundIncrement
== 0 ));
447 if (status
->flush_to_zero
) {
448 float_raise(float_flag_output_denormal
, status
);
449 return packFloat32(zSign
, 0, 0);
452 (status
->float_detect_tininess
453 == float_tininess_before_rounding
)
455 || ( zSig
+ roundIncrement
< 0x80000000 );
456 shift32RightJamming( zSig
, - zExp
, &zSig
);
458 roundBits
= zSig
& 0x7F;
459 if (isTiny
&& roundBits
) {
460 float_raise(float_flag_underflow
, status
);
465 status
->float_exception_flags
|= float_flag_inexact
;
467 zSig
= ( zSig
+ roundIncrement
)>>7;
468 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
469 if ( zSig
== 0 ) zExp
= 0;
470 return packFloat32( zSign
, zExp
, zSig
);
474 /*----------------------------------------------------------------------------
475 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
476 | and significand `zSig', and returns the proper single-precision floating-
477 | point value corresponding to the abstract input. This routine is just like
478 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
479 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
480 | floating-point exponent.
481 *----------------------------------------------------------------------------*/
484 normalizeRoundAndPackFloat32(flag zSign
, int zExp
, uint32_t zSig
,
485 float_status
*status
)
489 shiftCount
= countLeadingZeros32( zSig
) - 1;
490 return roundAndPackFloat32(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
495 /*----------------------------------------------------------------------------
496 | Returns the fraction bits of the double-precision floating-point value `a'.
497 *----------------------------------------------------------------------------*/
499 static inline uint64_t extractFloat64Frac( float64 a
)
502 return float64_val(a
) & LIT64( 0x000FFFFFFFFFFFFF );
506 /*----------------------------------------------------------------------------
507 | Returns the exponent bits of the double-precision floating-point value `a'.
508 *----------------------------------------------------------------------------*/
510 static inline int extractFloat64Exp(float64 a
)
513 return ( float64_val(a
)>>52 ) & 0x7FF;
517 /*----------------------------------------------------------------------------
518 | Returns the sign bit of the double-precision floating-point value `a'.
519 *----------------------------------------------------------------------------*/
521 static inline flag
extractFloat64Sign( float64 a
)
524 return float64_val(a
)>>63;
528 /*----------------------------------------------------------------------------
529 | If `a' is denormal and we are in flush-to-zero mode then set the
530 | input-denormal exception and return zero. Otherwise just return the value.
531 *----------------------------------------------------------------------------*/
532 float64
float64_squash_input_denormal(float64 a
, float_status
*status
)
534 if (status
->flush_inputs_to_zero
) {
535 if (extractFloat64Exp(a
) == 0 && extractFloat64Frac(a
) != 0) {
536 float_raise(float_flag_input_denormal
, status
);
537 return make_float64(float64_val(a
) & (1ULL << 63));
543 /*----------------------------------------------------------------------------
544 | Normalizes the subnormal double-precision floating-point value represented
545 | by the denormalized significand `aSig'. The normalized exponent and
546 | significand are stored at the locations pointed to by `zExpPtr' and
547 | `zSigPtr', respectively.
548 *----------------------------------------------------------------------------*/
551 normalizeFloat64Subnormal(uint64_t aSig
, int *zExpPtr
, uint64_t *zSigPtr
)
555 shiftCount
= countLeadingZeros64( aSig
) - 11;
556 *zSigPtr
= aSig
<<shiftCount
;
557 *zExpPtr
= 1 - shiftCount
;
561 /*----------------------------------------------------------------------------
562 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
563 | double-precision floating-point value, returning the result. After being
564 | shifted into the proper positions, the three fields are simply added
565 | together to form the result. This means that any integer portion of `zSig'
566 | will be added into the exponent. Since a properly normalized significand
567 | will have an integer portion equal to 1, the `zExp' input should be 1 less
568 | than the desired result exponent whenever `zSig' is a complete, normalized
570 *----------------------------------------------------------------------------*/
572 static inline float64
packFloat64(flag zSign
, int zExp
, uint64_t zSig
)
576 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
580 /*----------------------------------------------------------------------------
581 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
582 | and significand `zSig', and returns the proper double-precision floating-
583 | point value corresponding to the abstract input. Ordinarily, the abstract
584 | value is simply rounded and packed into the double-precision format, with
585 | the inexact exception raised if the abstract input cannot be represented
586 | exactly. However, if the abstract value is too large, the overflow and
587 | inexact exceptions are raised and an infinity or maximal finite value is
588 | returned. If the abstract value is too small, the input value is rounded to
589 | a subnormal number, and the underflow and inexact exceptions are raised if
590 | the abstract input cannot be represented exactly as a subnormal double-
591 | precision floating-point number.
592 | The input significand `zSig' has its binary point between bits 62
593 | and 61, which is 10 bits to the left of the usual location. This shifted
594 | significand must be normalized or smaller. If `zSig' is not normalized,
595 | `zExp' must be 0; in that case, the result returned is a subnormal number,
596 | and it must not require rounding. In the usual case that `zSig' is
597 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
598 | The handling of underflow and overflow follows the IEC/IEEE Standard for
599 | Binary Floating-Point Arithmetic.
600 *----------------------------------------------------------------------------*/
602 static float64
roundAndPackFloat64(flag zSign
, int zExp
, uint64_t zSig
,
603 float_status
*status
)
606 flag roundNearestEven
;
607 int roundIncrement
, roundBits
;
610 roundingMode
= status
->float_rounding_mode
;
611 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
612 switch (roundingMode
) {
613 case float_round_nearest_even
:
614 case float_round_ties_away
:
615 roundIncrement
= 0x200;
617 case float_round_to_zero
:
621 roundIncrement
= zSign
? 0 : 0x3ff;
623 case float_round_down
:
624 roundIncrement
= zSign
? 0x3ff : 0;
626 case float_round_to_odd
:
627 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
632 roundBits
= zSig
& 0x3FF;
633 if ( 0x7FD <= (uint16_t) zExp
) {
634 if ( ( 0x7FD < zExp
)
635 || ( ( zExp
== 0x7FD )
636 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
638 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
640 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
641 return packFloat64(zSign
, 0x7FF, -(!overflow_to_inf
));
644 if (status
->flush_to_zero
) {
645 float_raise(float_flag_output_denormal
, status
);
646 return packFloat64(zSign
, 0, 0);
649 (status
->float_detect_tininess
650 == float_tininess_before_rounding
)
652 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
653 shift64RightJamming( zSig
, - zExp
, &zSig
);
655 roundBits
= zSig
& 0x3FF;
656 if (isTiny
&& roundBits
) {
657 float_raise(float_flag_underflow
, status
);
659 if (roundingMode
== float_round_to_odd
) {
661 * For round-to-odd case, the roundIncrement depends on
662 * zSig which just changed.
664 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
669 status
->float_exception_flags
|= float_flag_inexact
;
671 zSig
= ( zSig
+ roundIncrement
)>>10;
672 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
673 if ( zSig
== 0 ) zExp
= 0;
674 return packFloat64( zSign
, zExp
, zSig
);
678 /*----------------------------------------------------------------------------
679 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
680 | and significand `zSig', and returns the proper double-precision floating-
681 | point value corresponding to the abstract input. This routine is just like
682 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
683 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
684 | floating-point exponent.
685 *----------------------------------------------------------------------------*/
688 normalizeRoundAndPackFloat64(flag zSign
, int zExp
, uint64_t zSig
,
689 float_status
*status
)
693 shiftCount
= countLeadingZeros64( zSig
) - 1;
694 return roundAndPackFloat64(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
699 /*----------------------------------------------------------------------------
700 | Returns the fraction bits of the extended double-precision floating-point
702 *----------------------------------------------------------------------------*/
704 static inline uint64_t extractFloatx80Frac( floatx80 a
)
711 /*----------------------------------------------------------------------------
712 | Returns the exponent bits of the extended double-precision floating-point
714 *----------------------------------------------------------------------------*/
716 static inline int32_t extractFloatx80Exp( floatx80 a
)
719 return a
.high
& 0x7FFF;
723 /*----------------------------------------------------------------------------
724 | Returns the sign bit of the extended double-precision floating-point value
726 *----------------------------------------------------------------------------*/
728 static inline flag
extractFloatx80Sign( floatx80 a
)
735 /*----------------------------------------------------------------------------
736 | Normalizes the subnormal extended double-precision floating-point value
737 | represented by the denormalized significand `aSig'. The normalized exponent
738 | and significand are stored at the locations pointed to by `zExpPtr' and
739 | `zSigPtr', respectively.
740 *----------------------------------------------------------------------------*/
743 normalizeFloatx80Subnormal( uint64_t aSig
, int32_t *zExpPtr
, uint64_t *zSigPtr
)
747 shiftCount
= countLeadingZeros64( aSig
);
748 *zSigPtr
= aSig
<<shiftCount
;
749 *zExpPtr
= 1 - shiftCount
;
753 /*----------------------------------------------------------------------------
754 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
755 | extended double-precision floating-point value, returning the result.
756 *----------------------------------------------------------------------------*/
758 static inline floatx80
packFloatx80( flag zSign
, int32_t zExp
, uint64_t zSig
)
763 z
.high
= ( ( (uint16_t) zSign
)<<15 ) + zExp
;
768 /*----------------------------------------------------------------------------
769 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
770 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
771 | and returns the proper extended double-precision floating-point value
772 | corresponding to the abstract input. Ordinarily, the abstract value is
773 | rounded and packed into the extended double-precision format, with the
774 | inexact exception raised if the abstract input cannot be represented
775 | exactly. However, if the abstract value is too large, the overflow and
776 | inexact exceptions are raised and an infinity or maximal finite value is
777 | returned. If the abstract value is too small, the input value is rounded to
778 | a subnormal number, and the underflow and inexact exceptions are raised if
779 | the abstract input cannot be represented exactly as a subnormal extended
780 | double-precision floating-point number.
781 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
782 | number of bits as single or double precision, respectively. Otherwise, the
783 | result is rounded to the full precision of the extended double-precision
785 | The input significand must be normalized or smaller. If the input
786 | significand is not normalized, `zExp' must be 0; in that case, the result
787 | returned is a subnormal number, and it must not require rounding. The
788 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
789 | Floating-Point Arithmetic.
790 *----------------------------------------------------------------------------*/
792 static floatx80
roundAndPackFloatx80(int8_t roundingPrecision
, flag zSign
,
793 int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
,
794 float_status
*status
)
797 flag roundNearestEven
, increment
, isTiny
;
798 int64_t roundIncrement
, roundMask
, roundBits
;
800 roundingMode
= status
->float_rounding_mode
;
801 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
802 if ( roundingPrecision
== 80 ) goto precision80
;
803 if ( roundingPrecision
== 64 ) {
804 roundIncrement
= LIT64( 0x0000000000000400 );
805 roundMask
= LIT64( 0x00000000000007FF );
807 else if ( roundingPrecision
== 32 ) {
808 roundIncrement
= LIT64( 0x0000008000000000 );
809 roundMask
= LIT64( 0x000000FFFFFFFFFF );
814 zSig0
|= ( zSig1
!= 0 );
815 switch (roundingMode
) {
816 case float_round_nearest_even
:
817 case float_round_ties_away
:
819 case float_round_to_zero
:
823 roundIncrement
= zSign
? 0 : roundMask
;
825 case float_round_down
:
826 roundIncrement
= zSign
? roundMask
: 0;
831 roundBits
= zSig0
& roundMask
;
832 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
833 if ( ( 0x7FFE < zExp
)
834 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
839 if (status
->flush_to_zero
) {
840 float_raise(float_flag_output_denormal
, status
);
841 return packFloatx80(zSign
, 0, 0);
844 (status
->float_detect_tininess
845 == float_tininess_before_rounding
)
847 || ( zSig0
<= zSig0
+ roundIncrement
);
848 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
850 roundBits
= zSig0
& roundMask
;
851 if (isTiny
&& roundBits
) {
852 float_raise(float_flag_underflow
, status
);
855 status
->float_exception_flags
|= float_flag_inexact
;
857 zSig0
+= roundIncrement
;
858 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
859 roundIncrement
= roundMask
+ 1;
860 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
861 roundMask
|= roundIncrement
;
863 zSig0
&= ~ roundMask
;
864 return packFloatx80( zSign
, zExp
, zSig0
);
868 status
->float_exception_flags
|= float_flag_inexact
;
870 zSig0
+= roundIncrement
;
871 if ( zSig0
< roundIncrement
) {
873 zSig0
= LIT64( 0x8000000000000000 );
875 roundIncrement
= roundMask
+ 1;
876 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
877 roundMask
|= roundIncrement
;
879 zSig0
&= ~ roundMask
;
880 if ( zSig0
== 0 ) zExp
= 0;
881 return packFloatx80( zSign
, zExp
, zSig0
);
883 switch (roundingMode
) {
884 case float_round_nearest_even
:
885 case float_round_ties_away
:
886 increment
= ((int64_t)zSig1
< 0);
888 case float_round_to_zero
:
892 increment
= !zSign
&& zSig1
;
894 case float_round_down
:
895 increment
= zSign
&& zSig1
;
900 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
901 if ( ( 0x7FFE < zExp
)
902 || ( ( zExp
== 0x7FFE )
903 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
909 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
910 if ( ( roundingMode
== float_round_to_zero
)
911 || ( zSign
&& ( roundingMode
== float_round_up
) )
912 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
914 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
916 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
920 (status
->float_detect_tininess
921 == float_tininess_before_rounding
)
924 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
925 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
927 if (isTiny
&& zSig1
) {
928 float_raise(float_flag_underflow
, status
);
931 status
->float_exception_flags
|= float_flag_inexact
;
933 switch (roundingMode
) {
934 case float_round_nearest_even
:
935 case float_round_ties_away
:
936 increment
= ((int64_t)zSig1
< 0);
938 case float_round_to_zero
:
942 increment
= !zSign
&& zSig1
;
944 case float_round_down
:
945 increment
= zSign
&& zSig1
;
953 ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
954 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
956 return packFloatx80( zSign
, zExp
, zSig0
);
960 status
->float_exception_flags
|= float_flag_inexact
;
966 zSig0
= LIT64( 0x8000000000000000 );
969 zSig0
&= ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
973 if ( zSig0
== 0 ) zExp
= 0;
975 return packFloatx80( zSign
, zExp
, zSig0
);
979 /*----------------------------------------------------------------------------
980 | Takes an abstract floating-point value having sign `zSign', exponent
981 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
982 | and returns the proper extended double-precision floating-point value
983 | corresponding to the abstract input. This routine is just like
984 | `roundAndPackFloatx80' except that the input significand does not have to be
986 *----------------------------------------------------------------------------*/
988 static floatx80
normalizeRoundAndPackFloatx80(int8_t roundingPrecision
,
989 flag zSign
, int32_t zExp
,
990 uint64_t zSig0
, uint64_t zSig1
,
991 float_status
*status
)
1000 shiftCount
= countLeadingZeros64( zSig0
);
1001 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1003 return roundAndPackFloatx80(roundingPrecision
, zSign
, zExp
,
1004 zSig0
, zSig1
, status
);
1008 /*----------------------------------------------------------------------------
1009 | Returns the least-significant 64 fraction bits of the quadruple-precision
1010 | floating-point value `a'.
1011 *----------------------------------------------------------------------------*/
1013 static inline uint64_t extractFloat128Frac1( float128 a
)
1020 /*----------------------------------------------------------------------------
1021 | Returns the most-significant 48 fraction bits of the quadruple-precision
1022 | floating-point value `a'.
1023 *----------------------------------------------------------------------------*/
1025 static inline uint64_t extractFloat128Frac0( float128 a
)
1028 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
1032 /*----------------------------------------------------------------------------
1033 | Returns the exponent bits of the quadruple-precision floating-point value
1035 *----------------------------------------------------------------------------*/
1037 static inline int32_t extractFloat128Exp( float128 a
)
1040 return ( a
.high
>>48 ) & 0x7FFF;
1044 /*----------------------------------------------------------------------------
1045 | Returns the sign bit of the quadruple-precision floating-point value `a'.
1046 *----------------------------------------------------------------------------*/
1048 static inline flag
extractFloat128Sign( float128 a
)
1055 /*----------------------------------------------------------------------------
1056 | Normalizes the subnormal quadruple-precision floating-point value
1057 | represented by the denormalized significand formed by the concatenation of
1058 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
1059 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
1060 | significand are stored at the location pointed to by `zSig0Ptr', and the
1061 | least significant 64 bits of the normalized significand are stored at the
1062 | location pointed to by `zSig1Ptr'.
1063 *----------------------------------------------------------------------------*/
1066 normalizeFloat128Subnormal(
1077 shiftCount
= countLeadingZeros64( aSig1
) - 15;
1078 if ( shiftCount
< 0 ) {
1079 *zSig0Ptr
= aSig1
>>( - shiftCount
);
1080 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
1083 *zSig0Ptr
= aSig1
<<shiftCount
;
1086 *zExpPtr
= - shiftCount
- 63;
1089 shiftCount
= countLeadingZeros64( aSig0
) - 15;
1090 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
1091 *zExpPtr
= 1 - shiftCount
;
1096 /*----------------------------------------------------------------------------
1097 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1098 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1099 | floating-point value, returning the result. After being shifted into the
1100 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1101 | added together to form the most significant 32 bits of the result. This
1102 | means that any integer portion of `zSig0' will be added into the exponent.
1103 | Since a properly normalized significand will have an integer portion equal
1104 | to 1, the `zExp' input should be 1 less than the desired result exponent
1105 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1107 *----------------------------------------------------------------------------*/
1109 static inline float128
1110 packFloat128( flag zSign
, int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
)
1115 z
.high
= ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<48 ) + zSig0
;
1120 /*----------------------------------------------------------------------------
1121 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1122 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1123 | and `zSig2', and returns the proper quadruple-precision floating-point value
1124 | corresponding to the abstract input. Ordinarily, the abstract value is
1125 | simply rounded and packed into the quadruple-precision format, with the
1126 | inexact exception raised if the abstract input cannot be represented
1127 | exactly. However, if the abstract value is too large, the overflow and
1128 | inexact exceptions are raised and an infinity or maximal finite value is
1129 | returned. If the abstract value is too small, the input value is rounded to
1130 | a subnormal number, and the underflow and inexact exceptions are raised if
1131 | the abstract input cannot be represented exactly as a subnormal quadruple-
1132 | precision floating-point number.
1133 | The input significand must be normalized or smaller. If the input
1134 | significand is not normalized, `zExp' must be 0; in that case, the result
1135 | returned is a subnormal number, and it must not require rounding. In the
1136 | usual case that the input significand is normalized, `zExp' must be 1 less
1137 | than the ``true'' floating-point exponent. The handling of underflow and
1138 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139 *----------------------------------------------------------------------------*/
1141 static float128
roundAndPackFloat128(flag zSign
, int32_t zExp
,
1142 uint64_t zSig0
, uint64_t zSig1
,
1143 uint64_t zSig2
, float_status
*status
)
1145 int8_t roundingMode
;
1146 flag roundNearestEven
, increment
, isTiny
;
1148 roundingMode
= status
->float_rounding_mode
;
1149 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
1150 switch (roundingMode
) {
1151 case float_round_nearest_even
:
1152 case float_round_ties_away
:
1153 increment
= ((int64_t)zSig2
< 0);
1155 case float_round_to_zero
:
1158 case float_round_up
:
1159 increment
= !zSign
&& zSig2
;
1161 case float_round_down
:
1162 increment
= zSign
&& zSig2
;
1164 case float_round_to_odd
:
1165 increment
= !(zSig1
& 0x1) && zSig2
;
1170 if ( 0x7FFD <= (uint32_t) zExp
) {
1171 if ( ( 0x7FFD < zExp
)
1172 || ( ( zExp
== 0x7FFD )
1174 LIT64( 0x0001FFFFFFFFFFFF ),
1175 LIT64( 0xFFFFFFFFFFFFFFFF ),
1182 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
1183 if ( ( roundingMode
== float_round_to_zero
)
1184 || ( zSign
&& ( roundingMode
== float_round_up
) )
1185 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
1186 || (roundingMode
== float_round_to_odd
)
1192 LIT64( 0x0000FFFFFFFFFFFF ),
1193 LIT64( 0xFFFFFFFFFFFFFFFF )
1196 return packFloat128( zSign
, 0x7FFF, 0, 0 );
1199 if (status
->flush_to_zero
) {
1200 float_raise(float_flag_output_denormal
, status
);
1201 return packFloat128(zSign
, 0, 0, 0);
1204 (status
->float_detect_tininess
1205 == float_tininess_before_rounding
)
1211 LIT64( 0x0001FFFFFFFFFFFF ),
1212 LIT64( 0xFFFFFFFFFFFFFFFF )
1214 shift128ExtraRightJamming(
1215 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
1217 if (isTiny
&& zSig2
) {
1218 float_raise(float_flag_underflow
, status
);
1220 switch (roundingMode
) {
1221 case float_round_nearest_even
:
1222 case float_round_ties_away
:
1223 increment
= ((int64_t)zSig2
< 0);
1225 case float_round_to_zero
:
1228 case float_round_up
:
1229 increment
= !zSign
&& zSig2
;
1231 case float_round_down
:
1232 increment
= zSign
&& zSig2
;
1234 case float_round_to_odd
:
1235 increment
= !(zSig1
& 0x1) && zSig2
;
1243 status
->float_exception_flags
|= float_flag_inexact
;
1246 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
1247 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
1250 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
1252 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1256 /*----------------------------------------------------------------------------
1257 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1258 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1259 | returns the proper quadruple-precision floating-point value corresponding
1260 | to the abstract input. This routine is just like `roundAndPackFloat128'
1261 | except that the input significand has fewer bits and does not have to be
1262 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1264 *----------------------------------------------------------------------------*/
1266 static float128
normalizeRoundAndPackFloat128(flag zSign
, int32_t zExp
,
1267 uint64_t zSig0
, uint64_t zSig1
,
1268 float_status
*status
)
1278 shiftCount
= countLeadingZeros64( zSig0
) - 15;
1279 if ( 0 <= shiftCount
) {
1281 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1284 shift128ExtraRightJamming(
1285 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
1288 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
1292 /*----------------------------------------------------------------------------
1293 | Returns the result of converting the 32-bit two's complement integer `a'
1294 | to the single-precision floating-point format. The conversion is performed
1295 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1296 *----------------------------------------------------------------------------*/
1298 float32
int32_to_float32(int32_t a
, float_status
*status
)
1302 if ( a
== 0 ) return float32_zero
;
1303 if ( a
== (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1305 return normalizeRoundAndPackFloat32(zSign
, 0x9C, zSign
? -a
: a
, status
);
1308 /*----------------------------------------------------------------------------
1309 | Returns the result of converting the 32-bit two's complement integer `a'
1310 | to the double-precision floating-point format. The conversion is performed
1311 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1312 *----------------------------------------------------------------------------*/
1314 float64
int32_to_float64(int32_t a
, float_status
*status
)
1321 if ( a
== 0 ) return float64_zero
;
1323 absA
= zSign
? - a
: a
;
1324 shiftCount
= countLeadingZeros32( absA
) + 21;
1326 return packFloat64( zSign
, 0x432 - shiftCount
, zSig
<<shiftCount
);
1330 /*----------------------------------------------------------------------------
1331 | Returns the result of converting the 32-bit two's complement integer `a'
1332 | to the extended double-precision floating-point format. The conversion
1333 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1335 *----------------------------------------------------------------------------*/
1337 floatx80
int32_to_floatx80(int32_t a
, float_status
*status
)
1344 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1346 absA
= zSign
? - a
: a
;
1347 shiftCount
= countLeadingZeros32( absA
) + 32;
1349 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
1353 /*----------------------------------------------------------------------------
1354 | Returns the result of converting the 32-bit two's complement integer `a' to
1355 | the quadruple-precision floating-point format. The conversion is performed
1356 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1357 *----------------------------------------------------------------------------*/
1359 float128
int32_to_float128(int32_t a
, float_status
*status
)
1366 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1368 absA
= zSign
? - a
: a
;
1369 shiftCount
= countLeadingZeros32( absA
) + 17;
1371 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
1375 /*----------------------------------------------------------------------------
1376 | Returns the result of converting the 64-bit two's complement integer `a'
1377 | to the single-precision floating-point format. The conversion is performed
1378 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1379 *----------------------------------------------------------------------------*/
1381 float32
int64_to_float32(int64_t a
, float_status
*status
)
1387 if ( a
== 0 ) return float32_zero
;
1389 absA
= zSign
? - a
: a
;
1390 shiftCount
= countLeadingZeros64( absA
) - 40;
1391 if ( 0 <= shiftCount
) {
1392 return packFloat32( zSign
, 0x95 - shiftCount
, absA
<<shiftCount
);
1396 if ( shiftCount
< 0 ) {
1397 shift64RightJamming( absA
, - shiftCount
, &absA
);
1400 absA
<<= shiftCount
;
1402 return roundAndPackFloat32(zSign
, 0x9C - shiftCount
, absA
, status
);
1407 /*----------------------------------------------------------------------------
1408 | Returns the result of converting the 64-bit two's complement integer `a'
1409 | to the double-precision floating-point format. The conversion is performed
1410 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1411 *----------------------------------------------------------------------------*/
1413 float64
int64_to_float64(int64_t a
, float_status
*status
)
1417 if ( a
== 0 ) return float64_zero
;
1418 if ( a
== (int64_t) LIT64( 0x8000000000000000 ) ) {
1419 return packFloat64( 1, 0x43E, 0 );
1422 return normalizeRoundAndPackFloat64(zSign
, 0x43C, zSign
? -a
: a
, status
);
1425 /*----------------------------------------------------------------------------
1426 | Returns the result of converting the 64-bit two's complement integer `a'
1427 | to the extended double-precision floating-point format. The conversion
1428 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1430 *----------------------------------------------------------------------------*/
1432 floatx80
int64_to_floatx80(int64_t a
, float_status
*status
)
1438 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1440 absA
= zSign
? - a
: a
;
1441 shiftCount
= countLeadingZeros64( absA
);
1442 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
1446 /*----------------------------------------------------------------------------
1447 | Returns the result of converting the 64-bit two's complement integer `a' to
1448 | the quadruple-precision floating-point format. The conversion is performed
1449 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1450 *----------------------------------------------------------------------------*/
1452 float128
int64_to_float128(int64_t a
, float_status
*status
)
1458 uint64_t zSig0
, zSig1
;
1460 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1462 absA
= zSign
? - a
: a
;
1463 shiftCount
= countLeadingZeros64( absA
) + 49;
1464 zExp
= 0x406E - shiftCount
;
1465 if ( 64 <= shiftCount
) {
1474 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1475 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1479 /*----------------------------------------------------------------------------
1480 | Returns the result of converting the 64-bit unsigned integer `a'
1481 | to the single-precision floating-point format. The conversion is performed
1482 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1483 *----------------------------------------------------------------------------*/
1485 float32
uint64_to_float32(uint64_t a
, float_status
*status
)
1490 return float32_zero
;
1493 /* Determine (left) shift needed to put first set bit into bit posn 23
1494 * (since packFloat32() expects the binary point between bits 23 and 22);
1495 * this is the fast case for smallish numbers.
1497 shiftcount
= countLeadingZeros64(a
) - 40;
1498 if (shiftcount
>= 0) {
1499 return packFloat32(0, 0x95 - shiftcount
, a
<< shiftcount
);
1501 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1502 * expects the binary point between bits 30 and 29, hence the + 7.
1505 if (shiftcount
< 0) {
1506 shift64RightJamming(a
, -shiftcount
, &a
);
1511 return roundAndPackFloat32(0, 0x9c - shiftcount
, a
, status
);
1514 /*----------------------------------------------------------------------------
1515 | Returns the result of converting the 64-bit unsigned integer `a'
1516 | to the double-precision floating-point format. The conversion is performed
1517 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1518 *----------------------------------------------------------------------------*/
1520 float64
uint64_to_float64(uint64_t a
, float_status
*status
)
1526 return float64_zero
;
1529 shiftcount
= countLeadingZeros64(a
) - 1;
1530 if (shiftcount
< 0) {
1531 shift64RightJamming(a
, -shiftcount
, &a
);
1535 return roundAndPackFloat64(0, exp
- shiftcount
, a
, status
);
1538 /*----------------------------------------------------------------------------
1539 | Returns the result of converting the 64-bit unsigned integer `a'
1540 | to the quadruple-precision floating-point format. The conversion is performed
1541 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1542 *----------------------------------------------------------------------------*/
1544 float128
uint64_to_float128(uint64_t a
, float_status
*status
)
1547 return float128_zero
;
1549 return normalizeRoundAndPackFloat128(0, 0x406E, a
, 0, status
);
1552 /*----------------------------------------------------------------------------
1553 | Returns the result of converting the single-precision floating-point value
1554 | `a' to the 32-bit two's complement integer format. The conversion is
1555 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1556 | Arithmetic---which means in particular that the conversion is rounded
1557 | according to the current rounding mode. If `a' is a NaN, the largest
1558 | positive integer is returned. Otherwise, if the conversion overflows, the
1559 | largest integer with the same sign as `a' is returned.
1560 *----------------------------------------------------------------------------*/
1562 int32_t float32_to_int32(float32 a
, float_status
*status
)
1570 a
= float32_squash_input_denormal(a
, status
);
1571 aSig
= extractFloat32Frac( a
);
1572 aExp
= extractFloat32Exp( a
);
1573 aSign
= extractFloat32Sign( a
);
1574 if ( ( aExp
== 0xFF ) && aSig
) aSign
= 0;
1575 if ( aExp
) aSig
|= 0x00800000;
1576 shiftCount
= 0xAF - aExp
;
1579 if ( 0 < shiftCount
) shift64RightJamming( aSig64
, shiftCount
, &aSig64
);
1580 return roundAndPackInt32(aSign
, aSig64
, status
);
1584 /*----------------------------------------------------------------------------
1585 | Returns the result of converting the single-precision floating-point value
1586 | `a' to the 32-bit two's complement integer format. The conversion is
1587 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1588 | Arithmetic, except that the conversion is always rounded toward zero.
1589 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1590 | the conversion overflows, the largest integer with the same sign as `a' is
1592 *----------------------------------------------------------------------------*/
1594 int32_t float32_to_int32_round_to_zero(float32 a
, float_status
*status
)
1601 a
= float32_squash_input_denormal(a
, status
);
1603 aSig
= extractFloat32Frac( a
);
1604 aExp
= extractFloat32Exp( a
);
1605 aSign
= extractFloat32Sign( a
);
1606 shiftCount
= aExp
- 0x9E;
1607 if ( 0 <= shiftCount
) {
1608 if ( float32_val(a
) != 0xCF000000 ) {
1609 float_raise(float_flag_invalid
, status
);
1610 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
1612 return (int32_t) 0x80000000;
1614 else if ( aExp
<= 0x7E ) {
1616 status
->float_exception_flags
|= float_flag_inexact
;
1620 aSig
= ( aSig
| 0x00800000 )<<8;
1621 z
= aSig
>>( - shiftCount
);
1622 if ( (uint32_t) ( aSig
<<( shiftCount
& 31 ) ) ) {
1623 status
->float_exception_flags
|= float_flag_inexact
;
1625 if ( aSign
) z
= - z
;
1630 /*----------------------------------------------------------------------------
1631 | Returns the result of converting the single-precision floating-point value
1632 | `a' to the 16-bit two's complement integer format. The conversion is
1633 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1634 | Arithmetic, except that the conversion is always rounded toward zero.
1635 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1636 | the conversion overflows, the largest integer with the same sign as `a' is
1638 *----------------------------------------------------------------------------*/
1640 int16_t float32_to_int16_round_to_zero(float32 a
, float_status
*status
)
1648 aSig
= extractFloat32Frac( a
);
1649 aExp
= extractFloat32Exp( a
);
1650 aSign
= extractFloat32Sign( a
);
1651 shiftCount
= aExp
- 0x8E;
1652 if ( 0 <= shiftCount
) {
1653 if ( float32_val(a
) != 0xC7000000 ) {
1654 float_raise(float_flag_invalid
, status
);
1655 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1659 return (int32_t) 0xffff8000;
1661 else if ( aExp
<= 0x7E ) {
1662 if ( aExp
| aSig
) {
1663 status
->float_exception_flags
|= float_flag_inexact
;
1668 aSig
= ( aSig
| 0x00800000 )<<8;
1669 z
= aSig
>>( - shiftCount
);
1670 if ( (uint32_t) ( aSig
<<( shiftCount
& 31 ) ) ) {
1671 status
->float_exception_flags
|= float_flag_inexact
;
1680 /*----------------------------------------------------------------------------
1681 | Returns the result of converting the single-precision floating-point value
1682 | `a' to the 64-bit two's complement integer format. The conversion is
1683 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1684 | Arithmetic---which means in particular that the conversion is rounded
1685 | according to the current rounding mode. If `a' is a NaN, the largest
1686 | positive integer is returned. Otherwise, if the conversion overflows, the
1687 | largest integer with the same sign as `a' is returned.
1688 *----------------------------------------------------------------------------*/
1690 int64_t float32_to_int64(float32 a
, float_status
*status
)
1696 uint64_t aSig64
, aSigExtra
;
1697 a
= float32_squash_input_denormal(a
, status
);
1699 aSig
= extractFloat32Frac( a
);
1700 aExp
= extractFloat32Exp( a
);
1701 aSign
= extractFloat32Sign( a
);
1702 shiftCount
= 0xBE - aExp
;
1703 if ( shiftCount
< 0 ) {
1704 float_raise(float_flag_invalid
, status
);
1705 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1706 return LIT64( 0x7FFFFFFFFFFFFFFF );
1708 return (int64_t) LIT64( 0x8000000000000000 );
1710 if ( aExp
) aSig
|= 0x00800000;
1713 shift64ExtraRightJamming( aSig64
, 0, shiftCount
, &aSig64
, &aSigExtra
);
1714 return roundAndPackInt64(aSign
, aSig64
, aSigExtra
, status
);
1718 /*----------------------------------------------------------------------------
1719 | Returns the result of converting the single-precision floating-point value
1720 | `a' to the 64-bit unsigned integer format. The conversion is
1721 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1722 | Arithmetic---which means in particular that the conversion is rounded
1723 | according to the current rounding mode. If `a' is a NaN, the largest
1724 | unsigned integer is returned. Otherwise, if the conversion overflows, the
1725 | largest unsigned integer is returned. If the 'a' is negative, the result
1726 | is rounded and zero is returned; values that do not round to zero will
1727 | raise the inexact exception flag.
1728 *----------------------------------------------------------------------------*/
1730 uint64_t float32_to_uint64(float32 a
, float_status
*status
)
1736 uint64_t aSig64
, aSigExtra
;
1737 a
= float32_squash_input_denormal(a
, status
);
1739 aSig
= extractFloat32Frac(a
);
1740 aExp
= extractFloat32Exp(a
);
1741 aSign
= extractFloat32Sign(a
);
1742 if ((aSign
) && (aExp
> 126)) {
1743 float_raise(float_flag_invalid
, status
);
1744 if (float32_is_any_nan(a
)) {
1745 return LIT64(0xFFFFFFFFFFFFFFFF);
1750 shiftCount
= 0xBE - aExp
;
1754 if (shiftCount
< 0) {
1755 float_raise(float_flag_invalid
, status
);
1756 return LIT64(0xFFFFFFFFFFFFFFFF);
1761 shift64ExtraRightJamming(aSig64
, 0, shiftCount
, &aSig64
, &aSigExtra
);
1762 return roundAndPackUint64(aSign
, aSig64
, aSigExtra
, status
);
1765 /*----------------------------------------------------------------------------
1766 | Returns the result of converting the single-precision floating-point value
1767 | `a' to the 64-bit unsigned integer format. The conversion is
1768 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1769 | Arithmetic, except that the conversion is always rounded toward zero. If
1770 | `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the
1771 | conversion overflows, the largest unsigned integer is returned. If the
1772 | 'a' is negative, the result is rounded and zero is returned; values that do
1773 | not round to zero will raise the inexact flag.
1774 *----------------------------------------------------------------------------*/
1776 uint64_t float32_to_uint64_round_to_zero(float32 a
, float_status
*status
)
1778 signed char current_rounding_mode
= status
->float_rounding_mode
;
1779 set_float_rounding_mode(float_round_to_zero
, status
);
1780 int64_t v
= float32_to_uint64(a
, status
);
1781 set_float_rounding_mode(current_rounding_mode
, status
);
1785 /*----------------------------------------------------------------------------
1786 | Returns the result of converting the single-precision floating-point value
1787 | `a' to the 64-bit two's complement integer format. The conversion is
1788 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1789 | Arithmetic, except that the conversion is always rounded toward zero. If
1790 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1791 | conversion overflows, the largest integer with the same sign as `a' is
1793 *----------------------------------------------------------------------------*/
1795 int64_t float32_to_int64_round_to_zero(float32 a
, float_status
*status
)
1803 a
= float32_squash_input_denormal(a
, status
);
1805 aSig
= extractFloat32Frac( a
);
1806 aExp
= extractFloat32Exp( a
);
1807 aSign
= extractFloat32Sign( a
);
1808 shiftCount
= aExp
- 0xBE;
1809 if ( 0 <= shiftCount
) {
1810 if ( float32_val(a
) != 0xDF000000 ) {
1811 float_raise(float_flag_invalid
, status
);
1812 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1813 return LIT64( 0x7FFFFFFFFFFFFFFF );
1816 return (int64_t) LIT64( 0x8000000000000000 );
1818 else if ( aExp
<= 0x7E ) {
1820 status
->float_exception_flags
|= float_flag_inexact
;
1824 aSig64
= aSig
| 0x00800000;
1826 z
= aSig64
>>( - shiftCount
);
1827 if ( (uint64_t) ( aSig64
<<( shiftCount
& 63 ) ) ) {
1828 status
->float_exception_flags
|= float_flag_inexact
;
1830 if ( aSign
) z
= - z
;
1835 /*----------------------------------------------------------------------------
1836 | Returns the result of converting the single-precision floating-point value
1837 | `a' to the double-precision floating-point format. The conversion is
1838 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1840 *----------------------------------------------------------------------------*/
1842 float64
float32_to_float64(float32 a
, float_status
*status
)
1847 a
= float32_squash_input_denormal(a
, status
);
1849 aSig
= extractFloat32Frac( a
);
1850 aExp
= extractFloat32Exp( a
);
1851 aSign
= extractFloat32Sign( a
);
1852 if ( aExp
== 0xFF ) {
1854 return commonNaNToFloat64(float32ToCommonNaN(a
, status
), status
);
1856 return packFloat64( aSign
, 0x7FF, 0 );
1859 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
1860 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1863 return packFloat64( aSign
, aExp
+ 0x380, ( (uint64_t) aSig
)<<29 );
1867 /*----------------------------------------------------------------------------
1868 | Returns the result of converting the single-precision floating-point value
1869 | `a' to the extended double-precision floating-point format. The conversion
1870 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1872 *----------------------------------------------------------------------------*/
1874 floatx80
float32_to_floatx80(float32 a
, float_status
*status
)
1880 a
= float32_squash_input_denormal(a
, status
);
1881 aSig
= extractFloat32Frac( a
);
1882 aExp
= extractFloat32Exp( a
);
1883 aSign
= extractFloat32Sign( a
);
1884 if ( aExp
== 0xFF ) {
1886 return commonNaNToFloatx80(float32ToCommonNaN(a
, status
), status
);
1888 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1891 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
1892 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1895 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
1899 /*----------------------------------------------------------------------------
1900 | Returns the result of converting the single-precision floating-point value
1901 | `a' to the double-precision floating-point format. The conversion is
1902 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1904 *----------------------------------------------------------------------------*/
1906 float128
float32_to_float128(float32 a
, float_status
*status
)
1912 a
= float32_squash_input_denormal(a
, status
);
1913 aSig
= extractFloat32Frac( a
);
1914 aExp
= extractFloat32Exp( a
);
1915 aSign
= extractFloat32Sign( a
);
1916 if ( aExp
== 0xFF ) {
1918 return commonNaNToFloat128(float32ToCommonNaN(a
, status
), status
);
1920 return packFloat128( aSign
, 0x7FFF, 0, 0 );
1923 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
1924 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1927 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
1931 /*----------------------------------------------------------------------------
1932 | Rounds the single-precision floating-point value `a' to an integer, and
1933 | returns the result as a single-precision floating-point value. The
1934 | operation is performed according to the IEC/IEEE Standard for Binary
1935 | Floating-Point Arithmetic.
1936 *----------------------------------------------------------------------------*/
1938 float32
float32_round_to_int(float32 a
, float_status
*status
)
1942 uint32_t lastBitMask
, roundBitsMask
;
1944 a
= float32_squash_input_denormal(a
, status
);
1946 aExp
= extractFloat32Exp( a
);
1947 if ( 0x96 <= aExp
) {
1948 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
1949 return propagateFloat32NaN(a
, a
, status
);
1953 if ( aExp
<= 0x7E ) {
1954 if ( (uint32_t) ( float32_val(a
)<<1 ) == 0 ) return a
;
1955 status
->float_exception_flags
|= float_flag_inexact
;
1956 aSign
= extractFloat32Sign( a
);
1957 switch (status
->float_rounding_mode
) {
1958 case float_round_nearest_even
:
1959 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
1960 return packFloat32( aSign
, 0x7F, 0 );
1963 case float_round_ties_away
:
1965 return packFloat32(aSign
, 0x7F, 0);
1968 case float_round_down
:
1969 return make_float32(aSign
? 0xBF800000 : 0);
1970 case float_round_up
:
1971 return make_float32(aSign
? 0x80000000 : 0x3F800000);
1973 return packFloat32( aSign
, 0, 0 );
1976 lastBitMask
<<= 0x96 - aExp
;
1977 roundBitsMask
= lastBitMask
- 1;
1979 switch (status
->float_rounding_mode
) {
1980 case float_round_nearest_even
:
1981 z
+= lastBitMask
>>1;
1982 if ((z
& roundBitsMask
) == 0) {
1986 case float_round_ties_away
:
1987 z
+= lastBitMask
>> 1;
1989 case float_round_to_zero
:
1991 case float_round_up
:
1992 if (!extractFloat32Sign(make_float32(z
))) {
1996 case float_round_down
:
1997 if (extractFloat32Sign(make_float32(z
))) {
2004 z
&= ~ roundBitsMask
;
2005 if (z
!= float32_val(a
)) {
2006 status
->float_exception_flags
|= float_flag_inexact
;
2008 return make_float32(z
);
2012 /*----------------------------------------------------------------------------
2013 | Returns the result of adding the absolute values of the single-precision
2014 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2015 | before being returned. `zSign' is ignored if the result is a NaN.
2016 | The addition is performed according to the IEC/IEEE Standard for Binary
2017 | Floating-Point Arithmetic.
2018 *----------------------------------------------------------------------------*/
2020 static float32
addFloat32Sigs(float32 a
, float32 b
, flag zSign
,
2021 float_status
*status
)
2023 int aExp
, bExp
, zExp
;
2024 uint32_t aSig
, bSig
, zSig
;
2027 aSig
= extractFloat32Frac( a
);
2028 aExp
= extractFloat32Exp( a
);
2029 bSig
= extractFloat32Frac( b
);
2030 bExp
= extractFloat32Exp( b
);
2031 expDiff
= aExp
- bExp
;
2034 if ( 0 < expDiff
) {
2035 if ( aExp
== 0xFF ) {
2037 return propagateFloat32NaN(a
, b
, status
);
2047 shift32RightJamming( bSig
, expDiff
, &bSig
);
2050 else if ( expDiff
< 0 ) {
2051 if ( bExp
== 0xFF ) {
2053 return propagateFloat32NaN(a
, b
, status
);
2055 return packFloat32( zSign
, 0xFF, 0 );
2063 shift32RightJamming( aSig
, - expDiff
, &aSig
);
2067 if ( aExp
== 0xFF ) {
2069 return propagateFloat32NaN(a
, b
, status
);
2074 if (status
->flush_to_zero
) {
2076 float_raise(float_flag_output_denormal
, status
);
2078 return packFloat32(zSign
, 0, 0);
2080 return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
2082 zSig
= 0x40000000 + aSig
+ bSig
;
2087 zSig
= ( aSig
+ bSig
)<<1;
2089 if ( (int32_t) zSig
< 0 ) {
2094 return roundAndPackFloat32(zSign
, zExp
, zSig
, status
);
2098 /*----------------------------------------------------------------------------
2099 | Returns the result of subtracting the absolute values of the single-
2100 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2101 | difference is negated before being returned. `zSign' is ignored if the
2102 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2103 | Standard for Binary Floating-Point Arithmetic.
2104 *----------------------------------------------------------------------------*/
2106 static float32
subFloat32Sigs(float32 a
, float32 b
, flag zSign
,
2107 float_status
*status
)
2109 int aExp
, bExp
, zExp
;
2110 uint32_t aSig
, bSig
, zSig
;
2113 aSig
= extractFloat32Frac( a
);
2114 aExp
= extractFloat32Exp( a
);
2115 bSig
= extractFloat32Frac( b
);
2116 bExp
= extractFloat32Exp( b
);
2117 expDiff
= aExp
- bExp
;
2120 if ( 0 < expDiff
) goto aExpBigger
;
2121 if ( expDiff
< 0 ) goto bExpBigger
;
2122 if ( aExp
== 0xFF ) {
2124 return propagateFloat32NaN(a
, b
, status
);
2126 float_raise(float_flag_invalid
, status
);
2127 return float32_default_nan(status
);
2133 if ( bSig
< aSig
) goto aBigger
;
2134 if ( aSig
< bSig
) goto bBigger
;
2135 return packFloat32(status
->float_rounding_mode
== float_round_down
, 0, 0);
2137 if ( bExp
== 0xFF ) {
2139 return propagateFloat32NaN(a
, b
, status
);
2141 return packFloat32( zSign
^ 1, 0xFF, 0 );
2149 shift32RightJamming( aSig
, - expDiff
, &aSig
);
2155 goto normalizeRoundAndPack
;
2157 if ( aExp
== 0xFF ) {
2159 return propagateFloat32NaN(a
, b
, status
);
2169 shift32RightJamming( bSig
, expDiff
, &bSig
);
2174 normalizeRoundAndPack
:
2176 return normalizeRoundAndPackFloat32(zSign
, zExp
, zSig
, status
);
2180 /*----------------------------------------------------------------------------
2181 | Returns the result of adding the single-precision floating-point values `a'
2182 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2183 | Binary Floating-Point Arithmetic.
2184 *----------------------------------------------------------------------------*/
2186 float32
float32_add(float32 a
, float32 b
, float_status
*status
)
2189 a
= float32_squash_input_denormal(a
, status
);
2190 b
= float32_squash_input_denormal(b
, status
);
2192 aSign
= extractFloat32Sign( a
);
2193 bSign
= extractFloat32Sign( b
);
2194 if ( aSign
== bSign
) {
2195 return addFloat32Sigs(a
, b
, aSign
, status
);
2198 return subFloat32Sigs(a
, b
, aSign
, status
);
2203 /*----------------------------------------------------------------------------
2204 | Returns the result of subtracting the single-precision floating-point values
2205 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2206 | for Binary Floating-Point Arithmetic.
2207 *----------------------------------------------------------------------------*/
2209 float32
float32_sub(float32 a
, float32 b
, float_status
*status
)
2212 a
= float32_squash_input_denormal(a
, status
);
2213 b
= float32_squash_input_denormal(b
, status
);
2215 aSign
= extractFloat32Sign( a
);
2216 bSign
= extractFloat32Sign( b
);
2217 if ( aSign
== bSign
) {
2218 return subFloat32Sigs(a
, b
, aSign
, status
);
2221 return addFloat32Sigs(a
, b
, aSign
, status
);
2226 /*----------------------------------------------------------------------------
2227 | Returns the result of multiplying the single-precision floating-point values
2228 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2229 | for Binary Floating-Point Arithmetic.
2230 *----------------------------------------------------------------------------*/
2232 float32
float32_mul(float32 a
, float32 b
, float_status
*status
)
2234 flag aSign
, bSign
, zSign
;
2235 int aExp
, bExp
, zExp
;
2236 uint32_t aSig
, bSig
;
2240 a
= float32_squash_input_denormal(a
, status
);
2241 b
= float32_squash_input_denormal(b
, status
);
2243 aSig
= extractFloat32Frac( a
);
2244 aExp
= extractFloat32Exp( a
);
2245 aSign
= extractFloat32Sign( a
);
2246 bSig
= extractFloat32Frac( b
);
2247 bExp
= extractFloat32Exp( b
);
2248 bSign
= extractFloat32Sign( b
);
2249 zSign
= aSign
^ bSign
;
2250 if ( aExp
== 0xFF ) {
2251 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
2252 return propagateFloat32NaN(a
, b
, status
);
2254 if ( ( bExp
| bSig
) == 0 ) {
2255 float_raise(float_flag_invalid
, status
);
2256 return float32_default_nan(status
);
2258 return packFloat32( zSign
, 0xFF, 0 );
2260 if ( bExp
== 0xFF ) {
2262 return propagateFloat32NaN(a
, b
, status
);
2264 if ( ( aExp
| aSig
) == 0 ) {
2265 float_raise(float_flag_invalid
, status
);
2266 return float32_default_nan(status
);
2268 return packFloat32( zSign
, 0xFF, 0 );
2271 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
2272 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2275 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
2276 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2278 zExp
= aExp
+ bExp
- 0x7F;
2279 aSig
= ( aSig
| 0x00800000 )<<7;
2280 bSig
= ( bSig
| 0x00800000 )<<8;
2281 shift64RightJamming( ( (uint64_t) aSig
) * bSig
, 32, &zSig64
);
2283 if ( 0 <= (int32_t) ( zSig
<<1 ) ) {
2287 return roundAndPackFloat32(zSign
, zExp
, zSig
, status
);
2291 /*----------------------------------------------------------------------------
2292 | Returns the result of dividing the single-precision floating-point value `a'
2293 | by the corresponding value `b'. The operation is performed according to the
2294 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2295 *----------------------------------------------------------------------------*/
2297 float32
float32_div(float32 a
, float32 b
, float_status
*status
)
2299 flag aSign
, bSign
, zSign
;
2300 int aExp
, bExp
, zExp
;
2301 uint32_t aSig
, bSig
, zSig
;
2302 a
= float32_squash_input_denormal(a
, status
);
2303 b
= float32_squash_input_denormal(b
, status
);
2305 aSig
= extractFloat32Frac( a
);
2306 aExp
= extractFloat32Exp( a
);
2307 aSign
= extractFloat32Sign( a
);
2308 bSig
= extractFloat32Frac( b
);
2309 bExp
= extractFloat32Exp( b
);
2310 bSign
= extractFloat32Sign( b
);
2311 zSign
= aSign
^ bSign
;
2312 if ( aExp
== 0xFF ) {
2314 return propagateFloat32NaN(a
, b
, status
);
2316 if ( bExp
== 0xFF ) {
2318 return propagateFloat32NaN(a
, b
, status
);
2320 float_raise(float_flag_invalid
, status
);
2321 return float32_default_nan(status
);
2323 return packFloat32( zSign
, 0xFF, 0 );
2325 if ( bExp
== 0xFF ) {
2327 return propagateFloat32NaN(a
, b
, status
);
2329 return packFloat32( zSign
, 0, 0 );
2333 if ( ( aExp
| aSig
) == 0 ) {
2334 float_raise(float_flag_invalid
, status
);
2335 return float32_default_nan(status
);
2337 float_raise(float_flag_divbyzero
, status
);
2338 return packFloat32( zSign
, 0xFF, 0 );
2340 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2343 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
2344 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2346 zExp
= aExp
- bExp
+ 0x7D;
2347 aSig
= ( aSig
| 0x00800000 )<<7;
2348 bSig
= ( bSig
| 0x00800000 )<<8;
2349 if ( bSig
<= ( aSig
+ aSig
) ) {
2353 zSig
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
2354 if ( ( zSig
& 0x3F ) == 0 ) {
2355 zSig
|= ( (uint64_t) bSig
* zSig
!= ( (uint64_t) aSig
)<<32 );
2357 return roundAndPackFloat32(zSign
, zExp
, zSig
, status
);
2361 /*----------------------------------------------------------------------------
2362 | Returns the remainder of the single-precision floating-point value `a'
2363 | with respect to the corresponding value `b'. The operation is performed
2364 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2365 *----------------------------------------------------------------------------*/
2367 float32
float32_rem(float32 a
, float32 b
, float_status
*status
)
2370 int aExp
, bExp
, expDiff
;
2371 uint32_t aSig
, bSig
;
2373 uint64_t aSig64
, bSig64
, q64
;
2374 uint32_t alternateASig
;
2376 a
= float32_squash_input_denormal(a
, status
);
2377 b
= float32_squash_input_denormal(b
, status
);
2379 aSig
= extractFloat32Frac( a
);
2380 aExp
= extractFloat32Exp( a
);
2381 aSign
= extractFloat32Sign( a
);
2382 bSig
= extractFloat32Frac( b
);
2383 bExp
= extractFloat32Exp( b
);
2384 if ( aExp
== 0xFF ) {
2385 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
2386 return propagateFloat32NaN(a
, b
, status
);
2388 float_raise(float_flag_invalid
, status
);
2389 return float32_default_nan(status
);
2391 if ( bExp
== 0xFF ) {
2393 return propagateFloat32NaN(a
, b
, status
);
2399 float_raise(float_flag_invalid
, status
);
2400 return float32_default_nan(status
);
2402 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2405 if ( aSig
== 0 ) return a
;
2406 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2408 expDiff
= aExp
- bExp
;
2411 if ( expDiff
< 32 ) {
2414 if ( expDiff
< 0 ) {
2415 if ( expDiff
< -1 ) return a
;
2418 q
= ( bSig
<= aSig
);
2419 if ( q
) aSig
-= bSig
;
2420 if ( 0 < expDiff
) {
2421 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
2424 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
2432 if ( bSig
<= aSig
) aSig
-= bSig
;
2433 aSig64
= ( (uint64_t) aSig
)<<40;
2434 bSig64
= ( (uint64_t) bSig
)<<40;
2436 while ( 0 < expDiff
) {
2437 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2438 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2439 aSig64
= - ( ( bSig
* q64
)<<38 );
2443 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2444 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2445 q
= q64
>>( 64 - expDiff
);
2447 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
2450 alternateASig
= aSig
;
2453 } while ( 0 <= (int32_t) aSig
);
2454 sigMean
= aSig
+ alternateASig
;
2455 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
2456 aSig
= alternateASig
;
2458 zSign
= ( (int32_t) aSig
< 0 );
2459 if ( zSign
) aSig
= - aSig
;
2460 return normalizeRoundAndPackFloat32(aSign
^ zSign
, bExp
, aSig
, status
);
2463 /*----------------------------------------------------------------------------
2464 | Returns the result of multiplying the single-precision floating-point values
2465 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2466 | multiplication. The operation is performed according to the IEC/IEEE
2467 | Standard for Binary Floating-Point Arithmetic 754-2008.
2468 | The flags argument allows the caller to select negation of the
2469 | addend, the intermediate product, or the final result. (The difference
2470 | between this and having the caller do a separate negation is that negating
2471 | externally will flip the sign bit on NaNs.)
2472 *----------------------------------------------------------------------------*/
2474 float32
float32_muladd(float32 a
, float32 b
, float32 c
, int flags
,
2475 float_status
*status
)
2477 flag aSign
, bSign
, cSign
, zSign
;
2478 int aExp
, bExp
, cExp
, pExp
, zExp
, expDiff
;
2479 uint32_t aSig
, bSig
, cSig
;
2480 flag pInf
, pZero
, pSign
;
2481 uint64_t pSig64
, cSig64
, zSig64
;
2484 flag signflip
, infzero
;
2486 a
= float32_squash_input_denormal(a
, status
);
2487 b
= float32_squash_input_denormal(b
, status
);
2488 c
= float32_squash_input_denormal(c
, status
);
2489 aSig
= extractFloat32Frac(a
);
2490 aExp
= extractFloat32Exp(a
);
2491 aSign
= extractFloat32Sign(a
);
2492 bSig
= extractFloat32Frac(b
);
2493 bExp
= extractFloat32Exp(b
);
2494 bSign
= extractFloat32Sign(b
);
2495 cSig
= extractFloat32Frac(c
);
2496 cExp
= extractFloat32Exp(c
);
2497 cSign
= extractFloat32Sign(c
);
2499 infzero
= ((aExp
== 0 && aSig
== 0 && bExp
== 0xff && bSig
== 0) ||
2500 (aExp
== 0xff && aSig
== 0 && bExp
== 0 && bSig
== 0));
2502 /* It is implementation-defined whether the cases of (0,inf,qnan)
2503 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2504 * they return if they do), so we have to hand this information
2505 * off to the target-specific pick-a-NaN routine.
2507 if (((aExp
== 0xff) && aSig
) ||
2508 ((bExp
== 0xff) && bSig
) ||
2509 ((cExp
== 0xff) && cSig
)) {
2510 return propagateFloat32MulAddNaN(a
, b
, c
, infzero
, status
);
2514 float_raise(float_flag_invalid
, status
);
2515 return float32_default_nan(status
);
2518 if (flags
& float_muladd_negate_c
) {
2522 signflip
= (flags
& float_muladd_negate_result
) ? 1 : 0;
2524 /* Work out the sign and type of the product */
2525 pSign
= aSign
^ bSign
;
2526 if (flags
& float_muladd_negate_product
) {
2529 pInf
= (aExp
== 0xff) || (bExp
== 0xff);
2530 pZero
= ((aExp
| aSig
) == 0) || ((bExp
| bSig
) == 0);
2533 if (pInf
&& (pSign
^ cSign
)) {
2534 /* addition of opposite-signed infinities => InvalidOperation */
2535 float_raise(float_flag_invalid
, status
);
2536 return float32_default_nan(status
);
2538 /* Otherwise generate an infinity of the same sign */
2539 return packFloat32(cSign
^ signflip
, 0xff, 0);
2543 return packFloat32(pSign
^ signflip
, 0xff, 0);
2549 /* Adding two exact zeroes */
2550 if (pSign
== cSign
) {
2552 } else if (status
->float_rounding_mode
== float_round_down
) {
2557 return packFloat32(zSign
^ signflip
, 0, 0);
2559 /* Exact zero plus a denorm */
2560 if (status
->flush_to_zero
) {
2561 float_raise(float_flag_output_denormal
, status
);
2562 return packFloat32(cSign
^ signflip
, 0, 0);
2565 /* Zero plus something non-zero : just return the something */
2566 if (flags
& float_muladd_halve_result
) {
2568 normalizeFloat32Subnormal(cSig
, &cExp
, &cSig
);
2570 /* Subtract one to halve, and one again because roundAndPackFloat32
2571 * wants one less than the true exponent.
2574 cSig
= (cSig
| 0x00800000) << 7;
2575 return roundAndPackFloat32(cSign
^ signflip
, cExp
, cSig
, status
);
2577 return packFloat32(cSign
^ signflip
, cExp
, cSig
);
2581 normalizeFloat32Subnormal(aSig
, &aExp
, &aSig
);
2584 normalizeFloat32Subnormal(bSig
, &bExp
, &bSig
);
2587 /* Calculate the actual result a * b + c */
2589 /* Multiply first; this is easy. */
2590 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2591 * because we want the true exponent, not the "one-less-than"
2592 * flavour that roundAndPackFloat32() takes.
2594 pExp
= aExp
+ bExp
- 0x7e;
2595 aSig
= (aSig
| 0x00800000) << 7;
2596 bSig
= (bSig
| 0x00800000) << 8;
2597 pSig64
= (uint64_t)aSig
* bSig
;
2598 if ((int64_t)(pSig64
<< 1) >= 0) {
2603 zSign
= pSign
^ signflip
;
2605 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2610 /* Throw out the special case of c being an exact zero now */
2611 shift64RightJamming(pSig64
, 32, &pSig64
);
2613 if (flags
& float_muladd_halve_result
) {
2616 return roundAndPackFloat32(zSign
, pExp
- 1,
2619 normalizeFloat32Subnormal(cSig
, &cExp
, &cSig
);
2622 cSig64
= (uint64_t)cSig
<< (62 - 23);
2623 cSig64
|= LIT64(0x4000000000000000);
2624 expDiff
= pExp
- cExp
;
2626 if (pSign
== cSign
) {
2629 /* scale c to match p */
2630 shift64RightJamming(cSig64
, expDiff
, &cSig64
);
2632 } else if (expDiff
< 0) {
2633 /* scale p to match c */
2634 shift64RightJamming(pSig64
, -expDiff
, &pSig64
);
2637 /* no scaling needed */
2640 /* Add significands and make sure explicit bit ends up in posn 62 */
2641 zSig64
= pSig64
+ cSig64
;
2642 if ((int64_t)zSig64
< 0) {
2643 shift64RightJamming(zSig64
, 1, &zSig64
);
2650 shift64RightJamming(cSig64
, expDiff
, &cSig64
);
2651 zSig64
= pSig64
- cSig64
;
2653 } else if (expDiff
< 0) {
2654 shift64RightJamming(pSig64
, -expDiff
, &pSig64
);
2655 zSig64
= cSig64
- pSig64
;
2660 if (cSig64
< pSig64
) {
2661 zSig64
= pSig64
- cSig64
;
2662 } else if (pSig64
< cSig64
) {
2663 zSig64
= cSig64
- pSig64
;
2668 if (status
->float_rounding_mode
== float_round_down
) {
2671 return packFloat32(zSign
, 0, 0);
2675 /* Normalize to put the explicit bit back into bit 62. */
2676 shiftcount
= countLeadingZeros64(zSig64
) - 1;
2677 zSig64
<<= shiftcount
;
2680 if (flags
& float_muladd_halve_result
) {
2684 shift64RightJamming(zSig64
, 32, &zSig64
);
2685 return roundAndPackFloat32(zSign
, zExp
, zSig64
, status
);
2689 /*----------------------------------------------------------------------------
2690 | Returns the square root of the single-precision floating-point value `a'.
2691 | The operation is performed according to the IEC/IEEE Standard for Binary
2692 | Floating-Point Arithmetic.
2693 *----------------------------------------------------------------------------*/
2695 float32
float32_sqrt(float32 a
, float_status
*status
)
2699 uint32_t aSig
, zSig
;
2701 a
= float32_squash_input_denormal(a
, status
);
2703 aSig
= extractFloat32Frac( a
);
2704 aExp
= extractFloat32Exp( a
);
2705 aSign
= extractFloat32Sign( a
);
2706 if ( aExp
== 0xFF ) {
2708 return propagateFloat32NaN(a
, float32_zero
, status
);
2710 if ( ! aSign
) return a
;
2711 float_raise(float_flag_invalid
, status
);
2712 return float32_default_nan(status
);
2715 if ( ( aExp
| aSig
) == 0 ) return a
;
2716 float_raise(float_flag_invalid
, status
);
2717 return float32_default_nan(status
);
2720 if ( aSig
== 0 ) return float32_zero
;
2721 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2723 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
2724 aSig
= ( aSig
| 0x00800000 )<<8;
2725 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
2726 if ( ( zSig
& 0x7F ) <= 5 ) {
2732 term
= ( (uint64_t) zSig
) * zSig
;
2733 rem
= ( ( (uint64_t) aSig
)<<32 ) - term
;
2734 while ( (int64_t) rem
< 0 ) {
2736 rem
+= ( ( (uint64_t) zSig
)<<1 ) | 1;
2738 zSig
|= ( rem
!= 0 );
2740 shift32RightJamming( zSig
, 1, &zSig
);
2742 return roundAndPackFloat32(0, zExp
, zSig
, status
);
2746 /*----------------------------------------------------------------------------
2747 | Returns the binary exponential of the single-precision floating-point value
2748 | `a'. The operation is performed according to the IEC/IEEE Standard for
2749 | Binary Floating-Point Arithmetic.
2751 | Uses the following identities:
2753 | 1. -------------------------------------------------------------------------
2757 | 2. -------------------------------------------------------------------------
2760 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2762 *----------------------------------------------------------------------------*/
2764 static const float64 float32_exp2_coefficients
[15] =
2766 const_float64( 0x3ff0000000000000ll
), /* 1 */
2767 const_float64( 0x3fe0000000000000ll
), /* 2 */
2768 const_float64( 0x3fc5555555555555ll
), /* 3 */
2769 const_float64( 0x3fa5555555555555ll
), /* 4 */
2770 const_float64( 0x3f81111111111111ll
), /* 5 */
2771 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
2772 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
2773 const_float64( 0x3efa01a01a01a01all
), /* 8 */
2774 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
2775 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
2776 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
2777 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
2778 const_float64( 0x3de6124613a86d09ll
), /* 13 */
2779 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
2780 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
2783 float32
float32_exp2(float32 a
, float_status
*status
)
2790 a
= float32_squash_input_denormal(a
, status
);
2792 aSig
= extractFloat32Frac( a
);
2793 aExp
= extractFloat32Exp( a
);
2794 aSign
= extractFloat32Sign( a
);
2796 if ( aExp
== 0xFF) {
2798 return propagateFloat32NaN(a
, float32_zero
, status
);
2800 return (aSign
) ? float32_zero
: a
;
2803 if (aSig
== 0) return float32_one
;
2806 float_raise(float_flag_inexact
, status
);
2808 /* ******************************* */
2809 /* using float64 for approximation */
2810 /* ******************************* */
2811 x
= float32_to_float64(a
, status
);
2812 x
= float64_mul(x
, float64_ln2
, status
);
2816 for (i
= 0 ; i
< 15 ; i
++) {
2819 f
= float64_mul(xn
, float32_exp2_coefficients
[i
], status
);
2820 r
= float64_add(r
, f
, status
);
2822 xn
= float64_mul(xn
, x
, status
);
2825 return float64_to_float32(r
, status
);
2828 /*----------------------------------------------------------------------------
2829 | Returns the binary log of the single-precision floating-point value `a'.
2830 | The operation is performed according to the IEC/IEEE Standard for Binary
2831 | Floating-Point Arithmetic.
2832 *----------------------------------------------------------------------------*/
2833 float32
float32_log2(float32 a
, float_status
*status
)
2837 uint32_t aSig
, zSig
, i
;
2839 a
= float32_squash_input_denormal(a
, status
);
2840 aSig
= extractFloat32Frac( a
);
2841 aExp
= extractFloat32Exp( a
);
2842 aSign
= extractFloat32Sign( a
);
2845 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
2846 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2849 float_raise(float_flag_invalid
, status
);
2850 return float32_default_nan(status
);
2852 if ( aExp
== 0xFF ) {
2854 return propagateFloat32NaN(a
, float32_zero
, status
);
2864 for (i
= 1 << 22; i
> 0; i
>>= 1) {
2865 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
2866 if ( aSig
& 0x01000000 ) {
2875 return normalizeRoundAndPackFloat32(zSign
, 0x85, zSig
, status
);
2878 /*----------------------------------------------------------------------------
2879 | Returns 1 if the single-precision floating-point value `a' is equal to
2880 | the corresponding value `b', and 0 otherwise. The invalid exception is
2881 | raised if either operand is a NaN. Otherwise, the comparison is performed
2882 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2883 *----------------------------------------------------------------------------*/
2885 int float32_eq(float32 a
, float32 b
, float_status
*status
)
2888 a
= float32_squash_input_denormal(a
, status
);
2889 b
= float32_squash_input_denormal(b
, status
);
2891 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2892 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2894 float_raise(float_flag_invalid
, status
);
2897 av
= float32_val(a
);
2898 bv
= float32_val(b
);
2899 return ( av
== bv
) || ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
2902 /*----------------------------------------------------------------------------
2903 | Returns 1 if the single-precision floating-point value `a' is less than
2904 | or equal to the corresponding value `b', and 0 otherwise. The invalid
2905 | exception is raised if either operand is a NaN. The comparison is performed
2906 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2907 *----------------------------------------------------------------------------*/
2909 int float32_le(float32 a
, float32 b
, float_status
*status
)
2913 a
= float32_squash_input_denormal(a
, status
);
2914 b
= float32_squash_input_denormal(b
, status
);
2916 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2917 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2919 float_raise(float_flag_invalid
, status
);
2922 aSign
= extractFloat32Sign( a
);
2923 bSign
= extractFloat32Sign( b
);
2924 av
= float32_val(a
);
2925 bv
= float32_val(b
);
2926 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
2927 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
2931 /*----------------------------------------------------------------------------
2932 | Returns 1 if the single-precision floating-point value `a' is less than
2933 | the corresponding value `b', and 0 otherwise. The invalid exception is
2934 | raised if either operand is a NaN. The comparison is performed according
2935 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2936 *----------------------------------------------------------------------------*/
2938 int float32_lt(float32 a
, float32 b
, float_status
*status
)
2942 a
= float32_squash_input_denormal(a
, status
);
2943 b
= float32_squash_input_denormal(b
, status
);
2945 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2946 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2948 float_raise(float_flag_invalid
, status
);
2951 aSign
= extractFloat32Sign( a
);
2952 bSign
= extractFloat32Sign( b
);
2953 av
= float32_val(a
);
2954 bv
= float32_val(b
);
2955 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
2956 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
2960 /*----------------------------------------------------------------------------
2961 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2962 | be compared, and 0 otherwise. The invalid exception is raised if either
2963 | operand is a NaN. The comparison is performed according to the IEC/IEEE
2964 | Standard for Binary Floating-Point Arithmetic.
2965 *----------------------------------------------------------------------------*/
2967 int float32_unordered(float32 a
, float32 b
, float_status
*status
)
2969 a
= float32_squash_input_denormal(a
, status
);
2970 b
= float32_squash_input_denormal(b
, status
);
2972 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2973 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2975 float_raise(float_flag_invalid
, status
);
2981 /*----------------------------------------------------------------------------
2982 | Returns 1 if the single-precision floating-point value `a' is equal to
2983 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2984 | exception. The comparison is performed according to the IEC/IEEE Standard
2985 | for Binary Floating-Point Arithmetic.
2986 *----------------------------------------------------------------------------*/
2988 int float32_eq_quiet(float32 a
, float32 b
, float_status
*status
)
2990 a
= float32_squash_input_denormal(a
, status
);
2991 b
= float32_squash_input_denormal(b
, status
);
2993 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2994 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2996 if (float32_is_signaling_nan(a
, status
)
2997 || float32_is_signaling_nan(b
, status
)) {
2998 float_raise(float_flag_invalid
, status
);
3002 return ( float32_val(a
) == float32_val(b
) ) ||
3003 ( (uint32_t) ( ( float32_val(a
) | float32_val(b
) )<<1 ) == 0 );
3006 /*----------------------------------------------------------------------------
3007 | Returns 1 if the single-precision floating-point value `a' is less than or
3008 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3009 | cause an exception. Otherwise, the comparison is performed according to the
3010 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3011 *----------------------------------------------------------------------------*/
3013 int float32_le_quiet(float32 a
, float32 b
, float_status
*status
)
3017 a
= float32_squash_input_denormal(a
, status
);
3018 b
= float32_squash_input_denormal(b
, status
);
3020 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3021 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3023 if (float32_is_signaling_nan(a
, status
)
3024 || float32_is_signaling_nan(b
, status
)) {
3025 float_raise(float_flag_invalid
, status
);
3029 aSign
= extractFloat32Sign( a
);
3030 bSign
= extractFloat32Sign( b
);
3031 av
= float32_val(a
);
3032 bv
= float32_val(b
);
3033 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
3034 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3038 /*----------------------------------------------------------------------------
3039 | Returns 1 if the single-precision floating-point value `a' is less than
3040 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3041 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3042 | Standard for Binary Floating-Point Arithmetic.
3043 *----------------------------------------------------------------------------*/
3045 int float32_lt_quiet(float32 a
, float32 b
, float_status
*status
)
3049 a
= float32_squash_input_denormal(a
, status
);
3050 b
= float32_squash_input_denormal(b
, status
);
3052 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3053 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3055 if (float32_is_signaling_nan(a
, status
)
3056 || float32_is_signaling_nan(b
, status
)) {
3057 float_raise(float_flag_invalid
, status
);
3061 aSign
= extractFloat32Sign( a
);
3062 bSign
= extractFloat32Sign( b
);
3063 av
= float32_val(a
);
3064 bv
= float32_val(b
);
3065 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
3066 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3070 /*----------------------------------------------------------------------------
3071 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3072 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3073 | comparison is performed according to the IEC/IEEE Standard for Binary
3074 | Floating-Point Arithmetic.
3075 *----------------------------------------------------------------------------*/
3077 int float32_unordered_quiet(float32 a
, float32 b
, float_status
*status
)
3079 a
= float32_squash_input_denormal(a
, status
);
3080 b
= float32_squash_input_denormal(b
, status
);
3082 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3083 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3085 if (float32_is_signaling_nan(a
, status
)
3086 || float32_is_signaling_nan(b
, status
)) {
3087 float_raise(float_flag_invalid
, status
);
3094 /*----------------------------------------------------------------------------
3095 | Returns the result of converting the double-precision floating-point value
3096 | `a' to the 32-bit two's complement integer format. The conversion is
3097 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3098 | Arithmetic---which means in particular that the conversion is rounded
3099 | according to the current rounding mode. If `a' is a NaN, the largest
3100 | positive integer is returned. Otherwise, if the conversion overflows, the
3101 | largest integer with the same sign as `a' is returned.
3102 *----------------------------------------------------------------------------*/
3104 int32_t float64_to_int32(float64 a
, float_status
*status
)
3110 a
= float64_squash_input_denormal(a
, status
);
3112 aSig
= extractFloat64Frac( a
);
3113 aExp
= extractFloat64Exp( a
);
3114 aSign
= extractFloat64Sign( a
);
3115 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
3116 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
3117 shiftCount
= 0x42C - aExp
;
3118 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
3119 return roundAndPackInt32(aSign
, aSig
, status
);
3123 /*----------------------------------------------------------------------------
3124 | Returns the result of converting the double-precision floating-point value
3125 | `a' to the 32-bit two's complement integer format. The conversion is
3126 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3127 | Arithmetic, except that the conversion is always rounded toward zero.
3128 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3129 | the conversion overflows, the largest integer with the same sign as `a' is
3131 *----------------------------------------------------------------------------*/
3133 int32_t float64_to_int32_round_to_zero(float64 a
, float_status
*status
)
3138 uint64_t aSig
, savedASig
;
3140 a
= float64_squash_input_denormal(a
, status
);
3142 aSig
= extractFloat64Frac( a
);
3143 aExp
= extractFloat64Exp( a
);
3144 aSign
= extractFloat64Sign( a
);
3145 if ( 0x41E < aExp
) {
3146 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
3149 else if ( aExp
< 0x3FF ) {
3151 status
->float_exception_flags
|= float_flag_inexact
;
3155 aSig
|= LIT64( 0x0010000000000000 );
3156 shiftCount
= 0x433 - aExp
;
3158 aSig
>>= shiftCount
;
3160 if ( aSign
) z
= - z
;
3161 if ( ( z
< 0 ) ^ aSign
) {
3163 float_raise(float_flag_invalid
, status
);
3164 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
3166 if ( ( aSig
<<shiftCount
) != savedASig
) {
3167 status
->float_exception_flags
|= float_flag_inexact
;
3173 /*----------------------------------------------------------------------------
3174 | Returns the result of converting the double-precision floating-point value
3175 | `a' to the 16-bit two's complement integer format. The conversion is
3176 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3177 | Arithmetic, except that the conversion is always rounded toward zero.
3178 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3179 | the conversion overflows, the largest integer with the same sign as `a' is
3181 *----------------------------------------------------------------------------*/
3183 int16_t float64_to_int16_round_to_zero(float64 a
, float_status
*status
)
3188 uint64_t aSig
, savedASig
;
3191 aSig
= extractFloat64Frac( a
);
3192 aExp
= extractFloat64Exp( a
);
3193 aSign
= extractFloat64Sign( a
);
3194 if ( 0x40E < aExp
) {
3195 if ( ( aExp
== 0x7FF ) && aSig
) {
3200 else if ( aExp
< 0x3FF ) {
3201 if ( aExp
|| aSig
) {
3202 status
->float_exception_flags
|= float_flag_inexact
;
3206 aSig
|= LIT64( 0x0010000000000000 );
3207 shiftCount
= 0x433 - aExp
;
3209 aSig
>>= shiftCount
;
3214 if ( ( (int16_t)z
< 0 ) ^ aSign
) {
3216 float_raise(float_flag_invalid
, status
);
3217 return aSign
? (int32_t) 0xffff8000 : 0x7FFF;
3219 if ( ( aSig
<<shiftCount
) != savedASig
) {
3220 status
->float_exception_flags
|= float_flag_inexact
;
3225 /*----------------------------------------------------------------------------
3226 | Returns the result of converting the double-precision floating-point value
3227 | `a' to the 64-bit two's complement integer format. The conversion is
3228 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3229 | Arithmetic---which means in particular that the conversion is rounded
3230 | according to the current rounding mode. If `a' is a NaN, the largest
3231 | positive integer is returned. Otherwise, if the conversion overflows, the
3232 | largest integer with the same sign as `a' is returned.
3233 *----------------------------------------------------------------------------*/
3235 int64_t float64_to_int64(float64 a
, float_status
*status
)
3240 uint64_t aSig
, aSigExtra
;
3241 a
= float64_squash_input_denormal(a
, status
);
3243 aSig
= extractFloat64Frac( a
);
3244 aExp
= extractFloat64Exp( a
);
3245 aSign
= extractFloat64Sign( a
);
3246 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
3247 shiftCount
= 0x433 - aExp
;
3248 if ( shiftCount
<= 0 ) {
3249 if ( 0x43E < aExp
) {
3250 float_raise(float_flag_invalid
, status
);
3252 || ( ( aExp
== 0x7FF )
3253 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
3255 return LIT64( 0x7FFFFFFFFFFFFFFF );
3257 return (int64_t) LIT64( 0x8000000000000000 );
3260 aSig
<<= - shiftCount
;
3263 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
3265 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
3269 /*----------------------------------------------------------------------------
3270 | Returns the result of converting the double-precision floating-point value
3271 | `a' to the 64-bit two's complement integer format. The conversion is
3272 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3273 | Arithmetic, except that the conversion is always rounded toward zero.
3274 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3275 | the conversion overflows, the largest integer with the same sign as `a' is
3277 *----------------------------------------------------------------------------*/
3279 int64_t float64_to_int64_round_to_zero(float64 a
, float_status
*status
)
3286 a
= float64_squash_input_denormal(a
, status
);
3288 aSig
= extractFloat64Frac( a
);
3289 aExp
= extractFloat64Exp( a
);
3290 aSign
= extractFloat64Sign( a
);
3291 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
3292 shiftCount
= aExp
- 0x433;
3293 if ( 0 <= shiftCount
) {
3294 if ( 0x43E <= aExp
) {
3295 if ( float64_val(a
) != LIT64( 0xC3E0000000000000 ) ) {
3296 float_raise(float_flag_invalid
, status
);
3298 || ( ( aExp
== 0x7FF )
3299 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
3301 return LIT64( 0x7FFFFFFFFFFFFFFF );
3304 return (int64_t) LIT64( 0x8000000000000000 );
3306 z
= aSig
<<shiftCount
;
3309 if ( aExp
< 0x3FE ) {
3311 status
->float_exception_flags
|= float_flag_inexact
;
3315 z
= aSig
>>( - shiftCount
);
3316 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
3317 status
->float_exception_flags
|= float_flag_inexact
;
3320 if ( aSign
) z
= - z
;
3325 /*----------------------------------------------------------------------------
3326 | Returns the result of converting the double-precision floating-point value
3327 | `a' to the single-precision floating-point format. The conversion is
3328 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3330 *----------------------------------------------------------------------------*/
3332 float32
float64_to_float32(float64 a
, float_status
*status
)
3338 a
= float64_squash_input_denormal(a
, status
);
3340 aSig
= extractFloat64Frac( a
);
3341 aExp
= extractFloat64Exp( a
);
3342 aSign
= extractFloat64Sign( a
);
3343 if ( aExp
== 0x7FF ) {
3345 return commonNaNToFloat32(float64ToCommonNaN(a
, status
), status
);
3347 return packFloat32( aSign
, 0xFF, 0 );
3349 shift64RightJamming( aSig
, 22, &aSig
);
3351 if ( aExp
|| zSig
) {
3355 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
3360 /*----------------------------------------------------------------------------
3361 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3362 | half-precision floating-point value, returning the result. After being
3363 | shifted into the proper positions, the three fields are simply added
3364 | together to form the result. This means that any integer portion of `zSig'
3365 | will be added into the exponent. Since a properly normalized significand
3366 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3367 | than the desired result exponent whenever `zSig' is a complete, normalized
3369 *----------------------------------------------------------------------------*/
3370 static float16
packFloat16(flag zSign
, int zExp
, uint16_t zSig
)
3372 return make_float16(
3373 (((uint32_t)zSign
) << 15) + (((uint32_t)zExp
) << 10) + zSig
);
3376 /*----------------------------------------------------------------------------
3377 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3378 | and significand `zSig', and returns the proper half-precision floating-
3379 | point value corresponding to the abstract input. Ordinarily, the abstract
3380 | value is simply rounded and packed into the half-precision format, with
3381 | the inexact exception raised if the abstract input cannot be represented
3382 | exactly. However, if the abstract value is too large, the overflow and
3383 | inexact exceptions are raised and an infinity or maximal finite value is
3384 | returned. If the abstract value is too small, the input value is rounded to
3385 | a subnormal number, and the underflow and inexact exceptions are raised if
3386 | the abstract input cannot be represented exactly as a subnormal half-
3387 | precision floating-point number.
3388 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3389 | ARM-style "alternative representation", which omits the NaN and Inf
3390 | encodings in order to raise the maximum representable exponent by one.
3391 | The input significand `zSig' has its binary point between bits 22
3392 | and 23, which is 13 bits to the left of the usual location. This shifted
3393 | significand must be normalized or smaller. If `zSig' is not normalized,
3394 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3395 | and it must not require rounding. In the usual case that `zSig' is
3396 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3397 | Note the slightly odd position of the binary point in zSig compared with the
3398 | other roundAndPackFloat functions. This should probably be fixed if we
3399 | need to implement more float16 routines than just conversion.
3400 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3401 | Binary Floating-Point Arithmetic.
3402 *----------------------------------------------------------------------------*/
3404 static float16
roundAndPackFloat16(flag zSign
, int zExp
,
3405 uint32_t zSig
, flag ieee
,
3406 float_status
*status
)
3408 int maxexp
= ieee
? 29 : 30;
3411 bool rounding_bumps_exp
;
3412 bool is_tiny
= false;
3414 /* Calculate the mask of bits of the mantissa which are not
3415 * representable in half-precision and will be lost.
3418 /* Will be denormal in halfprec */
3424 /* Normal number in halfprec */
3428 switch (status
->float_rounding_mode
) {
3429 case float_round_nearest_even
:
3430 increment
= (mask
+ 1) >> 1;
3431 if ((zSig
& mask
) == increment
) {
3432 increment
= zSig
& (increment
<< 1);
3435 case float_round_ties_away
:
3436 increment
= (mask
+ 1) >> 1;
3438 case float_round_up
:
3439 increment
= zSign
? 0 : mask
;
3441 case float_round_down
:
3442 increment
= zSign
? mask
: 0;
3444 default: /* round_to_zero */
3449 rounding_bumps_exp
= (zSig
+ increment
>= 0x01000000);
3451 if (zExp
> maxexp
|| (zExp
== maxexp
&& rounding_bumps_exp
)) {
3453 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3454 return packFloat16(zSign
, 0x1f, 0);
3456 float_raise(float_flag_invalid
, status
);
3457 return packFloat16(zSign
, 0x1f, 0x3ff);
3462 /* Note that flush-to-zero does not affect half-precision results */
3464 (status
->float_detect_tininess
== float_tininess_before_rounding
)
3466 || (!rounding_bumps_exp
);
3469 float_raise(float_flag_inexact
, status
);
3471 float_raise(float_flag_underflow
, status
);
3476 if (rounding_bumps_exp
) {
3482 return packFloat16(zSign
, 0, 0);
3488 return packFloat16(zSign
, zExp
, zSig
>> 13);
3491 static void normalizeFloat16Subnormal(uint32_t aSig
, int *zExpPtr
,
3494 int8_t shiftCount
= countLeadingZeros32(aSig
) - 21;
3495 *zSigPtr
= aSig
<< shiftCount
;
3496 *zExpPtr
= 1 - shiftCount
;
3499 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3500 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3502 float32
float16_to_float32(float16 a
, flag ieee
, float_status
*status
)
3508 aSign
= extractFloat16Sign(a
);
3509 aExp
= extractFloat16Exp(a
);
3510 aSig
= extractFloat16Frac(a
);
3512 if (aExp
== 0x1f && ieee
) {
3514 return commonNaNToFloat32(float16ToCommonNaN(a
, status
), status
);
3516 return packFloat32(aSign
, 0xff, 0);
3520 return packFloat32(aSign
, 0, 0);
3523 normalizeFloat16Subnormal(aSig
, &aExp
, &aSig
);
3526 return packFloat32( aSign
, aExp
+ 0x70, aSig
<< 13);
3529 float16
float32_to_float16(float32 a
, flag ieee
, float_status
*status
)
3535 a
= float32_squash_input_denormal(a
, status
);
3537 aSig
= extractFloat32Frac( a
);
3538 aExp
= extractFloat32Exp( a
);
3539 aSign
= extractFloat32Sign( a
);
3540 if ( aExp
== 0xFF ) {
3542 /* Input is a NaN */
3544 float_raise(float_flag_invalid
, status
);
3545 return packFloat16(aSign
, 0, 0);
3547 return commonNaNToFloat16(
3548 float32ToCommonNaN(a
, status
), status
);
3552 float_raise(float_flag_invalid
, status
);
3553 return packFloat16(aSign
, 0x1f, 0x3ff);
3555 return packFloat16(aSign
, 0x1f, 0);
3557 if (aExp
== 0 && aSig
== 0) {
3558 return packFloat16(aSign
, 0, 0);
3560 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3561 * even if the input is denormal; however this is harmless because
3562 * the largest possible single-precision denormal is still smaller
3563 * than the smallest representable half-precision denormal, and so we
3564 * will end up ignoring aSig and returning via the "always return zero"
3570 return roundAndPackFloat16(aSign
, aExp
, aSig
, ieee
, status
);
3573 float64
float16_to_float64(float16 a
, flag ieee
, float_status
*status
)
3579 aSign
= extractFloat16Sign(a
);
3580 aExp
= extractFloat16Exp(a
);
3581 aSig
= extractFloat16Frac(a
);
3583 if (aExp
== 0x1f && ieee
) {
3585 return commonNaNToFloat64(
3586 float16ToCommonNaN(a
, status
), status
);
3588 return packFloat64(aSign
, 0x7ff, 0);
3592 return packFloat64(aSign
, 0, 0);
3595 normalizeFloat16Subnormal(aSig
, &aExp
, &aSig
);
3598 return packFloat64(aSign
, aExp
+ 0x3f0, ((uint64_t)aSig
) << 42);
3601 float16
float64_to_float16(float64 a
, flag ieee
, float_status
*status
)
3608 a
= float64_squash_input_denormal(a
, status
);
3610 aSig
= extractFloat64Frac(a
);
3611 aExp
= extractFloat64Exp(a
);
3612 aSign
= extractFloat64Sign(a
);
3613 if (aExp
== 0x7FF) {
3615 /* Input is a NaN */
3617 float_raise(float_flag_invalid
, status
);
3618 return packFloat16(aSign
, 0, 0);
3620 return commonNaNToFloat16(
3621 float64ToCommonNaN(a
, status
), status
);
3625 float_raise(float_flag_invalid
, status
);
3626 return packFloat16(aSign
, 0x1f, 0x3ff);
3628 return packFloat16(aSign
, 0x1f, 0);
3630 shift64RightJamming(aSig
, 29, &aSig
);
3632 if (aExp
== 0 && zSig
== 0) {
3633 return packFloat16(aSign
, 0, 0);
3635 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3636 * even if the input is denormal; however this is harmless because
3637 * the largest possible single-precision denormal is still smaller
3638 * than the smallest representable half-precision denormal, and so we
3639 * will end up ignoring aSig and returning via the "always return zero"
3645 return roundAndPackFloat16(aSign
, aExp
, zSig
, ieee
, status
);
3648 /*----------------------------------------------------------------------------
3649 | Returns the result of converting the double-precision floating-point value
3650 | `a' to the extended double-precision floating-point format. The conversion
3651 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3653 *----------------------------------------------------------------------------*/
3655 floatx80
float64_to_floatx80(float64 a
, float_status
*status
)
3661 a
= float64_squash_input_denormal(a
, status
);
3662 aSig
= extractFloat64Frac( a
);
3663 aExp
= extractFloat64Exp( a
);
3664 aSign
= extractFloat64Sign( a
);
3665 if ( aExp
== 0x7FF ) {
3667 return commonNaNToFloatx80(float64ToCommonNaN(a
, status
), status
);
3669 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3672 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
3673 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3677 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
3681 /*----------------------------------------------------------------------------
3682 | Returns the result of converting the double-precision floating-point value
3683 | `a' to the quadruple-precision floating-point format. The conversion is
3684 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3686 *----------------------------------------------------------------------------*/
3688 float128
float64_to_float128(float64 a
, float_status
*status
)
3692 uint64_t aSig
, zSig0
, zSig1
;
3694 a
= float64_squash_input_denormal(a
, status
);
3695 aSig
= extractFloat64Frac( a
);
3696 aExp
= extractFloat64Exp( a
);
3697 aSign
= extractFloat64Sign( a
);
3698 if ( aExp
== 0x7FF ) {
3700 return commonNaNToFloat128(float64ToCommonNaN(a
, status
), status
);
3702 return packFloat128( aSign
, 0x7FFF, 0, 0 );
3705 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
3706 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3709 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
3710 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
3714 /*----------------------------------------------------------------------------
3715 | Rounds the double-precision floating-point value `a' to an integer, and
3716 | returns the result as a double-precision floating-point value. The
3717 | operation is performed according to the IEC/IEEE Standard for Binary
3718 | Floating-Point Arithmetic.
3719 *----------------------------------------------------------------------------*/
3721 float64
float64_round_to_int(float64 a
, float_status
*status
)
3725 uint64_t lastBitMask
, roundBitsMask
;
3727 a
= float64_squash_input_denormal(a
, status
);
3729 aExp
= extractFloat64Exp( a
);
3730 if ( 0x433 <= aExp
) {
3731 if ( ( aExp
== 0x7FF ) && extractFloat64Frac( a
) ) {
3732 return propagateFloat64NaN(a
, a
, status
);
3736 if ( aExp
< 0x3FF ) {
3737 if ( (uint64_t) ( float64_val(a
)<<1 ) == 0 ) return a
;
3738 status
->float_exception_flags
|= float_flag_inexact
;
3739 aSign
= extractFloat64Sign( a
);
3740 switch (status
->float_rounding_mode
) {
3741 case float_round_nearest_even
:
3742 if ( ( aExp
== 0x3FE ) && extractFloat64Frac( a
) ) {
3743 return packFloat64( aSign
, 0x3FF, 0 );
3746 case float_round_ties_away
:
3747 if (aExp
== 0x3FE) {
3748 return packFloat64(aSign
, 0x3ff, 0);
3751 case float_round_down
:
3752 return make_float64(aSign
? LIT64( 0xBFF0000000000000 ) : 0);
3753 case float_round_up
:
3754 return make_float64(
3755 aSign
? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3757 return packFloat64( aSign
, 0, 0 );
3760 lastBitMask
<<= 0x433 - aExp
;
3761 roundBitsMask
= lastBitMask
- 1;
3763 switch (status
->float_rounding_mode
) {
3764 case float_round_nearest_even
:
3765 z
+= lastBitMask
>> 1;
3766 if ((z
& roundBitsMask
) == 0) {
3770 case float_round_ties_away
:
3771 z
+= lastBitMask
>> 1;
3773 case float_round_to_zero
:
3775 case float_round_up
:
3776 if (!extractFloat64Sign(make_float64(z
))) {
3780 case float_round_down
:
3781 if (extractFloat64Sign(make_float64(z
))) {
3788 z
&= ~ roundBitsMask
;
3789 if (z
!= float64_val(a
)) {
3790 status
->float_exception_flags
|= float_flag_inexact
;
3792 return make_float64(z
);
3796 float64
float64_trunc_to_int(float64 a
, float_status
*status
)
3800 oldmode
= status
->float_rounding_mode
;
3801 status
->float_rounding_mode
= float_round_to_zero
;
3802 res
= float64_round_to_int(a
, status
);
3803 status
->float_rounding_mode
= oldmode
;
3807 /*----------------------------------------------------------------------------
3808 | Returns the result of adding the absolute values of the double-precision
3809 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3810 | before being returned. `zSign' is ignored if the result is a NaN.
3811 | The addition is performed according to the IEC/IEEE Standard for Binary
3812 | Floating-Point Arithmetic.
3813 *----------------------------------------------------------------------------*/
3815 static float64
addFloat64Sigs(float64 a
, float64 b
, flag zSign
,
3816 float_status
*status
)
3818 int aExp
, bExp
, zExp
;
3819 uint64_t aSig
, bSig
, zSig
;
3822 aSig
= extractFloat64Frac( a
);
3823 aExp
= extractFloat64Exp( a
);
3824 bSig
= extractFloat64Frac( b
);
3825 bExp
= extractFloat64Exp( b
);
3826 expDiff
= aExp
- bExp
;
3829 if ( 0 < expDiff
) {
3830 if ( aExp
== 0x7FF ) {
3832 return propagateFloat64NaN(a
, b
, status
);
3840 bSig
|= LIT64( 0x2000000000000000 );
3842 shift64RightJamming( bSig
, expDiff
, &bSig
);
3845 else if ( expDiff
< 0 ) {
3846 if ( bExp
== 0x7FF ) {
3848 return propagateFloat64NaN(a
, b
, status
);
3850 return packFloat64( zSign
, 0x7FF, 0 );
3856 aSig
|= LIT64( 0x2000000000000000 );
3858 shift64RightJamming( aSig
, - expDiff
, &aSig
);
3862 if ( aExp
== 0x7FF ) {
3864 return propagateFloat64NaN(a
, b
, status
);
3869 if (status
->flush_to_zero
) {
3871 float_raise(float_flag_output_denormal
, status
);
3873 return packFloat64(zSign
, 0, 0);
3875 return packFloat64( zSign
, 0, ( aSig
+ bSig
)>>9 );
3877 zSig
= LIT64( 0x4000000000000000 ) + aSig
+ bSig
;
3881 aSig
|= LIT64( 0x2000000000000000 );
3882 zSig
= ( aSig
+ bSig
)<<1;
3884 if ( (int64_t) zSig
< 0 ) {
3889 return roundAndPackFloat64(zSign
, zExp
, zSig
, status
);
3893 /*----------------------------------------------------------------------------
3894 | Returns the result of subtracting the absolute values of the double-
3895 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3896 | difference is negated before being returned. `zSign' is ignored if the
3897 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3898 | Standard for Binary Floating-Point Arithmetic.
3899 *----------------------------------------------------------------------------*/
3901 static float64
subFloat64Sigs(float64 a
, float64 b
, flag zSign
,
3902 float_status
*status
)
3904 int aExp
, bExp
, zExp
;
3905 uint64_t aSig
, bSig
, zSig
;
3908 aSig
= extractFloat64Frac( a
);
3909 aExp
= extractFloat64Exp( a
);
3910 bSig
= extractFloat64Frac( b
);
3911 bExp
= extractFloat64Exp( b
);
3912 expDiff
= aExp
- bExp
;
3915 if ( 0 < expDiff
) goto aExpBigger
;
3916 if ( expDiff
< 0 ) goto bExpBigger
;
3917 if ( aExp
== 0x7FF ) {
3919 return propagateFloat64NaN(a
, b
, status
);
3921 float_raise(float_flag_invalid
, status
);
3922 return float64_default_nan(status
);
3928 if ( bSig
< aSig
) goto aBigger
;
3929 if ( aSig
< bSig
) goto bBigger
;
3930 return packFloat64(status
->float_rounding_mode
== float_round_down
, 0, 0);
3932 if ( bExp
== 0x7FF ) {
3934 return propagateFloat64NaN(a
, b
, status
);
3936 return packFloat64( zSign
^ 1, 0x7FF, 0 );
3942 aSig
|= LIT64( 0x4000000000000000 );
3944 shift64RightJamming( aSig
, - expDiff
, &aSig
);
3945 bSig
|= LIT64( 0x4000000000000000 );
3950 goto normalizeRoundAndPack
;
3952 if ( aExp
== 0x7FF ) {
3954 return propagateFloat64NaN(a
, b
, status
);
3962 bSig
|= LIT64( 0x4000000000000000 );
3964 shift64RightJamming( bSig
, expDiff
, &bSig
);
3965 aSig
|= LIT64( 0x4000000000000000 );
3969 normalizeRoundAndPack
:
3971 return normalizeRoundAndPackFloat64(zSign
, zExp
, zSig
, status
);
3975 /*----------------------------------------------------------------------------
3976 | Returns the result of adding the double-precision floating-point values `a'
3977 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3978 | Binary Floating-Point Arithmetic.
3979 *----------------------------------------------------------------------------*/
3981 float64
float64_add(float64 a
, float64 b
, float_status
*status
)
3984 a
= float64_squash_input_denormal(a
, status
);
3985 b
= float64_squash_input_denormal(b
, status
);
3987 aSign
= extractFloat64Sign( a
);
3988 bSign
= extractFloat64Sign( b
);
3989 if ( aSign
== bSign
) {
3990 return addFloat64Sigs(a
, b
, aSign
, status
);
3993 return subFloat64Sigs(a
, b
, aSign
, status
);
3998 /*----------------------------------------------------------------------------
3999 | Returns the result of subtracting the double-precision floating-point values
4000 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4001 | for Binary Floating-Point Arithmetic.
4002 *----------------------------------------------------------------------------*/
4004 float64
float64_sub(float64 a
, float64 b
, float_status
*status
)
4007 a
= float64_squash_input_denormal(a
, status
);
4008 b
= float64_squash_input_denormal(b
, status
);
4010 aSign
= extractFloat64Sign( a
);
4011 bSign
= extractFloat64Sign( b
);
4012 if ( aSign
== bSign
) {
4013 return subFloat64Sigs(a
, b
, aSign
, status
);
4016 return addFloat64Sigs(a
, b
, aSign
, status
);
4021 /*----------------------------------------------------------------------------
4022 | Returns the result of multiplying the double-precision floating-point values
4023 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4024 | for Binary Floating-Point Arithmetic.
4025 *----------------------------------------------------------------------------*/
4027 float64
float64_mul(float64 a
, float64 b
, float_status
*status
)
4029 flag aSign
, bSign
, zSign
;
4030 int aExp
, bExp
, zExp
;
4031 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4033 a
= float64_squash_input_denormal(a
, status
);
4034 b
= float64_squash_input_denormal(b
, status
);
4036 aSig
= extractFloat64Frac( a
);
4037 aExp
= extractFloat64Exp( a
);
4038 aSign
= extractFloat64Sign( a
);
4039 bSig
= extractFloat64Frac( b
);
4040 bExp
= extractFloat64Exp( b
);
4041 bSign
= extractFloat64Sign( b
);
4042 zSign
= aSign
^ bSign
;
4043 if ( aExp
== 0x7FF ) {
4044 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
4045 return propagateFloat64NaN(a
, b
, status
);
4047 if ( ( bExp
| bSig
) == 0 ) {
4048 float_raise(float_flag_invalid
, status
);
4049 return float64_default_nan(status
);
4051 return packFloat64( zSign
, 0x7FF, 0 );
4053 if ( bExp
== 0x7FF ) {
4055 return propagateFloat64NaN(a
, b
, status
);
4057 if ( ( aExp
| aSig
) == 0 ) {
4058 float_raise(float_flag_invalid
, status
);
4059 return float64_default_nan(status
);
4061 return packFloat64( zSign
, 0x7FF, 0 );
4064 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
4065 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4068 if ( bSig
== 0 ) return packFloat64( zSign
, 0, 0 );
4069 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
4071 zExp
= aExp
+ bExp
- 0x3FF;
4072 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
4073 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
4074 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
4075 zSig0
|= ( zSig1
!= 0 );
4076 if ( 0 <= (int64_t) ( zSig0
<<1 ) ) {
4080 return roundAndPackFloat64(zSign
, zExp
, zSig0
, status
);
4084 /*----------------------------------------------------------------------------
4085 | Returns the result of dividing the double-precision floating-point value `a'
4086 | by the corresponding value `b'. The operation is performed according to
4087 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4088 *----------------------------------------------------------------------------*/
4090 float64
float64_div(float64 a
, float64 b
, float_status
*status
)
4092 flag aSign
, bSign
, zSign
;
4093 int aExp
, bExp
, zExp
;
4094 uint64_t aSig
, bSig
, zSig
;
4095 uint64_t rem0
, rem1
;
4096 uint64_t term0
, term1
;
4097 a
= float64_squash_input_denormal(a
, status
);
4098 b
= float64_squash_input_denormal(b
, status
);
4100 aSig
= extractFloat64Frac( a
);
4101 aExp
= extractFloat64Exp( a
);
4102 aSign
= extractFloat64Sign( a
);
4103 bSig
= extractFloat64Frac( b
);
4104 bExp
= extractFloat64Exp( b
);
4105 bSign
= extractFloat64Sign( b
);
4106 zSign
= aSign
^ bSign
;
4107 if ( aExp
== 0x7FF ) {
4109 return propagateFloat64NaN(a
, b
, status
);
4111 if ( bExp
== 0x7FF ) {
4113 return propagateFloat64NaN(a
, b
, status
);
4115 float_raise(float_flag_invalid
, status
);
4116 return float64_default_nan(status
);
4118 return packFloat64( zSign
, 0x7FF, 0 );
4120 if ( bExp
== 0x7FF ) {
4122 return propagateFloat64NaN(a
, b
, status
);
4124 return packFloat64( zSign
, 0, 0 );
4128 if ( ( aExp
| aSig
) == 0 ) {
4129 float_raise(float_flag_invalid
, status
);
4130 return float64_default_nan(status
);
4132 float_raise(float_flag_divbyzero
, status
);
4133 return packFloat64( zSign
, 0x7FF, 0 );
4135 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
4138 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
4139 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4141 zExp
= aExp
- bExp
+ 0x3FD;
4142 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
4143 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
4144 if ( bSig
<= ( aSig
+ aSig
) ) {
4148 zSig
= estimateDiv128To64( aSig
, 0, bSig
);
4149 if ( ( zSig
& 0x1FF ) <= 2 ) {
4150 mul64To128( bSig
, zSig
, &term0
, &term1
);
4151 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
4152 while ( (int64_t) rem0
< 0 ) {
4154 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
4156 zSig
|= ( rem1
!= 0 );
4158 return roundAndPackFloat64(zSign
, zExp
, zSig
, status
);
4162 /*----------------------------------------------------------------------------
4163 | Returns the remainder of the double-precision floating-point value `a'
4164 | with respect to the corresponding value `b'. The operation is performed
4165 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4166 *----------------------------------------------------------------------------*/
4168 float64
float64_rem(float64 a
, float64 b
, float_status
*status
)
4171 int aExp
, bExp
, expDiff
;
4172 uint64_t aSig
, bSig
;
4173 uint64_t q
, alternateASig
;
4176 a
= float64_squash_input_denormal(a
, status
);
4177 b
= float64_squash_input_denormal(b
, status
);
4178 aSig
= extractFloat64Frac( a
);
4179 aExp
= extractFloat64Exp( a
);
4180 aSign
= extractFloat64Sign( a
);
4181 bSig
= extractFloat64Frac( b
);
4182 bExp
= extractFloat64Exp( b
);
4183 if ( aExp
== 0x7FF ) {
4184 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
4185 return propagateFloat64NaN(a
, b
, status
);
4187 float_raise(float_flag_invalid
, status
);
4188 return float64_default_nan(status
);
4190 if ( bExp
== 0x7FF ) {
4192 return propagateFloat64NaN(a
, b
, status
);
4198 float_raise(float_flag_invalid
, status
);
4199 return float64_default_nan(status
);
4201 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
4204 if ( aSig
== 0 ) return a
;
4205 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4207 expDiff
= aExp
- bExp
;
4208 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
4209 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
4210 if ( expDiff
< 0 ) {
4211 if ( expDiff
< -1 ) return a
;
4214 q
= ( bSig
<= aSig
);
4215 if ( q
) aSig
-= bSig
;
4217 while ( 0 < expDiff
) {
4218 q
= estimateDiv128To64( aSig
, 0, bSig
);
4219 q
= ( 2 < q
) ? q
- 2 : 0;
4220 aSig
= - ( ( bSig
>>2 ) * q
);
4224 if ( 0 < expDiff
) {
4225 q
= estimateDiv128To64( aSig
, 0, bSig
);
4226 q
= ( 2 < q
) ? q
- 2 : 0;
4229 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
4236 alternateASig
= aSig
;
4239 } while ( 0 <= (int64_t) aSig
);
4240 sigMean
= aSig
+ alternateASig
;
4241 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
4242 aSig
= alternateASig
;
4244 zSign
= ( (int64_t) aSig
< 0 );
4245 if ( zSign
) aSig
= - aSig
;
4246 return normalizeRoundAndPackFloat64(aSign
^ zSign
, bExp
, aSig
, status
);
4250 /*----------------------------------------------------------------------------
4251 | Returns the result of multiplying the double-precision floating-point values
4252 | `a' and `b' then adding 'c', with no intermediate rounding step after the
4253 | multiplication. The operation is performed according to the IEC/IEEE
4254 | Standard for Binary Floating-Point Arithmetic 754-2008.
4255 | The flags argument allows the caller to select negation of the
4256 | addend, the intermediate product, or the final result. (The difference
4257 | between this and having the caller do a separate negation is that negating
4258 | externally will flip the sign bit on NaNs.)
4259 *----------------------------------------------------------------------------*/
4261 float64
float64_muladd(float64 a
, float64 b
, float64 c
, int flags
,
4262 float_status
*status
)
4264 flag aSign
, bSign
, cSign
, zSign
;
4265 int aExp
, bExp
, cExp
, pExp
, zExp
, expDiff
;
4266 uint64_t aSig
, bSig
, cSig
;
4267 flag pInf
, pZero
, pSign
;
4268 uint64_t pSig0
, pSig1
, cSig0
, cSig1
, zSig0
, zSig1
;
4270 flag signflip
, infzero
;
4272 a
= float64_squash_input_denormal(a
, status
);
4273 b
= float64_squash_input_denormal(b
, status
);
4274 c
= float64_squash_input_denormal(c
, status
);
4275 aSig
= extractFloat64Frac(a
);
4276 aExp
= extractFloat64Exp(a
);
4277 aSign
= extractFloat64Sign(a
);
4278 bSig
= extractFloat64Frac(b
);
4279 bExp
= extractFloat64Exp(b
);
4280 bSign
= extractFloat64Sign(b
);
4281 cSig
= extractFloat64Frac(c
);
4282 cExp
= extractFloat64Exp(c
);
4283 cSign
= extractFloat64Sign(c
);
4285 infzero
= ((aExp
== 0 && aSig
== 0 && bExp
== 0x7ff && bSig
== 0) ||
4286 (aExp
== 0x7ff && aSig
== 0 && bExp
== 0 && bSig
== 0));
4288 /* It is implementation-defined whether the cases of (0,inf,qnan)
4289 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4290 * they return if they do), so we have to hand this information
4291 * off to the target-specific pick-a-NaN routine.
4293 if (((aExp
== 0x7ff) && aSig
) ||
4294 ((bExp
== 0x7ff) && bSig
) ||
4295 ((cExp
== 0x7ff) && cSig
)) {
4296 return propagateFloat64MulAddNaN(a
, b
, c
, infzero
, status
);
4300 float_raise(float_flag_invalid
, status
);
4301 return float64_default_nan(status
);
4304 if (flags
& float_muladd_negate_c
) {
4308 signflip
= (flags
& float_muladd_negate_result
) ? 1 : 0;
4310 /* Work out the sign and type of the product */
4311 pSign
= aSign
^ bSign
;
4312 if (flags
& float_muladd_negate_product
) {
4315 pInf
= (aExp
== 0x7ff) || (bExp
== 0x7ff);
4316 pZero
= ((aExp
| aSig
) == 0) || ((bExp
| bSig
) == 0);
4318 if (cExp
== 0x7ff) {
4319 if (pInf
&& (pSign
^ cSign
)) {
4320 /* addition of opposite-signed infinities => InvalidOperation */
4321 float_raise(float_flag_invalid
, status
);
4322 return float64_default_nan(status
);
4324 /* Otherwise generate an infinity of the same sign */
4325 return packFloat64(cSign
^ signflip
, 0x7ff, 0);
4329 return packFloat64(pSign
^ signflip
, 0x7ff, 0);
4335 /* Adding two exact zeroes */
4336 if (pSign
== cSign
) {
4338 } else if (status
->float_rounding_mode
== float_round_down
) {
4343 return packFloat64(zSign
^ signflip
, 0, 0);
4345 /* Exact zero plus a denorm */
4346 if (status
->flush_to_zero
) {
4347 float_raise(float_flag_output_denormal
, status
);
4348 return packFloat64(cSign
^ signflip
, 0, 0);
4351 /* Zero plus something non-zero : just return the something */
4352 if (flags
& float_muladd_halve_result
) {
4354 normalizeFloat64Subnormal(cSig
, &cExp
, &cSig
);
4356 /* Subtract one to halve, and one again because roundAndPackFloat64
4357 * wants one less than the true exponent.
4360 cSig
= (cSig
| 0x0010000000000000ULL
) << 10;
4361 return roundAndPackFloat64(cSign
^ signflip
, cExp
, cSig
, status
);
4363 return packFloat64(cSign
^ signflip
, cExp
, cSig
);
4367 normalizeFloat64Subnormal(aSig
, &aExp
, &aSig
);
4370 normalizeFloat64Subnormal(bSig
, &bExp
, &bSig
);
4373 /* Calculate the actual result a * b + c */
4375 /* Multiply first; this is easy. */
4376 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4377 * because we want the true exponent, not the "one-less-than"
4378 * flavour that roundAndPackFloat64() takes.
4380 pExp
= aExp
+ bExp
- 0x3fe;
4381 aSig
= (aSig
| LIT64(0x0010000000000000))<<10;
4382 bSig
= (bSig
| LIT64(0x0010000000000000))<<11;
4383 mul64To128(aSig
, bSig
, &pSig0
, &pSig1
);
4384 if ((int64_t)(pSig0
<< 1) >= 0) {
4385 shortShift128Left(pSig0
, pSig1
, 1, &pSig0
, &pSig1
);
4389 zSign
= pSign
^ signflip
;
4391 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4392 * bit in position 126.
4396 /* Throw out the special case of c being an exact zero now */
4397 shift128RightJamming(pSig0
, pSig1
, 64, &pSig0
, &pSig1
);
4398 if (flags
& float_muladd_halve_result
) {
4401 return roundAndPackFloat64(zSign
, pExp
- 1,
4404 normalizeFloat64Subnormal(cSig
, &cExp
, &cSig
);
4407 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4408 * significand of the addend, with the explicit bit in position 126.
4410 cSig0
= cSig
<< (126 - 64 - 52);
4412 cSig0
|= LIT64(0x4000000000000000);
4413 expDiff
= pExp
- cExp
;
4415 if (pSign
== cSign
) {
4418 /* scale c to match p */
4419 shift128RightJamming(cSig0
, cSig1
, expDiff
, &cSig0
, &cSig1
);
4421 } else if (expDiff
< 0) {
4422 /* scale p to match c */
4423 shift128RightJamming(pSig0
, pSig1
, -expDiff
, &pSig0
, &pSig1
);
4426 /* no scaling needed */
4429 /* Add significands and make sure explicit bit ends up in posn 126 */
4430 add128(pSig0
, pSig1
, cSig0
, cSig1
, &zSig0
, &zSig1
);
4431 if ((int64_t)zSig0
< 0) {
4432 shift128RightJamming(zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
4436 shift128RightJamming(zSig0
, zSig1
, 64, &zSig0
, &zSig1
);
4437 if (flags
& float_muladd_halve_result
) {
4440 return roundAndPackFloat64(zSign
, zExp
, zSig1
, status
);
4444 shift128RightJamming(cSig0
, cSig1
, expDiff
, &cSig0
, &cSig1
);
4445 sub128(pSig0
, pSig1
, cSig0
, cSig1
, &zSig0
, &zSig1
);
4447 } else if (expDiff
< 0) {
4448 shift128RightJamming(pSig0
, pSig1
, -expDiff
, &pSig0
, &pSig1
);
4449 sub128(cSig0
, cSig1
, pSig0
, pSig1
, &zSig0
, &zSig1
);
4454 if (lt128(cSig0
, cSig1
, pSig0
, pSig1
)) {
4455 sub128(pSig0
, pSig1
, cSig0
, cSig1
, &zSig0
, &zSig1
);
4456 } else if (lt128(pSig0
, pSig1
, cSig0
, cSig1
)) {
4457 sub128(cSig0
, cSig1
, pSig0
, pSig1
, &zSig0
, &zSig1
);
4462 if (status
->float_rounding_mode
== float_round_down
) {
4465 return packFloat64(zSign
, 0, 0);
4469 /* Do the equivalent of normalizeRoundAndPackFloat64() but
4470 * starting with the significand in a pair of uint64_t.
4473 shiftcount
= countLeadingZeros64(zSig0
) - 1;
4474 shortShift128Left(zSig0
, zSig1
, shiftcount
, &zSig0
, &zSig1
);
4480 shiftcount
= countLeadingZeros64(zSig1
);
4481 if (shiftcount
== 0) {
4482 zSig0
= (zSig1
>> 1) | (zSig1
& 1);
4486 zSig0
= zSig1
<< shiftcount
;
4487 zExp
-= (shiftcount
+ 64);
4490 if (flags
& float_muladd_halve_result
) {
4493 return roundAndPackFloat64(zSign
, zExp
, zSig0
, status
);
4497 /*----------------------------------------------------------------------------
4498 | Returns the square root of the double-precision floating-point value `a'.
4499 | The operation is performed according to the IEC/IEEE Standard for Binary
4500 | Floating-Point Arithmetic.
4501 *----------------------------------------------------------------------------*/
4503 float64
float64_sqrt(float64 a
, float_status
*status
)
4507 uint64_t aSig
, zSig
, doubleZSig
;
4508 uint64_t rem0
, rem1
, term0
, term1
;
4509 a
= float64_squash_input_denormal(a
, status
);
4511 aSig
= extractFloat64Frac( a
);
4512 aExp
= extractFloat64Exp( a
);
4513 aSign
= extractFloat64Sign( a
);
4514 if ( aExp
== 0x7FF ) {
4516 return propagateFloat64NaN(a
, a
, status
);
4518 if ( ! aSign
) return a
;
4519 float_raise(float_flag_invalid
, status
);
4520 return float64_default_nan(status
);
4523 if ( ( aExp
| aSig
) == 0 ) return a
;
4524 float_raise(float_flag_invalid
, status
);
4525 return float64_default_nan(status
);
4528 if ( aSig
== 0 ) return float64_zero
;
4529 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4531 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
4532 aSig
|= LIT64( 0x0010000000000000 );
4533 zSig
= estimateSqrt32( aExp
, aSig
>>21 );
4534 aSig
<<= 9 - ( aExp
& 1 );
4535 zSig
= estimateDiv128To64( aSig
, 0, zSig
<<32 ) + ( zSig
<<30 );
4536 if ( ( zSig
& 0x1FF ) <= 5 ) {
4537 doubleZSig
= zSig
<<1;
4538 mul64To128( zSig
, zSig
, &term0
, &term1
);
4539 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
4540 while ( (int64_t) rem0
< 0 ) {
4543 add128( rem0
, rem1
, zSig
>>63, doubleZSig
| 1, &rem0
, &rem1
);
4545 zSig
|= ( ( rem0
| rem1
) != 0 );
4547 return roundAndPackFloat64(0, zExp
, zSig
, status
);
4551 /*----------------------------------------------------------------------------
4552 | Returns the binary log of the double-precision floating-point value `a'.
4553 | The operation is performed according to the IEC/IEEE Standard for Binary
4554 | Floating-Point Arithmetic.
4555 *----------------------------------------------------------------------------*/
4556 float64
float64_log2(float64 a
, float_status
*status
)
4560 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
4561 a
= float64_squash_input_denormal(a
, status
);
4563 aSig
= extractFloat64Frac( a
);
4564 aExp
= extractFloat64Exp( a
);
4565 aSign
= extractFloat64Sign( a
);
4568 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
4569 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4572 float_raise(float_flag_invalid
, status
);
4573 return float64_default_nan(status
);
4575 if ( aExp
== 0x7FF ) {
4577 return propagateFloat64NaN(a
, float64_zero
, status
);
4583 aSig
|= LIT64( 0x0010000000000000 );
4585 zSig
= (uint64_t)aExp
<< 52;
4586 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
4587 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
4588 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
4589 if ( aSig
& LIT64( 0x0020000000000000 ) ) {
4597 return normalizeRoundAndPackFloat64(zSign
, 0x408, zSig
, status
);
4600 /*----------------------------------------------------------------------------
4601 | Returns 1 if the double-precision floating-point value `a' is equal to the
4602 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4603 | if either operand is a NaN. Otherwise, the comparison is performed
4604 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4605 *----------------------------------------------------------------------------*/
4607 int float64_eq(float64 a
, float64 b
, float_status
*status
)
4610 a
= float64_squash_input_denormal(a
, status
);
4611 b
= float64_squash_input_denormal(b
, status
);
4613 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4614 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4616 float_raise(float_flag_invalid
, status
);
4619 av
= float64_val(a
);
4620 bv
= float64_val(b
);
4621 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4625 /*----------------------------------------------------------------------------
4626 | Returns 1 if the double-precision floating-point value `a' is less than or
4627 | equal to the corresponding value `b', and 0 otherwise. The invalid
4628 | exception is raised if either operand is a NaN. The comparison is performed
4629 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4630 *----------------------------------------------------------------------------*/
4632 int float64_le(float64 a
, float64 b
, float_status
*status
)
4636 a
= float64_squash_input_denormal(a
, status
);
4637 b
= float64_squash_input_denormal(b
, status
);
4639 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4640 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4642 float_raise(float_flag_invalid
, status
);
4645 aSign
= extractFloat64Sign( a
);
4646 bSign
= extractFloat64Sign( b
);
4647 av
= float64_val(a
);
4648 bv
= float64_val(b
);
4649 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4650 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4654 /*----------------------------------------------------------------------------
4655 | Returns 1 if the double-precision floating-point value `a' is less than
4656 | the corresponding value `b', and 0 otherwise. The invalid exception is
4657 | raised if either operand is a NaN. The comparison is performed according
4658 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4659 *----------------------------------------------------------------------------*/
4661 int float64_lt(float64 a
, float64 b
, float_status
*status
)
4666 a
= float64_squash_input_denormal(a
, status
);
4667 b
= float64_squash_input_denormal(b
, status
);
4668 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4669 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4671 float_raise(float_flag_invalid
, status
);
4674 aSign
= extractFloat64Sign( a
);
4675 bSign
= extractFloat64Sign( b
);
4676 av
= float64_val(a
);
4677 bv
= float64_val(b
);
4678 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
4679 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4683 /*----------------------------------------------------------------------------
4684 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4685 | be compared, and 0 otherwise. The invalid exception is raised if either
4686 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4687 | Standard for Binary Floating-Point Arithmetic.
4688 *----------------------------------------------------------------------------*/
4690 int float64_unordered(float64 a
, float64 b
, float_status
*status
)
4692 a
= float64_squash_input_denormal(a
, status
);
4693 b
= float64_squash_input_denormal(b
, status
);
4695 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4696 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4698 float_raise(float_flag_invalid
, status
);
4704 /*----------------------------------------------------------------------------
4705 | Returns 1 if the double-precision floating-point value `a' is equal to the
4706 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4707 | exception.The comparison is performed according to the IEC/IEEE Standard
4708 | for Binary Floating-Point Arithmetic.
4709 *----------------------------------------------------------------------------*/
4711 int float64_eq_quiet(float64 a
, float64 b
, float_status
*status
)
4714 a
= float64_squash_input_denormal(a
, status
);
4715 b
= float64_squash_input_denormal(b
, status
);
4717 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4718 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4720 if (float64_is_signaling_nan(a
, status
)
4721 || float64_is_signaling_nan(b
, status
)) {
4722 float_raise(float_flag_invalid
, status
);
4726 av
= float64_val(a
);
4727 bv
= float64_val(b
);
4728 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4732 /*----------------------------------------------------------------------------
4733 | Returns 1 if the double-precision floating-point value `a' is less than or
4734 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4735 | cause an exception. Otherwise, the comparison is performed according to the
4736 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4737 *----------------------------------------------------------------------------*/
4739 int float64_le_quiet(float64 a
, float64 b
, float_status
*status
)
4743 a
= float64_squash_input_denormal(a
, status
);
4744 b
= float64_squash_input_denormal(b
, status
);
4746 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4747 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4749 if (float64_is_signaling_nan(a
, status
)
4750 || float64_is_signaling_nan(b
, status
)) {
4751 float_raise(float_flag_invalid
, status
);
4755 aSign
= extractFloat64Sign( a
);
4756 bSign
= extractFloat64Sign( b
);
4757 av
= float64_val(a
);
4758 bv
= float64_val(b
);
4759 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4760 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4764 /*----------------------------------------------------------------------------
4765 | Returns 1 if the double-precision floating-point value `a' is less than
4766 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4767 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4768 | Standard for Binary Floating-Point Arithmetic.
4769 *----------------------------------------------------------------------------*/
4771 int float64_lt_quiet(float64 a
, float64 b
, float_status
*status
)
4775 a
= float64_squash_input_denormal(a
, status
);
4776 b
= float64_squash_input_denormal(b
, status
);
4778 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4779 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4781 if (float64_is_signaling_nan(a
, status
)
4782 || float64_is_signaling_nan(b
, status
)) {
4783 float_raise(float_flag_invalid
, status
);
4787 aSign
= extractFloat64Sign( a
);
4788 bSign
= extractFloat64Sign( b
);
4789 av
= float64_val(a
);
4790 bv
= float64_val(b
);
4791 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
4792 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4796 /*----------------------------------------------------------------------------
4797 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4798 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4799 | comparison is performed according to the IEC/IEEE Standard for Binary
4800 | Floating-Point Arithmetic.
4801 *----------------------------------------------------------------------------*/
4803 int float64_unordered_quiet(float64 a
, float64 b
, float_status
*status
)
4805 a
= float64_squash_input_denormal(a
, status
);
4806 b
= float64_squash_input_denormal(b
, status
);
4808 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4809 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4811 if (float64_is_signaling_nan(a
, status
)
4812 || float64_is_signaling_nan(b
, status
)) {
4813 float_raise(float_flag_invalid
, status
);
4820 /*----------------------------------------------------------------------------
4821 | Returns the result of converting the extended double-precision floating-
4822 | point value `a' to the 32-bit two's complement integer format. The
4823 | conversion is performed according to the IEC/IEEE Standard for Binary
4824 | Floating-Point Arithmetic---which means in particular that the conversion
4825 | is rounded according to the current rounding mode. If `a' is a NaN, the
4826 | largest positive integer is returned. Otherwise, if the conversion
4827 | overflows, the largest integer with the same sign as `a' is returned.
4828 *----------------------------------------------------------------------------*/
4830 int32_t floatx80_to_int32(floatx80 a
, float_status
*status
)
4833 int32_t aExp
, shiftCount
;
4836 if (floatx80_invalid_encoding(a
)) {
4837 float_raise(float_flag_invalid
, status
);
4840 aSig
= extractFloatx80Frac( a
);
4841 aExp
= extractFloatx80Exp( a
);
4842 aSign
= extractFloatx80Sign( a
);
4843 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
4844 shiftCount
= 0x4037 - aExp
;
4845 if ( shiftCount
<= 0 ) shiftCount
= 1;
4846 shift64RightJamming( aSig
, shiftCount
, &aSig
);
4847 return roundAndPackInt32(aSign
, aSig
, status
);
4851 /*----------------------------------------------------------------------------
4852 | Returns the result of converting the extended double-precision floating-
4853 | point value `a' to the 32-bit two's complement integer format. The
4854 | conversion is performed according to the IEC/IEEE Standard for Binary
4855 | Floating-Point Arithmetic, except that the conversion is always rounded
4856 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4857 | Otherwise, if the conversion overflows, the largest integer with the same
4858 | sign as `a' is returned.
4859 *----------------------------------------------------------------------------*/
4861 int32_t floatx80_to_int32_round_to_zero(floatx80 a
, float_status
*status
)
4864 int32_t aExp
, shiftCount
;
4865 uint64_t aSig
, savedASig
;
4868 if (floatx80_invalid_encoding(a
)) {
4869 float_raise(float_flag_invalid
, status
);
4872 aSig
= extractFloatx80Frac( a
);
4873 aExp
= extractFloatx80Exp( a
);
4874 aSign
= extractFloatx80Sign( a
);
4875 if ( 0x401E < aExp
) {
4876 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
4879 else if ( aExp
< 0x3FFF ) {
4881 status
->float_exception_flags
|= float_flag_inexact
;
4885 shiftCount
= 0x403E - aExp
;
4887 aSig
>>= shiftCount
;
4889 if ( aSign
) z
= - z
;
4890 if ( ( z
< 0 ) ^ aSign
) {
4892 float_raise(float_flag_invalid
, status
);
4893 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
4895 if ( ( aSig
<<shiftCount
) != savedASig
) {
4896 status
->float_exception_flags
|= float_flag_inexact
;
4902 /*----------------------------------------------------------------------------
4903 | Returns the result of converting the extended double-precision floating-
4904 | point value `a' to the 64-bit two's complement integer format. The
4905 | conversion is performed according to the IEC/IEEE Standard for Binary
4906 | Floating-Point Arithmetic---which means in particular that the conversion
4907 | is rounded according to the current rounding mode. If `a' is a NaN,
4908 | the largest positive integer is returned. Otherwise, if the conversion
4909 | overflows, the largest integer with the same sign as `a' is returned.
4910 *----------------------------------------------------------------------------*/
4912 int64_t floatx80_to_int64(floatx80 a
, float_status
*status
)
4915 int32_t aExp
, shiftCount
;
4916 uint64_t aSig
, aSigExtra
;
4918 if (floatx80_invalid_encoding(a
)) {
4919 float_raise(float_flag_invalid
, status
);
4922 aSig
= extractFloatx80Frac( a
);
4923 aExp
= extractFloatx80Exp( a
);
4924 aSign
= extractFloatx80Sign( a
);
4925 shiftCount
= 0x403E - aExp
;
4926 if ( shiftCount
<= 0 ) {
4928 float_raise(float_flag_invalid
, status
);
4930 || ( ( aExp
== 0x7FFF )
4931 && ( aSig
!= LIT64( 0x8000000000000000 ) ) )
4933 return LIT64( 0x7FFFFFFFFFFFFFFF );
4935 return (int64_t) LIT64( 0x8000000000000000 );
4940 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
4942 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
4946 /*----------------------------------------------------------------------------
4947 | Returns the result of converting the extended double-precision floating-
4948 | point value `a' to the 64-bit two's complement integer format. The
4949 | conversion is performed according to the IEC/IEEE Standard for Binary
4950 | Floating-Point Arithmetic, except that the conversion is always rounded
4951 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4952 | Otherwise, if the conversion overflows, the largest integer with the same
4953 | sign as `a' is returned.
4954 *----------------------------------------------------------------------------*/
4956 int64_t floatx80_to_int64_round_to_zero(floatx80 a
, float_status
*status
)
4959 int32_t aExp
, shiftCount
;
4963 if (floatx80_invalid_encoding(a
)) {
4964 float_raise(float_flag_invalid
, status
);
4967 aSig
= extractFloatx80Frac( a
);
4968 aExp
= extractFloatx80Exp( a
);
4969 aSign
= extractFloatx80Sign( a
);
4970 shiftCount
= aExp
- 0x403E;
4971 if ( 0 <= shiftCount
) {
4972 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
4973 if ( ( a
.high
!= 0xC03E ) || aSig
) {
4974 float_raise(float_flag_invalid
, status
);
4975 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
4976 return LIT64( 0x7FFFFFFFFFFFFFFF );
4979 return (int64_t) LIT64( 0x8000000000000000 );
4981 else if ( aExp
< 0x3FFF ) {
4983 status
->float_exception_flags
|= float_flag_inexact
;
4987 z
= aSig
>>( - shiftCount
);
4988 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
4989 status
->float_exception_flags
|= float_flag_inexact
;
4991 if ( aSign
) z
= - z
;
4996 /*----------------------------------------------------------------------------
4997 | Returns the result of converting the extended double-precision floating-
4998 | point value `a' to the single-precision floating-point format. The
4999 | conversion is performed according to the IEC/IEEE Standard for Binary
5000 | Floating-Point Arithmetic.
5001 *----------------------------------------------------------------------------*/
5003 float32
floatx80_to_float32(floatx80 a
, float_status
*status
)
5009 if (floatx80_invalid_encoding(a
)) {
5010 float_raise(float_flag_invalid
, status
);
5011 return float32_default_nan(status
);
5013 aSig
= extractFloatx80Frac( a
);
5014 aExp
= extractFloatx80Exp( a
);
5015 aSign
= extractFloatx80Sign( a
);
5016 if ( aExp
== 0x7FFF ) {
5017 if ( (uint64_t) ( aSig
<<1 ) ) {
5018 return commonNaNToFloat32(floatx80ToCommonNaN(a
, status
), status
);
5020 return packFloat32( aSign
, 0xFF, 0 );
5022 shift64RightJamming( aSig
, 33, &aSig
);
5023 if ( aExp
|| aSig
) aExp
-= 0x3F81;
5024 return roundAndPackFloat32(aSign
, aExp
, aSig
, status
);
5028 /*----------------------------------------------------------------------------
5029 | Returns the result of converting the extended double-precision floating-
5030 | point value `a' to the double-precision floating-point format. The
5031 | conversion is performed according to the IEC/IEEE Standard for Binary
5032 | Floating-Point Arithmetic.
5033 *----------------------------------------------------------------------------*/
5035 float64
floatx80_to_float64(floatx80 a
, float_status
*status
)
5039 uint64_t aSig
, zSig
;
5041 if (floatx80_invalid_encoding(a
)) {
5042 float_raise(float_flag_invalid
, status
);
5043 return float64_default_nan(status
);
5045 aSig
= extractFloatx80Frac( a
);
5046 aExp
= extractFloatx80Exp( a
);
5047 aSign
= extractFloatx80Sign( a
);
5048 if ( aExp
== 0x7FFF ) {
5049 if ( (uint64_t) ( aSig
<<1 ) ) {
5050 return commonNaNToFloat64(floatx80ToCommonNaN(a
, status
), status
);
5052 return packFloat64( aSign
, 0x7FF, 0 );
5054 shift64RightJamming( aSig
, 1, &zSig
);
5055 if ( aExp
|| aSig
) aExp
-= 0x3C01;
5056 return roundAndPackFloat64(aSign
, aExp
, zSig
, status
);
5060 /*----------------------------------------------------------------------------
5061 | Returns the result of converting the extended double-precision floating-
5062 | point value `a' to the quadruple-precision floating-point format. The
5063 | conversion is performed according to the IEC/IEEE Standard for Binary
5064 | Floating-Point Arithmetic.
5065 *----------------------------------------------------------------------------*/
5067 float128
floatx80_to_float128(floatx80 a
, float_status
*status
)
5071 uint64_t aSig
, zSig0
, zSig1
;
5073 if (floatx80_invalid_encoding(a
)) {
5074 float_raise(float_flag_invalid
, status
);
5075 return float128_default_nan(status
);
5077 aSig
= extractFloatx80Frac( a
);
5078 aExp
= extractFloatx80Exp( a
);
5079 aSign
= extractFloatx80Sign( a
);
5080 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
5081 return commonNaNToFloat128(floatx80ToCommonNaN(a
, status
), status
);
5083 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
5084 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
5088 /*----------------------------------------------------------------------------
5089 | Rounds the extended double-precision floating-point value `a'
5090 | to the precision provided by floatx80_rounding_precision and returns the
5091 | result as an extended double-precision floating-point value.
5092 | The operation is performed according to the IEC/IEEE Standard for Binary
5093 | Floating-Point Arithmetic.
5094 *----------------------------------------------------------------------------*/
5096 floatx80
floatx80_round(floatx80 a
, float_status
*status
)
5098 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5099 extractFloatx80Sign(a
),
5100 extractFloatx80Exp(a
),
5101 extractFloatx80Frac(a
), 0, status
);
5104 /*----------------------------------------------------------------------------
5105 | Rounds the extended double-precision floating-point value `a' to an integer,
5106 | and returns the result as an extended quadruple-precision floating-point
5107 | value. The operation is performed according to the IEC/IEEE Standard for
5108 | Binary Floating-Point Arithmetic.
5109 *----------------------------------------------------------------------------*/
5111 floatx80
floatx80_round_to_int(floatx80 a
, float_status
*status
)
5115 uint64_t lastBitMask
, roundBitsMask
;
5118 if (floatx80_invalid_encoding(a
)) {
5119 float_raise(float_flag_invalid
, status
);
5120 return floatx80_default_nan(status
);
5122 aExp
= extractFloatx80Exp( a
);
5123 if ( 0x403E <= aExp
) {
5124 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
5125 return propagateFloatx80NaN(a
, a
, status
);
5129 if ( aExp
< 0x3FFF ) {
5131 && ( (uint64_t) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
5134 status
->float_exception_flags
|= float_flag_inexact
;
5135 aSign
= extractFloatx80Sign( a
);
5136 switch (status
->float_rounding_mode
) {
5137 case float_round_nearest_even
:
5138 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
5141 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
5144 case float_round_ties_away
:
5145 if (aExp
== 0x3FFE) {
5146 return packFloatx80(aSign
, 0x3FFF, LIT64(0x8000000000000000));
5149 case float_round_down
:
5152 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5153 : packFloatx80( 0, 0, 0 );
5154 case float_round_up
:
5156 aSign
? packFloatx80( 1, 0, 0 )
5157 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5159 return packFloatx80( aSign
, 0, 0 );
5162 lastBitMask
<<= 0x403E - aExp
;
5163 roundBitsMask
= lastBitMask
- 1;
5165 switch (status
->float_rounding_mode
) {
5166 case float_round_nearest_even
:
5167 z
.low
+= lastBitMask
>>1;
5168 if ((z
.low
& roundBitsMask
) == 0) {
5169 z
.low
&= ~lastBitMask
;
5172 case float_round_ties_away
:
5173 z
.low
+= lastBitMask
>> 1;
5175 case float_round_to_zero
:
5177 case float_round_up
:
5178 if (!extractFloatx80Sign(z
)) {
5179 z
.low
+= roundBitsMask
;
5182 case float_round_down
:
5183 if (extractFloatx80Sign(z
)) {
5184 z
.low
+= roundBitsMask
;
5190 z
.low
&= ~ roundBitsMask
;
5193 z
.low
= LIT64( 0x8000000000000000 );
5195 if (z
.low
!= a
.low
) {
5196 status
->float_exception_flags
|= float_flag_inexact
;
5202 /*----------------------------------------------------------------------------
5203 | Returns the result of adding the absolute values of the extended double-
5204 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5205 | negated before being returned. `zSign' is ignored if the result is a NaN.
5206 | The addition is performed according to the IEC/IEEE Standard for Binary
5207 | Floating-Point Arithmetic.
5208 *----------------------------------------------------------------------------*/
5210 static floatx80
addFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
5211 float_status
*status
)
5213 int32_t aExp
, bExp
, zExp
;
5214 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5217 aSig
= extractFloatx80Frac( a
);
5218 aExp
= extractFloatx80Exp( a
);
5219 bSig
= extractFloatx80Frac( b
);
5220 bExp
= extractFloatx80Exp( b
);
5221 expDiff
= aExp
- bExp
;
5222 if ( 0 < expDiff
) {
5223 if ( aExp
== 0x7FFF ) {
5224 if ((uint64_t)(aSig
<< 1)) {
5225 return propagateFloatx80NaN(a
, b
, status
);
5229 if ( bExp
== 0 ) --expDiff
;
5230 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5233 else if ( expDiff
< 0 ) {
5234 if ( bExp
== 0x7FFF ) {
5235 if ((uint64_t)(bSig
<< 1)) {
5236 return propagateFloatx80NaN(a
, b
, status
);
5238 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5240 if ( aExp
== 0 ) ++expDiff
;
5241 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5245 if ( aExp
== 0x7FFF ) {
5246 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5247 return propagateFloatx80NaN(a
, b
, status
);
5252 zSig0
= aSig
+ bSig
;
5254 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
5260 zSig0
= aSig
+ bSig
;
5261 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
5263 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5264 zSig0
|= LIT64( 0x8000000000000000 );
5267 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5268 zSign
, zExp
, zSig0
, zSig1
, status
);
5271 /*----------------------------------------------------------------------------
5272 | Returns the result of subtracting the absolute values of the extended
5273 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5274 | difference is negated before being returned. `zSign' is ignored if the
5275 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5276 | Standard for Binary Floating-Point Arithmetic.
5277 *----------------------------------------------------------------------------*/
5279 static floatx80
subFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
5280 float_status
*status
)
5282 int32_t aExp
, bExp
, zExp
;
5283 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5286 aSig
= extractFloatx80Frac( a
);
5287 aExp
= extractFloatx80Exp( a
);
5288 bSig
= extractFloatx80Frac( b
);
5289 bExp
= extractFloatx80Exp( b
);
5290 expDiff
= aExp
- bExp
;
5291 if ( 0 < expDiff
) goto aExpBigger
;
5292 if ( expDiff
< 0 ) goto bExpBigger
;
5293 if ( aExp
== 0x7FFF ) {
5294 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5295 return propagateFloatx80NaN(a
, b
, status
);
5297 float_raise(float_flag_invalid
, status
);
5298 return floatx80_default_nan(status
);
5305 if ( bSig
< aSig
) goto aBigger
;
5306 if ( aSig
< bSig
) goto bBigger
;
5307 return packFloatx80(status
->float_rounding_mode
== float_round_down
, 0, 0);
5309 if ( bExp
== 0x7FFF ) {
5310 if ((uint64_t)(bSig
<< 1)) {
5311 return propagateFloatx80NaN(a
, b
, status
);
5313 return packFloatx80( zSign
^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5315 if ( aExp
== 0 ) ++expDiff
;
5316 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5318 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
5321 goto normalizeRoundAndPack
;
5323 if ( aExp
== 0x7FFF ) {
5324 if ((uint64_t)(aSig
<< 1)) {
5325 return propagateFloatx80NaN(a
, b
, status
);
5329 if ( bExp
== 0 ) --expDiff
;
5330 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5332 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
5334 normalizeRoundAndPack
:
5335 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
5336 zSign
, zExp
, zSig0
, zSig1
, status
);
5339 /*----------------------------------------------------------------------------
5340 | Returns the result of adding the extended double-precision floating-point
5341 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5342 | Standard for Binary Floating-Point Arithmetic.
5343 *----------------------------------------------------------------------------*/
5345 floatx80
floatx80_add(floatx80 a
, floatx80 b
, float_status
*status
)
5349 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5350 float_raise(float_flag_invalid
, status
);
5351 return floatx80_default_nan(status
);
5353 aSign
= extractFloatx80Sign( a
);
5354 bSign
= extractFloatx80Sign( b
);
5355 if ( aSign
== bSign
) {
5356 return addFloatx80Sigs(a
, b
, aSign
, status
);
5359 return subFloatx80Sigs(a
, b
, aSign
, status
);
5364 /*----------------------------------------------------------------------------
5365 | Returns the result of subtracting the extended double-precision floating-
5366 | point values `a' and `b'. The operation is performed according to the
5367 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5368 *----------------------------------------------------------------------------*/
5370 floatx80
floatx80_sub(floatx80 a
, floatx80 b
, float_status
*status
)
5374 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5375 float_raise(float_flag_invalid
, status
);
5376 return floatx80_default_nan(status
);
5378 aSign
= extractFloatx80Sign( a
);
5379 bSign
= extractFloatx80Sign( b
);
5380 if ( aSign
== bSign
) {
5381 return subFloatx80Sigs(a
, b
, aSign
, status
);
5384 return addFloatx80Sigs(a
, b
, aSign
, status
);
5389 /*----------------------------------------------------------------------------
5390 | Returns the result of multiplying the extended double-precision floating-
5391 | point values `a' and `b'. The operation is performed according to the
5392 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5393 *----------------------------------------------------------------------------*/
5395 floatx80
floatx80_mul(floatx80 a
, floatx80 b
, float_status
*status
)
5397 flag aSign
, bSign
, zSign
;
5398 int32_t aExp
, bExp
, zExp
;
5399 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5401 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5402 float_raise(float_flag_invalid
, status
);
5403 return floatx80_default_nan(status
);
5405 aSig
= extractFloatx80Frac( a
);
5406 aExp
= extractFloatx80Exp( a
);
5407 aSign
= extractFloatx80Sign( a
);
5408 bSig
= extractFloatx80Frac( b
);
5409 bExp
= extractFloatx80Exp( b
);
5410 bSign
= extractFloatx80Sign( b
);
5411 zSign
= aSign
^ bSign
;
5412 if ( aExp
== 0x7FFF ) {
5413 if ( (uint64_t) ( aSig
<<1 )
5414 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5415 return propagateFloatx80NaN(a
, b
, status
);
5417 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
5418 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5420 if ( bExp
== 0x7FFF ) {
5421 if ((uint64_t)(bSig
<< 1)) {
5422 return propagateFloatx80NaN(a
, b
, status
);
5424 if ( ( aExp
| aSig
) == 0 ) {
5426 float_raise(float_flag_invalid
, status
);
5427 return floatx80_default_nan(status
);
5429 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5432 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5433 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5436 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5437 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5439 zExp
= aExp
+ bExp
- 0x3FFE;
5440 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
5441 if ( 0 < (int64_t) zSig0
) {
5442 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5445 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5446 zSign
, zExp
, zSig0
, zSig1
, status
);
5449 /*----------------------------------------------------------------------------
5450 | Returns the result of dividing the extended double-precision floating-point
5451 | value `a' by the corresponding value `b'. The operation is performed
5452 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5453 *----------------------------------------------------------------------------*/
5455 floatx80
floatx80_div(floatx80 a
, floatx80 b
, float_status
*status
)
5457 flag aSign
, bSign
, zSign
;
5458 int32_t aExp
, bExp
, zExp
;
5459 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5460 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
5462 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5463 float_raise(float_flag_invalid
, status
);
5464 return floatx80_default_nan(status
);
5466 aSig
= extractFloatx80Frac( a
);
5467 aExp
= extractFloatx80Exp( a
);
5468 aSign
= extractFloatx80Sign( a
);
5469 bSig
= extractFloatx80Frac( b
);
5470 bExp
= extractFloatx80Exp( b
);
5471 bSign
= extractFloatx80Sign( b
);
5472 zSign
= aSign
^ bSign
;
5473 if ( aExp
== 0x7FFF ) {
5474 if ((uint64_t)(aSig
<< 1)) {
5475 return propagateFloatx80NaN(a
, b
, status
);
5477 if ( bExp
== 0x7FFF ) {
5478 if ((uint64_t)(bSig
<< 1)) {
5479 return propagateFloatx80NaN(a
, b
, status
);
5483 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5485 if ( bExp
== 0x7FFF ) {
5486 if ((uint64_t)(bSig
<< 1)) {
5487 return propagateFloatx80NaN(a
, b
, status
);
5489 return packFloatx80( zSign
, 0, 0 );
5493 if ( ( aExp
| aSig
) == 0 ) {
5495 float_raise(float_flag_invalid
, status
);
5496 return floatx80_default_nan(status
);
5498 float_raise(float_flag_divbyzero
, status
);
5499 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5501 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5504 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5505 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5507 zExp
= aExp
- bExp
+ 0x3FFE;
5509 if ( bSig
<= aSig
) {
5510 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
5513 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
5514 mul64To128( bSig
, zSig0
, &term0
, &term1
);
5515 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
5516 while ( (int64_t) rem0
< 0 ) {
5518 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
5520 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
5521 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
5522 mul64To128( bSig
, zSig1
, &term1
, &term2
);
5523 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5524 while ( (int64_t) rem1
< 0 ) {
5526 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
5528 zSig1
|= ( ( rem1
| rem2
) != 0 );
5530 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5531 zSign
, zExp
, zSig0
, zSig1
, status
);
5534 /*----------------------------------------------------------------------------
5535 | Returns the remainder of the extended double-precision floating-point value
5536 | `a' with respect to the corresponding value `b'. The operation is performed
5537 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5538 *----------------------------------------------------------------------------*/
5540 floatx80
floatx80_rem(floatx80 a
, floatx80 b
, float_status
*status
)
5543 int32_t aExp
, bExp
, expDiff
;
5544 uint64_t aSig0
, aSig1
, bSig
;
5545 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
5547 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5548 float_raise(float_flag_invalid
, status
);
5549 return floatx80_default_nan(status
);
5551 aSig0
= extractFloatx80Frac( a
);
5552 aExp
= extractFloatx80Exp( a
);
5553 aSign
= extractFloatx80Sign( a
);
5554 bSig
= extractFloatx80Frac( b
);
5555 bExp
= extractFloatx80Exp( b
);
5556 if ( aExp
== 0x7FFF ) {
5557 if ( (uint64_t) ( aSig0
<<1 )
5558 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5559 return propagateFloatx80NaN(a
, b
, status
);
5563 if ( bExp
== 0x7FFF ) {
5564 if ((uint64_t)(bSig
<< 1)) {
5565 return propagateFloatx80NaN(a
, b
, status
);
5572 float_raise(float_flag_invalid
, status
);
5573 return floatx80_default_nan(status
);
5575 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5578 if ( (uint64_t) ( aSig0
<<1 ) == 0 ) return a
;
5579 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5581 bSig
|= LIT64( 0x8000000000000000 );
5583 expDiff
= aExp
- bExp
;
5585 if ( expDiff
< 0 ) {
5586 if ( expDiff
< -1 ) return a
;
5587 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
5590 q
= ( bSig
<= aSig0
);
5591 if ( q
) aSig0
-= bSig
;
5593 while ( 0 < expDiff
) {
5594 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5595 q
= ( 2 < q
) ? q
- 2 : 0;
5596 mul64To128( bSig
, q
, &term0
, &term1
);
5597 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5598 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
5602 if ( 0 < expDiff
) {
5603 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5604 q
= ( 2 < q
) ? q
- 2 : 0;
5606 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
5607 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5608 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
5609 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
5611 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5618 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
5619 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5620 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5623 aSig0
= alternateASig0
;
5624 aSig1
= alternateASig1
;
5628 normalizeRoundAndPackFloatx80(
5629 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
, status
);
5633 /*----------------------------------------------------------------------------
5634 | Returns the square root of the extended double-precision floating-point
5635 | value `a'. The operation is performed according to the IEC/IEEE Standard
5636 | for Binary Floating-Point Arithmetic.
5637 *----------------------------------------------------------------------------*/
5639 floatx80
floatx80_sqrt(floatx80 a
, float_status
*status
)
5643 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
5644 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5646 if (floatx80_invalid_encoding(a
)) {
5647 float_raise(float_flag_invalid
, status
);
5648 return floatx80_default_nan(status
);
5650 aSig0
= extractFloatx80Frac( a
);
5651 aExp
= extractFloatx80Exp( a
);
5652 aSign
= extractFloatx80Sign( a
);
5653 if ( aExp
== 0x7FFF ) {
5654 if ((uint64_t)(aSig0
<< 1)) {
5655 return propagateFloatx80NaN(a
, a
, status
);
5657 if ( ! aSign
) return a
;
5661 if ( ( aExp
| aSig0
) == 0 ) return a
;
5663 float_raise(float_flag_invalid
, status
);
5664 return floatx80_default_nan(status
);
5667 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
5668 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5670 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
5671 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
5672 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
5673 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
5674 doubleZSig0
= zSig0
<<1;
5675 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
5676 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
5677 while ( (int64_t) rem0
< 0 ) {
5680 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
5682 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
5683 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5684 if ( zSig1
== 0 ) zSig1
= 1;
5685 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
5686 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5687 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
5688 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5689 while ( (int64_t) rem1
< 0 ) {
5691 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
5693 term2
|= doubleZSig0
;
5694 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5696 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5698 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
5699 zSig0
|= doubleZSig0
;
5700 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5701 0, zExp
, zSig0
, zSig1
, status
);
5704 /*----------------------------------------------------------------------------
5705 | Returns 1 if the extended double-precision floating-point value `a' is equal
5706 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5707 | raised if either operand is a NaN. Otherwise, the comparison is performed
5708 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5709 *----------------------------------------------------------------------------*/
5711 int floatx80_eq(floatx80 a
, floatx80 b
, float_status
*status
)
5714 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5715 || (extractFloatx80Exp(a
) == 0x7FFF
5716 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5717 || (extractFloatx80Exp(b
) == 0x7FFF
5718 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5720 float_raise(float_flag_invalid
, status
);
5725 && ( ( a
.high
== b
.high
)
5727 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5732 /*----------------------------------------------------------------------------
5733 | Returns 1 if the extended double-precision floating-point value `a' is
5734 | less than or equal to the corresponding value `b', and 0 otherwise. The
5735 | invalid exception is raised if either operand is a NaN. The comparison is
5736 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5738 *----------------------------------------------------------------------------*/
5740 int floatx80_le(floatx80 a
, floatx80 b
, float_status
*status
)
5744 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5745 || (extractFloatx80Exp(a
) == 0x7FFF
5746 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5747 || (extractFloatx80Exp(b
) == 0x7FFF
5748 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5750 float_raise(float_flag_invalid
, status
);
5753 aSign
= extractFloatx80Sign( a
);
5754 bSign
= extractFloatx80Sign( b
);
5755 if ( aSign
!= bSign
) {
5758 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5762 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5763 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5767 /*----------------------------------------------------------------------------
5768 | Returns 1 if the extended double-precision floating-point value `a' is
5769 | less than the corresponding value `b', and 0 otherwise. The invalid
5770 | exception is raised if either operand is a NaN. The comparison is performed
5771 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5772 *----------------------------------------------------------------------------*/
5774 int floatx80_lt(floatx80 a
, floatx80 b
, float_status
*status
)
5778 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5779 || (extractFloatx80Exp(a
) == 0x7FFF
5780 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5781 || (extractFloatx80Exp(b
) == 0x7FFF
5782 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5784 float_raise(float_flag_invalid
, status
);
5787 aSign
= extractFloatx80Sign( a
);
5788 bSign
= extractFloatx80Sign( b
);
5789 if ( aSign
!= bSign
) {
5792 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5796 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5797 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5801 /*----------------------------------------------------------------------------
5802 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5803 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5804 | either operand is a NaN. The comparison is performed according to the
5805 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5806 *----------------------------------------------------------------------------*/
5807 int floatx80_unordered(floatx80 a
, floatx80 b
, float_status
*status
)
5809 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5810 || (extractFloatx80Exp(a
) == 0x7FFF
5811 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5812 || (extractFloatx80Exp(b
) == 0x7FFF
5813 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5815 float_raise(float_flag_invalid
, status
);
5821 /*----------------------------------------------------------------------------
5822 | Returns 1 if the extended double-precision floating-point value `a' is
5823 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5824 | cause an exception. The comparison is performed according to the IEC/IEEE
5825 | Standard for Binary Floating-Point Arithmetic.
5826 *----------------------------------------------------------------------------*/
5828 int floatx80_eq_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5831 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5832 float_raise(float_flag_invalid
, status
);
5835 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5836 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5837 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5838 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5840 if (floatx80_is_signaling_nan(a
, status
)
5841 || floatx80_is_signaling_nan(b
, status
)) {
5842 float_raise(float_flag_invalid
, status
);
5848 && ( ( a
.high
== b
.high
)
5850 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5855 /*----------------------------------------------------------------------------
5856 | Returns 1 if the extended double-precision floating-point value `a' is less
5857 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5858 | do not cause an exception. Otherwise, the comparison is performed according
5859 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5860 *----------------------------------------------------------------------------*/
5862 int floatx80_le_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5866 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5867 float_raise(float_flag_invalid
, status
);
5870 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5871 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5872 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5873 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5875 if (floatx80_is_signaling_nan(a
, status
)
5876 || floatx80_is_signaling_nan(b
, status
)) {
5877 float_raise(float_flag_invalid
, status
);
5881 aSign
= extractFloatx80Sign( a
);
5882 bSign
= extractFloatx80Sign( b
);
5883 if ( aSign
!= bSign
) {
5886 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5890 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5891 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5895 /*----------------------------------------------------------------------------
5896 | Returns 1 if the extended double-precision floating-point value `a' is less
5897 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5898 | an exception. Otherwise, the comparison is performed according to the
5899 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5900 *----------------------------------------------------------------------------*/
5902 int floatx80_lt_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5906 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5907 float_raise(float_flag_invalid
, status
);
5910 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5911 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5912 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5913 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5915 if (floatx80_is_signaling_nan(a
, status
)
5916 || floatx80_is_signaling_nan(b
, status
)) {
5917 float_raise(float_flag_invalid
, status
);
5921 aSign
= extractFloatx80Sign( a
);
5922 bSign
= extractFloatx80Sign( b
);
5923 if ( aSign
!= bSign
) {
5926 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5930 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5931 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5935 /*----------------------------------------------------------------------------
5936 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5937 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5938 | The comparison is performed according to the IEC/IEEE Standard for Binary
5939 | Floating-Point Arithmetic.
5940 *----------------------------------------------------------------------------*/
5941 int floatx80_unordered_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5943 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5944 float_raise(float_flag_invalid
, status
);
5947 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5948 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5949 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5950 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5952 if (floatx80_is_signaling_nan(a
, status
)
5953 || floatx80_is_signaling_nan(b
, status
)) {
5954 float_raise(float_flag_invalid
, status
);
5961 /*----------------------------------------------------------------------------
5962 | Returns the result of converting the quadruple-precision floating-point
5963 | value `a' to the 32-bit two's complement integer format. The conversion
5964 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5965 | Arithmetic---which means in particular that the conversion is rounded
5966 | according to the current rounding mode. If `a' is a NaN, the largest
5967 | positive integer is returned. Otherwise, if the conversion overflows, the
5968 | largest integer with the same sign as `a' is returned.
5969 *----------------------------------------------------------------------------*/
5971 int32_t float128_to_int32(float128 a
, float_status
*status
)
5974 int32_t aExp
, shiftCount
;
5975 uint64_t aSig0
, aSig1
;
5977 aSig1
= extractFloat128Frac1( a
);
5978 aSig0
= extractFloat128Frac0( a
);
5979 aExp
= extractFloat128Exp( a
);
5980 aSign
= extractFloat128Sign( a
);
5981 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
5982 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5983 aSig0
|= ( aSig1
!= 0 );
5984 shiftCount
= 0x4028 - aExp
;
5985 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
5986 return roundAndPackInt32(aSign
, aSig0
, status
);
5990 /*----------------------------------------------------------------------------
5991 | Returns the result of converting the quadruple-precision floating-point
5992 | value `a' to the 32-bit two's complement integer format. The conversion
5993 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5994 | Arithmetic, except that the conversion is always rounded toward zero. If
5995 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5996 | conversion overflows, the largest integer with the same sign as `a' is
5998 *----------------------------------------------------------------------------*/
6000 int32_t float128_to_int32_round_to_zero(float128 a
, float_status
*status
)
6003 int32_t aExp
, shiftCount
;
6004 uint64_t aSig0
, aSig1
, savedASig
;
6007 aSig1
= extractFloat128Frac1( a
);
6008 aSig0
= extractFloat128Frac0( a
);
6009 aExp
= extractFloat128Exp( a
);
6010 aSign
= extractFloat128Sign( a
);
6011 aSig0
|= ( aSig1
!= 0 );
6012 if ( 0x401E < aExp
) {
6013 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
6016 else if ( aExp
< 0x3FFF ) {
6017 if (aExp
|| aSig0
) {
6018 status
->float_exception_flags
|= float_flag_inexact
;
6022 aSig0
|= LIT64( 0x0001000000000000 );
6023 shiftCount
= 0x402F - aExp
;
6025 aSig0
>>= shiftCount
;
6027 if ( aSign
) z
= - z
;
6028 if ( ( z
< 0 ) ^ aSign
) {
6030 float_raise(float_flag_invalid
, status
);
6031 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
6033 if ( ( aSig0
<<shiftCount
) != savedASig
) {
6034 status
->float_exception_flags
|= float_flag_inexact
;
6040 /*----------------------------------------------------------------------------
6041 | Returns the result of converting the quadruple-precision floating-point
6042 | value `a' to the 64-bit two's complement integer format. The conversion
6043 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6044 | Arithmetic---which means in particular that the conversion is rounded
6045 | according to the current rounding mode. If `a' is a NaN, the largest
6046 | positive integer is returned. Otherwise, if the conversion overflows, the
6047 | largest integer with the same sign as `a' is returned.
6048 *----------------------------------------------------------------------------*/
6050 int64_t float128_to_int64(float128 a
, float_status
*status
)
6053 int32_t aExp
, shiftCount
;
6054 uint64_t aSig0
, aSig1
;
6056 aSig1
= extractFloat128Frac1( a
);
6057 aSig0
= extractFloat128Frac0( a
);
6058 aExp
= extractFloat128Exp( a
);
6059 aSign
= extractFloat128Sign( a
);
6060 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
6061 shiftCount
= 0x402F - aExp
;
6062 if ( shiftCount
<= 0 ) {
6063 if ( 0x403E < aExp
) {
6064 float_raise(float_flag_invalid
, status
);
6066 || ( ( aExp
== 0x7FFF )
6067 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
6070 return LIT64( 0x7FFFFFFFFFFFFFFF );
6072 return (int64_t) LIT64( 0x8000000000000000 );
6074 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
6077 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
6079 return roundAndPackInt64(aSign
, aSig0
, aSig1
, status
);
6083 /*----------------------------------------------------------------------------
6084 | Returns the result of converting the quadruple-precision floating-point
6085 | value `a' to the 64-bit two's complement integer format. The conversion
6086 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6087 | Arithmetic, except that the conversion is always rounded toward zero.
6088 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6089 | the conversion overflows, the largest integer with the same sign as `a' is
6091 *----------------------------------------------------------------------------*/
6093 int64_t float128_to_int64_round_to_zero(float128 a
, float_status
*status
)
6096 int32_t aExp
, shiftCount
;
6097 uint64_t aSig0
, aSig1
;
6100 aSig1
= extractFloat128Frac1( a
);
6101 aSig0
= extractFloat128Frac0( a
);
6102 aExp
= extractFloat128Exp( a
);
6103 aSign
= extractFloat128Sign( a
);
6104 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
6105 shiftCount
= aExp
- 0x402F;
6106 if ( 0 < shiftCount
) {
6107 if ( 0x403E <= aExp
) {
6108 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
6109 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
6110 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
6112 status
->float_exception_flags
|= float_flag_inexact
;
6116 float_raise(float_flag_invalid
, status
);
6117 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
6118 return LIT64( 0x7FFFFFFFFFFFFFFF );
6121 return (int64_t) LIT64( 0x8000000000000000 );
6123 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
6124 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
6125 status
->float_exception_flags
|= float_flag_inexact
;
6129 if ( aExp
< 0x3FFF ) {
6130 if ( aExp
| aSig0
| aSig1
) {
6131 status
->float_exception_flags
|= float_flag_inexact
;
6135 z
= aSig0
>>( - shiftCount
);
6137 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
6138 status
->float_exception_flags
|= float_flag_inexact
;
6141 if ( aSign
) z
= - z
;
6146 /*----------------------------------------------------------------------------
6147 | Returns the result of converting the quadruple-precision floating-point value
6148 | `a' to the 64-bit unsigned integer format. The conversion is
6149 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6150 | Arithmetic---which means in particular that the conversion is rounded
6151 | according to the current rounding mode. If `a' is a NaN, the largest
6152 | positive integer is returned. If the conversion overflows, the
6153 | largest unsigned integer is returned. If 'a' is negative, the value is
6154 | rounded and zero is returned; negative values that do not round to zero
6155 | will raise the inexact exception.
6156 *----------------------------------------------------------------------------*/
6158 uint64_t float128_to_uint64(float128 a
, float_status
*status
)
6163 uint64_t aSig0
, aSig1
;
6165 aSig0
= extractFloat128Frac0(a
);
6166 aSig1
= extractFloat128Frac1(a
);
6167 aExp
= extractFloat128Exp(a
);
6168 aSign
= extractFloat128Sign(a
);
6169 if (aSign
&& (aExp
> 0x3FFE)) {
6170 float_raise(float_flag_invalid
, status
);
6171 if (float128_is_any_nan(a
)) {
6172 return LIT64(0xFFFFFFFFFFFFFFFF);
6178 aSig0
|= LIT64(0x0001000000000000);
6180 shiftCount
= 0x402F - aExp
;
6181 if (shiftCount
<= 0) {
6182 if (0x403E < aExp
) {
6183 float_raise(float_flag_invalid
, status
);
6184 return LIT64(0xFFFFFFFFFFFFFFFF);
6186 shortShift128Left(aSig0
, aSig1
, -shiftCount
, &aSig0
, &aSig1
);
6188 shift64ExtraRightJamming(aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
6190 return roundAndPackUint64(aSign
, aSig0
, aSig1
, status
);
6193 uint64_t float128_to_uint64_round_to_zero(float128 a
, float_status
*status
)
6196 signed char current_rounding_mode
= status
->float_rounding_mode
;
6198 set_float_rounding_mode(float_round_to_zero
, status
);
6199 v
= float128_to_uint64(a
, status
);
6200 set_float_rounding_mode(current_rounding_mode
, status
);
6205 /*----------------------------------------------------------------------------
6206 | Returns the result of converting the quadruple-precision floating-point
6207 | value `a' to the 32-bit unsigned integer format. The conversion
6208 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6209 | Arithmetic except that the conversion is always rounded toward zero.
6210 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
6211 | if the conversion overflows, the largest unsigned integer is returned.
6212 | If 'a' is negative, the value is rounded and zero is returned; negative
6213 | values that do not round to zero will raise the inexact exception.
6214 *----------------------------------------------------------------------------*/
6216 uint32_t float128_to_uint32_round_to_zero(float128 a
, float_status
*status
)
6220 int old_exc_flags
= get_float_exception_flags(status
);
6222 v
= float128_to_uint64_round_to_zero(a
, status
);
6223 if (v
> 0xffffffff) {
6228 set_float_exception_flags(old_exc_flags
, status
);
6229 float_raise(float_flag_invalid
, status
);
6233 /*----------------------------------------------------------------------------
6234 | Returns the result of converting the quadruple-precision floating-point
6235 | value `a' to the single-precision floating-point format. The conversion
6236 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6238 *----------------------------------------------------------------------------*/
6240 float32
float128_to_float32(float128 a
, float_status
*status
)
6244 uint64_t aSig0
, aSig1
;
6247 aSig1
= extractFloat128Frac1( a
);
6248 aSig0
= extractFloat128Frac0( a
);
6249 aExp
= extractFloat128Exp( a
);
6250 aSign
= extractFloat128Sign( a
);
6251 if ( aExp
== 0x7FFF ) {
6252 if ( aSig0
| aSig1
) {
6253 return commonNaNToFloat32(float128ToCommonNaN(a
, status
), status
);
6255 return packFloat32( aSign
, 0xFF, 0 );
6257 aSig0
|= ( aSig1
!= 0 );
6258 shift64RightJamming( aSig0
, 18, &aSig0
);
6260 if ( aExp
|| zSig
) {
6264 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
6268 /*----------------------------------------------------------------------------
6269 | Returns the result of converting the quadruple-precision floating-point
6270 | value `a' to the double-precision floating-point format. The conversion
6271 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6273 *----------------------------------------------------------------------------*/
6275 float64
float128_to_float64(float128 a
, float_status
*status
)
6279 uint64_t aSig0
, aSig1
;
6281 aSig1
= extractFloat128Frac1( a
);
6282 aSig0
= extractFloat128Frac0( a
);
6283 aExp
= extractFloat128Exp( a
);
6284 aSign
= extractFloat128Sign( a
);
6285 if ( aExp
== 0x7FFF ) {
6286 if ( aSig0
| aSig1
) {
6287 return commonNaNToFloat64(float128ToCommonNaN(a
, status
), status
);
6289 return packFloat64( aSign
, 0x7FF, 0 );
6291 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6292 aSig0
|= ( aSig1
!= 0 );
6293 if ( aExp
|| aSig0
) {
6294 aSig0
|= LIT64( 0x4000000000000000 );
6297 return roundAndPackFloat64(aSign
, aExp
, aSig0
, status
);
6301 /*----------------------------------------------------------------------------
6302 | Returns the result of converting the quadruple-precision floating-point
6303 | value `a' to the extended double-precision floating-point format. The
6304 | conversion is performed according to the IEC/IEEE Standard for Binary
6305 | Floating-Point Arithmetic.
6306 *----------------------------------------------------------------------------*/
6308 floatx80
float128_to_floatx80(float128 a
, float_status
*status
)
6312 uint64_t aSig0
, aSig1
;
6314 aSig1
= extractFloat128Frac1( a
);
6315 aSig0
= extractFloat128Frac0( a
);
6316 aExp
= extractFloat128Exp( a
);
6317 aSign
= extractFloat128Sign( a
);
6318 if ( aExp
== 0x7FFF ) {
6319 if ( aSig0
| aSig1
) {
6320 return commonNaNToFloatx80(float128ToCommonNaN(a
, status
), status
);
6322 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
6325 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
6326 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6329 aSig0
|= LIT64( 0x0001000000000000 );
6331 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
6332 return roundAndPackFloatx80(80, aSign
, aExp
, aSig0
, aSig1
, status
);
6336 /*----------------------------------------------------------------------------
6337 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6338 | returns the result as a quadruple-precision floating-point value. The
6339 | operation is performed according to the IEC/IEEE Standard for Binary
6340 | Floating-Point Arithmetic.
6341 *----------------------------------------------------------------------------*/
6343 float128
float128_round_to_int(float128 a
, float_status
*status
)
6347 uint64_t lastBitMask
, roundBitsMask
;
6350 aExp
= extractFloat128Exp( a
);
6351 if ( 0x402F <= aExp
) {
6352 if ( 0x406F <= aExp
) {
6353 if ( ( aExp
== 0x7FFF )
6354 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
6356 return propagateFloat128NaN(a
, a
, status
);
6361 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
6362 roundBitsMask
= lastBitMask
- 1;
6364 switch (status
->float_rounding_mode
) {
6365 case float_round_nearest_even
:
6366 if ( lastBitMask
) {
6367 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
6368 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
6371 if ( (int64_t) z
.low
< 0 ) {
6373 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
6377 case float_round_ties_away
:
6379 add128(z
.high
, z
.low
, 0, lastBitMask
>> 1, &z
.high
, &z
.low
);
6381 if ((int64_t) z
.low
< 0) {
6386 case float_round_to_zero
:
6388 case float_round_up
:
6389 if (!extractFloat128Sign(z
)) {
6390 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6393 case float_round_down
:
6394 if (extractFloat128Sign(z
)) {
6395 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6401 z
.low
&= ~ roundBitsMask
;
6404 if ( aExp
< 0x3FFF ) {
6405 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
6406 status
->float_exception_flags
|= float_flag_inexact
;
6407 aSign
= extractFloat128Sign( a
);
6408 switch (status
->float_rounding_mode
) {
6409 case float_round_nearest_even
:
6410 if ( ( aExp
== 0x3FFE )
6411 && ( extractFloat128Frac0( a
)
6412 | extractFloat128Frac1( a
) )
6414 return packFloat128( aSign
, 0x3FFF, 0, 0 );
6417 case float_round_ties_away
:
6418 if (aExp
== 0x3FFE) {
6419 return packFloat128(aSign
, 0x3FFF, 0, 0);
6422 case float_round_down
:
6424 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
6425 : packFloat128( 0, 0, 0, 0 );
6426 case float_round_up
:
6428 aSign
? packFloat128( 1, 0, 0, 0 )
6429 : packFloat128( 0, 0x3FFF, 0, 0 );
6431 return packFloat128( aSign
, 0, 0, 0 );
6434 lastBitMask
<<= 0x402F - aExp
;
6435 roundBitsMask
= lastBitMask
- 1;
6438 switch (status
->float_rounding_mode
) {
6439 case float_round_nearest_even
:
6440 z
.high
+= lastBitMask
>>1;
6441 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
6442 z
.high
&= ~ lastBitMask
;
6445 case float_round_ties_away
:
6446 z
.high
+= lastBitMask
>>1;
6448 case float_round_to_zero
:
6450 case float_round_up
:
6451 if (!extractFloat128Sign(z
)) {
6452 z
.high
|= ( a
.low
!= 0 );
6453 z
.high
+= roundBitsMask
;
6456 case float_round_down
:
6457 if (extractFloat128Sign(z
)) {
6458 z
.high
|= (a
.low
!= 0);
6459 z
.high
+= roundBitsMask
;
6465 z
.high
&= ~ roundBitsMask
;
6467 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
6468 status
->float_exception_flags
|= float_flag_inexact
;
6474 /*----------------------------------------------------------------------------
6475 | Returns the result of adding the absolute values of the quadruple-precision
6476 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6477 | before being returned. `zSign' is ignored if the result is a NaN.
6478 | The addition is performed according to the IEC/IEEE Standard for Binary
6479 | Floating-Point Arithmetic.
6480 *----------------------------------------------------------------------------*/
6482 static float128
addFloat128Sigs(float128 a
, float128 b
, flag zSign
,
6483 float_status
*status
)
6485 int32_t aExp
, bExp
, zExp
;
6486 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6489 aSig1
= extractFloat128Frac1( a
);
6490 aSig0
= extractFloat128Frac0( a
);
6491 aExp
= extractFloat128Exp( a
);
6492 bSig1
= extractFloat128Frac1( b
);
6493 bSig0
= extractFloat128Frac0( b
);
6494 bExp
= extractFloat128Exp( b
);
6495 expDiff
= aExp
- bExp
;
6496 if ( 0 < expDiff
) {
6497 if ( aExp
== 0x7FFF ) {
6498 if (aSig0
| aSig1
) {
6499 return propagateFloat128NaN(a
, b
, status
);
6507 bSig0
|= LIT64( 0x0001000000000000 );
6509 shift128ExtraRightJamming(
6510 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
6513 else if ( expDiff
< 0 ) {
6514 if ( bExp
== 0x7FFF ) {
6515 if (bSig0
| bSig1
) {
6516 return propagateFloat128NaN(a
, b
, status
);
6518 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6524 aSig0
|= LIT64( 0x0001000000000000 );
6526 shift128ExtraRightJamming(
6527 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
6531 if ( aExp
== 0x7FFF ) {
6532 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6533 return propagateFloat128NaN(a
, b
, status
);
6537 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6539 if (status
->flush_to_zero
) {
6540 if (zSig0
| zSig1
) {
6541 float_raise(float_flag_output_denormal
, status
);
6543 return packFloat128(zSign
, 0, 0, 0);
6545 return packFloat128( zSign
, 0, zSig0
, zSig1
);
6548 zSig0
|= LIT64( 0x0002000000000000 );
6552 aSig0
|= LIT64( 0x0001000000000000 );
6553 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6555 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
6558 shift128ExtraRightJamming(
6559 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6561 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6565 /*----------------------------------------------------------------------------
6566 | Returns the result of subtracting the absolute values of the quadruple-
6567 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6568 | difference is negated before being returned. `zSign' is ignored if the
6569 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6570 | Standard for Binary Floating-Point Arithmetic.
6571 *----------------------------------------------------------------------------*/
6573 static float128
subFloat128Sigs(float128 a
, float128 b
, flag zSign
,
6574 float_status
*status
)
6576 int32_t aExp
, bExp
, zExp
;
6577 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
6580 aSig1
= extractFloat128Frac1( a
);
6581 aSig0
= extractFloat128Frac0( a
);
6582 aExp
= extractFloat128Exp( a
);
6583 bSig1
= extractFloat128Frac1( b
);
6584 bSig0
= extractFloat128Frac0( b
);
6585 bExp
= extractFloat128Exp( b
);
6586 expDiff
= aExp
- bExp
;
6587 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6588 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
6589 if ( 0 < expDiff
) goto aExpBigger
;
6590 if ( expDiff
< 0 ) goto bExpBigger
;
6591 if ( aExp
== 0x7FFF ) {
6592 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6593 return propagateFloat128NaN(a
, b
, status
);
6595 float_raise(float_flag_invalid
, status
);
6596 return float128_default_nan(status
);
6602 if ( bSig0
< aSig0
) goto aBigger
;
6603 if ( aSig0
< bSig0
) goto bBigger
;
6604 if ( bSig1
< aSig1
) goto aBigger
;
6605 if ( aSig1
< bSig1
) goto bBigger
;
6606 return packFloat128(status
->float_rounding_mode
== float_round_down
,
6609 if ( bExp
== 0x7FFF ) {
6610 if (bSig0
| bSig1
) {
6611 return propagateFloat128NaN(a
, b
, status
);
6613 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
6619 aSig0
|= LIT64( 0x4000000000000000 );
6621 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6622 bSig0
|= LIT64( 0x4000000000000000 );
6624 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6627 goto normalizeRoundAndPack
;
6629 if ( aExp
== 0x7FFF ) {
6630 if (aSig0
| aSig1
) {
6631 return propagateFloat128NaN(a
, b
, status
);
6639 bSig0
|= LIT64( 0x4000000000000000 );
6641 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
6642 aSig0
|= LIT64( 0x4000000000000000 );
6644 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6646 normalizeRoundAndPack
:
6648 return normalizeRoundAndPackFloat128(zSign
, zExp
- 14, zSig0
, zSig1
,
6653 /*----------------------------------------------------------------------------
6654 | Returns the result of adding the quadruple-precision floating-point values
6655 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6656 | for Binary Floating-Point Arithmetic.
6657 *----------------------------------------------------------------------------*/
6659 float128
float128_add(float128 a
, float128 b
, float_status
*status
)
6663 aSign
= extractFloat128Sign( a
);
6664 bSign
= extractFloat128Sign( b
);
6665 if ( aSign
== bSign
) {
6666 return addFloat128Sigs(a
, b
, aSign
, status
);
6669 return subFloat128Sigs(a
, b
, aSign
, status
);
6674 /*----------------------------------------------------------------------------
6675 | Returns the result of subtracting the quadruple-precision floating-point
6676 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6677 | Standard for Binary Floating-Point Arithmetic.
6678 *----------------------------------------------------------------------------*/
6680 float128
float128_sub(float128 a
, float128 b
, float_status
*status
)
6684 aSign
= extractFloat128Sign( a
);
6685 bSign
= extractFloat128Sign( b
);
6686 if ( aSign
== bSign
) {
6687 return subFloat128Sigs(a
, b
, aSign
, status
);
6690 return addFloat128Sigs(a
, b
, aSign
, status
);
6695 /*----------------------------------------------------------------------------
6696 | Returns the result of multiplying the quadruple-precision floating-point
6697 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6698 | Standard for Binary Floating-Point Arithmetic.
6699 *----------------------------------------------------------------------------*/
6701 float128
float128_mul(float128 a
, float128 b
, float_status
*status
)
6703 flag aSign
, bSign
, zSign
;
6704 int32_t aExp
, bExp
, zExp
;
6705 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
6707 aSig1
= extractFloat128Frac1( a
);
6708 aSig0
= extractFloat128Frac0( a
);
6709 aExp
= extractFloat128Exp( a
);
6710 aSign
= extractFloat128Sign( a
);
6711 bSig1
= extractFloat128Frac1( b
);
6712 bSig0
= extractFloat128Frac0( b
);
6713 bExp
= extractFloat128Exp( b
);
6714 bSign
= extractFloat128Sign( b
);
6715 zSign
= aSign
^ bSign
;
6716 if ( aExp
== 0x7FFF ) {
6717 if ( ( aSig0
| aSig1
)
6718 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6719 return propagateFloat128NaN(a
, b
, status
);
6721 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
6722 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6724 if ( bExp
== 0x7FFF ) {
6725 if (bSig0
| bSig1
) {
6726 return propagateFloat128NaN(a
, b
, status
);
6728 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6730 float_raise(float_flag_invalid
, status
);
6731 return float128_default_nan(status
);
6733 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6736 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6737 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6740 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6741 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6743 zExp
= aExp
+ bExp
- 0x4000;
6744 aSig0
|= LIT64( 0x0001000000000000 );
6745 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
6746 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
6747 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6748 zSig2
|= ( zSig3
!= 0 );
6749 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
6750 shift128ExtraRightJamming(
6751 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6754 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6758 /*----------------------------------------------------------------------------
6759 | Returns the result of dividing the quadruple-precision floating-point value
6760 | `a' by the corresponding value `b'. The operation is performed according to
6761 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6762 *----------------------------------------------------------------------------*/
6764 float128
float128_div(float128 a
, float128 b
, float_status
*status
)
6766 flag aSign
, bSign
, zSign
;
6767 int32_t aExp
, bExp
, zExp
;
6768 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6769 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6771 aSig1
= extractFloat128Frac1( a
);
6772 aSig0
= extractFloat128Frac0( a
);
6773 aExp
= extractFloat128Exp( a
);
6774 aSign
= extractFloat128Sign( a
);
6775 bSig1
= extractFloat128Frac1( b
);
6776 bSig0
= extractFloat128Frac0( b
);
6777 bExp
= extractFloat128Exp( b
);
6778 bSign
= extractFloat128Sign( b
);
6779 zSign
= aSign
^ bSign
;
6780 if ( aExp
== 0x7FFF ) {
6781 if (aSig0
| aSig1
) {
6782 return propagateFloat128NaN(a
, b
, status
);
6784 if ( bExp
== 0x7FFF ) {
6785 if (bSig0
| bSig1
) {
6786 return propagateFloat128NaN(a
, b
, status
);
6790 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6792 if ( bExp
== 0x7FFF ) {
6793 if (bSig0
| bSig1
) {
6794 return propagateFloat128NaN(a
, b
, status
);
6796 return packFloat128( zSign
, 0, 0, 0 );
6799 if ( ( bSig0
| bSig1
) == 0 ) {
6800 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6802 float_raise(float_flag_invalid
, status
);
6803 return float128_default_nan(status
);
6805 float_raise(float_flag_divbyzero
, status
);
6806 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6808 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6811 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6812 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6814 zExp
= aExp
- bExp
+ 0x3FFD;
6816 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
6818 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
6819 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
6820 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
6823 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6824 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
6825 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
6826 while ( (int64_t) rem0
< 0 ) {
6828 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
6830 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
6831 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
6832 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
6833 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
6834 while ( (int64_t) rem1
< 0 ) {
6836 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
6838 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6840 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
6841 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6845 /*----------------------------------------------------------------------------
6846 | Returns the remainder of the quadruple-precision floating-point value `a'
6847 | with respect to the corresponding value `b'. The operation is performed
6848 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6849 *----------------------------------------------------------------------------*/
6851 float128
float128_rem(float128 a
, float128 b
, float_status
*status
)
6854 int32_t aExp
, bExp
, expDiff
;
6855 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
6856 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
6859 aSig1
= extractFloat128Frac1( a
);
6860 aSig0
= extractFloat128Frac0( a
);
6861 aExp
= extractFloat128Exp( a
);
6862 aSign
= extractFloat128Sign( a
);
6863 bSig1
= extractFloat128Frac1( b
);
6864 bSig0
= extractFloat128Frac0( b
);
6865 bExp
= extractFloat128Exp( b
);
6866 if ( aExp
== 0x7FFF ) {
6867 if ( ( aSig0
| aSig1
)
6868 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6869 return propagateFloat128NaN(a
, b
, status
);
6873 if ( bExp
== 0x7FFF ) {
6874 if (bSig0
| bSig1
) {
6875 return propagateFloat128NaN(a
, b
, status
);
6880 if ( ( bSig0
| bSig1
) == 0 ) {
6882 float_raise(float_flag_invalid
, status
);
6883 return float128_default_nan(status
);
6885 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6888 if ( ( aSig0
| aSig1
) == 0 ) return a
;
6889 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6891 expDiff
= aExp
- bExp
;
6892 if ( expDiff
< -1 ) return a
;
6894 aSig0
| LIT64( 0x0001000000000000 ),
6896 15 - ( expDiff
< 0 ),
6901 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
6902 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
6903 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6905 while ( 0 < expDiff
) {
6906 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6907 q
= ( 4 < q
) ? q
- 4 : 0;
6908 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6909 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
6910 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
6911 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
6914 if ( -64 < expDiff
) {
6915 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6916 q
= ( 4 < q
) ? q
- 4 : 0;
6918 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6920 if ( expDiff
< 0 ) {
6921 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6924 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
6926 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6927 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
6930 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
6931 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6934 alternateASig0
= aSig0
;
6935 alternateASig1
= aSig1
;
6937 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6938 } while ( 0 <= (int64_t) aSig0
);
6940 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
6941 if ( ( sigMean0
< 0 )
6942 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
6943 aSig0
= alternateASig0
;
6944 aSig1
= alternateASig1
;
6946 zSign
= ( (int64_t) aSig0
< 0 );
6947 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
6948 return normalizeRoundAndPackFloat128(aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
,
6952 /*----------------------------------------------------------------------------
6953 | Returns the square root of the quadruple-precision floating-point value `a'.
6954 | The operation is performed according to the IEC/IEEE Standard for Binary
6955 | Floating-Point Arithmetic.
6956 *----------------------------------------------------------------------------*/
6958 float128
float128_sqrt(float128 a
, float_status
*status
)
6962 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
6963 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6965 aSig1
= extractFloat128Frac1( a
);
6966 aSig0
= extractFloat128Frac0( a
);
6967 aExp
= extractFloat128Exp( a
);
6968 aSign
= extractFloat128Sign( a
);
6969 if ( aExp
== 0x7FFF ) {
6970 if (aSig0
| aSig1
) {
6971 return propagateFloat128NaN(a
, a
, status
);
6973 if ( ! aSign
) return a
;
6977 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
6979 float_raise(float_flag_invalid
, status
);
6980 return float128_default_nan(status
);
6983 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
6984 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6986 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
6987 aSig0
|= LIT64( 0x0001000000000000 );
6988 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
6989 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
6990 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6991 doubleZSig0
= zSig0
<<1;
6992 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6993 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6994 while ( (int64_t) rem0
< 0 ) {
6997 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6999 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
7000 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
7001 if ( zSig1
== 0 ) zSig1
= 1;
7002 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
7003 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
7004 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
7005 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
7006 while ( (int64_t) rem1
< 0 ) {
7008 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
7010 term2
|= doubleZSig0
;
7011 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
7013 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
7015 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
7016 return roundAndPackFloat128(0, zExp
, zSig0
, zSig1
, zSig2
, status
);
7020 /*----------------------------------------------------------------------------
7021 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7022 | the corresponding value `b', and 0 otherwise. The invalid exception is
7023 | raised if either operand is a NaN. Otherwise, the comparison is performed
7024 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7025 *----------------------------------------------------------------------------*/
7027 int float128_eq(float128 a
, float128 b
, float_status
*status
)
7030 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7031 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7032 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7033 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7035 float_raise(float_flag_invalid
, status
);
7040 && ( ( a
.high
== b
.high
)
7042 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
7047 /*----------------------------------------------------------------------------
7048 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7049 | or equal to the corresponding value `b', and 0 otherwise. The invalid
7050 | exception is raised if either operand is a NaN. The comparison is performed
7051 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7052 *----------------------------------------------------------------------------*/
7054 int float128_le(float128 a
, float128 b
, float_status
*status
)
7058 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7059 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7060 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7061 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7063 float_raise(float_flag_invalid
, status
);
7066 aSign
= extractFloat128Sign( a
);
7067 bSign
= extractFloat128Sign( b
);
7068 if ( aSign
!= bSign
) {
7071 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
7075 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
7076 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
7080 /*----------------------------------------------------------------------------
7081 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7082 | the corresponding value `b', and 0 otherwise. The invalid exception is
7083 | raised if either operand is a NaN. The comparison is performed according
7084 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7085 *----------------------------------------------------------------------------*/
7087 int float128_lt(float128 a
, float128 b
, float_status
*status
)
7091 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7092 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7093 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7094 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7096 float_raise(float_flag_invalid
, status
);
7099 aSign
= extractFloat128Sign( a
);
7100 bSign
= extractFloat128Sign( b
);
7101 if ( aSign
!= bSign
) {
7104 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
7108 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
7109 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
7113 /*----------------------------------------------------------------------------
7114 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7115 | be compared, and 0 otherwise. The invalid exception is raised if either
7116 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7117 | Standard for Binary Floating-Point Arithmetic.
7118 *----------------------------------------------------------------------------*/
7120 int float128_unordered(float128 a
, float128 b
, float_status
*status
)
7122 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7123 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7124 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7125 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7127 float_raise(float_flag_invalid
, status
);
7133 /*----------------------------------------------------------------------------
7134 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7135 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7136 | exception. The comparison is performed according to the IEC/IEEE Standard
7137 | for Binary Floating-Point Arithmetic.
7138 *----------------------------------------------------------------------------*/
7140 int float128_eq_quiet(float128 a
, float128 b
, float_status
*status
)
7143 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7144 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7145 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7146 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7148 if (float128_is_signaling_nan(a
, status
)
7149 || float128_is_signaling_nan(b
, status
)) {
7150 float_raise(float_flag_invalid
, status
);
7156 && ( ( a
.high
== b
.high
)
7158 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
7163 /*----------------------------------------------------------------------------
7164 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7165 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
7166 | cause an exception. Otherwise, the comparison is performed according to the
7167 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7168 *----------------------------------------------------------------------------*/
7170 int float128_le_quiet(float128 a
, float128 b
, float_status
*status
)
7174 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7175 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7176 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7177 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7179 if (float128_is_signaling_nan(a
, status
)
7180 || float128_is_signaling_nan(b
, status
)) {
7181 float_raise(float_flag_invalid
, status
);
7185 aSign
= extractFloat128Sign( a
);
7186 bSign
= extractFloat128Sign( b
);
7187 if ( aSign
!= bSign
) {
7190 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
7194 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
7195 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
7199 /*----------------------------------------------------------------------------
7200 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7201 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7202 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
7203 | Standard for Binary Floating-Point Arithmetic.
7204 *----------------------------------------------------------------------------*/
7206 int float128_lt_quiet(float128 a
, float128 b
, float_status
*status
)
7210 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7211 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7212 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7213 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7215 if (float128_is_signaling_nan(a
, status
)
7216 || float128_is_signaling_nan(b
, status
)) {
7217 float_raise(float_flag_invalid
, status
);
7221 aSign
= extractFloat128Sign( a
);
7222 bSign
= extractFloat128Sign( b
);
7223 if ( aSign
!= bSign
) {
7226 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
7230 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
7231 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
7235 /*----------------------------------------------------------------------------
7236 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7237 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
7238 | comparison is performed according to the IEC/IEEE Standard for Binary
7239 | Floating-Point Arithmetic.
7240 *----------------------------------------------------------------------------*/
7242 int float128_unordered_quiet(float128 a
, float128 b
, float_status
*status
)
7244 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7245 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7246 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7247 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7249 if (float128_is_signaling_nan(a
, status
)
7250 || float128_is_signaling_nan(b
, status
)) {
7251 float_raise(float_flag_invalid
, status
);
7258 /* misc functions */
7259 float32
uint32_to_float32(uint32_t a
, float_status
*status
)
7261 return int64_to_float32(a
, status
);
7264 float64
uint32_to_float64(uint32_t a
, float_status
*status
)
7266 return int64_to_float64(a
, status
);
7269 uint32_t float32_to_uint32(float32 a
, float_status
*status
)
7273 int old_exc_flags
= get_float_exception_flags(status
);
7275 v
= float32_to_int64(a
, status
);
7278 } else if (v
> 0xffffffff) {
7283 set_float_exception_flags(old_exc_flags
, status
);
7284 float_raise(float_flag_invalid
, status
);
7288 uint32_t float32_to_uint32_round_to_zero(float32 a
, float_status
*status
)
7292 int old_exc_flags
= get_float_exception_flags(status
);
7294 v
= float32_to_int64_round_to_zero(a
, status
);
7297 } else if (v
> 0xffffffff) {
7302 set_float_exception_flags(old_exc_flags
, status
);
7303 float_raise(float_flag_invalid
, status
);
7307 int16_t float32_to_int16(float32 a
, float_status
*status
)
7311 int old_exc_flags
= get_float_exception_flags(status
);
7313 v
= float32_to_int32(a
, status
);
7316 } else if (v
> 0x7fff) {
7322 set_float_exception_flags(old_exc_flags
, status
);
7323 float_raise(float_flag_invalid
, status
);
7327 uint16_t float32_to_uint16(float32 a
, float_status
*status
)
7331 int old_exc_flags
= get_float_exception_flags(status
);
7333 v
= float32_to_int32(a
, status
);
7336 } else if (v
> 0xffff) {
7342 set_float_exception_flags(old_exc_flags
, status
);
7343 float_raise(float_flag_invalid
, status
);
7347 uint16_t float32_to_uint16_round_to_zero(float32 a
, float_status
*status
)
7351 int old_exc_flags
= get_float_exception_flags(status
);
7353 v
= float32_to_int64_round_to_zero(a
, status
);
7356 } else if (v
> 0xffff) {
7361 set_float_exception_flags(old_exc_flags
, status
);
7362 float_raise(float_flag_invalid
, status
);
7366 uint32_t float64_to_uint32(float64 a
, float_status
*status
)
7370 int old_exc_flags
= get_float_exception_flags(status
);
7372 v
= float64_to_uint64(a
, status
);
7373 if (v
> 0xffffffff) {
7378 set_float_exception_flags(old_exc_flags
, status
);
7379 float_raise(float_flag_invalid
, status
);
7383 uint32_t float64_to_uint32_round_to_zero(float64 a
, float_status
*status
)
7387 int old_exc_flags
= get_float_exception_flags(status
);
7389 v
= float64_to_uint64_round_to_zero(a
, status
);
7390 if (v
> 0xffffffff) {
7395 set_float_exception_flags(old_exc_flags
, status
);
7396 float_raise(float_flag_invalid
, status
);
7400 int16_t float64_to_int16(float64 a
, float_status
*status
)
7404 int old_exc_flags
= get_float_exception_flags(status
);
7406 v
= float64_to_int32(a
, status
);
7409 } else if (v
> 0x7fff) {
7415 set_float_exception_flags(old_exc_flags
, status
);
7416 float_raise(float_flag_invalid
, status
);
7420 uint16_t float64_to_uint16(float64 a
, float_status
*status
)
7424 int old_exc_flags
= get_float_exception_flags(status
);
7426 v
= float64_to_int32(a
, status
);
7429 } else if (v
> 0xffff) {
7435 set_float_exception_flags(old_exc_flags
, status
);
7436 float_raise(float_flag_invalid
, status
);
7440 uint16_t float64_to_uint16_round_to_zero(float64 a
, float_status
*status
)
7444 int old_exc_flags
= get_float_exception_flags(status
);
7446 v
= float64_to_int64_round_to_zero(a
, status
);
7449 } else if (v
> 0xffff) {
7454 set_float_exception_flags(old_exc_flags
, status
);
7455 float_raise(float_flag_invalid
, status
);
7459 /*----------------------------------------------------------------------------
7460 | Returns the result of converting the double-precision floating-point value
7461 | `a' to the 64-bit unsigned integer format. The conversion is
7462 | performed according to the IEC/IEEE Standard for Binary Floating-Point
7463 | Arithmetic---which means in particular that the conversion is rounded
7464 | according to the current rounding mode. If `a' is a NaN, the largest
7465 | positive integer is returned. If the conversion overflows, the
7466 | largest unsigned integer is returned. If 'a' is negative, the value is
7467 | rounded and zero is returned; negative values that do not round to zero
7468 | will raise the inexact exception.
7469 *----------------------------------------------------------------------------*/
7471 uint64_t float64_to_uint64(float64 a
, float_status
*status
)
7476 uint64_t aSig
, aSigExtra
;
7477 a
= float64_squash_input_denormal(a
, status
);
7479 aSig
= extractFloat64Frac(a
);
7480 aExp
= extractFloat64Exp(a
);
7481 aSign
= extractFloat64Sign(a
);
7482 if (aSign
&& (aExp
> 1022)) {
7483 float_raise(float_flag_invalid
, status
);
7484 if (float64_is_any_nan(a
)) {
7485 return LIT64(0xFFFFFFFFFFFFFFFF);
7491 aSig
|= LIT64(0x0010000000000000);
7493 shiftCount
= 0x433 - aExp
;
7494 if (shiftCount
<= 0) {
7496 float_raise(float_flag_invalid
, status
);
7497 return LIT64(0xFFFFFFFFFFFFFFFF);
7500 aSig
<<= -shiftCount
;
7502 shift64ExtraRightJamming(aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
7504 return roundAndPackUint64(aSign
, aSig
, aSigExtra
, status
);
7507 uint64_t float64_to_uint64_round_to_zero(float64 a
, float_status
*status
)
7509 signed char current_rounding_mode
= status
->float_rounding_mode
;
7510 set_float_rounding_mode(float_round_to_zero
, status
);
7511 uint64_t v
= float64_to_uint64(a
, status
);
7512 set_float_rounding_mode(current_rounding_mode
, status
);
7516 #define COMPARE(s, nan_exp) \
7517 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7518 int is_quiet, float_status *status) \
7520 flag aSign, bSign; \
7521 uint ## s ## _t av, bv; \
7522 a = float ## s ## _squash_input_denormal(a, status); \
7523 b = float ## s ## _squash_input_denormal(b, status); \
7525 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
7526 extractFloat ## s ## Frac( a ) ) || \
7527 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
7528 extractFloat ## s ## Frac( b ) )) { \
7530 float ## s ## _is_signaling_nan(a, status) || \
7531 float ## s ## _is_signaling_nan(b, status)) { \
7532 float_raise(float_flag_invalid, status); \
7534 return float_relation_unordered; \
7536 aSign = extractFloat ## s ## Sign( a ); \
7537 bSign = extractFloat ## s ## Sign( b ); \
7538 av = float ## s ## _val(a); \
7539 bv = float ## s ## _val(b); \
7540 if ( aSign != bSign ) { \
7541 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
7543 return float_relation_equal; \
7545 return 1 - (2 * aSign); \
7549 return float_relation_equal; \
7551 return 1 - 2 * (aSign ^ ( av < bv )); \
7556 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
7558 return float ## s ## _compare_internal(a, b, 0, status); \
7561 int float ## s ## _compare_quiet(float ## s a, float ## s b, \
7562 float_status *status) \
7564 return float ## s ## _compare_internal(a, b, 1, status); \
7570 static inline int floatx80_compare_internal(floatx80 a
, floatx80 b
,
7571 int is_quiet
, float_status
*status
)
7575 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
7576 float_raise(float_flag_invalid
, status
);
7577 return float_relation_unordered
;
7579 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
7580 ( extractFloatx80Frac( a
)<<1 ) ) ||
7581 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
7582 ( extractFloatx80Frac( b
)<<1 ) )) {
7584 floatx80_is_signaling_nan(a
, status
) ||
7585 floatx80_is_signaling_nan(b
, status
)) {
7586 float_raise(float_flag_invalid
, status
);
7588 return float_relation_unordered
;
7590 aSign
= extractFloatx80Sign( a
);
7591 bSign
= extractFloatx80Sign( b
);
7592 if ( aSign
!= bSign
) {
7594 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
7595 ( ( a
.low
| b
.low
) == 0 ) ) {
7597 return float_relation_equal
;
7599 return 1 - (2 * aSign
);
7602 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7603 return float_relation_equal
;
7605 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7610 int floatx80_compare(floatx80 a
, floatx80 b
, float_status
*status
)
7612 return floatx80_compare_internal(a
, b
, 0, status
);
7615 int floatx80_compare_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
7617 return floatx80_compare_internal(a
, b
, 1, status
);
7620 static inline int float128_compare_internal(float128 a
, float128 b
,
7621 int is_quiet
, float_status
*status
)
7625 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
7626 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
7627 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
7628 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
7630 float128_is_signaling_nan(a
, status
) ||
7631 float128_is_signaling_nan(b
, status
)) {
7632 float_raise(float_flag_invalid
, status
);
7634 return float_relation_unordered
;
7636 aSign
= extractFloat128Sign( a
);
7637 bSign
= extractFloat128Sign( b
);
7638 if ( aSign
!= bSign
) {
7639 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
7641 return float_relation_equal
;
7643 return 1 - (2 * aSign
);
7646 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7647 return float_relation_equal
;
7649 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7654 int float128_compare(float128 a
, float128 b
, float_status
*status
)
7656 return float128_compare_internal(a
, b
, 0, status
);
7659 int float128_compare_quiet(float128 a
, float128 b
, float_status
*status
)
7661 return float128_compare_internal(a
, b
, 1, status
);
7664 /* min() and max() functions. These can't be implemented as
7665 * 'compare and pick one input' because that would mishandle
7666 * NaNs and +0 vs -0.
7668 * minnum() and maxnum() functions. These are similar to the min()
7669 * and max() functions but if one of the arguments is a QNaN and
7670 * the other is numerical then the numerical argument is returned.
7671 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7672 * and maxNum() operations. min() and max() are the typical min/max
7673 * semantics provided by many CPUs which predate that specification.
7675 * minnummag() and maxnummag() functions correspond to minNumMag()
7676 * and minNumMag() from the IEEE-754 2008.
7679 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \
7680 int ismin, int isieee, \
7682 float_status *status) \
7684 flag aSign, bSign; \
7685 uint ## s ## _t av, bv, aav, abv; \
7686 a = float ## s ## _squash_input_denormal(a, status); \
7687 b = float ## s ## _squash_input_denormal(b, status); \
7688 if (float ## s ## _is_any_nan(a) || \
7689 float ## s ## _is_any_nan(b)) { \
7691 if (float ## s ## _is_quiet_nan(a, status) && \
7692 !float ## s ##_is_any_nan(b)) { \
7694 } else if (float ## s ## _is_quiet_nan(b, status) && \
7695 !float ## s ## _is_any_nan(a)) { \
7699 return propagateFloat ## s ## NaN(a, b, status); \
7701 aSign = extractFloat ## s ## Sign(a); \
7702 bSign = extractFloat ## s ## Sign(b); \
7703 av = float ## s ## _val(a); \
7704 bv = float ## s ## _val(b); \
7706 aav = float ## s ## _abs(av); \
7707 abv = float ## s ## _abs(bv); \
7710 return (aav < abv) ? a : b; \
7712 return (aav < abv) ? b : a; \
7716 if (aSign != bSign) { \
7718 return aSign ? a : b; \
7720 return aSign ? b : a; \
7724 return (aSign ^ (av < bv)) ? a : b; \
7726 return (aSign ^ (av < bv)) ? b : a; \
7731 float ## s float ## s ## _min(float ## s a, float ## s b, \
7732 float_status *status) \
7734 return float ## s ## _minmax(a, b, 1, 0, 0, status); \
7737 float ## s float ## s ## _max(float ## s a, float ## s b, \
7738 float_status *status) \
7740 return float ## s ## _minmax(a, b, 0, 0, 0, status); \
7743 float ## s float ## s ## _minnum(float ## s a, float ## s b, \
7744 float_status *status) \
7746 return float ## s ## _minmax(a, b, 1, 1, 0, status); \
7749 float ## s float ## s ## _maxnum(float ## s a, float ## s b, \
7750 float_status *status) \
7752 return float ## s ## _minmax(a, b, 0, 1, 0, status); \
7755 float ## s float ## s ## _minnummag(float ## s a, float ## s b, \
7756 float_status *status) \
7758 return float ## s ## _minmax(a, b, 1, 1, 1, status); \
7761 float ## s float ## s ## _maxnummag(float ## s a, float ## s b, \
7762 float_status *status) \
7764 return float ## s ## _minmax(a, b, 0, 1, 1, status); \
7771 /* Multiply A by 2 raised to the power N. */
7772 float32
float32_scalbn(float32 a
, int n
, float_status
*status
)
7778 a
= float32_squash_input_denormal(a
, status
);
7779 aSig
= extractFloat32Frac( a
);
7780 aExp
= extractFloat32Exp( a
);
7781 aSign
= extractFloat32Sign( a
);
7783 if ( aExp
== 0xFF ) {
7785 return propagateFloat32NaN(a
, a
, status
);
7791 } else if (aSig
== 0) {
7799 } else if (n
< -0x200) {
7805 return normalizeRoundAndPackFloat32(aSign
, aExp
, aSig
, status
);
7808 float64
float64_scalbn(float64 a
, int n
, float_status
*status
)
7814 a
= float64_squash_input_denormal(a
, status
);
7815 aSig
= extractFloat64Frac( a
);
7816 aExp
= extractFloat64Exp( a
);
7817 aSign
= extractFloat64Sign( a
);
7819 if ( aExp
== 0x7FF ) {
7821 return propagateFloat64NaN(a
, a
, status
);
7826 aSig
|= LIT64( 0x0010000000000000 );
7827 } else if (aSig
== 0) {
7835 } else if (n
< -0x1000) {
7841 return normalizeRoundAndPackFloat64(aSign
, aExp
, aSig
, status
);
7844 floatx80
floatx80_scalbn(floatx80 a
, int n
, float_status
*status
)
7850 if (floatx80_invalid_encoding(a
)) {
7851 float_raise(float_flag_invalid
, status
);
7852 return floatx80_default_nan(status
);
7854 aSig
= extractFloatx80Frac( a
);
7855 aExp
= extractFloatx80Exp( a
);
7856 aSign
= extractFloatx80Sign( a
);
7858 if ( aExp
== 0x7FFF ) {
7860 return propagateFloatx80NaN(a
, a
, status
);
7874 } else if (n
< -0x10000) {
7879 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
7880 aSign
, aExp
, aSig
, 0, status
);
7883 float128
float128_scalbn(float128 a
, int n
, float_status
*status
)
7887 uint64_t aSig0
, aSig1
;
7889 aSig1
= extractFloat128Frac1( a
);
7890 aSig0
= extractFloat128Frac0( a
);
7891 aExp
= extractFloat128Exp( a
);
7892 aSign
= extractFloat128Sign( a
);
7893 if ( aExp
== 0x7FFF ) {
7894 if ( aSig0
| aSig1
) {
7895 return propagateFloat128NaN(a
, a
, status
);
7900 aSig0
|= LIT64( 0x0001000000000000 );
7901 } else if (aSig0
== 0 && aSig1
== 0) {
7909 } else if (n
< -0x10000) {
7914 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1