4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
95 *----------------------------------------------------------------------------*/
96 #include "softfloat-macros.h"
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine: (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output. These details are target-
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
112 static inline uint32_t extractFloat16Frac(float16 a
)
114 return float16_val(a
) & 0x3ff;
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
121 static inline int extractFloat16Exp(float16 a
)
123 return (float16_val(a
) >> 10) & 0x1f;
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
130 static inline flag
extractFloat16Sign(float16 a
)
132 return float16_val(a
)>>15;
135 /*----------------------------------------------------------------------------
136 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
137 | and 7, and returns the properly rounded 32-bit integer corresponding to the
138 | input. If `zSign' is 1, the input is negated before being converted to an
139 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
140 | is simply rounded to an integer, with the inexact exception raised if the
141 | input cannot be represented exactly as an integer. However, if the fixed-
142 | point input is too large, the invalid exception is raised and the largest
143 | positive or negative integer is returned.
144 *----------------------------------------------------------------------------*/
146 static int32_t roundAndPackInt32(flag zSign
, uint64_t absZ
, float_status
*status
)
149 flag roundNearestEven
;
150 int8_t roundIncrement
, roundBits
;
153 roundingMode
= status
->float_rounding_mode
;
154 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
155 switch (roundingMode
) {
156 case float_round_nearest_even
:
157 case float_round_ties_away
:
158 roundIncrement
= 0x40;
160 case float_round_to_zero
:
164 roundIncrement
= zSign
? 0 : 0x7f;
166 case float_round_down
:
167 roundIncrement
= zSign
? 0x7f : 0;
172 roundBits
= absZ
& 0x7F;
173 absZ
= ( absZ
+ roundIncrement
)>>7;
174 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
176 if ( zSign
) z
= - z
;
177 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
178 float_raise(float_flag_invalid
, status
);
179 return zSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
182 status
->float_exception_flags
|= float_flag_inexact
;
188 /*----------------------------------------------------------------------------
189 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
190 | `absZ1', with binary point between bits 63 and 64 (between the input words),
191 | and returns the properly rounded 64-bit integer corresponding to the input.
192 | If `zSign' is 1, the input is negated before being converted to an integer.
193 | Ordinarily, the fixed-point input is simply rounded to an integer, with
194 | the inexact exception raised if the input cannot be represented exactly as
195 | an integer. However, if the fixed-point input is too large, the invalid
196 | exception is raised and the largest positive or negative integer is
198 *----------------------------------------------------------------------------*/
200 static int64_t roundAndPackInt64(flag zSign
, uint64_t absZ0
, uint64_t absZ1
,
201 float_status
*status
)
204 flag roundNearestEven
, increment
;
207 roundingMode
= status
->float_rounding_mode
;
208 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
209 switch (roundingMode
) {
210 case float_round_nearest_even
:
211 case float_round_ties_away
:
212 increment
= ((int64_t) absZ1
< 0);
214 case float_round_to_zero
:
218 increment
= !zSign
&& absZ1
;
220 case float_round_down
:
221 increment
= zSign
&& absZ1
;
228 if ( absZ0
== 0 ) goto overflow
;
229 absZ0
&= ~ ( ( (uint64_t) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
232 if ( zSign
) z
= - z
;
233 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
235 float_raise(float_flag_invalid
, status
);
237 zSign
? (int64_t) LIT64( 0x8000000000000000 )
238 : LIT64( 0x7FFFFFFFFFFFFFFF );
241 status
->float_exception_flags
|= float_flag_inexact
;
247 /*----------------------------------------------------------------------------
248 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
249 | `absZ1', with binary point between bits 63 and 64 (between the input words),
250 | and returns the properly rounded 64-bit unsigned integer corresponding to the
251 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
252 | with the inexact exception raised if the input cannot be represented exactly
253 | as an integer. However, if the fixed-point input is too large, the invalid
254 | exception is raised and the largest unsigned integer is returned.
255 *----------------------------------------------------------------------------*/
257 static int64_t roundAndPackUint64(flag zSign
, uint64_t absZ0
,
258 uint64_t absZ1
, float_status
*status
)
261 flag roundNearestEven
, increment
;
263 roundingMode
= status
->float_rounding_mode
;
264 roundNearestEven
= (roundingMode
== float_round_nearest_even
);
265 switch (roundingMode
) {
266 case float_round_nearest_even
:
267 case float_round_ties_away
:
268 increment
= ((int64_t)absZ1
< 0);
270 case float_round_to_zero
:
274 increment
= !zSign
&& absZ1
;
276 case float_round_down
:
277 increment
= zSign
&& absZ1
;
285 float_raise(float_flag_invalid
, status
);
286 return LIT64(0xFFFFFFFFFFFFFFFF);
288 absZ0
&= ~(((uint64_t)(absZ1
<<1) == 0) & roundNearestEven
);
291 if (zSign
&& absZ0
) {
292 float_raise(float_flag_invalid
, status
);
297 status
->float_exception_flags
|= float_flag_inexact
;
302 /*----------------------------------------------------------------------------
303 | Returns the fraction bits of the single-precision floating-point value `a'.
304 *----------------------------------------------------------------------------*/
306 static inline uint32_t extractFloat32Frac( float32 a
)
309 return float32_val(a
) & 0x007FFFFF;
313 /*----------------------------------------------------------------------------
314 | Returns the exponent bits of the single-precision floating-point value `a'.
315 *----------------------------------------------------------------------------*/
317 static inline int extractFloat32Exp(float32 a
)
320 return ( float32_val(a
)>>23 ) & 0xFF;
324 /*----------------------------------------------------------------------------
325 | Returns the sign bit of the single-precision floating-point value `a'.
326 *----------------------------------------------------------------------------*/
328 static inline flag
extractFloat32Sign( float32 a
)
331 return float32_val(a
)>>31;
335 /*----------------------------------------------------------------------------
336 | If `a' is denormal and we are in flush-to-zero mode then set the
337 | input-denormal exception and return zero. Otherwise just return the value.
338 *----------------------------------------------------------------------------*/
339 float32
float32_squash_input_denormal(float32 a
, float_status
*status
)
341 if (status
->flush_inputs_to_zero
) {
342 if (extractFloat32Exp(a
) == 0 && extractFloat32Frac(a
) != 0) {
343 float_raise(float_flag_input_denormal
, status
);
344 return make_float32(float32_val(a
) & 0x80000000);
350 /*----------------------------------------------------------------------------
351 | Normalizes the subnormal single-precision floating-point value represented
352 | by the denormalized significand `aSig'. The normalized exponent and
353 | significand are stored at the locations pointed to by `zExpPtr' and
354 | `zSigPtr', respectively.
355 *----------------------------------------------------------------------------*/
358 normalizeFloat32Subnormal(uint32_t aSig
, int *zExpPtr
, uint32_t *zSigPtr
)
362 shiftCount
= countLeadingZeros32( aSig
) - 8;
363 *zSigPtr
= aSig
<<shiftCount
;
364 *zExpPtr
= 1 - shiftCount
;
368 /*----------------------------------------------------------------------------
369 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
370 | single-precision floating-point value, returning the result. After being
371 | shifted into the proper positions, the three fields are simply added
372 | together to form the result. This means that any integer portion of `zSig'
373 | will be added into the exponent. Since a properly normalized significand
374 | will have an integer portion equal to 1, the `zExp' input should be 1 less
375 | than the desired result exponent whenever `zSig' is a complete, normalized
377 *----------------------------------------------------------------------------*/
379 static inline float32
packFloat32(flag zSign
, int zExp
, uint32_t zSig
)
383 ( ( (uint32_t) zSign
)<<31 ) + ( ( (uint32_t) zExp
)<<23 ) + zSig
);
387 /*----------------------------------------------------------------------------
388 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
389 | and significand `zSig', and returns the proper single-precision floating-
390 | point value corresponding to the abstract input. Ordinarily, the abstract
391 | value is simply rounded and packed into the single-precision format, with
392 | the inexact exception raised if the abstract input cannot be represented
393 | exactly. However, if the abstract value is too large, the overflow and
394 | inexact exceptions are raised and an infinity or maximal finite value is
395 | returned. If the abstract value is too small, the input value is rounded to
396 | a subnormal number, and the underflow and inexact exceptions are raised if
397 | the abstract input cannot be represented exactly as a subnormal single-
398 | precision floating-point number.
399 | The input significand `zSig' has its binary point between bits 30
400 | and 29, which is 7 bits to the left of the usual location. This shifted
401 | significand must be normalized or smaller. If `zSig' is not normalized,
402 | `zExp' must be 0; in that case, the result returned is a subnormal number,
403 | and it must not require rounding. In the usual case that `zSig' is
404 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
405 | The handling of underflow and overflow follows the IEC/IEEE Standard for
406 | Binary Floating-Point Arithmetic.
407 *----------------------------------------------------------------------------*/
409 static float32
roundAndPackFloat32(flag zSign
, int zExp
, uint32_t zSig
,
410 float_status
*status
)
413 flag roundNearestEven
;
414 int8_t roundIncrement
, roundBits
;
417 roundingMode
= status
->float_rounding_mode
;
418 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
419 switch (roundingMode
) {
420 case float_round_nearest_even
:
421 case float_round_ties_away
:
422 roundIncrement
= 0x40;
424 case float_round_to_zero
:
428 roundIncrement
= zSign
? 0 : 0x7f;
430 case float_round_down
:
431 roundIncrement
= zSign
? 0x7f : 0;
437 roundBits
= zSig
& 0x7F;
438 if ( 0xFD <= (uint16_t) zExp
) {
440 || ( ( zExp
== 0xFD )
441 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
443 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
444 return packFloat32( zSign
, 0xFF, - ( roundIncrement
== 0 ));
447 if (status
->flush_to_zero
) {
448 float_raise(float_flag_output_denormal
, status
);
449 return packFloat32(zSign
, 0, 0);
452 (status
->float_detect_tininess
453 == float_tininess_before_rounding
)
455 || ( zSig
+ roundIncrement
< 0x80000000 );
456 shift32RightJamming( zSig
, - zExp
, &zSig
);
458 roundBits
= zSig
& 0x7F;
459 if (isTiny
&& roundBits
) {
460 float_raise(float_flag_underflow
, status
);
465 status
->float_exception_flags
|= float_flag_inexact
;
467 zSig
= ( zSig
+ roundIncrement
)>>7;
468 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
469 if ( zSig
== 0 ) zExp
= 0;
470 return packFloat32( zSign
, zExp
, zSig
);
474 /*----------------------------------------------------------------------------
475 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
476 | and significand `zSig', and returns the proper single-precision floating-
477 | point value corresponding to the abstract input. This routine is just like
478 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
479 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
480 | floating-point exponent.
481 *----------------------------------------------------------------------------*/
484 normalizeRoundAndPackFloat32(flag zSign
, int zExp
, uint32_t zSig
,
485 float_status
*status
)
489 shiftCount
= countLeadingZeros32( zSig
) - 1;
490 return roundAndPackFloat32(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
495 /*----------------------------------------------------------------------------
496 | Returns the fraction bits of the double-precision floating-point value `a'.
497 *----------------------------------------------------------------------------*/
499 static inline uint64_t extractFloat64Frac( float64 a
)
502 return float64_val(a
) & LIT64( 0x000FFFFFFFFFFFFF );
506 /*----------------------------------------------------------------------------
507 | Returns the exponent bits of the double-precision floating-point value `a'.
508 *----------------------------------------------------------------------------*/
510 static inline int extractFloat64Exp(float64 a
)
513 return ( float64_val(a
)>>52 ) & 0x7FF;
517 /*----------------------------------------------------------------------------
518 | Returns the sign bit of the double-precision floating-point value `a'.
519 *----------------------------------------------------------------------------*/
521 static inline flag
extractFloat64Sign( float64 a
)
524 return float64_val(a
)>>63;
528 /*----------------------------------------------------------------------------
529 | If `a' is denormal and we are in flush-to-zero mode then set the
530 | input-denormal exception and return zero. Otherwise just return the value.
531 *----------------------------------------------------------------------------*/
532 float64
float64_squash_input_denormal(float64 a
, float_status
*status
)
534 if (status
->flush_inputs_to_zero
) {
535 if (extractFloat64Exp(a
) == 0 && extractFloat64Frac(a
) != 0) {
536 float_raise(float_flag_input_denormal
, status
);
537 return make_float64(float64_val(a
) & (1ULL << 63));
543 /*----------------------------------------------------------------------------
544 | Normalizes the subnormal double-precision floating-point value represented
545 | by the denormalized significand `aSig'. The normalized exponent and
546 | significand are stored at the locations pointed to by `zExpPtr' and
547 | `zSigPtr', respectively.
548 *----------------------------------------------------------------------------*/
551 normalizeFloat64Subnormal(uint64_t aSig
, int *zExpPtr
, uint64_t *zSigPtr
)
555 shiftCount
= countLeadingZeros64( aSig
) - 11;
556 *zSigPtr
= aSig
<<shiftCount
;
557 *zExpPtr
= 1 - shiftCount
;
561 /*----------------------------------------------------------------------------
562 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
563 | double-precision floating-point value, returning the result. After being
564 | shifted into the proper positions, the three fields are simply added
565 | together to form the result. This means that any integer portion of `zSig'
566 | will be added into the exponent. Since a properly normalized significand
567 | will have an integer portion equal to 1, the `zExp' input should be 1 less
568 | than the desired result exponent whenever `zSig' is a complete, normalized
570 *----------------------------------------------------------------------------*/
572 static inline float64
packFloat64(flag zSign
, int zExp
, uint64_t zSig
)
576 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
580 /*----------------------------------------------------------------------------
581 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
582 | and significand `zSig', and returns the proper double-precision floating-
583 | point value corresponding to the abstract input. Ordinarily, the abstract
584 | value is simply rounded and packed into the double-precision format, with
585 | the inexact exception raised if the abstract input cannot be represented
586 | exactly. However, if the abstract value is too large, the overflow and
587 | inexact exceptions are raised and an infinity or maximal finite value is
588 | returned. If the abstract value is too small, the input value is rounded to
589 | a subnormal number, and the underflow and inexact exceptions are raised if
590 | the abstract input cannot be represented exactly as a subnormal double-
591 | precision floating-point number.
592 | The input significand `zSig' has its binary point between bits 62
593 | and 61, which is 10 bits to the left of the usual location. This shifted
594 | significand must be normalized or smaller. If `zSig' is not normalized,
595 | `zExp' must be 0; in that case, the result returned is a subnormal number,
596 | and it must not require rounding. In the usual case that `zSig' is
597 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
598 | The handling of underflow and overflow follows the IEC/IEEE Standard for
599 | Binary Floating-Point Arithmetic.
600 *----------------------------------------------------------------------------*/
602 static float64
roundAndPackFloat64(flag zSign
, int zExp
, uint64_t zSig
,
603 float_status
*status
)
606 flag roundNearestEven
;
607 int roundIncrement
, roundBits
;
610 roundingMode
= status
->float_rounding_mode
;
611 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
612 switch (roundingMode
) {
613 case float_round_nearest_even
:
614 case float_round_ties_away
:
615 roundIncrement
= 0x200;
617 case float_round_to_zero
:
621 roundIncrement
= zSign
? 0 : 0x3ff;
623 case float_round_down
:
624 roundIncrement
= zSign
? 0x3ff : 0;
629 roundBits
= zSig
& 0x3FF;
630 if ( 0x7FD <= (uint16_t) zExp
) {
631 if ( ( 0x7FD < zExp
)
632 || ( ( zExp
== 0x7FD )
633 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
635 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
636 return packFloat64( zSign
, 0x7FF, - ( roundIncrement
== 0 ));
639 if (status
->flush_to_zero
) {
640 float_raise(float_flag_output_denormal
, status
);
641 return packFloat64(zSign
, 0, 0);
644 (status
->float_detect_tininess
645 == float_tininess_before_rounding
)
647 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
648 shift64RightJamming( zSig
, - zExp
, &zSig
);
650 roundBits
= zSig
& 0x3FF;
651 if (isTiny
&& roundBits
) {
652 float_raise(float_flag_underflow
, status
);
657 status
->float_exception_flags
|= float_flag_inexact
;
659 zSig
= ( zSig
+ roundIncrement
)>>10;
660 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
661 if ( zSig
== 0 ) zExp
= 0;
662 return packFloat64( zSign
, zExp
, zSig
);
666 /*----------------------------------------------------------------------------
667 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
668 | and significand `zSig', and returns the proper double-precision floating-
669 | point value corresponding to the abstract input. This routine is just like
670 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
671 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
672 | floating-point exponent.
673 *----------------------------------------------------------------------------*/
676 normalizeRoundAndPackFloat64(flag zSign
, int zExp
, uint64_t zSig
,
677 float_status
*status
)
681 shiftCount
= countLeadingZeros64( zSig
) - 1;
682 return roundAndPackFloat64(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
687 /*----------------------------------------------------------------------------
688 | Returns the fraction bits of the extended double-precision floating-point
690 *----------------------------------------------------------------------------*/
692 static inline uint64_t extractFloatx80Frac( floatx80 a
)
699 /*----------------------------------------------------------------------------
700 | Returns the exponent bits of the extended double-precision floating-point
702 *----------------------------------------------------------------------------*/
704 static inline int32_t extractFloatx80Exp( floatx80 a
)
707 return a
.high
& 0x7FFF;
711 /*----------------------------------------------------------------------------
712 | Returns the sign bit of the extended double-precision floating-point value
714 *----------------------------------------------------------------------------*/
716 static inline flag
extractFloatx80Sign( floatx80 a
)
723 /*----------------------------------------------------------------------------
724 | Normalizes the subnormal extended double-precision floating-point value
725 | represented by the denormalized significand `aSig'. The normalized exponent
726 | and significand are stored at the locations pointed to by `zExpPtr' and
727 | `zSigPtr', respectively.
728 *----------------------------------------------------------------------------*/
731 normalizeFloatx80Subnormal( uint64_t aSig
, int32_t *zExpPtr
, uint64_t *zSigPtr
)
735 shiftCount
= countLeadingZeros64( aSig
);
736 *zSigPtr
= aSig
<<shiftCount
;
737 *zExpPtr
= 1 - shiftCount
;
741 /*----------------------------------------------------------------------------
742 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
743 | extended double-precision floating-point value, returning the result.
744 *----------------------------------------------------------------------------*/
746 static inline floatx80
packFloatx80( flag zSign
, int32_t zExp
, uint64_t zSig
)
751 z
.high
= ( ( (uint16_t) zSign
)<<15 ) + zExp
;
756 /*----------------------------------------------------------------------------
757 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
758 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
759 | and returns the proper extended double-precision floating-point value
760 | corresponding to the abstract input. Ordinarily, the abstract value is
761 | rounded and packed into the extended double-precision format, with the
762 | inexact exception raised if the abstract input cannot be represented
763 | exactly. However, if the abstract value is too large, the overflow and
764 | inexact exceptions are raised and an infinity or maximal finite value is
765 | returned. If the abstract value is too small, the input value is rounded to
766 | a subnormal number, and the underflow and inexact exceptions are raised if
767 | the abstract input cannot be represented exactly as a subnormal extended
768 | double-precision floating-point number.
769 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
770 | number of bits as single or double precision, respectively. Otherwise, the
771 | result is rounded to the full precision of the extended double-precision
773 | The input significand must be normalized or smaller. If the input
774 | significand is not normalized, `zExp' must be 0; in that case, the result
775 | returned is a subnormal number, and it must not require rounding. The
776 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
777 | Floating-Point Arithmetic.
778 *----------------------------------------------------------------------------*/
780 static floatx80
roundAndPackFloatx80(int8_t roundingPrecision
, flag zSign
,
781 int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
,
782 float_status
*status
)
785 flag roundNearestEven
, increment
, isTiny
;
786 int64_t roundIncrement
, roundMask
, roundBits
;
788 roundingMode
= status
->float_rounding_mode
;
789 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
790 if ( roundingPrecision
== 80 ) goto precision80
;
791 if ( roundingPrecision
== 64 ) {
792 roundIncrement
= LIT64( 0x0000000000000400 );
793 roundMask
= LIT64( 0x00000000000007FF );
795 else if ( roundingPrecision
== 32 ) {
796 roundIncrement
= LIT64( 0x0000008000000000 );
797 roundMask
= LIT64( 0x000000FFFFFFFFFF );
802 zSig0
|= ( zSig1
!= 0 );
803 switch (roundingMode
) {
804 case float_round_nearest_even
:
805 case float_round_ties_away
:
807 case float_round_to_zero
:
811 roundIncrement
= zSign
? 0 : roundMask
;
813 case float_round_down
:
814 roundIncrement
= zSign
? roundMask
: 0;
819 roundBits
= zSig0
& roundMask
;
820 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
821 if ( ( 0x7FFE < zExp
)
822 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
827 if (status
->flush_to_zero
) {
828 float_raise(float_flag_output_denormal
, status
);
829 return packFloatx80(zSign
, 0, 0);
832 (status
->float_detect_tininess
833 == float_tininess_before_rounding
)
835 || ( zSig0
<= zSig0
+ roundIncrement
);
836 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
838 roundBits
= zSig0
& roundMask
;
839 if (isTiny
&& roundBits
) {
840 float_raise(float_flag_underflow
, status
);
843 status
->float_exception_flags
|= float_flag_inexact
;
845 zSig0
+= roundIncrement
;
846 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
847 roundIncrement
= roundMask
+ 1;
848 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
849 roundMask
|= roundIncrement
;
851 zSig0
&= ~ roundMask
;
852 return packFloatx80( zSign
, zExp
, zSig0
);
856 status
->float_exception_flags
|= float_flag_inexact
;
858 zSig0
+= roundIncrement
;
859 if ( zSig0
< roundIncrement
) {
861 zSig0
= LIT64( 0x8000000000000000 );
863 roundIncrement
= roundMask
+ 1;
864 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
865 roundMask
|= roundIncrement
;
867 zSig0
&= ~ roundMask
;
868 if ( zSig0
== 0 ) zExp
= 0;
869 return packFloatx80( zSign
, zExp
, zSig0
);
871 switch (roundingMode
) {
872 case float_round_nearest_even
:
873 case float_round_ties_away
:
874 increment
= ((int64_t)zSig1
< 0);
876 case float_round_to_zero
:
880 increment
= !zSign
&& zSig1
;
882 case float_round_down
:
883 increment
= zSign
&& zSig1
;
888 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
889 if ( ( 0x7FFE < zExp
)
890 || ( ( zExp
== 0x7FFE )
891 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
897 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
898 if ( ( roundingMode
== float_round_to_zero
)
899 || ( zSign
&& ( roundingMode
== float_round_up
) )
900 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
902 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
904 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
908 (status
->float_detect_tininess
909 == float_tininess_before_rounding
)
912 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
913 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
915 if (isTiny
&& zSig1
) {
916 float_raise(float_flag_underflow
, status
);
919 status
->float_exception_flags
|= float_flag_inexact
;
921 switch (roundingMode
) {
922 case float_round_nearest_even
:
923 case float_round_ties_away
:
924 increment
= ((int64_t)zSig1
< 0);
926 case float_round_to_zero
:
930 increment
= !zSign
&& zSig1
;
932 case float_round_down
:
933 increment
= zSign
&& zSig1
;
941 ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
942 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
944 return packFloatx80( zSign
, zExp
, zSig0
);
948 status
->float_exception_flags
|= float_flag_inexact
;
954 zSig0
= LIT64( 0x8000000000000000 );
957 zSig0
&= ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
961 if ( zSig0
== 0 ) zExp
= 0;
963 return packFloatx80( zSign
, zExp
, zSig0
);
967 /*----------------------------------------------------------------------------
968 | Takes an abstract floating-point value having sign `zSign', exponent
969 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
970 | and returns the proper extended double-precision floating-point value
971 | corresponding to the abstract input. This routine is just like
972 | `roundAndPackFloatx80' except that the input significand does not have to be
974 *----------------------------------------------------------------------------*/
976 static floatx80
normalizeRoundAndPackFloatx80(int8_t roundingPrecision
,
977 flag zSign
, int32_t zExp
,
978 uint64_t zSig0
, uint64_t zSig1
,
979 float_status
*status
)
988 shiftCount
= countLeadingZeros64( zSig0
);
989 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
991 return roundAndPackFloatx80(roundingPrecision
, zSign
, zExp
,
992 zSig0
, zSig1
, status
);
996 /*----------------------------------------------------------------------------
997 | Returns the least-significant 64 fraction bits of the quadruple-precision
998 | floating-point value `a'.
999 *----------------------------------------------------------------------------*/
1001 static inline uint64_t extractFloat128Frac1( float128 a
)
1008 /*----------------------------------------------------------------------------
1009 | Returns the most-significant 48 fraction bits of the quadruple-precision
1010 | floating-point value `a'.
1011 *----------------------------------------------------------------------------*/
1013 static inline uint64_t extractFloat128Frac0( float128 a
)
1016 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
1020 /*----------------------------------------------------------------------------
1021 | Returns the exponent bits of the quadruple-precision floating-point value
1023 *----------------------------------------------------------------------------*/
1025 static inline int32_t extractFloat128Exp( float128 a
)
1028 return ( a
.high
>>48 ) & 0x7FFF;
1032 /*----------------------------------------------------------------------------
1033 | Returns the sign bit of the quadruple-precision floating-point value `a'.
1034 *----------------------------------------------------------------------------*/
1036 static inline flag
extractFloat128Sign( float128 a
)
1043 /*----------------------------------------------------------------------------
1044 | Normalizes the subnormal quadruple-precision floating-point value
1045 | represented by the denormalized significand formed by the concatenation of
1046 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
1047 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
1048 | significand are stored at the location pointed to by `zSig0Ptr', and the
1049 | least significant 64 bits of the normalized significand are stored at the
1050 | location pointed to by `zSig1Ptr'.
1051 *----------------------------------------------------------------------------*/
1054 normalizeFloat128Subnormal(
1065 shiftCount
= countLeadingZeros64( aSig1
) - 15;
1066 if ( shiftCount
< 0 ) {
1067 *zSig0Ptr
= aSig1
>>( - shiftCount
);
1068 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
1071 *zSig0Ptr
= aSig1
<<shiftCount
;
1074 *zExpPtr
= - shiftCount
- 63;
1077 shiftCount
= countLeadingZeros64( aSig0
) - 15;
1078 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
1079 *zExpPtr
= 1 - shiftCount
;
1084 /*----------------------------------------------------------------------------
1085 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1086 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1087 | floating-point value, returning the result. After being shifted into the
1088 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1089 | added together to form the most significant 32 bits of the result. This
1090 | means that any integer portion of `zSig0' will be added into the exponent.
1091 | Since a properly normalized significand will have an integer portion equal
1092 | to 1, the `zExp' input should be 1 less than the desired result exponent
1093 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1095 *----------------------------------------------------------------------------*/
1097 static inline float128
1098 packFloat128( flag zSign
, int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
)
1103 z
.high
= ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<48 ) + zSig0
;
1108 /*----------------------------------------------------------------------------
1109 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1110 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1111 | and `zSig2', and returns the proper quadruple-precision floating-point value
1112 | corresponding to the abstract input. Ordinarily, the abstract value is
1113 | simply rounded and packed into the quadruple-precision format, with the
1114 | inexact exception raised if the abstract input cannot be represented
1115 | exactly. However, if the abstract value is too large, the overflow and
1116 | inexact exceptions are raised and an infinity or maximal finite value is
1117 | returned. If the abstract value is too small, the input value is rounded to
1118 | a subnormal number, and the underflow and inexact exceptions are raised if
1119 | the abstract input cannot be represented exactly as a subnormal quadruple-
1120 | precision floating-point number.
1121 | The input significand must be normalized or smaller. If the input
1122 | significand is not normalized, `zExp' must be 0; in that case, the result
1123 | returned is a subnormal number, and it must not require rounding. In the
1124 | usual case that the input significand is normalized, `zExp' must be 1 less
1125 | than the ``true'' floating-point exponent. The handling of underflow and
1126 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1127 *----------------------------------------------------------------------------*/
1129 static float128
roundAndPackFloat128(flag zSign
, int32_t zExp
,
1130 uint64_t zSig0
, uint64_t zSig1
,
1131 uint64_t zSig2
, float_status
*status
)
1133 int8_t roundingMode
;
1134 flag roundNearestEven
, increment
, isTiny
;
1136 roundingMode
= status
->float_rounding_mode
;
1137 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
1138 switch (roundingMode
) {
1139 case float_round_nearest_even
:
1140 case float_round_ties_away
:
1141 increment
= ((int64_t)zSig2
< 0);
1143 case float_round_to_zero
:
1146 case float_round_up
:
1147 increment
= !zSign
&& zSig2
;
1149 case float_round_down
:
1150 increment
= zSign
&& zSig2
;
1155 if ( 0x7FFD <= (uint32_t) zExp
) {
1156 if ( ( 0x7FFD < zExp
)
1157 || ( ( zExp
== 0x7FFD )
1159 LIT64( 0x0001FFFFFFFFFFFF ),
1160 LIT64( 0xFFFFFFFFFFFFFFFF ),
1167 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
1168 if ( ( roundingMode
== float_round_to_zero
)
1169 || ( zSign
&& ( roundingMode
== float_round_up
) )
1170 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
1176 LIT64( 0x0000FFFFFFFFFFFF ),
1177 LIT64( 0xFFFFFFFFFFFFFFFF )
1180 return packFloat128( zSign
, 0x7FFF, 0, 0 );
1183 if (status
->flush_to_zero
) {
1184 float_raise(float_flag_output_denormal
, status
);
1185 return packFloat128(zSign
, 0, 0, 0);
1188 (status
->float_detect_tininess
1189 == float_tininess_before_rounding
)
1195 LIT64( 0x0001FFFFFFFFFFFF ),
1196 LIT64( 0xFFFFFFFFFFFFFFFF )
1198 shift128ExtraRightJamming(
1199 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
1201 if (isTiny
&& zSig2
) {
1202 float_raise(float_flag_underflow
, status
);
1204 switch (roundingMode
) {
1205 case float_round_nearest_even
:
1206 case float_round_ties_away
:
1207 increment
= ((int64_t)zSig2
< 0);
1209 case float_round_to_zero
:
1212 case float_round_up
:
1213 increment
= !zSign
&& zSig2
;
1215 case float_round_down
:
1216 increment
= zSign
&& zSig2
;
1224 status
->float_exception_flags
|= float_flag_inexact
;
1227 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
1228 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
1231 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
1233 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1237 /*----------------------------------------------------------------------------
1238 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1239 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1240 | returns the proper quadruple-precision floating-point value corresponding
1241 | to the abstract input. This routine is just like `roundAndPackFloat128'
1242 | except that the input significand has fewer bits and does not have to be
1243 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1245 *----------------------------------------------------------------------------*/
1247 static float128
normalizeRoundAndPackFloat128(flag zSign
, int32_t zExp
,
1248 uint64_t zSig0
, uint64_t zSig1
,
1249 float_status
*status
)
1259 shiftCount
= countLeadingZeros64( zSig0
) - 15;
1260 if ( 0 <= shiftCount
) {
1262 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1265 shift128ExtraRightJamming(
1266 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
1269 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
1273 /*----------------------------------------------------------------------------
1274 | Returns the result of converting the 32-bit two's complement integer `a'
1275 | to the single-precision floating-point format. The conversion is performed
1276 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1277 *----------------------------------------------------------------------------*/
1279 float32
int32_to_float32(int32_t a
, float_status
*status
)
1283 if ( a
== 0 ) return float32_zero
;
1284 if ( a
== (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1286 return normalizeRoundAndPackFloat32(zSign
, 0x9C, zSign
? -a
: a
, status
);
1289 /*----------------------------------------------------------------------------
1290 | Returns the result of converting the 32-bit two's complement integer `a'
1291 | to the double-precision floating-point format. The conversion is performed
1292 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1293 *----------------------------------------------------------------------------*/
1295 float64
int32_to_float64(int32_t a
, float_status
*status
)
1302 if ( a
== 0 ) return float64_zero
;
1304 absA
= zSign
? - a
: a
;
1305 shiftCount
= countLeadingZeros32( absA
) + 21;
1307 return packFloat64( zSign
, 0x432 - shiftCount
, zSig
<<shiftCount
);
1311 /*----------------------------------------------------------------------------
1312 | Returns the result of converting the 32-bit two's complement integer `a'
1313 | to the extended double-precision floating-point format. The conversion
1314 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1316 *----------------------------------------------------------------------------*/
1318 floatx80
int32_to_floatx80(int32_t a
, float_status
*status
)
1325 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1327 absA
= zSign
? - a
: a
;
1328 shiftCount
= countLeadingZeros32( absA
) + 32;
1330 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
1334 /*----------------------------------------------------------------------------
1335 | Returns the result of converting the 32-bit two's complement integer `a' to
1336 | the quadruple-precision floating-point format. The conversion is performed
1337 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1338 *----------------------------------------------------------------------------*/
1340 float128
int32_to_float128(int32_t a
, float_status
*status
)
1347 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1349 absA
= zSign
? - a
: a
;
1350 shiftCount
= countLeadingZeros32( absA
) + 17;
1352 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
1356 /*----------------------------------------------------------------------------
1357 | Returns the result of converting the 64-bit two's complement integer `a'
1358 | to the single-precision floating-point format. The conversion is performed
1359 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1360 *----------------------------------------------------------------------------*/
1362 float32
int64_to_float32(int64_t a
, float_status
*status
)
1368 if ( a
== 0 ) return float32_zero
;
1370 absA
= zSign
? - a
: a
;
1371 shiftCount
= countLeadingZeros64( absA
) - 40;
1372 if ( 0 <= shiftCount
) {
1373 return packFloat32( zSign
, 0x95 - shiftCount
, absA
<<shiftCount
);
1377 if ( shiftCount
< 0 ) {
1378 shift64RightJamming( absA
, - shiftCount
, &absA
);
1381 absA
<<= shiftCount
;
1383 return roundAndPackFloat32(zSign
, 0x9C - shiftCount
, absA
, status
);
1388 /*----------------------------------------------------------------------------
1389 | Returns the result of converting the 64-bit two's complement integer `a'
1390 | to the double-precision floating-point format. The conversion is performed
1391 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1392 *----------------------------------------------------------------------------*/
1394 float64
int64_to_float64(int64_t a
, float_status
*status
)
1398 if ( a
== 0 ) return float64_zero
;
1399 if ( a
== (int64_t) LIT64( 0x8000000000000000 ) ) {
1400 return packFloat64( 1, 0x43E, 0 );
1403 return normalizeRoundAndPackFloat64(zSign
, 0x43C, zSign
? -a
: a
, status
);
1406 /*----------------------------------------------------------------------------
1407 | Returns the result of converting the 64-bit two's complement integer `a'
1408 | to the extended double-precision floating-point format. The conversion
1409 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1411 *----------------------------------------------------------------------------*/
1413 floatx80
int64_to_floatx80(int64_t a
, float_status
*status
)
1419 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1421 absA
= zSign
? - a
: a
;
1422 shiftCount
= countLeadingZeros64( absA
);
1423 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
1427 /*----------------------------------------------------------------------------
1428 | Returns the result of converting the 64-bit two's complement integer `a' to
1429 | the quadruple-precision floating-point format. The conversion is performed
1430 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1431 *----------------------------------------------------------------------------*/
1433 float128
int64_to_float128(int64_t a
, float_status
*status
)
1439 uint64_t zSig0
, zSig1
;
1441 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1443 absA
= zSign
? - a
: a
;
1444 shiftCount
= countLeadingZeros64( absA
) + 49;
1445 zExp
= 0x406E - shiftCount
;
1446 if ( 64 <= shiftCount
) {
1455 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1456 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1460 /*----------------------------------------------------------------------------
1461 | Returns the result of converting the 64-bit unsigned integer `a'
1462 | to the single-precision floating-point format. The conversion is performed
1463 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1464 *----------------------------------------------------------------------------*/
1466 float32
uint64_to_float32(uint64_t a
, float_status
*status
)
1471 return float32_zero
;
1474 /* Determine (left) shift needed to put first set bit into bit posn 23
1475 * (since packFloat32() expects the binary point between bits 23 and 22);
1476 * this is the fast case for smallish numbers.
1478 shiftcount
= countLeadingZeros64(a
) - 40;
1479 if (shiftcount
>= 0) {
1480 return packFloat32(0, 0x95 - shiftcount
, a
<< shiftcount
);
1482 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1483 * expects the binary point between bits 30 and 29, hence the + 7.
1486 if (shiftcount
< 0) {
1487 shift64RightJamming(a
, -shiftcount
, &a
);
1492 return roundAndPackFloat32(0, 0x9c - shiftcount
, a
, status
);
1495 /*----------------------------------------------------------------------------
1496 | Returns the result of converting the 64-bit unsigned integer `a'
1497 | to the double-precision floating-point format. The conversion is performed
1498 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1499 *----------------------------------------------------------------------------*/
1501 float64
uint64_to_float64(uint64_t a
, float_status
*status
)
1507 return float64_zero
;
1510 shiftcount
= countLeadingZeros64(a
) - 1;
1511 if (shiftcount
< 0) {
1512 shift64RightJamming(a
, -shiftcount
, &a
);
1516 return roundAndPackFloat64(0, exp
- shiftcount
, a
, status
);
1519 /*----------------------------------------------------------------------------
1520 | Returns the result of converting the 64-bit unsigned integer `a'
1521 | to the quadruple-precision floating-point format. The conversion is performed
1522 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1523 *----------------------------------------------------------------------------*/
1525 float128
uint64_to_float128(uint64_t a
, float_status
*status
)
1528 return float128_zero
;
1530 return normalizeRoundAndPackFloat128(0, 0x406E, a
, 0, status
);
1533 /*----------------------------------------------------------------------------
1534 | Returns the result of converting the single-precision floating-point value
1535 | `a' to the 32-bit two's complement integer format. The conversion is
1536 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1537 | Arithmetic---which means in particular that the conversion is rounded
1538 | according to the current rounding mode. If `a' is a NaN, the largest
1539 | positive integer is returned. Otherwise, if the conversion overflows, the
1540 | largest integer with the same sign as `a' is returned.
1541 *----------------------------------------------------------------------------*/
1543 int32_t float32_to_int32(float32 a
, float_status
*status
)
1551 a
= float32_squash_input_denormal(a
, status
);
1552 aSig
= extractFloat32Frac( a
);
1553 aExp
= extractFloat32Exp( a
);
1554 aSign
= extractFloat32Sign( a
);
1555 if ( ( aExp
== 0xFF ) && aSig
) aSign
= 0;
1556 if ( aExp
) aSig
|= 0x00800000;
1557 shiftCount
= 0xAF - aExp
;
1560 if ( 0 < shiftCount
) shift64RightJamming( aSig64
, shiftCount
, &aSig64
);
1561 return roundAndPackInt32(aSign
, aSig64
, status
);
1565 /*----------------------------------------------------------------------------
1566 | Returns the result of converting the single-precision floating-point value
1567 | `a' to the 32-bit two's complement integer format. The conversion is
1568 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1569 | Arithmetic, except that the conversion is always rounded toward zero.
1570 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1571 | the conversion overflows, the largest integer with the same sign as `a' is
1573 *----------------------------------------------------------------------------*/
1575 int32_t float32_to_int32_round_to_zero(float32 a
, float_status
*status
)
1582 a
= float32_squash_input_denormal(a
, status
);
1584 aSig
= extractFloat32Frac( a
);
1585 aExp
= extractFloat32Exp( a
);
1586 aSign
= extractFloat32Sign( a
);
1587 shiftCount
= aExp
- 0x9E;
1588 if ( 0 <= shiftCount
) {
1589 if ( float32_val(a
) != 0xCF000000 ) {
1590 float_raise(float_flag_invalid
, status
);
1591 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
1593 return (int32_t) 0x80000000;
1595 else if ( aExp
<= 0x7E ) {
1597 status
->float_exception_flags
|= float_flag_inexact
;
1601 aSig
= ( aSig
| 0x00800000 )<<8;
1602 z
= aSig
>>( - shiftCount
);
1603 if ( (uint32_t) ( aSig
<<( shiftCount
& 31 ) ) ) {
1604 status
->float_exception_flags
|= float_flag_inexact
;
1606 if ( aSign
) z
= - z
;
1611 /*----------------------------------------------------------------------------
1612 | Returns the result of converting the single-precision floating-point value
1613 | `a' to the 16-bit two's complement integer format. The conversion is
1614 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1615 | Arithmetic, except that the conversion is always rounded toward zero.
1616 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1617 | the conversion overflows, the largest integer with the same sign as `a' is
1619 *----------------------------------------------------------------------------*/
1621 int16_t float32_to_int16_round_to_zero(float32 a
, float_status
*status
)
1629 aSig
= extractFloat32Frac( a
);
1630 aExp
= extractFloat32Exp( a
);
1631 aSign
= extractFloat32Sign( a
);
1632 shiftCount
= aExp
- 0x8E;
1633 if ( 0 <= shiftCount
) {
1634 if ( float32_val(a
) != 0xC7000000 ) {
1635 float_raise(float_flag_invalid
, status
);
1636 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1640 return (int32_t) 0xffff8000;
1642 else if ( aExp
<= 0x7E ) {
1643 if ( aExp
| aSig
) {
1644 status
->float_exception_flags
|= float_flag_inexact
;
1649 aSig
= ( aSig
| 0x00800000 )<<8;
1650 z
= aSig
>>( - shiftCount
);
1651 if ( (uint32_t) ( aSig
<<( shiftCount
& 31 ) ) ) {
1652 status
->float_exception_flags
|= float_flag_inexact
;
1661 /*----------------------------------------------------------------------------
1662 | Returns the result of converting the single-precision floating-point value
1663 | `a' to the 64-bit two's complement integer format. The conversion is
1664 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1665 | Arithmetic---which means in particular that the conversion is rounded
1666 | according to the current rounding mode. If `a' is a NaN, the largest
1667 | positive integer is returned. Otherwise, if the conversion overflows, the
1668 | largest integer with the same sign as `a' is returned.
1669 *----------------------------------------------------------------------------*/
1671 int64_t float32_to_int64(float32 a
, float_status
*status
)
1677 uint64_t aSig64
, aSigExtra
;
1678 a
= float32_squash_input_denormal(a
, status
);
1680 aSig
= extractFloat32Frac( a
);
1681 aExp
= extractFloat32Exp( a
);
1682 aSign
= extractFloat32Sign( a
);
1683 shiftCount
= 0xBE - aExp
;
1684 if ( shiftCount
< 0 ) {
1685 float_raise(float_flag_invalid
, status
);
1686 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1687 return LIT64( 0x7FFFFFFFFFFFFFFF );
1689 return (int64_t) LIT64( 0x8000000000000000 );
1691 if ( aExp
) aSig
|= 0x00800000;
1694 shift64ExtraRightJamming( aSig64
, 0, shiftCount
, &aSig64
, &aSigExtra
);
1695 return roundAndPackInt64(aSign
, aSig64
, aSigExtra
, status
);
1699 /*----------------------------------------------------------------------------
1700 | Returns the result of converting the single-precision floating-point value
1701 | `a' to the 64-bit unsigned integer format. The conversion is
1702 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1703 | Arithmetic---which means in particular that the conversion is rounded
1704 | according to the current rounding mode. If `a' is a NaN, the largest
1705 | unsigned integer is returned. Otherwise, if the conversion overflows, the
1706 | largest unsigned integer is returned. If the 'a' is negative, the result
1707 | is rounded and zero is returned; values that do not round to zero will
1708 | raise the inexact exception flag.
1709 *----------------------------------------------------------------------------*/
1711 uint64_t float32_to_uint64(float32 a
, float_status
*status
)
1717 uint64_t aSig64
, aSigExtra
;
1718 a
= float32_squash_input_denormal(a
, status
);
1720 aSig
= extractFloat32Frac(a
);
1721 aExp
= extractFloat32Exp(a
);
1722 aSign
= extractFloat32Sign(a
);
1723 if ((aSign
) && (aExp
> 126)) {
1724 float_raise(float_flag_invalid
, status
);
1725 if (float32_is_any_nan(a
)) {
1726 return LIT64(0xFFFFFFFFFFFFFFFF);
1731 shiftCount
= 0xBE - aExp
;
1735 if (shiftCount
< 0) {
1736 float_raise(float_flag_invalid
, status
);
1737 return LIT64(0xFFFFFFFFFFFFFFFF);
1742 shift64ExtraRightJamming(aSig64
, 0, shiftCount
, &aSig64
, &aSigExtra
);
1743 return roundAndPackUint64(aSign
, aSig64
, aSigExtra
, status
);
1746 /*----------------------------------------------------------------------------
1747 | Returns the result of converting the single-precision floating-point value
1748 | `a' to the 64-bit unsigned integer format. The conversion is
1749 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1750 | Arithmetic, except that the conversion is always rounded toward zero. If
1751 | `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the
1752 | conversion overflows, the largest unsigned integer is returned. If the
1753 | 'a' is negative, the result is rounded and zero is returned; values that do
1754 | not round to zero will raise the inexact flag.
1755 *----------------------------------------------------------------------------*/
1757 uint64_t float32_to_uint64_round_to_zero(float32 a
, float_status
*status
)
1759 signed char current_rounding_mode
= status
->float_rounding_mode
;
1760 set_float_rounding_mode(float_round_to_zero
, status
);
1761 int64_t v
= float32_to_uint64(a
, status
);
1762 set_float_rounding_mode(current_rounding_mode
, status
);
1766 /*----------------------------------------------------------------------------
1767 | Returns the result of converting the single-precision floating-point value
1768 | `a' to the 64-bit two's complement integer format. The conversion is
1769 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1770 | Arithmetic, except that the conversion is always rounded toward zero. If
1771 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1772 | conversion overflows, the largest integer with the same sign as `a' is
1774 *----------------------------------------------------------------------------*/
1776 int64_t float32_to_int64_round_to_zero(float32 a
, float_status
*status
)
1784 a
= float32_squash_input_denormal(a
, status
);
1786 aSig
= extractFloat32Frac( a
);
1787 aExp
= extractFloat32Exp( a
);
1788 aSign
= extractFloat32Sign( a
);
1789 shiftCount
= aExp
- 0xBE;
1790 if ( 0 <= shiftCount
) {
1791 if ( float32_val(a
) != 0xDF000000 ) {
1792 float_raise(float_flag_invalid
, status
);
1793 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1794 return LIT64( 0x7FFFFFFFFFFFFFFF );
1797 return (int64_t) LIT64( 0x8000000000000000 );
1799 else if ( aExp
<= 0x7E ) {
1801 status
->float_exception_flags
|= float_flag_inexact
;
1805 aSig64
= aSig
| 0x00800000;
1807 z
= aSig64
>>( - shiftCount
);
1808 if ( (uint64_t) ( aSig64
<<( shiftCount
& 63 ) ) ) {
1809 status
->float_exception_flags
|= float_flag_inexact
;
1811 if ( aSign
) z
= - z
;
1816 /*----------------------------------------------------------------------------
1817 | Returns the result of converting the single-precision floating-point value
1818 | `a' to the double-precision floating-point format. The conversion is
1819 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1821 *----------------------------------------------------------------------------*/
1823 float64
float32_to_float64(float32 a
, float_status
*status
)
1828 a
= float32_squash_input_denormal(a
, status
);
1830 aSig
= extractFloat32Frac( a
);
1831 aExp
= extractFloat32Exp( a
);
1832 aSign
= extractFloat32Sign( a
);
1833 if ( aExp
== 0xFF ) {
1835 return commonNaNToFloat64(float32ToCommonNaN(a
, status
), status
);
1837 return packFloat64( aSign
, 0x7FF, 0 );
1840 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
1841 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1844 return packFloat64( aSign
, aExp
+ 0x380, ( (uint64_t) aSig
)<<29 );
1848 /*----------------------------------------------------------------------------
1849 | Returns the result of converting the single-precision floating-point value
1850 | `a' to the extended double-precision floating-point format. The conversion
1851 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1853 *----------------------------------------------------------------------------*/
1855 floatx80
float32_to_floatx80(float32 a
, float_status
*status
)
1861 a
= float32_squash_input_denormal(a
, status
);
1862 aSig
= extractFloat32Frac( a
);
1863 aExp
= extractFloat32Exp( a
);
1864 aSign
= extractFloat32Sign( a
);
1865 if ( aExp
== 0xFF ) {
1867 return commonNaNToFloatx80(float32ToCommonNaN(a
, status
), status
);
1869 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1872 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
1873 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1876 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
1880 /*----------------------------------------------------------------------------
1881 | Returns the result of converting the single-precision floating-point value
1882 | `a' to the double-precision floating-point format. The conversion is
1883 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1885 *----------------------------------------------------------------------------*/
1887 float128
float32_to_float128(float32 a
, float_status
*status
)
1893 a
= float32_squash_input_denormal(a
, status
);
1894 aSig
= extractFloat32Frac( a
);
1895 aExp
= extractFloat32Exp( a
);
1896 aSign
= extractFloat32Sign( a
);
1897 if ( aExp
== 0xFF ) {
1899 return commonNaNToFloat128(float32ToCommonNaN(a
, status
), status
);
1901 return packFloat128( aSign
, 0x7FFF, 0, 0 );
1904 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
1905 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1908 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
1912 /*----------------------------------------------------------------------------
1913 | Rounds the single-precision floating-point value `a' to an integer, and
1914 | returns the result as a single-precision floating-point value. The
1915 | operation is performed according to the IEC/IEEE Standard for Binary
1916 | Floating-Point Arithmetic.
1917 *----------------------------------------------------------------------------*/
1919 float32
float32_round_to_int(float32 a
, float_status
*status
)
1923 uint32_t lastBitMask
, roundBitsMask
;
1925 a
= float32_squash_input_denormal(a
, status
);
1927 aExp
= extractFloat32Exp( a
);
1928 if ( 0x96 <= aExp
) {
1929 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
1930 return propagateFloat32NaN(a
, a
, status
);
1934 if ( aExp
<= 0x7E ) {
1935 if ( (uint32_t) ( float32_val(a
)<<1 ) == 0 ) return a
;
1936 status
->float_exception_flags
|= float_flag_inexact
;
1937 aSign
= extractFloat32Sign( a
);
1938 switch (status
->float_rounding_mode
) {
1939 case float_round_nearest_even
:
1940 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
1941 return packFloat32( aSign
, 0x7F, 0 );
1944 case float_round_ties_away
:
1946 return packFloat32(aSign
, 0x7F, 0);
1949 case float_round_down
:
1950 return make_float32(aSign
? 0xBF800000 : 0);
1951 case float_round_up
:
1952 return make_float32(aSign
? 0x80000000 : 0x3F800000);
1954 return packFloat32( aSign
, 0, 0 );
1957 lastBitMask
<<= 0x96 - aExp
;
1958 roundBitsMask
= lastBitMask
- 1;
1960 switch (status
->float_rounding_mode
) {
1961 case float_round_nearest_even
:
1962 z
+= lastBitMask
>>1;
1963 if ((z
& roundBitsMask
) == 0) {
1967 case float_round_ties_away
:
1968 z
+= lastBitMask
>> 1;
1970 case float_round_to_zero
:
1972 case float_round_up
:
1973 if (!extractFloat32Sign(make_float32(z
))) {
1977 case float_round_down
:
1978 if (extractFloat32Sign(make_float32(z
))) {
1985 z
&= ~ roundBitsMask
;
1986 if (z
!= float32_val(a
)) {
1987 status
->float_exception_flags
|= float_flag_inexact
;
1989 return make_float32(z
);
1993 /*----------------------------------------------------------------------------
1994 | Returns the result of adding the absolute values of the single-precision
1995 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1996 | before being returned. `zSign' is ignored if the result is a NaN.
1997 | The addition is performed according to the IEC/IEEE Standard for Binary
1998 | Floating-Point Arithmetic.
1999 *----------------------------------------------------------------------------*/
2001 static float32
addFloat32Sigs(float32 a
, float32 b
, flag zSign
,
2002 float_status
*status
)
2004 int aExp
, bExp
, zExp
;
2005 uint32_t aSig
, bSig
, zSig
;
2008 aSig
= extractFloat32Frac( a
);
2009 aExp
= extractFloat32Exp( a
);
2010 bSig
= extractFloat32Frac( b
);
2011 bExp
= extractFloat32Exp( b
);
2012 expDiff
= aExp
- bExp
;
2015 if ( 0 < expDiff
) {
2016 if ( aExp
== 0xFF ) {
2018 return propagateFloat32NaN(a
, b
, status
);
2028 shift32RightJamming( bSig
, expDiff
, &bSig
);
2031 else if ( expDiff
< 0 ) {
2032 if ( bExp
== 0xFF ) {
2034 return propagateFloat32NaN(a
, b
, status
);
2036 return packFloat32( zSign
, 0xFF, 0 );
2044 shift32RightJamming( aSig
, - expDiff
, &aSig
);
2048 if ( aExp
== 0xFF ) {
2050 return propagateFloat32NaN(a
, b
, status
);
2055 if (status
->flush_to_zero
) {
2057 float_raise(float_flag_output_denormal
, status
);
2059 return packFloat32(zSign
, 0, 0);
2061 return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
2063 zSig
= 0x40000000 + aSig
+ bSig
;
2068 zSig
= ( aSig
+ bSig
)<<1;
2070 if ( (int32_t) zSig
< 0 ) {
2075 return roundAndPackFloat32(zSign
, zExp
, zSig
, status
);
2079 /*----------------------------------------------------------------------------
2080 | Returns the result of subtracting the absolute values of the single-
2081 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2082 | difference is negated before being returned. `zSign' is ignored if the
2083 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2084 | Standard for Binary Floating-Point Arithmetic.
2085 *----------------------------------------------------------------------------*/
2087 static float32
subFloat32Sigs(float32 a
, float32 b
, flag zSign
,
2088 float_status
*status
)
2090 int aExp
, bExp
, zExp
;
2091 uint32_t aSig
, bSig
, zSig
;
2094 aSig
= extractFloat32Frac( a
);
2095 aExp
= extractFloat32Exp( a
);
2096 bSig
= extractFloat32Frac( b
);
2097 bExp
= extractFloat32Exp( b
);
2098 expDiff
= aExp
- bExp
;
2101 if ( 0 < expDiff
) goto aExpBigger
;
2102 if ( expDiff
< 0 ) goto bExpBigger
;
2103 if ( aExp
== 0xFF ) {
2105 return propagateFloat32NaN(a
, b
, status
);
2107 float_raise(float_flag_invalid
, status
);
2108 return float32_default_nan
;
2114 if ( bSig
< aSig
) goto aBigger
;
2115 if ( aSig
< bSig
) goto bBigger
;
2116 return packFloat32(status
->float_rounding_mode
== float_round_down
, 0, 0);
2118 if ( bExp
== 0xFF ) {
2120 return propagateFloat32NaN(a
, b
, status
);
2122 return packFloat32( zSign
^ 1, 0xFF, 0 );
2130 shift32RightJamming( aSig
, - expDiff
, &aSig
);
2136 goto normalizeRoundAndPack
;
2138 if ( aExp
== 0xFF ) {
2140 return propagateFloat32NaN(a
, b
, status
);
2150 shift32RightJamming( bSig
, expDiff
, &bSig
);
2155 normalizeRoundAndPack
:
2157 return normalizeRoundAndPackFloat32(zSign
, zExp
, zSig
, status
);
2161 /*----------------------------------------------------------------------------
2162 | Returns the result of adding the single-precision floating-point values `a'
2163 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2164 | Binary Floating-Point Arithmetic.
2165 *----------------------------------------------------------------------------*/
2167 float32
float32_add(float32 a
, float32 b
, float_status
*status
)
2170 a
= float32_squash_input_denormal(a
, status
);
2171 b
= float32_squash_input_denormal(b
, status
);
2173 aSign
= extractFloat32Sign( a
);
2174 bSign
= extractFloat32Sign( b
);
2175 if ( aSign
== bSign
) {
2176 return addFloat32Sigs(a
, b
, aSign
, status
);
2179 return subFloat32Sigs(a
, b
, aSign
, status
);
2184 /*----------------------------------------------------------------------------
2185 | Returns the result of subtracting the single-precision floating-point values
2186 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2187 | for Binary Floating-Point Arithmetic.
2188 *----------------------------------------------------------------------------*/
2190 float32
float32_sub(float32 a
, float32 b
, float_status
*status
)
2193 a
= float32_squash_input_denormal(a
, status
);
2194 b
= float32_squash_input_denormal(b
, status
);
2196 aSign
= extractFloat32Sign( a
);
2197 bSign
= extractFloat32Sign( b
);
2198 if ( aSign
== bSign
) {
2199 return subFloat32Sigs(a
, b
, aSign
, status
);
2202 return addFloat32Sigs(a
, b
, aSign
, status
);
2207 /*----------------------------------------------------------------------------
2208 | Returns the result of multiplying the single-precision floating-point values
2209 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2210 | for Binary Floating-Point Arithmetic.
2211 *----------------------------------------------------------------------------*/
2213 float32
float32_mul(float32 a
, float32 b
, float_status
*status
)
2215 flag aSign
, bSign
, zSign
;
2216 int aExp
, bExp
, zExp
;
2217 uint32_t aSig
, bSig
;
2221 a
= float32_squash_input_denormal(a
, status
);
2222 b
= float32_squash_input_denormal(b
, status
);
2224 aSig
= extractFloat32Frac( a
);
2225 aExp
= extractFloat32Exp( a
);
2226 aSign
= extractFloat32Sign( a
);
2227 bSig
= extractFloat32Frac( b
);
2228 bExp
= extractFloat32Exp( b
);
2229 bSign
= extractFloat32Sign( b
);
2230 zSign
= aSign
^ bSign
;
2231 if ( aExp
== 0xFF ) {
2232 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
2233 return propagateFloat32NaN(a
, b
, status
);
2235 if ( ( bExp
| bSig
) == 0 ) {
2236 float_raise(float_flag_invalid
, status
);
2237 return float32_default_nan
;
2239 return packFloat32( zSign
, 0xFF, 0 );
2241 if ( bExp
== 0xFF ) {
2243 return propagateFloat32NaN(a
, b
, status
);
2245 if ( ( aExp
| aSig
) == 0 ) {
2246 float_raise(float_flag_invalid
, status
);
2247 return float32_default_nan
;
2249 return packFloat32( zSign
, 0xFF, 0 );
2252 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
2253 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2256 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
2257 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2259 zExp
= aExp
+ bExp
- 0x7F;
2260 aSig
= ( aSig
| 0x00800000 )<<7;
2261 bSig
= ( bSig
| 0x00800000 )<<8;
2262 shift64RightJamming( ( (uint64_t) aSig
) * bSig
, 32, &zSig64
);
2264 if ( 0 <= (int32_t) ( zSig
<<1 ) ) {
2268 return roundAndPackFloat32(zSign
, zExp
, zSig
, status
);
2272 /*----------------------------------------------------------------------------
2273 | Returns the result of dividing the single-precision floating-point value `a'
2274 | by the corresponding value `b'. The operation is performed according to the
2275 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2276 *----------------------------------------------------------------------------*/
2278 float32
float32_div(float32 a
, float32 b
, float_status
*status
)
2280 flag aSign
, bSign
, zSign
;
2281 int aExp
, bExp
, zExp
;
2282 uint32_t aSig
, bSig
, zSig
;
2283 a
= float32_squash_input_denormal(a
, status
);
2284 b
= float32_squash_input_denormal(b
, status
);
2286 aSig
= extractFloat32Frac( a
);
2287 aExp
= extractFloat32Exp( a
);
2288 aSign
= extractFloat32Sign( a
);
2289 bSig
= extractFloat32Frac( b
);
2290 bExp
= extractFloat32Exp( b
);
2291 bSign
= extractFloat32Sign( b
);
2292 zSign
= aSign
^ bSign
;
2293 if ( aExp
== 0xFF ) {
2295 return propagateFloat32NaN(a
, b
, status
);
2297 if ( bExp
== 0xFF ) {
2299 return propagateFloat32NaN(a
, b
, status
);
2301 float_raise(float_flag_invalid
, status
);
2302 return float32_default_nan
;
2304 return packFloat32( zSign
, 0xFF, 0 );
2306 if ( bExp
== 0xFF ) {
2308 return propagateFloat32NaN(a
, b
, status
);
2310 return packFloat32( zSign
, 0, 0 );
2314 if ( ( aExp
| aSig
) == 0 ) {
2315 float_raise(float_flag_invalid
, status
);
2316 return float32_default_nan
;
2318 float_raise(float_flag_divbyzero
, status
);
2319 return packFloat32( zSign
, 0xFF, 0 );
2321 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2324 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
2325 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2327 zExp
= aExp
- bExp
+ 0x7D;
2328 aSig
= ( aSig
| 0x00800000 )<<7;
2329 bSig
= ( bSig
| 0x00800000 )<<8;
2330 if ( bSig
<= ( aSig
+ aSig
) ) {
2334 zSig
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
2335 if ( ( zSig
& 0x3F ) == 0 ) {
2336 zSig
|= ( (uint64_t) bSig
* zSig
!= ( (uint64_t) aSig
)<<32 );
2338 return roundAndPackFloat32(zSign
, zExp
, zSig
, status
);
2342 /*----------------------------------------------------------------------------
2343 | Returns the remainder of the single-precision floating-point value `a'
2344 | with respect to the corresponding value `b'. The operation is performed
2345 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2346 *----------------------------------------------------------------------------*/
2348 float32
float32_rem(float32 a
, float32 b
, float_status
*status
)
2351 int aExp
, bExp
, expDiff
;
2352 uint32_t aSig
, bSig
;
2354 uint64_t aSig64
, bSig64
, q64
;
2355 uint32_t alternateASig
;
2357 a
= float32_squash_input_denormal(a
, status
);
2358 b
= float32_squash_input_denormal(b
, status
);
2360 aSig
= extractFloat32Frac( a
);
2361 aExp
= extractFloat32Exp( a
);
2362 aSign
= extractFloat32Sign( a
);
2363 bSig
= extractFloat32Frac( b
);
2364 bExp
= extractFloat32Exp( b
);
2365 if ( aExp
== 0xFF ) {
2366 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
2367 return propagateFloat32NaN(a
, b
, status
);
2369 float_raise(float_flag_invalid
, status
);
2370 return float32_default_nan
;
2372 if ( bExp
== 0xFF ) {
2374 return propagateFloat32NaN(a
, b
, status
);
2380 float_raise(float_flag_invalid
, status
);
2381 return float32_default_nan
;
2383 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2386 if ( aSig
== 0 ) return a
;
2387 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2389 expDiff
= aExp
- bExp
;
2392 if ( expDiff
< 32 ) {
2395 if ( expDiff
< 0 ) {
2396 if ( expDiff
< -1 ) return a
;
2399 q
= ( bSig
<= aSig
);
2400 if ( q
) aSig
-= bSig
;
2401 if ( 0 < expDiff
) {
2402 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
2405 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
2413 if ( bSig
<= aSig
) aSig
-= bSig
;
2414 aSig64
= ( (uint64_t) aSig
)<<40;
2415 bSig64
= ( (uint64_t) bSig
)<<40;
2417 while ( 0 < expDiff
) {
2418 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2419 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2420 aSig64
= - ( ( bSig
* q64
)<<38 );
2424 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2425 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2426 q
= q64
>>( 64 - expDiff
);
2428 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
2431 alternateASig
= aSig
;
2434 } while ( 0 <= (int32_t) aSig
);
2435 sigMean
= aSig
+ alternateASig
;
2436 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
2437 aSig
= alternateASig
;
2439 zSign
= ( (int32_t) aSig
< 0 );
2440 if ( zSign
) aSig
= - aSig
;
2441 return normalizeRoundAndPackFloat32(aSign
^ zSign
, bExp
, aSig
, status
);
2444 /*----------------------------------------------------------------------------
2445 | Returns the result of multiplying the single-precision floating-point values
2446 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2447 | multiplication. The operation is performed according to the IEC/IEEE
2448 | Standard for Binary Floating-Point Arithmetic 754-2008.
2449 | The flags argument allows the caller to select negation of the
2450 | addend, the intermediate product, or the final result. (The difference
2451 | between this and having the caller do a separate negation is that negating
2452 | externally will flip the sign bit on NaNs.)
2453 *----------------------------------------------------------------------------*/
2455 float32
float32_muladd(float32 a
, float32 b
, float32 c
, int flags
,
2456 float_status
*status
)
2458 flag aSign
, bSign
, cSign
, zSign
;
2459 int aExp
, bExp
, cExp
, pExp
, zExp
, expDiff
;
2460 uint32_t aSig
, bSig
, cSig
;
2461 flag pInf
, pZero
, pSign
;
2462 uint64_t pSig64
, cSig64
, zSig64
;
2465 flag signflip
, infzero
;
2467 a
= float32_squash_input_denormal(a
, status
);
2468 b
= float32_squash_input_denormal(b
, status
);
2469 c
= float32_squash_input_denormal(c
, status
);
2470 aSig
= extractFloat32Frac(a
);
2471 aExp
= extractFloat32Exp(a
);
2472 aSign
= extractFloat32Sign(a
);
2473 bSig
= extractFloat32Frac(b
);
2474 bExp
= extractFloat32Exp(b
);
2475 bSign
= extractFloat32Sign(b
);
2476 cSig
= extractFloat32Frac(c
);
2477 cExp
= extractFloat32Exp(c
);
2478 cSign
= extractFloat32Sign(c
);
2480 infzero
= ((aExp
== 0 && aSig
== 0 && bExp
== 0xff && bSig
== 0) ||
2481 (aExp
== 0xff && aSig
== 0 && bExp
== 0 && bSig
== 0));
2483 /* It is implementation-defined whether the cases of (0,inf,qnan)
2484 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2485 * they return if they do), so we have to hand this information
2486 * off to the target-specific pick-a-NaN routine.
2488 if (((aExp
== 0xff) && aSig
) ||
2489 ((bExp
== 0xff) && bSig
) ||
2490 ((cExp
== 0xff) && cSig
)) {
2491 return propagateFloat32MulAddNaN(a
, b
, c
, infzero
, status
);
2495 float_raise(float_flag_invalid
, status
);
2496 return float32_default_nan
;
2499 if (flags
& float_muladd_negate_c
) {
2503 signflip
= (flags
& float_muladd_negate_result
) ? 1 : 0;
2505 /* Work out the sign and type of the product */
2506 pSign
= aSign
^ bSign
;
2507 if (flags
& float_muladd_negate_product
) {
2510 pInf
= (aExp
== 0xff) || (bExp
== 0xff);
2511 pZero
= ((aExp
| aSig
) == 0) || ((bExp
| bSig
) == 0);
2514 if (pInf
&& (pSign
^ cSign
)) {
2515 /* addition of opposite-signed infinities => InvalidOperation */
2516 float_raise(float_flag_invalid
, status
);
2517 return float32_default_nan
;
2519 /* Otherwise generate an infinity of the same sign */
2520 return packFloat32(cSign
^ signflip
, 0xff, 0);
2524 return packFloat32(pSign
^ signflip
, 0xff, 0);
2530 /* Adding two exact zeroes */
2531 if (pSign
== cSign
) {
2533 } else if (status
->float_rounding_mode
== float_round_down
) {
2538 return packFloat32(zSign
^ signflip
, 0, 0);
2540 /* Exact zero plus a denorm */
2541 if (status
->flush_to_zero
) {
2542 float_raise(float_flag_output_denormal
, status
);
2543 return packFloat32(cSign
^ signflip
, 0, 0);
2546 /* Zero plus something non-zero : just return the something */
2547 if (flags
& float_muladd_halve_result
) {
2549 normalizeFloat32Subnormal(cSig
, &cExp
, &cSig
);
2551 /* Subtract one to halve, and one again because roundAndPackFloat32
2552 * wants one less than the true exponent.
2555 cSig
= (cSig
| 0x00800000) << 7;
2556 return roundAndPackFloat32(cSign
^ signflip
, cExp
, cSig
, status
);
2558 return packFloat32(cSign
^ signflip
, cExp
, cSig
);
2562 normalizeFloat32Subnormal(aSig
, &aExp
, &aSig
);
2565 normalizeFloat32Subnormal(bSig
, &bExp
, &bSig
);
2568 /* Calculate the actual result a * b + c */
2570 /* Multiply first; this is easy. */
2571 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2572 * because we want the true exponent, not the "one-less-than"
2573 * flavour that roundAndPackFloat32() takes.
2575 pExp
= aExp
+ bExp
- 0x7e;
2576 aSig
= (aSig
| 0x00800000) << 7;
2577 bSig
= (bSig
| 0x00800000) << 8;
2578 pSig64
= (uint64_t)aSig
* bSig
;
2579 if ((int64_t)(pSig64
<< 1) >= 0) {
2584 zSign
= pSign
^ signflip
;
2586 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2591 /* Throw out the special case of c being an exact zero now */
2592 shift64RightJamming(pSig64
, 32, &pSig64
);
2594 if (flags
& float_muladd_halve_result
) {
2597 return roundAndPackFloat32(zSign
, pExp
- 1,
2600 normalizeFloat32Subnormal(cSig
, &cExp
, &cSig
);
2603 cSig64
= (uint64_t)cSig
<< (62 - 23);
2604 cSig64
|= LIT64(0x4000000000000000);
2605 expDiff
= pExp
- cExp
;
2607 if (pSign
== cSign
) {
2610 /* scale c to match p */
2611 shift64RightJamming(cSig64
, expDiff
, &cSig64
);
2613 } else if (expDiff
< 0) {
2614 /* scale p to match c */
2615 shift64RightJamming(pSig64
, -expDiff
, &pSig64
);
2618 /* no scaling needed */
2621 /* Add significands and make sure explicit bit ends up in posn 62 */
2622 zSig64
= pSig64
+ cSig64
;
2623 if ((int64_t)zSig64
< 0) {
2624 shift64RightJamming(zSig64
, 1, &zSig64
);
2631 shift64RightJamming(cSig64
, expDiff
, &cSig64
);
2632 zSig64
= pSig64
- cSig64
;
2634 } else if (expDiff
< 0) {
2635 shift64RightJamming(pSig64
, -expDiff
, &pSig64
);
2636 zSig64
= cSig64
- pSig64
;
2641 if (cSig64
< pSig64
) {
2642 zSig64
= pSig64
- cSig64
;
2643 } else if (pSig64
< cSig64
) {
2644 zSig64
= cSig64
- pSig64
;
2649 if (status
->float_rounding_mode
== float_round_down
) {
2652 return packFloat32(zSign
, 0, 0);
2656 /* Normalize to put the explicit bit back into bit 62. */
2657 shiftcount
= countLeadingZeros64(zSig64
) - 1;
2658 zSig64
<<= shiftcount
;
2661 if (flags
& float_muladd_halve_result
) {
2665 shift64RightJamming(zSig64
, 32, &zSig64
);
2666 return roundAndPackFloat32(zSign
, zExp
, zSig64
, status
);
2670 /*----------------------------------------------------------------------------
2671 | Returns the square root of the single-precision floating-point value `a'.
2672 | The operation is performed according to the IEC/IEEE Standard for Binary
2673 | Floating-Point Arithmetic.
2674 *----------------------------------------------------------------------------*/
2676 float32
float32_sqrt(float32 a
, float_status
*status
)
2680 uint32_t aSig
, zSig
;
2682 a
= float32_squash_input_denormal(a
, status
);
2684 aSig
= extractFloat32Frac( a
);
2685 aExp
= extractFloat32Exp( a
);
2686 aSign
= extractFloat32Sign( a
);
2687 if ( aExp
== 0xFF ) {
2689 return propagateFloat32NaN(a
, float32_zero
, status
);
2691 if ( ! aSign
) return a
;
2692 float_raise(float_flag_invalid
, status
);
2693 return float32_default_nan
;
2696 if ( ( aExp
| aSig
) == 0 ) return a
;
2697 float_raise(float_flag_invalid
, status
);
2698 return float32_default_nan
;
2701 if ( aSig
== 0 ) return float32_zero
;
2702 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2704 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
2705 aSig
= ( aSig
| 0x00800000 )<<8;
2706 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
2707 if ( ( zSig
& 0x7F ) <= 5 ) {
2713 term
= ( (uint64_t) zSig
) * zSig
;
2714 rem
= ( ( (uint64_t) aSig
)<<32 ) - term
;
2715 while ( (int64_t) rem
< 0 ) {
2717 rem
+= ( ( (uint64_t) zSig
)<<1 ) | 1;
2719 zSig
|= ( rem
!= 0 );
2721 shift32RightJamming( zSig
, 1, &zSig
);
2723 return roundAndPackFloat32(0, zExp
, zSig
, status
);
2727 /*----------------------------------------------------------------------------
2728 | Returns the binary exponential of the single-precision floating-point value
2729 | `a'. The operation is performed according to the IEC/IEEE Standard for
2730 | Binary Floating-Point Arithmetic.
2732 | Uses the following identities:
2734 | 1. -------------------------------------------------------------------------
2738 | 2. -------------------------------------------------------------------------
2741 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2743 *----------------------------------------------------------------------------*/
2745 static const float64 float32_exp2_coefficients
[15] =
2747 const_float64( 0x3ff0000000000000ll
), /* 1 */
2748 const_float64( 0x3fe0000000000000ll
), /* 2 */
2749 const_float64( 0x3fc5555555555555ll
), /* 3 */
2750 const_float64( 0x3fa5555555555555ll
), /* 4 */
2751 const_float64( 0x3f81111111111111ll
), /* 5 */
2752 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
2753 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
2754 const_float64( 0x3efa01a01a01a01all
), /* 8 */
2755 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
2756 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
2757 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
2758 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
2759 const_float64( 0x3de6124613a86d09ll
), /* 13 */
2760 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
2761 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
2764 float32
float32_exp2(float32 a
, float_status
*status
)
2771 a
= float32_squash_input_denormal(a
, status
);
2773 aSig
= extractFloat32Frac( a
);
2774 aExp
= extractFloat32Exp( a
);
2775 aSign
= extractFloat32Sign( a
);
2777 if ( aExp
== 0xFF) {
2779 return propagateFloat32NaN(a
, float32_zero
, status
);
2781 return (aSign
) ? float32_zero
: a
;
2784 if (aSig
== 0) return float32_one
;
2787 float_raise(float_flag_inexact
, status
);
2789 /* ******************************* */
2790 /* using float64 for approximation */
2791 /* ******************************* */
2792 x
= float32_to_float64(a
, status
);
2793 x
= float64_mul(x
, float64_ln2
, status
);
2797 for (i
= 0 ; i
< 15 ; i
++) {
2800 f
= float64_mul(xn
, float32_exp2_coefficients
[i
], status
);
2801 r
= float64_add(r
, f
, status
);
2803 xn
= float64_mul(xn
, x
, status
);
2806 return float64_to_float32(r
, status
);
2809 /*----------------------------------------------------------------------------
2810 | Returns the binary log of the single-precision floating-point value `a'.
2811 | The operation is performed according to the IEC/IEEE Standard for Binary
2812 | Floating-Point Arithmetic.
2813 *----------------------------------------------------------------------------*/
2814 float32
float32_log2(float32 a
, float_status
*status
)
2818 uint32_t aSig
, zSig
, i
;
2820 a
= float32_squash_input_denormal(a
, status
);
2821 aSig
= extractFloat32Frac( a
);
2822 aExp
= extractFloat32Exp( a
);
2823 aSign
= extractFloat32Sign( a
);
2826 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
2827 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2830 float_raise(float_flag_invalid
, status
);
2831 return float32_default_nan
;
2833 if ( aExp
== 0xFF ) {
2835 return propagateFloat32NaN(a
, float32_zero
, status
);
2845 for (i
= 1 << 22; i
> 0; i
>>= 1) {
2846 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
2847 if ( aSig
& 0x01000000 ) {
2856 return normalizeRoundAndPackFloat32(zSign
, 0x85, zSig
, status
);
2859 /*----------------------------------------------------------------------------
2860 | Returns 1 if the single-precision floating-point value `a' is equal to
2861 | the corresponding value `b', and 0 otherwise. The invalid exception is
2862 | raised if either operand is a NaN. Otherwise, the comparison is performed
2863 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2864 *----------------------------------------------------------------------------*/
2866 int float32_eq(float32 a
, float32 b
, float_status
*status
)
2869 a
= float32_squash_input_denormal(a
, status
);
2870 b
= float32_squash_input_denormal(b
, status
);
2872 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2873 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2875 float_raise(float_flag_invalid
, status
);
2878 av
= float32_val(a
);
2879 bv
= float32_val(b
);
2880 return ( av
== bv
) || ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
2883 /*----------------------------------------------------------------------------
2884 | Returns 1 if the single-precision floating-point value `a' is less than
2885 | or equal to the corresponding value `b', and 0 otherwise. The invalid
2886 | exception is raised if either operand is a NaN. The comparison is performed
2887 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2888 *----------------------------------------------------------------------------*/
2890 int float32_le(float32 a
, float32 b
, float_status
*status
)
2894 a
= float32_squash_input_denormal(a
, status
);
2895 b
= float32_squash_input_denormal(b
, status
);
2897 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2898 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2900 float_raise(float_flag_invalid
, status
);
2903 aSign
= extractFloat32Sign( a
);
2904 bSign
= extractFloat32Sign( b
);
2905 av
= float32_val(a
);
2906 bv
= float32_val(b
);
2907 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
2908 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
2912 /*----------------------------------------------------------------------------
2913 | Returns 1 if the single-precision floating-point value `a' is less than
2914 | the corresponding value `b', and 0 otherwise. The invalid exception is
2915 | raised if either operand is a NaN. The comparison is performed according
2916 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2917 *----------------------------------------------------------------------------*/
2919 int float32_lt(float32 a
, float32 b
, float_status
*status
)
2923 a
= float32_squash_input_denormal(a
, status
);
2924 b
= float32_squash_input_denormal(b
, status
);
2926 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2927 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2929 float_raise(float_flag_invalid
, status
);
2932 aSign
= extractFloat32Sign( a
);
2933 bSign
= extractFloat32Sign( b
);
2934 av
= float32_val(a
);
2935 bv
= float32_val(b
);
2936 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
2937 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
2941 /*----------------------------------------------------------------------------
2942 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2943 | be compared, and 0 otherwise. The invalid exception is raised if either
2944 | operand is a NaN. The comparison is performed according to the IEC/IEEE
2945 | Standard for Binary Floating-Point Arithmetic.
2946 *----------------------------------------------------------------------------*/
2948 int float32_unordered(float32 a
, float32 b
, float_status
*status
)
2950 a
= float32_squash_input_denormal(a
, status
);
2951 b
= float32_squash_input_denormal(b
, status
);
2953 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2954 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2956 float_raise(float_flag_invalid
, status
);
2962 /*----------------------------------------------------------------------------
2963 | Returns 1 if the single-precision floating-point value `a' is equal to
2964 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2965 | exception. The comparison is performed according to the IEC/IEEE Standard
2966 | for Binary Floating-Point Arithmetic.
2967 *----------------------------------------------------------------------------*/
2969 int float32_eq_quiet(float32 a
, float32 b
, float_status
*status
)
2971 a
= float32_squash_input_denormal(a
, status
);
2972 b
= float32_squash_input_denormal(b
, status
);
2974 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2975 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2977 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2978 float_raise(float_flag_invalid
, status
);
2982 return ( float32_val(a
) == float32_val(b
) ) ||
2983 ( (uint32_t) ( ( float32_val(a
) | float32_val(b
) )<<1 ) == 0 );
2986 /*----------------------------------------------------------------------------
2987 | Returns 1 if the single-precision floating-point value `a' is less than or
2988 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2989 | cause an exception. Otherwise, the comparison is performed according to the
2990 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2991 *----------------------------------------------------------------------------*/
2993 int float32_le_quiet(float32 a
, float32 b
, float_status
*status
)
2997 a
= float32_squash_input_denormal(a
, status
);
2998 b
= float32_squash_input_denormal(b
, status
);
3000 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3001 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3003 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
3004 float_raise(float_flag_invalid
, status
);
3008 aSign
= extractFloat32Sign( a
);
3009 bSign
= extractFloat32Sign( b
);
3010 av
= float32_val(a
);
3011 bv
= float32_val(b
);
3012 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
3013 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3017 /*----------------------------------------------------------------------------
3018 | Returns 1 if the single-precision floating-point value `a' is less than
3019 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3020 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3021 | Standard for Binary Floating-Point Arithmetic.
3022 *----------------------------------------------------------------------------*/
3024 int float32_lt_quiet(float32 a
, float32 b
, float_status
*status
)
3028 a
= float32_squash_input_denormal(a
, status
);
3029 b
= float32_squash_input_denormal(b
, status
);
3031 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3032 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3034 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
3035 float_raise(float_flag_invalid
, status
);
3039 aSign
= extractFloat32Sign( a
);
3040 bSign
= extractFloat32Sign( b
);
3041 av
= float32_val(a
);
3042 bv
= float32_val(b
);
3043 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
3044 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3048 /*----------------------------------------------------------------------------
3049 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3050 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3051 | comparison is performed according to the IEC/IEEE Standard for Binary
3052 | Floating-Point Arithmetic.
3053 *----------------------------------------------------------------------------*/
3055 int float32_unordered_quiet(float32 a
, float32 b
, float_status
*status
)
3057 a
= float32_squash_input_denormal(a
, status
);
3058 b
= float32_squash_input_denormal(b
, status
);
3060 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3061 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3063 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
3064 float_raise(float_flag_invalid
, status
);
3071 /*----------------------------------------------------------------------------
3072 | Returns the result of converting the double-precision floating-point value
3073 | `a' to the 32-bit two's complement integer format. The conversion is
3074 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3075 | Arithmetic---which means in particular that the conversion is rounded
3076 | according to the current rounding mode. If `a' is a NaN, the largest
3077 | positive integer is returned. Otherwise, if the conversion overflows, the
3078 | largest integer with the same sign as `a' is returned.
3079 *----------------------------------------------------------------------------*/
3081 int32_t float64_to_int32(float64 a
, float_status
*status
)
3087 a
= float64_squash_input_denormal(a
, status
);
3089 aSig
= extractFloat64Frac( a
);
3090 aExp
= extractFloat64Exp( a
);
3091 aSign
= extractFloat64Sign( a
);
3092 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
3093 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
3094 shiftCount
= 0x42C - aExp
;
3095 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
3096 return roundAndPackInt32(aSign
, aSig
, status
);
3100 /*----------------------------------------------------------------------------
3101 | Returns the result of converting the double-precision floating-point value
3102 | `a' to the 32-bit two's complement integer format. The conversion is
3103 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3104 | Arithmetic, except that the conversion is always rounded toward zero.
3105 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3106 | the conversion overflows, the largest integer with the same sign as `a' is
3108 *----------------------------------------------------------------------------*/
3110 int32_t float64_to_int32_round_to_zero(float64 a
, float_status
*status
)
3115 uint64_t aSig
, savedASig
;
3117 a
= float64_squash_input_denormal(a
, status
);
3119 aSig
= extractFloat64Frac( a
);
3120 aExp
= extractFloat64Exp( a
);
3121 aSign
= extractFloat64Sign( a
);
3122 if ( 0x41E < aExp
) {
3123 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
3126 else if ( aExp
< 0x3FF ) {
3128 status
->float_exception_flags
|= float_flag_inexact
;
3132 aSig
|= LIT64( 0x0010000000000000 );
3133 shiftCount
= 0x433 - aExp
;
3135 aSig
>>= shiftCount
;
3137 if ( aSign
) z
= - z
;
3138 if ( ( z
< 0 ) ^ aSign
) {
3140 float_raise(float_flag_invalid
, status
);
3141 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
3143 if ( ( aSig
<<shiftCount
) != savedASig
) {
3144 status
->float_exception_flags
|= float_flag_inexact
;
3150 /*----------------------------------------------------------------------------
3151 | Returns the result of converting the double-precision floating-point value
3152 | `a' to the 16-bit two's complement integer format. The conversion is
3153 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3154 | Arithmetic, except that the conversion is always rounded toward zero.
3155 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3156 | the conversion overflows, the largest integer with the same sign as `a' is
3158 *----------------------------------------------------------------------------*/
3160 int16_t float64_to_int16_round_to_zero(float64 a
, float_status
*status
)
3165 uint64_t aSig
, savedASig
;
3168 aSig
= extractFloat64Frac( a
);
3169 aExp
= extractFloat64Exp( a
);
3170 aSign
= extractFloat64Sign( a
);
3171 if ( 0x40E < aExp
) {
3172 if ( ( aExp
== 0x7FF ) && aSig
) {
3177 else if ( aExp
< 0x3FF ) {
3178 if ( aExp
|| aSig
) {
3179 status
->float_exception_flags
|= float_flag_inexact
;
3183 aSig
|= LIT64( 0x0010000000000000 );
3184 shiftCount
= 0x433 - aExp
;
3186 aSig
>>= shiftCount
;
3191 if ( ( (int16_t)z
< 0 ) ^ aSign
) {
3193 float_raise(float_flag_invalid
, status
);
3194 return aSign
? (int32_t) 0xffff8000 : 0x7FFF;
3196 if ( ( aSig
<<shiftCount
) != savedASig
) {
3197 status
->float_exception_flags
|= float_flag_inexact
;
3202 /*----------------------------------------------------------------------------
3203 | Returns the result of converting the double-precision floating-point value
3204 | `a' to the 64-bit two's complement integer format. The conversion is
3205 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3206 | Arithmetic---which means in particular that the conversion is rounded
3207 | according to the current rounding mode. If `a' is a NaN, the largest
3208 | positive integer is returned. Otherwise, if the conversion overflows, the
3209 | largest integer with the same sign as `a' is returned.
3210 *----------------------------------------------------------------------------*/
3212 int64_t float64_to_int64(float64 a
, float_status
*status
)
3217 uint64_t aSig
, aSigExtra
;
3218 a
= float64_squash_input_denormal(a
, status
);
3220 aSig
= extractFloat64Frac( a
);
3221 aExp
= extractFloat64Exp( a
);
3222 aSign
= extractFloat64Sign( a
);
3223 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
3224 shiftCount
= 0x433 - aExp
;
3225 if ( shiftCount
<= 0 ) {
3226 if ( 0x43E < aExp
) {
3227 float_raise(float_flag_invalid
, status
);
3229 || ( ( aExp
== 0x7FF )
3230 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
3232 return LIT64( 0x7FFFFFFFFFFFFFFF );
3234 return (int64_t) LIT64( 0x8000000000000000 );
3237 aSig
<<= - shiftCount
;
3240 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
3242 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
3246 /*----------------------------------------------------------------------------
3247 | Returns the result of converting the double-precision floating-point value
3248 | `a' to the 64-bit two's complement integer format. The conversion is
3249 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3250 | Arithmetic, except that the conversion is always rounded toward zero.
3251 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3252 | the conversion overflows, the largest integer with the same sign as `a' is
3254 *----------------------------------------------------------------------------*/
3256 int64_t float64_to_int64_round_to_zero(float64 a
, float_status
*status
)
3263 a
= float64_squash_input_denormal(a
, status
);
3265 aSig
= extractFloat64Frac( a
);
3266 aExp
= extractFloat64Exp( a
);
3267 aSign
= extractFloat64Sign( a
);
3268 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
3269 shiftCount
= aExp
- 0x433;
3270 if ( 0 <= shiftCount
) {
3271 if ( 0x43E <= aExp
) {
3272 if ( float64_val(a
) != LIT64( 0xC3E0000000000000 ) ) {
3273 float_raise(float_flag_invalid
, status
);
3275 || ( ( aExp
== 0x7FF )
3276 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
3278 return LIT64( 0x7FFFFFFFFFFFFFFF );
3281 return (int64_t) LIT64( 0x8000000000000000 );
3283 z
= aSig
<<shiftCount
;
3286 if ( aExp
< 0x3FE ) {
3288 status
->float_exception_flags
|= float_flag_inexact
;
3292 z
= aSig
>>( - shiftCount
);
3293 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
3294 status
->float_exception_flags
|= float_flag_inexact
;
3297 if ( aSign
) z
= - z
;
3302 /*----------------------------------------------------------------------------
3303 | Returns the result of converting the double-precision floating-point value
3304 | `a' to the single-precision floating-point format. The conversion is
3305 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3307 *----------------------------------------------------------------------------*/
3309 float32
float64_to_float32(float64 a
, float_status
*status
)
3315 a
= float64_squash_input_denormal(a
, status
);
3317 aSig
= extractFloat64Frac( a
);
3318 aExp
= extractFloat64Exp( a
);
3319 aSign
= extractFloat64Sign( a
);
3320 if ( aExp
== 0x7FF ) {
3322 return commonNaNToFloat32(float64ToCommonNaN(a
, status
), status
);
3324 return packFloat32( aSign
, 0xFF, 0 );
3326 shift64RightJamming( aSig
, 22, &aSig
);
3328 if ( aExp
|| zSig
) {
3332 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
3337 /*----------------------------------------------------------------------------
3338 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3339 | half-precision floating-point value, returning the result. After being
3340 | shifted into the proper positions, the three fields are simply added
3341 | together to form the result. This means that any integer portion of `zSig'
3342 | will be added into the exponent. Since a properly normalized significand
3343 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3344 | than the desired result exponent whenever `zSig' is a complete, normalized
3346 *----------------------------------------------------------------------------*/
3347 static float16
packFloat16(flag zSign
, int zExp
, uint16_t zSig
)
3349 return make_float16(
3350 (((uint32_t)zSign
) << 15) + (((uint32_t)zExp
) << 10) + zSig
);
3353 /*----------------------------------------------------------------------------
3354 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3355 | and significand `zSig', and returns the proper half-precision floating-
3356 | point value corresponding to the abstract input. Ordinarily, the abstract
3357 | value is simply rounded and packed into the half-precision format, with
3358 | the inexact exception raised if the abstract input cannot be represented
3359 | exactly. However, if the abstract value is too large, the overflow and
3360 | inexact exceptions are raised and an infinity or maximal finite value is
3361 | returned. If the abstract value is too small, the input value is rounded to
3362 | a subnormal number, and the underflow and inexact exceptions are raised if
3363 | the abstract input cannot be represented exactly as a subnormal half-
3364 | precision floating-point number.
3365 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3366 | ARM-style "alternative representation", which omits the NaN and Inf
3367 | encodings in order to raise the maximum representable exponent by one.
3368 | The input significand `zSig' has its binary point between bits 22
3369 | and 23, which is 13 bits to the left of the usual location. This shifted
3370 | significand must be normalized or smaller. If `zSig' is not normalized,
3371 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3372 | and it must not require rounding. In the usual case that `zSig' is
3373 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3374 | Note the slightly odd position of the binary point in zSig compared with the
3375 | other roundAndPackFloat functions. This should probably be fixed if we
3376 | need to implement more float16 routines than just conversion.
3377 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3378 | Binary Floating-Point Arithmetic.
3379 *----------------------------------------------------------------------------*/
3381 static float16
roundAndPackFloat16(flag zSign
, int zExp
,
3382 uint32_t zSig
, flag ieee
,
3383 float_status
*status
)
3385 int maxexp
= ieee
? 29 : 30;
3388 bool rounding_bumps_exp
;
3389 bool is_tiny
= false;
3391 /* Calculate the mask of bits of the mantissa which are not
3392 * representable in half-precision and will be lost.
3395 /* Will be denormal in halfprec */
3401 /* Normal number in halfprec */
3405 switch (status
->float_rounding_mode
) {
3406 case float_round_nearest_even
:
3407 increment
= (mask
+ 1) >> 1;
3408 if ((zSig
& mask
) == increment
) {
3409 increment
= zSig
& (increment
<< 1);
3412 case float_round_ties_away
:
3413 increment
= (mask
+ 1) >> 1;
3415 case float_round_up
:
3416 increment
= zSign
? 0 : mask
;
3418 case float_round_down
:
3419 increment
= zSign
? mask
: 0;
3421 default: /* round_to_zero */
3426 rounding_bumps_exp
= (zSig
+ increment
>= 0x01000000);
3428 if (zExp
> maxexp
|| (zExp
== maxexp
&& rounding_bumps_exp
)) {
3430 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3431 return packFloat16(zSign
, 0x1f, 0);
3433 float_raise(float_flag_invalid
, status
);
3434 return packFloat16(zSign
, 0x1f, 0x3ff);
3439 /* Note that flush-to-zero does not affect half-precision results */
3441 (status
->float_detect_tininess
== float_tininess_before_rounding
)
3443 || (!rounding_bumps_exp
);
3446 float_raise(float_flag_inexact
, status
);
3448 float_raise(float_flag_underflow
, status
);
3453 if (rounding_bumps_exp
) {
3459 return packFloat16(zSign
, 0, 0);
3465 return packFloat16(zSign
, zExp
, zSig
>> 13);
3468 static void normalizeFloat16Subnormal(uint32_t aSig
, int *zExpPtr
,
3471 int8_t shiftCount
= countLeadingZeros32(aSig
) - 21;
3472 *zSigPtr
= aSig
<< shiftCount
;
3473 *zExpPtr
= 1 - shiftCount
;
3476 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3477 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3479 float32
float16_to_float32(float16 a
, flag ieee
, float_status
*status
)
3485 aSign
= extractFloat16Sign(a
);
3486 aExp
= extractFloat16Exp(a
);
3487 aSig
= extractFloat16Frac(a
);
3489 if (aExp
== 0x1f && ieee
) {
3491 return commonNaNToFloat32(float16ToCommonNaN(a
, status
), status
);
3493 return packFloat32(aSign
, 0xff, 0);
3497 return packFloat32(aSign
, 0, 0);
3500 normalizeFloat16Subnormal(aSig
, &aExp
, &aSig
);
3503 return packFloat32( aSign
, aExp
+ 0x70, aSig
<< 13);
3506 float16
float32_to_float16(float32 a
, flag ieee
, float_status
*status
)
3512 a
= float32_squash_input_denormal(a
, status
);
3514 aSig
= extractFloat32Frac( a
);
3515 aExp
= extractFloat32Exp( a
);
3516 aSign
= extractFloat32Sign( a
);
3517 if ( aExp
== 0xFF ) {
3519 /* Input is a NaN */
3521 float_raise(float_flag_invalid
, status
);
3522 return packFloat16(aSign
, 0, 0);
3524 return commonNaNToFloat16(
3525 float32ToCommonNaN(a
, status
), status
);
3529 float_raise(float_flag_invalid
, status
);
3530 return packFloat16(aSign
, 0x1f, 0x3ff);
3532 return packFloat16(aSign
, 0x1f, 0);
3534 if (aExp
== 0 && aSig
== 0) {
3535 return packFloat16(aSign
, 0, 0);
3537 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3538 * even if the input is denormal; however this is harmless because
3539 * the largest possible single-precision denormal is still smaller
3540 * than the smallest representable half-precision denormal, and so we
3541 * will end up ignoring aSig and returning via the "always return zero"
3547 return roundAndPackFloat16(aSign
, aExp
, aSig
, ieee
, status
);
3550 float64
float16_to_float64(float16 a
, flag ieee
, float_status
*status
)
3556 aSign
= extractFloat16Sign(a
);
3557 aExp
= extractFloat16Exp(a
);
3558 aSig
= extractFloat16Frac(a
);
3560 if (aExp
== 0x1f && ieee
) {
3562 return commonNaNToFloat64(
3563 float16ToCommonNaN(a
, status
), status
);
3565 return packFloat64(aSign
, 0x7ff, 0);
3569 return packFloat64(aSign
, 0, 0);
3572 normalizeFloat16Subnormal(aSig
, &aExp
, &aSig
);
3575 return packFloat64(aSign
, aExp
+ 0x3f0, ((uint64_t)aSig
) << 42);
3578 float16
float64_to_float16(float64 a
, flag ieee
, float_status
*status
)
3585 a
= float64_squash_input_denormal(a
, status
);
3587 aSig
= extractFloat64Frac(a
);
3588 aExp
= extractFloat64Exp(a
);
3589 aSign
= extractFloat64Sign(a
);
3590 if (aExp
== 0x7FF) {
3592 /* Input is a NaN */
3594 float_raise(float_flag_invalid
, status
);
3595 return packFloat16(aSign
, 0, 0);
3597 return commonNaNToFloat16(
3598 float64ToCommonNaN(a
, status
), status
);
3602 float_raise(float_flag_invalid
, status
);
3603 return packFloat16(aSign
, 0x1f, 0x3ff);
3605 return packFloat16(aSign
, 0x1f, 0);
3607 shift64RightJamming(aSig
, 29, &aSig
);
3609 if (aExp
== 0 && zSig
== 0) {
3610 return packFloat16(aSign
, 0, 0);
3612 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3613 * even if the input is denormal; however this is harmless because
3614 * the largest possible single-precision denormal is still smaller
3615 * than the smallest representable half-precision denormal, and so we
3616 * will end up ignoring aSig and returning via the "always return zero"
3622 return roundAndPackFloat16(aSign
, aExp
, zSig
, ieee
, status
);
3625 /*----------------------------------------------------------------------------
3626 | Returns the result of converting the double-precision floating-point value
3627 | `a' to the extended double-precision floating-point format. The conversion
3628 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3630 *----------------------------------------------------------------------------*/
3632 floatx80
float64_to_floatx80(float64 a
, float_status
*status
)
3638 a
= float64_squash_input_denormal(a
, status
);
3639 aSig
= extractFloat64Frac( a
);
3640 aExp
= extractFloat64Exp( a
);
3641 aSign
= extractFloat64Sign( a
);
3642 if ( aExp
== 0x7FF ) {
3644 return commonNaNToFloatx80(float64ToCommonNaN(a
, status
), status
);
3646 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3649 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
3650 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3654 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
3658 /*----------------------------------------------------------------------------
3659 | Returns the result of converting the double-precision floating-point value
3660 | `a' to the quadruple-precision floating-point format. The conversion is
3661 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3663 *----------------------------------------------------------------------------*/
3665 float128
float64_to_float128(float64 a
, float_status
*status
)
3669 uint64_t aSig
, zSig0
, zSig1
;
3671 a
= float64_squash_input_denormal(a
, status
);
3672 aSig
= extractFloat64Frac( a
);
3673 aExp
= extractFloat64Exp( a
);
3674 aSign
= extractFloat64Sign( a
);
3675 if ( aExp
== 0x7FF ) {
3677 return commonNaNToFloat128(float64ToCommonNaN(a
, status
), status
);
3679 return packFloat128( aSign
, 0x7FFF, 0, 0 );
3682 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
3683 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3686 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
3687 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
3691 /*----------------------------------------------------------------------------
3692 | Rounds the double-precision floating-point value `a' to an integer, and
3693 | returns the result as a double-precision floating-point value. The
3694 | operation is performed according to the IEC/IEEE Standard for Binary
3695 | Floating-Point Arithmetic.
3696 *----------------------------------------------------------------------------*/
3698 float64
float64_round_to_int(float64 a
, float_status
*status
)
3702 uint64_t lastBitMask
, roundBitsMask
;
3704 a
= float64_squash_input_denormal(a
, status
);
3706 aExp
= extractFloat64Exp( a
);
3707 if ( 0x433 <= aExp
) {
3708 if ( ( aExp
== 0x7FF ) && extractFloat64Frac( a
) ) {
3709 return propagateFloat64NaN(a
, a
, status
);
3713 if ( aExp
< 0x3FF ) {
3714 if ( (uint64_t) ( float64_val(a
)<<1 ) == 0 ) return a
;
3715 status
->float_exception_flags
|= float_flag_inexact
;
3716 aSign
= extractFloat64Sign( a
);
3717 switch (status
->float_rounding_mode
) {
3718 case float_round_nearest_even
:
3719 if ( ( aExp
== 0x3FE ) && extractFloat64Frac( a
) ) {
3720 return packFloat64( aSign
, 0x3FF, 0 );
3723 case float_round_ties_away
:
3724 if (aExp
== 0x3FE) {
3725 return packFloat64(aSign
, 0x3ff, 0);
3728 case float_round_down
:
3729 return make_float64(aSign
? LIT64( 0xBFF0000000000000 ) : 0);
3730 case float_round_up
:
3731 return make_float64(
3732 aSign
? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3734 return packFloat64( aSign
, 0, 0 );
3737 lastBitMask
<<= 0x433 - aExp
;
3738 roundBitsMask
= lastBitMask
- 1;
3740 switch (status
->float_rounding_mode
) {
3741 case float_round_nearest_even
:
3742 z
+= lastBitMask
>> 1;
3743 if ((z
& roundBitsMask
) == 0) {
3747 case float_round_ties_away
:
3748 z
+= lastBitMask
>> 1;
3750 case float_round_to_zero
:
3752 case float_round_up
:
3753 if (!extractFloat64Sign(make_float64(z
))) {
3757 case float_round_down
:
3758 if (extractFloat64Sign(make_float64(z
))) {
3765 z
&= ~ roundBitsMask
;
3766 if (z
!= float64_val(a
)) {
3767 status
->float_exception_flags
|= float_flag_inexact
;
3769 return make_float64(z
);
3773 float64
float64_trunc_to_int(float64 a
, float_status
*status
)
3777 oldmode
= status
->float_rounding_mode
;
3778 status
->float_rounding_mode
= float_round_to_zero
;
3779 res
= float64_round_to_int(a
, status
);
3780 status
->float_rounding_mode
= oldmode
;
3784 /*----------------------------------------------------------------------------
3785 | Returns the result of adding the absolute values of the double-precision
3786 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3787 | before being returned. `zSign' is ignored if the result is a NaN.
3788 | The addition is performed according to the IEC/IEEE Standard for Binary
3789 | Floating-Point Arithmetic.
3790 *----------------------------------------------------------------------------*/
3792 static float64
addFloat64Sigs(float64 a
, float64 b
, flag zSign
,
3793 float_status
*status
)
3795 int aExp
, bExp
, zExp
;
3796 uint64_t aSig
, bSig
, zSig
;
3799 aSig
= extractFloat64Frac( a
);
3800 aExp
= extractFloat64Exp( a
);
3801 bSig
= extractFloat64Frac( b
);
3802 bExp
= extractFloat64Exp( b
);
3803 expDiff
= aExp
- bExp
;
3806 if ( 0 < expDiff
) {
3807 if ( aExp
== 0x7FF ) {
3809 return propagateFloat64NaN(a
, b
, status
);
3817 bSig
|= LIT64( 0x2000000000000000 );
3819 shift64RightJamming( bSig
, expDiff
, &bSig
);
3822 else if ( expDiff
< 0 ) {
3823 if ( bExp
== 0x7FF ) {
3825 return propagateFloat64NaN(a
, b
, status
);
3827 return packFloat64( zSign
, 0x7FF, 0 );
3833 aSig
|= LIT64( 0x2000000000000000 );
3835 shift64RightJamming( aSig
, - expDiff
, &aSig
);
3839 if ( aExp
== 0x7FF ) {
3841 return propagateFloat64NaN(a
, b
, status
);
3846 if (status
->flush_to_zero
) {
3848 float_raise(float_flag_output_denormal
, status
);
3850 return packFloat64(zSign
, 0, 0);
3852 return packFloat64( zSign
, 0, ( aSig
+ bSig
)>>9 );
3854 zSig
= LIT64( 0x4000000000000000 ) + aSig
+ bSig
;
3858 aSig
|= LIT64( 0x2000000000000000 );
3859 zSig
= ( aSig
+ bSig
)<<1;
3861 if ( (int64_t) zSig
< 0 ) {
3866 return roundAndPackFloat64(zSign
, zExp
, zSig
, status
);
3870 /*----------------------------------------------------------------------------
3871 | Returns the result of subtracting the absolute values of the double-
3872 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3873 | difference is negated before being returned. `zSign' is ignored if the
3874 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3875 | Standard for Binary Floating-Point Arithmetic.
3876 *----------------------------------------------------------------------------*/
3878 static float64
subFloat64Sigs(float64 a
, float64 b
, flag zSign
,
3879 float_status
*status
)
3881 int aExp
, bExp
, zExp
;
3882 uint64_t aSig
, bSig
, zSig
;
3885 aSig
= extractFloat64Frac( a
);
3886 aExp
= extractFloat64Exp( a
);
3887 bSig
= extractFloat64Frac( b
);
3888 bExp
= extractFloat64Exp( b
);
3889 expDiff
= aExp
- bExp
;
3892 if ( 0 < expDiff
) goto aExpBigger
;
3893 if ( expDiff
< 0 ) goto bExpBigger
;
3894 if ( aExp
== 0x7FF ) {
3896 return propagateFloat64NaN(a
, b
, status
);
3898 float_raise(float_flag_invalid
, status
);
3899 return float64_default_nan
;
3905 if ( bSig
< aSig
) goto aBigger
;
3906 if ( aSig
< bSig
) goto bBigger
;
3907 return packFloat64(status
->float_rounding_mode
== float_round_down
, 0, 0);
3909 if ( bExp
== 0x7FF ) {
3911 return propagateFloat64NaN(a
, b
, status
);
3913 return packFloat64( zSign
^ 1, 0x7FF, 0 );
3919 aSig
|= LIT64( 0x4000000000000000 );
3921 shift64RightJamming( aSig
, - expDiff
, &aSig
);
3922 bSig
|= LIT64( 0x4000000000000000 );
3927 goto normalizeRoundAndPack
;
3929 if ( aExp
== 0x7FF ) {
3931 return propagateFloat64NaN(a
, b
, status
);
3939 bSig
|= LIT64( 0x4000000000000000 );
3941 shift64RightJamming( bSig
, expDiff
, &bSig
);
3942 aSig
|= LIT64( 0x4000000000000000 );
3946 normalizeRoundAndPack
:
3948 return normalizeRoundAndPackFloat64(zSign
, zExp
, zSig
, status
);
3952 /*----------------------------------------------------------------------------
3953 | Returns the result of adding the double-precision floating-point values `a'
3954 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3955 | Binary Floating-Point Arithmetic.
3956 *----------------------------------------------------------------------------*/
3958 float64
float64_add(float64 a
, float64 b
, float_status
*status
)
3961 a
= float64_squash_input_denormal(a
, status
);
3962 b
= float64_squash_input_denormal(b
, status
);
3964 aSign
= extractFloat64Sign( a
);
3965 bSign
= extractFloat64Sign( b
);
3966 if ( aSign
== bSign
) {
3967 return addFloat64Sigs(a
, b
, aSign
, status
);
3970 return subFloat64Sigs(a
, b
, aSign
, status
);
3975 /*----------------------------------------------------------------------------
3976 | Returns the result of subtracting the double-precision floating-point values
3977 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3978 | for Binary Floating-Point Arithmetic.
3979 *----------------------------------------------------------------------------*/
3981 float64
float64_sub(float64 a
, float64 b
, float_status
*status
)
3984 a
= float64_squash_input_denormal(a
, status
);
3985 b
= float64_squash_input_denormal(b
, status
);
3987 aSign
= extractFloat64Sign( a
);
3988 bSign
= extractFloat64Sign( b
);
3989 if ( aSign
== bSign
) {
3990 return subFloat64Sigs(a
, b
, aSign
, status
);
3993 return addFloat64Sigs(a
, b
, aSign
, status
);
3998 /*----------------------------------------------------------------------------
3999 | Returns the result of multiplying the double-precision floating-point values
4000 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4001 | for Binary Floating-Point Arithmetic.
4002 *----------------------------------------------------------------------------*/
4004 float64
float64_mul(float64 a
, float64 b
, float_status
*status
)
4006 flag aSign
, bSign
, zSign
;
4007 int aExp
, bExp
, zExp
;
4008 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4010 a
= float64_squash_input_denormal(a
, status
);
4011 b
= float64_squash_input_denormal(b
, status
);
4013 aSig
= extractFloat64Frac( a
);
4014 aExp
= extractFloat64Exp( a
);
4015 aSign
= extractFloat64Sign( a
);
4016 bSig
= extractFloat64Frac( b
);
4017 bExp
= extractFloat64Exp( b
);
4018 bSign
= extractFloat64Sign( b
);
4019 zSign
= aSign
^ bSign
;
4020 if ( aExp
== 0x7FF ) {
4021 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
4022 return propagateFloat64NaN(a
, b
, status
);
4024 if ( ( bExp
| bSig
) == 0 ) {
4025 float_raise(float_flag_invalid
, status
);
4026 return float64_default_nan
;
4028 return packFloat64( zSign
, 0x7FF, 0 );
4030 if ( bExp
== 0x7FF ) {
4032 return propagateFloat64NaN(a
, b
, status
);
4034 if ( ( aExp
| aSig
) == 0 ) {
4035 float_raise(float_flag_invalid
, status
);
4036 return float64_default_nan
;
4038 return packFloat64( zSign
, 0x7FF, 0 );
4041 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
4042 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4045 if ( bSig
== 0 ) return packFloat64( zSign
, 0, 0 );
4046 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
4048 zExp
= aExp
+ bExp
- 0x3FF;
4049 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
4050 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
4051 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
4052 zSig0
|= ( zSig1
!= 0 );
4053 if ( 0 <= (int64_t) ( zSig0
<<1 ) ) {
4057 return roundAndPackFloat64(zSign
, zExp
, zSig0
, status
);
4061 /*----------------------------------------------------------------------------
4062 | Returns the result of dividing the double-precision floating-point value `a'
4063 | by the corresponding value `b'. The operation is performed according to
4064 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4065 *----------------------------------------------------------------------------*/
4067 float64
float64_div(float64 a
, float64 b
, float_status
*status
)
4069 flag aSign
, bSign
, zSign
;
4070 int aExp
, bExp
, zExp
;
4071 uint64_t aSig
, bSig
, zSig
;
4072 uint64_t rem0
, rem1
;
4073 uint64_t term0
, term1
;
4074 a
= float64_squash_input_denormal(a
, status
);
4075 b
= float64_squash_input_denormal(b
, status
);
4077 aSig
= extractFloat64Frac( a
);
4078 aExp
= extractFloat64Exp( a
);
4079 aSign
= extractFloat64Sign( a
);
4080 bSig
= extractFloat64Frac( b
);
4081 bExp
= extractFloat64Exp( b
);
4082 bSign
= extractFloat64Sign( b
);
4083 zSign
= aSign
^ bSign
;
4084 if ( aExp
== 0x7FF ) {
4086 return propagateFloat64NaN(a
, b
, status
);
4088 if ( bExp
== 0x7FF ) {
4090 return propagateFloat64NaN(a
, b
, status
);
4092 float_raise(float_flag_invalid
, status
);
4093 return float64_default_nan
;
4095 return packFloat64( zSign
, 0x7FF, 0 );
4097 if ( bExp
== 0x7FF ) {
4099 return propagateFloat64NaN(a
, b
, status
);
4101 return packFloat64( zSign
, 0, 0 );
4105 if ( ( aExp
| aSig
) == 0 ) {
4106 float_raise(float_flag_invalid
, status
);
4107 return float64_default_nan
;
4109 float_raise(float_flag_divbyzero
, status
);
4110 return packFloat64( zSign
, 0x7FF, 0 );
4112 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
4115 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
4116 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4118 zExp
= aExp
- bExp
+ 0x3FD;
4119 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
4120 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
4121 if ( bSig
<= ( aSig
+ aSig
) ) {
4125 zSig
= estimateDiv128To64( aSig
, 0, bSig
);
4126 if ( ( zSig
& 0x1FF ) <= 2 ) {
4127 mul64To128( bSig
, zSig
, &term0
, &term1
);
4128 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
4129 while ( (int64_t) rem0
< 0 ) {
4131 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
4133 zSig
|= ( rem1
!= 0 );
4135 return roundAndPackFloat64(zSign
, zExp
, zSig
, status
);
4139 /*----------------------------------------------------------------------------
4140 | Returns the remainder of the double-precision floating-point value `a'
4141 | with respect to the corresponding value `b'. The operation is performed
4142 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4143 *----------------------------------------------------------------------------*/
4145 float64
float64_rem(float64 a
, float64 b
, float_status
*status
)
4148 int aExp
, bExp
, expDiff
;
4149 uint64_t aSig
, bSig
;
4150 uint64_t q
, alternateASig
;
4153 a
= float64_squash_input_denormal(a
, status
);
4154 b
= float64_squash_input_denormal(b
, status
);
4155 aSig
= extractFloat64Frac( a
);
4156 aExp
= extractFloat64Exp( a
);
4157 aSign
= extractFloat64Sign( a
);
4158 bSig
= extractFloat64Frac( b
);
4159 bExp
= extractFloat64Exp( b
);
4160 if ( aExp
== 0x7FF ) {
4161 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
4162 return propagateFloat64NaN(a
, b
, status
);
4164 float_raise(float_flag_invalid
, status
);
4165 return float64_default_nan
;
4167 if ( bExp
== 0x7FF ) {
4169 return propagateFloat64NaN(a
, b
, status
);
4175 float_raise(float_flag_invalid
, status
);
4176 return float64_default_nan
;
4178 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
4181 if ( aSig
== 0 ) return a
;
4182 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4184 expDiff
= aExp
- bExp
;
4185 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
4186 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
4187 if ( expDiff
< 0 ) {
4188 if ( expDiff
< -1 ) return a
;
4191 q
= ( bSig
<= aSig
);
4192 if ( q
) aSig
-= bSig
;
4194 while ( 0 < expDiff
) {
4195 q
= estimateDiv128To64( aSig
, 0, bSig
);
4196 q
= ( 2 < q
) ? q
- 2 : 0;
4197 aSig
= - ( ( bSig
>>2 ) * q
);
4201 if ( 0 < expDiff
) {
4202 q
= estimateDiv128To64( aSig
, 0, bSig
);
4203 q
= ( 2 < q
) ? q
- 2 : 0;
4206 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
4213 alternateASig
= aSig
;
4216 } while ( 0 <= (int64_t) aSig
);
4217 sigMean
= aSig
+ alternateASig
;
4218 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
4219 aSig
= alternateASig
;
4221 zSign
= ( (int64_t) aSig
< 0 );
4222 if ( zSign
) aSig
= - aSig
;
4223 return normalizeRoundAndPackFloat64(aSign
^ zSign
, bExp
, aSig
, status
);
4227 /*----------------------------------------------------------------------------
4228 | Returns the result of multiplying the double-precision floating-point values
4229 | `a' and `b' then adding 'c', with no intermediate rounding step after the
4230 | multiplication. The operation is performed according to the IEC/IEEE
4231 | Standard for Binary Floating-Point Arithmetic 754-2008.
4232 | The flags argument allows the caller to select negation of the
4233 | addend, the intermediate product, or the final result. (The difference
4234 | between this and having the caller do a separate negation is that negating
4235 | externally will flip the sign bit on NaNs.)
4236 *----------------------------------------------------------------------------*/
4238 float64
float64_muladd(float64 a
, float64 b
, float64 c
, int flags
,
4239 float_status
*status
)
4241 flag aSign
, bSign
, cSign
, zSign
;
4242 int aExp
, bExp
, cExp
, pExp
, zExp
, expDiff
;
4243 uint64_t aSig
, bSig
, cSig
;
4244 flag pInf
, pZero
, pSign
;
4245 uint64_t pSig0
, pSig1
, cSig0
, cSig1
, zSig0
, zSig1
;
4247 flag signflip
, infzero
;
4249 a
= float64_squash_input_denormal(a
, status
);
4250 b
= float64_squash_input_denormal(b
, status
);
4251 c
= float64_squash_input_denormal(c
, status
);
4252 aSig
= extractFloat64Frac(a
);
4253 aExp
= extractFloat64Exp(a
);
4254 aSign
= extractFloat64Sign(a
);
4255 bSig
= extractFloat64Frac(b
);
4256 bExp
= extractFloat64Exp(b
);
4257 bSign
= extractFloat64Sign(b
);
4258 cSig
= extractFloat64Frac(c
);
4259 cExp
= extractFloat64Exp(c
);
4260 cSign
= extractFloat64Sign(c
);
4262 infzero
= ((aExp
== 0 && aSig
== 0 && bExp
== 0x7ff && bSig
== 0) ||
4263 (aExp
== 0x7ff && aSig
== 0 && bExp
== 0 && bSig
== 0));
4265 /* It is implementation-defined whether the cases of (0,inf,qnan)
4266 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4267 * they return if they do), so we have to hand this information
4268 * off to the target-specific pick-a-NaN routine.
4270 if (((aExp
== 0x7ff) && aSig
) ||
4271 ((bExp
== 0x7ff) && bSig
) ||
4272 ((cExp
== 0x7ff) && cSig
)) {
4273 return propagateFloat64MulAddNaN(a
, b
, c
, infzero
, status
);
4277 float_raise(float_flag_invalid
, status
);
4278 return float64_default_nan
;
4281 if (flags
& float_muladd_negate_c
) {
4285 signflip
= (flags
& float_muladd_negate_result
) ? 1 : 0;
4287 /* Work out the sign and type of the product */
4288 pSign
= aSign
^ bSign
;
4289 if (flags
& float_muladd_negate_product
) {
4292 pInf
= (aExp
== 0x7ff) || (bExp
== 0x7ff);
4293 pZero
= ((aExp
| aSig
) == 0) || ((bExp
| bSig
) == 0);
4295 if (cExp
== 0x7ff) {
4296 if (pInf
&& (pSign
^ cSign
)) {
4297 /* addition of opposite-signed infinities => InvalidOperation */
4298 float_raise(float_flag_invalid
, status
);
4299 return float64_default_nan
;
4301 /* Otherwise generate an infinity of the same sign */
4302 return packFloat64(cSign
^ signflip
, 0x7ff, 0);
4306 return packFloat64(pSign
^ signflip
, 0x7ff, 0);
4312 /* Adding two exact zeroes */
4313 if (pSign
== cSign
) {
4315 } else if (status
->float_rounding_mode
== float_round_down
) {
4320 return packFloat64(zSign
^ signflip
, 0, 0);
4322 /* Exact zero plus a denorm */
4323 if (status
->flush_to_zero
) {
4324 float_raise(float_flag_output_denormal
, status
);
4325 return packFloat64(cSign
^ signflip
, 0, 0);
4328 /* Zero plus something non-zero : just return the something */
4329 if (flags
& float_muladd_halve_result
) {
4331 normalizeFloat64Subnormal(cSig
, &cExp
, &cSig
);
4333 /* Subtract one to halve, and one again because roundAndPackFloat64
4334 * wants one less than the true exponent.
4337 cSig
= (cSig
| 0x0010000000000000ULL
) << 10;
4338 return roundAndPackFloat64(cSign
^ signflip
, cExp
, cSig
, status
);
4340 return packFloat64(cSign
^ signflip
, cExp
, cSig
);
4344 normalizeFloat64Subnormal(aSig
, &aExp
, &aSig
);
4347 normalizeFloat64Subnormal(bSig
, &bExp
, &bSig
);
4350 /* Calculate the actual result a * b + c */
4352 /* Multiply first; this is easy. */
4353 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4354 * because we want the true exponent, not the "one-less-than"
4355 * flavour that roundAndPackFloat64() takes.
4357 pExp
= aExp
+ bExp
- 0x3fe;
4358 aSig
= (aSig
| LIT64(0x0010000000000000))<<10;
4359 bSig
= (bSig
| LIT64(0x0010000000000000))<<11;
4360 mul64To128(aSig
, bSig
, &pSig0
, &pSig1
);
4361 if ((int64_t)(pSig0
<< 1) >= 0) {
4362 shortShift128Left(pSig0
, pSig1
, 1, &pSig0
, &pSig1
);
4366 zSign
= pSign
^ signflip
;
4368 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4369 * bit in position 126.
4373 /* Throw out the special case of c being an exact zero now */
4374 shift128RightJamming(pSig0
, pSig1
, 64, &pSig0
, &pSig1
);
4375 if (flags
& float_muladd_halve_result
) {
4378 return roundAndPackFloat64(zSign
, pExp
- 1,
4381 normalizeFloat64Subnormal(cSig
, &cExp
, &cSig
);
4384 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4385 * significand of the addend, with the explicit bit in position 126.
4387 cSig0
= cSig
<< (126 - 64 - 52);
4389 cSig0
|= LIT64(0x4000000000000000);
4390 expDiff
= pExp
- cExp
;
4392 if (pSign
== cSign
) {
4395 /* scale c to match p */
4396 shift128RightJamming(cSig0
, cSig1
, expDiff
, &cSig0
, &cSig1
);
4398 } else if (expDiff
< 0) {
4399 /* scale p to match c */
4400 shift128RightJamming(pSig0
, pSig1
, -expDiff
, &pSig0
, &pSig1
);
4403 /* no scaling needed */
4406 /* Add significands and make sure explicit bit ends up in posn 126 */
4407 add128(pSig0
, pSig1
, cSig0
, cSig1
, &zSig0
, &zSig1
);
4408 if ((int64_t)zSig0
< 0) {
4409 shift128RightJamming(zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
4413 shift128RightJamming(zSig0
, zSig1
, 64, &zSig0
, &zSig1
);
4414 if (flags
& float_muladd_halve_result
) {
4417 return roundAndPackFloat64(zSign
, zExp
, zSig1
, status
);
4421 shift128RightJamming(cSig0
, cSig1
, expDiff
, &cSig0
, &cSig1
);
4422 sub128(pSig0
, pSig1
, cSig0
, cSig1
, &zSig0
, &zSig1
);
4424 } else if (expDiff
< 0) {
4425 shift128RightJamming(pSig0
, pSig1
, -expDiff
, &pSig0
, &pSig1
);
4426 sub128(cSig0
, cSig1
, pSig0
, pSig1
, &zSig0
, &zSig1
);
4431 if (lt128(cSig0
, cSig1
, pSig0
, pSig1
)) {
4432 sub128(pSig0
, pSig1
, cSig0
, cSig1
, &zSig0
, &zSig1
);
4433 } else if (lt128(pSig0
, pSig1
, cSig0
, cSig1
)) {
4434 sub128(cSig0
, cSig1
, pSig0
, pSig1
, &zSig0
, &zSig1
);
4439 if (status
->float_rounding_mode
== float_round_down
) {
4442 return packFloat64(zSign
, 0, 0);
4446 /* Do the equivalent of normalizeRoundAndPackFloat64() but
4447 * starting with the significand in a pair of uint64_t.
4450 shiftcount
= countLeadingZeros64(zSig0
) - 1;
4451 shortShift128Left(zSig0
, zSig1
, shiftcount
, &zSig0
, &zSig1
);
4457 shiftcount
= countLeadingZeros64(zSig1
);
4458 if (shiftcount
== 0) {
4459 zSig0
= (zSig1
>> 1) | (zSig1
& 1);
4463 zSig0
= zSig1
<< shiftcount
;
4464 zExp
-= (shiftcount
+ 64);
4467 if (flags
& float_muladd_halve_result
) {
4470 return roundAndPackFloat64(zSign
, zExp
, zSig0
, status
);
4474 /*----------------------------------------------------------------------------
4475 | Returns the square root of the double-precision floating-point value `a'.
4476 | The operation is performed according to the IEC/IEEE Standard for Binary
4477 | Floating-Point Arithmetic.
4478 *----------------------------------------------------------------------------*/
4480 float64
float64_sqrt(float64 a
, float_status
*status
)
4484 uint64_t aSig
, zSig
, doubleZSig
;
4485 uint64_t rem0
, rem1
, term0
, term1
;
4486 a
= float64_squash_input_denormal(a
, status
);
4488 aSig
= extractFloat64Frac( a
);
4489 aExp
= extractFloat64Exp( a
);
4490 aSign
= extractFloat64Sign( a
);
4491 if ( aExp
== 0x7FF ) {
4493 return propagateFloat64NaN(a
, a
, status
);
4495 if ( ! aSign
) return a
;
4496 float_raise(float_flag_invalid
, status
);
4497 return float64_default_nan
;
4500 if ( ( aExp
| aSig
) == 0 ) return a
;
4501 float_raise(float_flag_invalid
, status
);
4502 return float64_default_nan
;
4505 if ( aSig
== 0 ) return float64_zero
;
4506 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4508 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
4509 aSig
|= LIT64( 0x0010000000000000 );
4510 zSig
= estimateSqrt32( aExp
, aSig
>>21 );
4511 aSig
<<= 9 - ( aExp
& 1 );
4512 zSig
= estimateDiv128To64( aSig
, 0, zSig
<<32 ) + ( zSig
<<30 );
4513 if ( ( zSig
& 0x1FF ) <= 5 ) {
4514 doubleZSig
= zSig
<<1;
4515 mul64To128( zSig
, zSig
, &term0
, &term1
);
4516 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
4517 while ( (int64_t) rem0
< 0 ) {
4520 add128( rem0
, rem1
, zSig
>>63, doubleZSig
| 1, &rem0
, &rem1
);
4522 zSig
|= ( ( rem0
| rem1
) != 0 );
4524 return roundAndPackFloat64(0, zExp
, zSig
, status
);
4528 /*----------------------------------------------------------------------------
4529 | Returns the binary log of the double-precision floating-point value `a'.
4530 | The operation is performed according to the IEC/IEEE Standard for Binary
4531 | Floating-Point Arithmetic.
4532 *----------------------------------------------------------------------------*/
4533 float64
float64_log2(float64 a
, float_status
*status
)
4537 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
4538 a
= float64_squash_input_denormal(a
, status
);
4540 aSig
= extractFloat64Frac( a
);
4541 aExp
= extractFloat64Exp( a
);
4542 aSign
= extractFloat64Sign( a
);
4545 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
4546 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4549 float_raise(float_flag_invalid
, status
);
4550 return float64_default_nan
;
4552 if ( aExp
== 0x7FF ) {
4554 return propagateFloat64NaN(a
, float64_zero
, status
);
4560 aSig
|= LIT64( 0x0010000000000000 );
4562 zSig
= (uint64_t)aExp
<< 52;
4563 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
4564 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
4565 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
4566 if ( aSig
& LIT64( 0x0020000000000000 ) ) {
4574 return normalizeRoundAndPackFloat64(zSign
, 0x408, zSig
, status
);
4577 /*----------------------------------------------------------------------------
4578 | Returns 1 if the double-precision floating-point value `a' is equal to the
4579 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4580 | if either operand is a NaN. Otherwise, the comparison is performed
4581 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4582 *----------------------------------------------------------------------------*/
4584 int float64_eq(float64 a
, float64 b
, float_status
*status
)
4587 a
= float64_squash_input_denormal(a
, status
);
4588 b
= float64_squash_input_denormal(b
, status
);
4590 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4591 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4593 float_raise(float_flag_invalid
, status
);
4596 av
= float64_val(a
);
4597 bv
= float64_val(b
);
4598 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4602 /*----------------------------------------------------------------------------
4603 | Returns 1 if the double-precision floating-point value `a' is less than or
4604 | equal to the corresponding value `b', and 0 otherwise. The invalid
4605 | exception is raised if either operand is a NaN. The comparison is performed
4606 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4607 *----------------------------------------------------------------------------*/
4609 int float64_le(float64 a
, float64 b
, float_status
*status
)
4613 a
= float64_squash_input_denormal(a
, status
);
4614 b
= float64_squash_input_denormal(b
, status
);
4616 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4617 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4619 float_raise(float_flag_invalid
, status
);
4622 aSign
= extractFloat64Sign( a
);
4623 bSign
= extractFloat64Sign( b
);
4624 av
= float64_val(a
);
4625 bv
= float64_val(b
);
4626 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4627 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4631 /*----------------------------------------------------------------------------
4632 | Returns 1 if the double-precision floating-point value `a' is less than
4633 | the corresponding value `b', and 0 otherwise. The invalid exception is
4634 | raised if either operand is a NaN. The comparison is performed according
4635 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4636 *----------------------------------------------------------------------------*/
4638 int float64_lt(float64 a
, float64 b
, float_status
*status
)
4643 a
= float64_squash_input_denormal(a
, status
);
4644 b
= float64_squash_input_denormal(b
, status
);
4645 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4646 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4648 float_raise(float_flag_invalid
, status
);
4651 aSign
= extractFloat64Sign( a
);
4652 bSign
= extractFloat64Sign( b
);
4653 av
= float64_val(a
);
4654 bv
= float64_val(b
);
4655 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
4656 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4660 /*----------------------------------------------------------------------------
4661 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4662 | be compared, and 0 otherwise. The invalid exception is raised if either
4663 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4664 | Standard for Binary Floating-Point Arithmetic.
4665 *----------------------------------------------------------------------------*/
4667 int float64_unordered(float64 a
, float64 b
, float_status
*status
)
4669 a
= float64_squash_input_denormal(a
, status
);
4670 b
= float64_squash_input_denormal(b
, status
);
4672 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4673 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4675 float_raise(float_flag_invalid
, status
);
4681 /*----------------------------------------------------------------------------
4682 | Returns 1 if the double-precision floating-point value `a' is equal to the
4683 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4684 | exception.The comparison is performed according to the IEC/IEEE Standard
4685 | for Binary Floating-Point Arithmetic.
4686 *----------------------------------------------------------------------------*/
4688 int float64_eq_quiet(float64 a
, float64 b
, float_status
*status
)
4691 a
= float64_squash_input_denormal(a
, status
);
4692 b
= float64_squash_input_denormal(b
, status
);
4694 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4695 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4697 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
4698 float_raise(float_flag_invalid
, status
);
4702 av
= float64_val(a
);
4703 bv
= float64_val(b
);
4704 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4708 /*----------------------------------------------------------------------------
4709 | Returns 1 if the double-precision floating-point value `a' is less than or
4710 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4711 | cause an exception. Otherwise, the comparison is performed according to the
4712 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4713 *----------------------------------------------------------------------------*/
4715 int float64_le_quiet(float64 a
, float64 b
, float_status
*status
)
4719 a
= float64_squash_input_denormal(a
, status
);
4720 b
= float64_squash_input_denormal(b
, status
);
4722 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4723 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4725 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
4726 float_raise(float_flag_invalid
, status
);
4730 aSign
= extractFloat64Sign( a
);
4731 bSign
= extractFloat64Sign( b
);
4732 av
= float64_val(a
);
4733 bv
= float64_val(b
);
4734 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4735 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4739 /*----------------------------------------------------------------------------
4740 | Returns 1 if the double-precision floating-point value `a' is less than
4741 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4742 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4743 | Standard for Binary Floating-Point Arithmetic.
4744 *----------------------------------------------------------------------------*/
4746 int float64_lt_quiet(float64 a
, float64 b
, float_status
*status
)
4750 a
= float64_squash_input_denormal(a
, status
);
4751 b
= float64_squash_input_denormal(b
, status
);
4753 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4754 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4756 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
4757 float_raise(float_flag_invalid
, status
);
4761 aSign
= extractFloat64Sign( a
);
4762 bSign
= extractFloat64Sign( b
);
4763 av
= float64_val(a
);
4764 bv
= float64_val(b
);
4765 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
4766 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4770 /*----------------------------------------------------------------------------
4771 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4772 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4773 | comparison is performed according to the IEC/IEEE Standard for Binary
4774 | Floating-Point Arithmetic.
4775 *----------------------------------------------------------------------------*/
4777 int float64_unordered_quiet(float64 a
, float64 b
, float_status
*status
)
4779 a
= float64_squash_input_denormal(a
, status
);
4780 b
= float64_squash_input_denormal(b
, status
);
4782 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4783 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4785 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
4786 float_raise(float_flag_invalid
, status
);
4793 /*----------------------------------------------------------------------------
4794 | Returns the result of converting the extended double-precision floating-
4795 | point value `a' to the 32-bit two's complement integer format. The
4796 | conversion is performed according to the IEC/IEEE Standard for Binary
4797 | Floating-Point Arithmetic---which means in particular that the conversion
4798 | is rounded according to the current rounding mode. If `a' is a NaN, the
4799 | largest positive integer is returned. Otherwise, if the conversion
4800 | overflows, the largest integer with the same sign as `a' is returned.
4801 *----------------------------------------------------------------------------*/
4803 int32_t floatx80_to_int32(floatx80 a
, float_status
*status
)
4806 int32_t aExp
, shiftCount
;
4809 aSig
= extractFloatx80Frac( a
);
4810 aExp
= extractFloatx80Exp( a
);
4811 aSign
= extractFloatx80Sign( a
);
4812 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
4813 shiftCount
= 0x4037 - aExp
;
4814 if ( shiftCount
<= 0 ) shiftCount
= 1;
4815 shift64RightJamming( aSig
, shiftCount
, &aSig
);
4816 return roundAndPackInt32(aSign
, aSig
, status
);
4820 /*----------------------------------------------------------------------------
4821 | Returns the result of converting the extended double-precision floating-
4822 | point value `a' to the 32-bit two's complement integer format. The
4823 | conversion is performed according to the IEC/IEEE Standard for Binary
4824 | Floating-Point Arithmetic, except that the conversion is always rounded
4825 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4826 | Otherwise, if the conversion overflows, the largest integer with the same
4827 | sign as `a' is returned.
4828 *----------------------------------------------------------------------------*/
4830 int32_t floatx80_to_int32_round_to_zero(floatx80 a
, float_status
*status
)
4833 int32_t aExp
, shiftCount
;
4834 uint64_t aSig
, savedASig
;
4837 aSig
= extractFloatx80Frac( a
);
4838 aExp
= extractFloatx80Exp( a
);
4839 aSign
= extractFloatx80Sign( a
);
4840 if ( 0x401E < aExp
) {
4841 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
4844 else if ( aExp
< 0x3FFF ) {
4846 status
->float_exception_flags
|= float_flag_inexact
;
4850 shiftCount
= 0x403E - aExp
;
4852 aSig
>>= shiftCount
;
4854 if ( aSign
) z
= - z
;
4855 if ( ( z
< 0 ) ^ aSign
) {
4857 float_raise(float_flag_invalid
, status
);
4858 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
4860 if ( ( aSig
<<shiftCount
) != savedASig
) {
4861 status
->float_exception_flags
|= float_flag_inexact
;
4867 /*----------------------------------------------------------------------------
4868 | Returns the result of converting the extended double-precision floating-
4869 | point value `a' to the 64-bit two's complement integer format. The
4870 | conversion is performed according to the IEC/IEEE Standard for Binary
4871 | Floating-Point Arithmetic---which means in particular that the conversion
4872 | is rounded according to the current rounding mode. If `a' is a NaN,
4873 | the largest positive integer is returned. Otherwise, if the conversion
4874 | overflows, the largest integer with the same sign as `a' is returned.
4875 *----------------------------------------------------------------------------*/
4877 int64_t floatx80_to_int64(floatx80 a
, float_status
*status
)
4880 int32_t aExp
, shiftCount
;
4881 uint64_t aSig
, aSigExtra
;
4883 aSig
= extractFloatx80Frac( a
);
4884 aExp
= extractFloatx80Exp( a
);
4885 aSign
= extractFloatx80Sign( a
);
4886 shiftCount
= 0x403E - aExp
;
4887 if ( shiftCount
<= 0 ) {
4889 float_raise(float_flag_invalid
, status
);
4891 || ( ( aExp
== 0x7FFF )
4892 && ( aSig
!= LIT64( 0x8000000000000000 ) ) )
4894 return LIT64( 0x7FFFFFFFFFFFFFFF );
4896 return (int64_t) LIT64( 0x8000000000000000 );
4901 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
4903 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
4907 /*----------------------------------------------------------------------------
4908 | Returns the result of converting the extended double-precision floating-
4909 | point value `a' to the 64-bit two's complement integer format. The
4910 | conversion is performed according to the IEC/IEEE Standard for Binary
4911 | Floating-Point Arithmetic, except that the conversion is always rounded
4912 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4913 | Otherwise, if the conversion overflows, the largest integer with the same
4914 | sign as `a' is returned.
4915 *----------------------------------------------------------------------------*/
4917 int64_t floatx80_to_int64_round_to_zero(floatx80 a
, float_status
*status
)
4920 int32_t aExp
, shiftCount
;
4924 aSig
= extractFloatx80Frac( a
);
4925 aExp
= extractFloatx80Exp( a
);
4926 aSign
= extractFloatx80Sign( a
);
4927 shiftCount
= aExp
- 0x403E;
4928 if ( 0 <= shiftCount
) {
4929 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
4930 if ( ( a
.high
!= 0xC03E ) || aSig
) {
4931 float_raise(float_flag_invalid
, status
);
4932 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
4933 return LIT64( 0x7FFFFFFFFFFFFFFF );
4936 return (int64_t) LIT64( 0x8000000000000000 );
4938 else if ( aExp
< 0x3FFF ) {
4940 status
->float_exception_flags
|= float_flag_inexact
;
4944 z
= aSig
>>( - shiftCount
);
4945 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
4946 status
->float_exception_flags
|= float_flag_inexact
;
4948 if ( aSign
) z
= - z
;
4953 /*----------------------------------------------------------------------------
4954 | Returns the result of converting the extended double-precision floating-
4955 | point value `a' to the single-precision floating-point format. The
4956 | conversion is performed according to the IEC/IEEE Standard for Binary
4957 | Floating-Point Arithmetic.
4958 *----------------------------------------------------------------------------*/
4960 float32
floatx80_to_float32(floatx80 a
, float_status
*status
)
4966 aSig
= extractFloatx80Frac( a
);
4967 aExp
= extractFloatx80Exp( a
);
4968 aSign
= extractFloatx80Sign( a
);
4969 if ( aExp
== 0x7FFF ) {
4970 if ( (uint64_t) ( aSig
<<1 ) ) {
4971 return commonNaNToFloat32(floatx80ToCommonNaN(a
, status
), status
);
4973 return packFloat32( aSign
, 0xFF, 0 );
4975 shift64RightJamming( aSig
, 33, &aSig
);
4976 if ( aExp
|| aSig
) aExp
-= 0x3F81;
4977 return roundAndPackFloat32(aSign
, aExp
, aSig
, status
);
4981 /*----------------------------------------------------------------------------
4982 | Returns the result of converting the extended double-precision floating-
4983 | point value `a' to the double-precision floating-point format. The
4984 | conversion is performed according to the IEC/IEEE Standard for Binary
4985 | Floating-Point Arithmetic.
4986 *----------------------------------------------------------------------------*/
4988 float64
floatx80_to_float64(floatx80 a
, float_status
*status
)
4992 uint64_t aSig
, zSig
;
4994 aSig
= extractFloatx80Frac( a
);
4995 aExp
= extractFloatx80Exp( a
);
4996 aSign
= extractFloatx80Sign( a
);
4997 if ( aExp
== 0x7FFF ) {
4998 if ( (uint64_t) ( aSig
<<1 ) ) {
4999 return commonNaNToFloat64(floatx80ToCommonNaN(a
, status
), status
);
5001 return packFloat64( aSign
, 0x7FF, 0 );
5003 shift64RightJamming( aSig
, 1, &zSig
);
5004 if ( aExp
|| aSig
) aExp
-= 0x3C01;
5005 return roundAndPackFloat64(aSign
, aExp
, zSig
, status
);
5009 /*----------------------------------------------------------------------------
5010 | Returns the result of converting the extended double-precision floating-
5011 | point value `a' to the quadruple-precision floating-point format. The
5012 | conversion is performed according to the IEC/IEEE Standard for Binary
5013 | Floating-Point Arithmetic.
5014 *----------------------------------------------------------------------------*/
5016 float128
floatx80_to_float128(floatx80 a
, float_status
*status
)
5020 uint64_t aSig
, zSig0
, zSig1
;
5022 aSig
= extractFloatx80Frac( a
);
5023 aExp
= extractFloatx80Exp( a
);
5024 aSign
= extractFloatx80Sign( a
);
5025 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
5026 return commonNaNToFloat128(floatx80ToCommonNaN(a
, status
), status
);
5028 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
5029 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
5033 /*----------------------------------------------------------------------------
5034 | Rounds the extended double-precision floating-point value `a' to an integer,
5035 | and returns the result as an extended quadruple-precision floating-point
5036 | value. The operation is performed according to the IEC/IEEE Standard for
5037 | Binary Floating-Point Arithmetic.
5038 *----------------------------------------------------------------------------*/
5040 floatx80
floatx80_round_to_int(floatx80 a
, float_status
*status
)
5044 uint64_t lastBitMask
, roundBitsMask
;
5047 aExp
= extractFloatx80Exp( a
);
5048 if ( 0x403E <= aExp
) {
5049 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
5050 return propagateFloatx80NaN(a
, a
, status
);
5054 if ( aExp
< 0x3FFF ) {
5056 && ( (uint64_t) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
5059 status
->float_exception_flags
|= float_flag_inexact
;
5060 aSign
= extractFloatx80Sign( a
);
5061 switch (status
->float_rounding_mode
) {
5062 case float_round_nearest_even
:
5063 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
5066 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
5069 case float_round_ties_away
:
5070 if (aExp
== 0x3FFE) {
5071 return packFloatx80(aSign
, 0x3FFF, LIT64(0x8000000000000000));
5074 case float_round_down
:
5077 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5078 : packFloatx80( 0, 0, 0 );
5079 case float_round_up
:
5081 aSign
? packFloatx80( 1, 0, 0 )
5082 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5084 return packFloatx80( aSign
, 0, 0 );
5087 lastBitMask
<<= 0x403E - aExp
;
5088 roundBitsMask
= lastBitMask
- 1;
5090 switch (status
->float_rounding_mode
) {
5091 case float_round_nearest_even
:
5092 z
.low
+= lastBitMask
>>1;
5093 if ((z
.low
& roundBitsMask
) == 0) {
5094 z
.low
&= ~lastBitMask
;
5097 case float_round_ties_away
:
5098 z
.low
+= lastBitMask
>> 1;
5100 case float_round_to_zero
:
5102 case float_round_up
:
5103 if (!extractFloatx80Sign(z
)) {
5104 z
.low
+= roundBitsMask
;
5107 case float_round_down
:
5108 if (extractFloatx80Sign(z
)) {
5109 z
.low
+= roundBitsMask
;
5115 z
.low
&= ~ roundBitsMask
;
5118 z
.low
= LIT64( 0x8000000000000000 );
5120 if (z
.low
!= a
.low
) {
5121 status
->float_exception_flags
|= float_flag_inexact
;
5127 /*----------------------------------------------------------------------------
5128 | Returns the result of adding the absolute values of the extended double-
5129 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5130 | negated before being returned. `zSign' is ignored if the result is a NaN.
5131 | The addition is performed according to the IEC/IEEE Standard for Binary
5132 | Floating-Point Arithmetic.
5133 *----------------------------------------------------------------------------*/
5135 static floatx80
addFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
5136 float_status
*status
)
5138 int32_t aExp
, bExp
, zExp
;
5139 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5142 aSig
= extractFloatx80Frac( a
);
5143 aExp
= extractFloatx80Exp( a
);
5144 bSig
= extractFloatx80Frac( b
);
5145 bExp
= extractFloatx80Exp( b
);
5146 expDiff
= aExp
- bExp
;
5147 if ( 0 < expDiff
) {
5148 if ( aExp
== 0x7FFF ) {
5149 if ((uint64_t)(aSig
<< 1)) {
5150 return propagateFloatx80NaN(a
, b
, status
);
5154 if ( bExp
== 0 ) --expDiff
;
5155 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5158 else if ( expDiff
< 0 ) {
5159 if ( bExp
== 0x7FFF ) {
5160 if ((uint64_t)(bSig
<< 1)) {
5161 return propagateFloatx80NaN(a
, b
, status
);
5163 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5165 if ( aExp
== 0 ) ++expDiff
;
5166 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5170 if ( aExp
== 0x7FFF ) {
5171 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5172 return propagateFloatx80NaN(a
, b
, status
);
5177 zSig0
= aSig
+ bSig
;
5179 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
5185 zSig0
= aSig
+ bSig
;
5186 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
5188 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5189 zSig0
|= LIT64( 0x8000000000000000 );
5192 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5193 zSign
, zExp
, zSig0
, zSig1
, status
);
5196 /*----------------------------------------------------------------------------
5197 | Returns the result of subtracting the absolute values of the extended
5198 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5199 | difference is negated before being returned. `zSign' is ignored if the
5200 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5201 | Standard for Binary Floating-Point Arithmetic.
5202 *----------------------------------------------------------------------------*/
5204 static floatx80
subFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
5205 float_status
*status
)
5207 int32_t aExp
, bExp
, zExp
;
5208 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5212 aSig
= extractFloatx80Frac( a
);
5213 aExp
= extractFloatx80Exp( a
);
5214 bSig
= extractFloatx80Frac( b
);
5215 bExp
= extractFloatx80Exp( b
);
5216 expDiff
= aExp
- bExp
;
5217 if ( 0 < expDiff
) goto aExpBigger
;
5218 if ( expDiff
< 0 ) goto bExpBigger
;
5219 if ( aExp
== 0x7FFF ) {
5220 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5221 return propagateFloatx80NaN(a
, b
, status
);
5223 float_raise(float_flag_invalid
, status
);
5224 z
.low
= floatx80_default_nan_low
;
5225 z
.high
= floatx80_default_nan_high
;
5233 if ( bSig
< aSig
) goto aBigger
;
5234 if ( aSig
< bSig
) goto bBigger
;
5235 return packFloatx80(status
->float_rounding_mode
== float_round_down
, 0, 0);
5237 if ( bExp
== 0x7FFF ) {
5238 if ((uint64_t)(bSig
<< 1)) {
5239 return propagateFloatx80NaN(a
, b
, status
);
5241 return packFloatx80( zSign
^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5243 if ( aExp
== 0 ) ++expDiff
;
5244 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5246 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
5249 goto normalizeRoundAndPack
;
5251 if ( aExp
== 0x7FFF ) {
5252 if ((uint64_t)(aSig
<< 1)) {
5253 return propagateFloatx80NaN(a
, b
, status
);
5257 if ( bExp
== 0 ) --expDiff
;
5258 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5260 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
5262 normalizeRoundAndPack
:
5263 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
5264 zSign
, zExp
, zSig0
, zSig1
, status
);
5267 /*----------------------------------------------------------------------------
5268 | Returns the result of adding the extended double-precision floating-point
5269 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5270 | Standard for Binary Floating-Point Arithmetic.
5271 *----------------------------------------------------------------------------*/
5273 floatx80
floatx80_add(floatx80 a
, floatx80 b
, float_status
*status
)
5277 aSign
= extractFloatx80Sign( a
);
5278 bSign
= extractFloatx80Sign( b
);
5279 if ( aSign
== bSign
) {
5280 return addFloatx80Sigs(a
, b
, aSign
, status
);
5283 return subFloatx80Sigs(a
, b
, aSign
, status
);
5288 /*----------------------------------------------------------------------------
5289 | Returns the result of subtracting the extended double-precision floating-
5290 | point values `a' and `b'. The operation is performed according to the
5291 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5292 *----------------------------------------------------------------------------*/
5294 floatx80
floatx80_sub(floatx80 a
, floatx80 b
, float_status
*status
)
5298 aSign
= extractFloatx80Sign( a
);
5299 bSign
= extractFloatx80Sign( b
);
5300 if ( aSign
== bSign
) {
5301 return subFloatx80Sigs(a
, b
, aSign
, status
);
5304 return addFloatx80Sigs(a
, b
, aSign
, status
);
5309 /*----------------------------------------------------------------------------
5310 | Returns the result of multiplying the extended double-precision floating-
5311 | point values `a' and `b'. The operation is performed according to the
5312 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5313 *----------------------------------------------------------------------------*/
5315 floatx80
floatx80_mul(floatx80 a
, floatx80 b
, float_status
*status
)
5317 flag aSign
, bSign
, zSign
;
5318 int32_t aExp
, bExp
, zExp
;
5319 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5322 aSig
= extractFloatx80Frac( a
);
5323 aExp
= extractFloatx80Exp( a
);
5324 aSign
= extractFloatx80Sign( a
);
5325 bSig
= extractFloatx80Frac( b
);
5326 bExp
= extractFloatx80Exp( b
);
5327 bSign
= extractFloatx80Sign( b
);
5328 zSign
= aSign
^ bSign
;
5329 if ( aExp
== 0x7FFF ) {
5330 if ( (uint64_t) ( aSig
<<1 )
5331 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5332 return propagateFloatx80NaN(a
, b
, status
);
5334 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
5335 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5337 if ( bExp
== 0x7FFF ) {
5338 if ((uint64_t)(bSig
<< 1)) {
5339 return propagateFloatx80NaN(a
, b
, status
);
5341 if ( ( aExp
| aSig
) == 0 ) {
5343 float_raise(float_flag_invalid
, status
);
5344 z
.low
= floatx80_default_nan_low
;
5345 z
.high
= floatx80_default_nan_high
;
5348 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5351 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5352 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5355 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5356 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5358 zExp
= aExp
+ bExp
- 0x3FFE;
5359 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
5360 if ( 0 < (int64_t) zSig0
) {
5361 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5364 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5365 zSign
, zExp
, zSig0
, zSig1
, status
);
5368 /*----------------------------------------------------------------------------
5369 | Returns the result of dividing the extended double-precision floating-point
5370 | value `a' by the corresponding value `b'. The operation is performed
5371 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5372 *----------------------------------------------------------------------------*/
5374 floatx80
floatx80_div(floatx80 a
, floatx80 b
, float_status
*status
)
5376 flag aSign
, bSign
, zSign
;
5377 int32_t aExp
, bExp
, zExp
;
5378 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5379 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
5382 aSig
= extractFloatx80Frac( a
);
5383 aExp
= extractFloatx80Exp( a
);
5384 aSign
= extractFloatx80Sign( a
);
5385 bSig
= extractFloatx80Frac( b
);
5386 bExp
= extractFloatx80Exp( b
);
5387 bSign
= extractFloatx80Sign( b
);
5388 zSign
= aSign
^ bSign
;
5389 if ( aExp
== 0x7FFF ) {
5390 if ((uint64_t)(aSig
<< 1)) {
5391 return propagateFloatx80NaN(a
, b
, status
);
5393 if ( bExp
== 0x7FFF ) {
5394 if ((uint64_t)(bSig
<< 1)) {
5395 return propagateFloatx80NaN(a
, b
, status
);
5399 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5401 if ( bExp
== 0x7FFF ) {
5402 if ((uint64_t)(bSig
<< 1)) {
5403 return propagateFloatx80NaN(a
, b
, status
);
5405 return packFloatx80( zSign
, 0, 0 );
5409 if ( ( aExp
| aSig
) == 0 ) {
5411 float_raise(float_flag_invalid
, status
);
5412 z
.low
= floatx80_default_nan_low
;
5413 z
.high
= floatx80_default_nan_high
;
5416 float_raise(float_flag_divbyzero
, status
);
5417 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5419 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5422 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5423 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5425 zExp
= aExp
- bExp
+ 0x3FFE;
5427 if ( bSig
<= aSig
) {
5428 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
5431 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
5432 mul64To128( bSig
, zSig0
, &term0
, &term1
);
5433 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
5434 while ( (int64_t) rem0
< 0 ) {
5436 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
5438 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
5439 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
5440 mul64To128( bSig
, zSig1
, &term1
, &term2
);
5441 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5442 while ( (int64_t) rem1
< 0 ) {
5444 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
5446 zSig1
|= ( ( rem1
| rem2
) != 0 );
5448 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5449 zSign
, zExp
, zSig0
, zSig1
, status
);
5452 /*----------------------------------------------------------------------------
5453 | Returns the remainder of the extended double-precision floating-point value
5454 | `a' with respect to the corresponding value `b'. The operation is performed
5455 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5456 *----------------------------------------------------------------------------*/
5458 floatx80
floatx80_rem(floatx80 a
, floatx80 b
, float_status
*status
)
5461 int32_t aExp
, bExp
, expDiff
;
5462 uint64_t aSig0
, aSig1
, bSig
;
5463 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
5466 aSig0
= extractFloatx80Frac( a
);
5467 aExp
= extractFloatx80Exp( a
);
5468 aSign
= extractFloatx80Sign( a
);
5469 bSig
= extractFloatx80Frac( b
);
5470 bExp
= extractFloatx80Exp( b
);
5471 if ( aExp
== 0x7FFF ) {
5472 if ( (uint64_t) ( aSig0
<<1 )
5473 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5474 return propagateFloatx80NaN(a
, b
, status
);
5478 if ( bExp
== 0x7FFF ) {
5479 if ((uint64_t)(bSig
<< 1)) {
5480 return propagateFloatx80NaN(a
, b
, status
);
5487 float_raise(float_flag_invalid
, status
);
5488 z
.low
= floatx80_default_nan_low
;
5489 z
.high
= floatx80_default_nan_high
;
5492 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5495 if ( (uint64_t) ( aSig0
<<1 ) == 0 ) return a
;
5496 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5498 bSig
|= LIT64( 0x8000000000000000 );
5500 expDiff
= aExp
- bExp
;
5502 if ( expDiff
< 0 ) {
5503 if ( expDiff
< -1 ) return a
;
5504 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
5507 q
= ( bSig
<= aSig0
);
5508 if ( q
) aSig0
-= bSig
;
5510 while ( 0 < expDiff
) {
5511 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5512 q
= ( 2 < q
) ? q
- 2 : 0;
5513 mul64To128( bSig
, q
, &term0
, &term1
);
5514 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5515 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
5519 if ( 0 < expDiff
) {
5520 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5521 q
= ( 2 < q
) ? q
- 2 : 0;
5523 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
5524 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5525 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
5526 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
5528 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5535 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
5536 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5537 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5540 aSig0
= alternateASig0
;
5541 aSig1
= alternateASig1
;
5545 normalizeRoundAndPackFloatx80(
5546 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
, status
);
5550 /*----------------------------------------------------------------------------
5551 | Returns the square root of the extended double-precision floating-point
5552 | value `a'. The operation is performed according to the IEC/IEEE Standard
5553 | for Binary Floating-Point Arithmetic.
5554 *----------------------------------------------------------------------------*/
5556 floatx80
floatx80_sqrt(floatx80 a
, float_status
*status
)
5560 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
5561 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5564 aSig0
= extractFloatx80Frac( a
);
5565 aExp
= extractFloatx80Exp( a
);
5566 aSign
= extractFloatx80Sign( a
);
5567 if ( aExp
== 0x7FFF ) {
5568 if ((uint64_t)(aSig0
<< 1)) {
5569 return propagateFloatx80NaN(a
, a
, status
);
5571 if ( ! aSign
) return a
;
5575 if ( ( aExp
| aSig0
) == 0 ) return a
;
5577 float_raise(float_flag_invalid
, status
);
5578 z
.low
= floatx80_default_nan_low
;
5579 z
.high
= floatx80_default_nan_high
;
5583 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
5584 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5586 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
5587 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
5588 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
5589 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
5590 doubleZSig0
= zSig0
<<1;
5591 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
5592 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
5593 while ( (int64_t) rem0
< 0 ) {
5596 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
5598 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
5599 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5600 if ( zSig1
== 0 ) zSig1
= 1;
5601 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
5602 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5603 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
5604 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5605 while ( (int64_t) rem1
< 0 ) {
5607 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
5609 term2
|= doubleZSig0
;
5610 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5612 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5614 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
5615 zSig0
|= doubleZSig0
;
5616 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5617 0, zExp
, zSig0
, zSig1
, status
);
5620 /*----------------------------------------------------------------------------
5621 | Returns 1 if the extended double-precision floating-point value `a' is equal
5622 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5623 | raised if either operand is a NaN. Otherwise, the comparison is performed
5624 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5625 *----------------------------------------------------------------------------*/
5627 int floatx80_eq(floatx80 a
, floatx80 b
, float_status
*status
)
5630 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5631 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5632 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5633 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5635 float_raise(float_flag_invalid
, status
);
5640 && ( ( a
.high
== b
.high
)
5642 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5647 /*----------------------------------------------------------------------------
5648 | Returns 1 if the extended double-precision floating-point value `a' is
5649 | less than or equal to the corresponding value `b', and 0 otherwise. The
5650 | invalid exception is raised if either operand is a NaN. The comparison is
5651 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5653 *----------------------------------------------------------------------------*/
5655 int floatx80_le(floatx80 a
, floatx80 b
, float_status
*status
)
5659 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5660 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5661 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5662 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5664 float_raise(float_flag_invalid
, status
);
5667 aSign
= extractFloatx80Sign( a
);
5668 bSign
= extractFloatx80Sign( b
);
5669 if ( aSign
!= bSign
) {
5672 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5676 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5677 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5681 /*----------------------------------------------------------------------------
5682 | Returns 1 if the extended double-precision floating-point value `a' is
5683 | less than the corresponding value `b', and 0 otherwise. The invalid
5684 | exception is raised if either operand is a NaN. The comparison is performed
5685 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5686 *----------------------------------------------------------------------------*/
5688 int floatx80_lt(floatx80 a
, floatx80 b
, float_status
*status
)
5692 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5693 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5694 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5695 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5697 float_raise(float_flag_invalid
, status
);
5700 aSign
= extractFloatx80Sign( a
);
5701 bSign
= extractFloatx80Sign( b
);
5702 if ( aSign
!= bSign
) {
5705 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5709 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5710 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5714 /*----------------------------------------------------------------------------
5715 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5716 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5717 | either operand is a NaN. The comparison is performed according to the
5718 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5719 *----------------------------------------------------------------------------*/
5720 int floatx80_unordered(floatx80 a
, floatx80 b
, float_status
*status
)
5722 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5723 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5724 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5725 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5727 float_raise(float_flag_invalid
, status
);
5733 /*----------------------------------------------------------------------------
5734 | Returns 1 if the extended double-precision floating-point value `a' is
5735 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5736 | cause an exception. The comparison is performed according to the IEC/IEEE
5737 | Standard for Binary Floating-Point Arithmetic.
5738 *----------------------------------------------------------------------------*/
5740 int floatx80_eq_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5743 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5744 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5745 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5746 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5748 if ( floatx80_is_signaling_nan( a
)
5749 || floatx80_is_signaling_nan( b
) ) {
5750 float_raise(float_flag_invalid
, status
);
5756 && ( ( a
.high
== b
.high
)
5758 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5763 /*----------------------------------------------------------------------------
5764 | Returns 1 if the extended double-precision floating-point value `a' is less
5765 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5766 | do not cause an exception. Otherwise, the comparison is performed according
5767 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5768 *----------------------------------------------------------------------------*/
5770 int floatx80_le_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5774 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5775 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5776 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5777 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5779 if ( floatx80_is_signaling_nan( a
)
5780 || floatx80_is_signaling_nan( b
) ) {
5781 float_raise(float_flag_invalid
, status
);
5785 aSign
= extractFloatx80Sign( a
);
5786 bSign
= extractFloatx80Sign( b
);
5787 if ( aSign
!= bSign
) {
5790 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5794 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5795 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5799 /*----------------------------------------------------------------------------
5800 | Returns 1 if the extended double-precision floating-point value `a' is less
5801 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5802 | an exception. Otherwise, the comparison is performed according to the
5803 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5804 *----------------------------------------------------------------------------*/
5806 int floatx80_lt_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5810 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5811 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5812 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5813 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5815 if ( floatx80_is_signaling_nan( a
)
5816 || floatx80_is_signaling_nan( b
) ) {
5817 float_raise(float_flag_invalid
, status
);
5821 aSign
= extractFloatx80Sign( a
);
5822 bSign
= extractFloatx80Sign( b
);
5823 if ( aSign
!= bSign
) {
5826 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5830 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5831 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5835 /*----------------------------------------------------------------------------
5836 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5837 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5838 | The comparison is performed according to the IEC/IEEE Standard for Binary
5839 | Floating-Point Arithmetic.
5840 *----------------------------------------------------------------------------*/
5841 int floatx80_unordered_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5843 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5844 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5845 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5846 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5848 if ( floatx80_is_signaling_nan( a
)
5849 || floatx80_is_signaling_nan( b
) ) {
5850 float_raise(float_flag_invalid
, status
);
5857 /*----------------------------------------------------------------------------
5858 | Returns the result of converting the quadruple-precision floating-point
5859 | value `a' to the 32-bit two's complement integer format. The conversion
5860 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5861 | Arithmetic---which means in particular that the conversion is rounded
5862 | according to the current rounding mode. If `a' is a NaN, the largest
5863 | positive integer is returned. Otherwise, if the conversion overflows, the
5864 | largest integer with the same sign as `a' is returned.
5865 *----------------------------------------------------------------------------*/
5867 int32_t float128_to_int32(float128 a
, float_status
*status
)
5870 int32_t aExp
, shiftCount
;
5871 uint64_t aSig0
, aSig1
;
5873 aSig1
= extractFloat128Frac1( a
);
5874 aSig0
= extractFloat128Frac0( a
);
5875 aExp
= extractFloat128Exp( a
);
5876 aSign
= extractFloat128Sign( a
);
5877 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
5878 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5879 aSig0
|= ( aSig1
!= 0 );
5880 shiftCount
= 0x4028 - aExp
;
5881 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
5882 return roundAndPackInt32(aSign
, aSig0
, status
);
5886 /*----------------------------------------------------------------------------
5887 | Returns the result of converting the quadruple-precision floating-point
5888 | value `a' to the 32-bit two's complement integer format. The conversion
5889 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5890 | Arithmetic, except that the conversion is always rounded toward zero. If
5891 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5892 | conversion overflows, the largest integer with the same sign as `a' is
5894 *----------------------------------------------------------------------------*/
5896 int32_t float128_to_int32_round_to_zero(float128 a
, float_status
*status
)
5899 int32_t aExp
, shiftCount
;
5900 uint64_t aSig0
, aSig1
, savedASig
;
5903 aSig1
= extractFloat128Frac1( a
);
5904 aSig0
= extractFloat128Frac0( a
);
5905 aExp
= extractFloat128Exp( a
);
5906 aSign
= extractFloat128Sign( a
);
5907 aSig0
|= ( aSig1
!= 0 );
5908 if ( 0x401E < aExp
) {
5909 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
5912 else if ( aExp
< 0x3FFF ) {
5913 if (aExp
|| aSig0
) {
5914 status
->float_exception_flags
|= float_flag_inexact
;
5918 aSig0
|= LIT64( 0x0001000000000000 );
5919 shiftCount
= 0x402F - aExp
;
5921 aSig0
>>= shiftCount
;
5923 if ( aSign
) z
= - z
;
5924 if ( ( z
< 0 ) ^ aSign
) {
5926 float_raise(float_flag_invalid
, status
);
5927 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
5929 if ( ( aSig0
<<shiftCount
) != savedASig
) {
5930 status
->float_exception_flags
|= float_flag_inexact
;
5936 /*----------------------------------------------------------------------------
5937 | Returns the result of converting the quadruple-precision floating-point
5938 | value `a' to the 64-bit two's complement integer format. The conversion
5939 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5940 | Arithmetic---which means in particular that the conversion is rounded
5941 | according to the current rounding mode. If `a' is a NaN, the largest
5942 | positive integer is returned. Otherwise, if the conversion overflows, the
5943 | largest integer with the same sign as `a' is returned.
5944 *----------------------------------------------------------------------------*/
5946 int64_t float128_to_int64(float128 a
, float_status
*status
)
5949 int32_t aExp
, shiftCount
;
5950 uint64_t aSig0
, aSig1
;
5952 aSig1
= extractFloat128Frac1( a
);
5953 aSig0
= extractFloat128Frac0( a
);
5954 aExp
= extractFloat128Exp( a
);
5955 aSign
= extractFloat128Sign( a
);
5956 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5957 shiftCount
= 0x402F - aExp
;
5958 if ( shiftCount
<= 0 ) {
5959 if ( 0x403E < aExp
) {
5960 float_raise(float_flag_invalid
, status
);
5962 || ( ( aExp
== 0x7FFF )
5963 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
5966 return LIT64( 0x7FFFFFFFFFFFFFFF );
5968 return (int64_t) LIT64( 0x8000000000000000 );
5970 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
5973 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
5975 return roundAndPackInt64(aSign
, aSig0
, aSig1
, status
);
5979 /*----------------------------------------------------------------------------
5980 | Returns the result of converting the quadruple-precision floating-point
5981 | value `a' to the 64-bit two's complement integer format. The conversion
5982 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5983 | Arithmetic, except that the conversion is always rounded toward zero.
5984 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5985 | the conversion overflows, the largest integer with the same sign as `a' is
5987 *----------------------------------------------------------------------------*/
5989 int64_t float128_to_int64_round_to_zero(float128 a
, float_status
*status
)
5992 int32_t aExp
, shiftCount
;
5993 uint64_t aSig0
, aSig1
;
5996 aSig1
= extractFloat128Frac1( a
);
5997 aSig0
= extractFloat128Frac0( a
);
5998 aExp
= extractFloat128Exp( a
);
5999 aSign
= extractFloat128Sign( a
);
6000 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
6001 shiftCount
= aExp
- 0x402F;
6002 if ( 0 < shiftCount
) {
6003 if ( 0x403E <= aExp
) {
6004 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
6005 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
6006 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
6008 status
->float_exception_flags
|= float_flag_inexact
;
6012 float_raise(float_flag_invalid
, status
);
6013 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
6014 return LIT64( 0x7FFFFFFFFFFFFFFF );
6017 return (int64_t) LIT64( 0x8000000000000000 );
6019 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
6020 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
6021 status
->float_exception_flags
|= float_flag_inexact
;
6025 if ( aExp
< 0x3FFF ) {
6026 if ( aExp
| aSig0
| aSig1
) {
6027 status
->float_exception_flags
|= float_flag_inexact
;
6031 z
= aSig0
>>( - shiftCount
);
6033 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
6034 status
->float_exception_flags
|= float_flag_inexact
;
6037 if ( aSign
) z
= - z
;
6042 /*----------------------------------------------------------------------------
6043 | Returns the result of converting the quadruple-precision floating-point
6044 | value `a' to the single-precision floating-point format. The conversion
6045 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6047 *----------------------------------------------------------------------------*/
6049 float32
float128_to_float32(float128 a
, float_status
*status
)
6053 uint64_t aSig0
, aSig1
;
6056 aSig1
= extractFloat128Frac1( a
);
6057 aSig0
= extractFloat128Frac0( a
);
6058 aExp
= extractFloat128Exp( a
);
6059 aSign
= extractFloat128Sign( a
);
6060 if ( aExp
== 0x7FFF ) {
6061 if ( aSig0
| aSig1
) {
6062 return commonNaNToFloat32(float128ToCommonNaN(a
, status
), status
);
6064 return packFloat32( aSign
, 0xFF, 0 );
6066 aSig0
|= ( aSig1
!= 0 );
6067 shift64RightJamming( aSig0
, 18, &aSig0
);
6069 if ( aExp
|| zSig
) {
6073 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
6077 /*----------------------------------------------------------------------------
6078 | Returns the result of converting the quadruple-precision floating-point
6079 | value `a' to the double-precision floating-point format. The conversion
6080 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6082 *----------------------------------------------------------------------------*/
6084 float64
float128_to_float64(float128 a
, float_status
*status
)
6088 uint64_t aSig0
, aSig1
;
6090 aSig1
= extractFloat128Frac1( a
);
6091 aSig0
= extractFloat128Frac0( a
);
6092 aExp
= extractFloat128Exp( a
);
6093 aSign
= extractFloat128Sign( a
);
6094 if ( aExp
== 0x7FFF ) {
6095 if ( aSig0
| aSig1
) {
6096 return commonNaNToFloat64(float128ToCommonNaN(a
, status
), status
);
6098 return packFloat64( aSign
, 0x7FF, 0 );
6100 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6101 aSig0
|= ( aSig1
!= 0 );
6102 if ( aExp
|| aSig0
) {
6103 aSig0
|= LIT64( 0x4000000000000000 );
6106 return roundAndPackFloat64(aSign
, aExp
, aSig0
, status
);
6110 /*----------------------------------------------------------------------------
6111 | Returns the result of converting the quadruple-precision floating-point
6112 | value `a' to the extended double-precision floating-point format. The
6113 | conversion is performed according to the IEC/IEEE Standard for Binary
6114 | Floating-Point Arithmetic.
6115 *----------------------------------------------------------------------------*/
6117 floatx80
float128_to_floatx80(float128 a
, float_status
*status
)
6121 uint64_t aSig0
, aSig1
;
6123 aSig1
= extractFloat128Frac1( a
);
6124 aSig0
= extractFloat128Frac0( a
);
6125 aExp
= extractFloat128Exp( a
);
6126 aSign
= extractFloat128Sign( a
);
6127 if ( aExp
== 0x7FFF ) {
6128 if ( aSig0
| aSig1
) {
6129 return commonNaNToFloatx80(float128ToCommonNaN(a
, status
), status
);
6131 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
6134 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
6135 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6138 aSig0
|= LIT64( 0x0001000000000000 );
6140 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
6141 return roundAndPackFloatx80(80, aSign
, aExp
, aSig0
, aSig1
, status
);
6145 /*----------------------------------------------------------------------------
6146 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6147 | returns the result as a quadruple-precision floating-point value. The
6148 | operation is performed according to the IEC/IEEE Standard for Binary
6149 | Floating-Point Arithmetic.
6150 *----------------------------------------------------------------------------*/
6152 float128
float128_round_to_int(float128 a
, float_status
*status
)
6156 uint64_t lastBitMask
, roundBitsMask
;
6159 aExp
= extractFloat128Exp( a
);
6160 if ( 0x402F <= aExp
) {
6161 if ( 0x406F <= aExp
) {
6162 if ( ( aExp
== 0x7FFF )
6163 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
6165 return propagateFloat128NaN(a
, a
, status
);
6170 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
6171 roundBitsMask
= lastBitMask
- 1;
6173 switch (status
->float_rounding_mode
) {
6174 case float_round_nearest_even
:
6175 if ( lastBitMask
) {
6176 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
6177 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
6180 if ( (int64_t) z
.low
< 0 ) {
6182 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
6186 case float_round_ties_away
:
6188 add128(z
.high
, z
.low
, 0, lastBitMask
>> 1, &z
.high
, &z
.low
);
6190 if ((int64_t) z
.low
< 0) {
6195 case float_round_to_zero
:
6197 case float_round_up
:
6198 if (!extractFloat128Sign(z
)) {
6199 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6202 case float_round_down
:
6203 if (extractFloat128Sign(z
)) {
6204 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6210 z
.low
&= ~ roundBitsMask
;
6213 if ( aExp
< 0x3FFF ) {
6214 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
6215 status
->float_exception_flags
|= float_flag_inexact
;
6216 aSign
= extractFloat128Sign( a
);
6217 switch (status
->float_rounding_mode
) {
6218 case float_round_nearest_even
:
6219 if ( ( aExp
== 0x3FFE )
6220 && ( extractFloat128Frac0( a
)
6221 | extractFloat128Frac1( a
) )
6223 return packFloat128( aSign
, 0x3FFF, 0, 0 );
6226 case float_round_ties_away
:
6227 if (aExp
== 0x3FFE) {
6228 return packFloat128(aSign
, 0x3FFF, 0, 0);
6231 case float_round_down
:
6233 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
6234 : packFloat128( 0, 0, 0, 0 );
6235 case float_round_up
:
6237 aSign
? packFloat128( 1, 0, 0, 0 )
6238 : packFloat128( 0, 0x3FFF, 0, 0 );
6240 return packFloat128( aSign
, 0, 0, 0 );
6243 lastBitMask
<<= 0x402F - aExp
;
6244 roundBitsMask
= lastBitMask
- 1;
6247 switch (status
->float_rounding_mode
) {
6248 case float_round_nearest_even
:
6249 z
.high
+= lastBitMask
>>1;
6250 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
6251 z
.high
&= ~ lastBitMask
;
6254 case float_round_ties_away
:
6255 z
.high
+= lastBitMask
>>1;
6257 case float_round_to_zero
:
6259 case float_round_up
:
6260 if (!extractFloat128Sign(z
)) {
6261 z
.high
|= ( a
.low
!= 0 );
6262 z
.high
+= roundBitsMask
;
6265 case float_round_down
:
6266 if (extractFloat128Sign(z
)) {
6267 z
.high
|= (a
.low
!= 0);
6268 z
.high
+= roundBitsMask
;
6274 z
.high
&= ~ roundBitsMask
;
6276 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
6277 status
->float_exception_flags
|= float_flag_inexact
;
6283 /*----------------------------------------------------------------------------
6284 | Returns the result of adding the absolute values of the quadruple-precision
6285 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6286 | before being returned. `zSign' is ignored if the result is a NaN.
6287 | The addition is performed according to the IEC/IEEE Standard for Binary
6288 | Floating-Point Arithmetic.
6289 *----------------------------------------------------------------------------*/
6291 static float128
addFloat128Sigs(float128 a
, float128 b
, flag zSign
,
6292 float_status
*status
)
6294 int32_t aExp
, bExp
, zExp
;
6295 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6298 aSig1
= extractFloat128Frac1( a
);
6299 aSig0
= extractFloat128Frac0( a
);
6300 aExp
= extractFloat128Exp( a
);
6301 bSig1
= extractFloat128Frac1( b
);
6302 bSig0
= extractFloat128Frac0( b
);
6303 bExp
= extractFloat128Exp( b
);
6304 expDiff
= aExp
- bExp
;
6305 if ( 0 < expDiff
) {
6306 if ( aExp
== 0x7FFF ) {
6307 if (aSig0
| aSig1
) {
6308 return propagateFloat128NaN(a
, b
, status
);
6316 bSig0
|= LIT64( 0x0001000000000000 );
6318 shift128ExtraRightJamming(
6319 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
6322 else if ( expDiff
< 0 ) {
6323 if ( bExp
== 0x7FFF ) {
6324 if (bSig0
| bSig1
) {
6325 return propagateFloat128NaN(a
, b
, status
);
6327 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6333 aSig0
|= LIT64( 0x0001000000000000 );
6335 shift128ExtraRightJamming(
6336 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
6340 if ( aExp
== 0x7FFF ) {
6341 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6342 return propagateFloat128NaN(a
, b
, status
);
6346 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6348 if (status
->flush_to_zero
) {
6349 if (zSig0
| zSig1
) {
6350 float_raise(float_flag_output_denormal
, status
);
6352 return packFloat128(zSign
, 0, 0, 0);
6354 return packFloat128( zSign
, 0, zSig0
, zSig1
);
6357 zSig0
|= LIT64( 0x0002000000000000 );
6361 aSig0
|= LIT64( 0x0001000000000000 );
6362 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6364 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
6367 shift128ExtraRightJamming(
6368 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6370 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6374 /*----------------------------------------------------------------------------
6375 | Returns the result of subtracting the absolute values of the quadruple-
6376 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6377 | difference is negated before being returned. `zSign' is ignored if the
6378 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6379 | Standard for Binary Floating-Point Arithmetic.
6380 *----------------------------------------------------------------------------*/
6382 static float128
subFloat128Sigs(float128 a
, float128 b
, flag zSign
,
6383 float_status
*status
)
6385 int32_t aExp
, bExp
, zExp
;
6386 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
6390 aSig1
= extractFloat128Frac1( a
);
6391 aSig0
= extractFloat128Frac0( a
);
6392 aExp
= extractFloat128Exp( a
);
6393 bSig1
= extractFloat128Frac1( b
);
6394 bSig0
= extractFloat128Frac0( b
);
6395 bExp
= extractFloat128Exp( b
);
6396 expDiff
= aExp
- bExp
;
6397 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6398 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
6399 if ( 0 < expDiff
) goto aExpBigger
;
6400 if ( expDiff
< 0 ) goto bExpBigger
;
6401 if ( aExp
== 0x7FFF ) {
6402 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6403 return propagateFloat128NaN(a
, b
, status
);
6405 float_raise(float_flag_invalid
, status
);
6406 z
.low
= float128_default_nan_low
;
6407 z
.high
= float128_default_nan_high
;
6414 if ( bSig0
< aSig0
) goto aBigger
;
6415 if ( aSig0
< bSig0
) goto bBigger
;
6416 if ( bSig1
< aSig1
) goto aBigger
;
6417 if ( aSig1
< bSig1
) goto bBigger
;
6418 return packFloat128(status
->float_rounding_mode
== float_round_down
,
6421 if ( bExp
== 0x7FFF ) {
6422 if (bSig0
| bSig1
) {
6423 return propagateFloat128NaN(a
, b
, status
);
6425 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
6431 aSig0
|= LIT64( 0x4000000000000000 );
6433 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6434 bSig0
|= LIT64( 0x4000000000000000 );
6436 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6439 goto normalizeRoundAndPack
;
6441 if ( aExp
== 0x7FFF ) {
6442 if (aSig0
| aSig1
) {
6443 return propagateFloat128NaN(a
, b
, status
);
6451 bSig0
|= LIT64( 0x4000000000000000 );
6453 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
6454 aSig0
|= LIT64( 0x4000000000000000 );
6456 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6458 normalizeRoundAndPack
:
6460 return normalizeRoundAndPackFloat128(zSign
, zExp
- 14, zSig0
, zSig1
,
6465 /*----------------------------------------------------------------------------
6466 | Returns the result of adding the quadruple-precision floating-point values
6467 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6468 | for Binary Floating-Point Arithmetic.
6469 *----------------------------------------------------------------------------*/
6471 float128
float128_add(float128 a
, float128 b
, float_status
*status
)
6475 aSign
= extractFloat128Sign( a
);
6476 bSign
= extractFloat128Sign( b
);
6477 if ( aSign
== bSign
) {
6478 return addFloat128Sigs(a
, b
, aSign
, status
);
6481 return subFloat128Sigs(a
, b
, aSign
, status
);
6486 /*----------------------------------------------------------------------------
6487 | Returns the result of subtracting the quadruple-precision floating-point
6488 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6489 | Standard for Binary Floating-Point Arithmetic.
6490 *----------------------------------------------------------------------------*/
6492 float128
float128_sub(float128 a
, float128 b
, float_status
*status
)
6496 aSign
= extractFloat128Sign( a
);
6497 bSign
= extractFloat128Sign( b
);
6498 if ( aSign
== bSign
) {
6499 return subFloat128Sigs(a
, b
, aSign
, status
);
6502 return addFloat128Sigs(a
, b
, aSign
, status
);
6507 /*----------------------------------------------------------------------------
6508 | Returns the result of multiplying the quadruple-precision floating-point
6509 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6510 | Standard for Binary Floating-Point Arithmetic.
6511 *----------------------------------------------------------------------------*/
6513 float128
float128_mul(float128 a
, float128 b
, float_status
*status
)
6515 flag aSign
, bSign
, zSign
;
6516 int32_t aExp
, bExp
, zExp
;
6517 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
6520 aSig1
= extractFloat128Frac1( a
);
6521 aSig0
= extractFloat128Frac0( a
);
6522 aExp
= extractFloat128Exp( a
);
6523 aSign
= extractFloat128Sign( a
);
6524 bSig1
= extractFloat128Frac1( b
);
6525 bSig0
= extractFloat128Frac0( b
);
6526 bExp
= extractFloat128Exp( b
);
6527 bSign
= extractFloat128Sign( b
);
6528 zSign
= aSign
^ bSign
;
6529 if ( aExp
== 0x7FFF ) {
6530 if ( ( aSig0
| aSig1
)
6531 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6532 return propagateFloat128NaN(a
, b
, status
);
6534 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
6535 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6537 if ( bExp
== 0x7FFF ) {
6538 if (bSig0
| bSig1
) {
6539 return propagateFloat128NaN(a
, b
, status
);
6541 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6543 float_raise(float_flag_invalid
, status
);
6544 z
.low
= float128_default_nan_low
;
6545 z
.high
= float128_default_nan_high
;
6548 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6551 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6552 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6555 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6556 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6558 zExp
= aExp
+ bExp
- 0x4000;
6559 aSig0
|= LIT64( 0x0001000000000000 );
6560 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
6561 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
6562 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6563 zSig2
|= ( zSig3
!= 0 );
6564 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
6565 shift128ExtraRightJamming(
6566 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6569 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6573 /*----------------------------------------------------------------------------
6574 | Returns the result of dividing the quadruple-precision floating-point value
6575 | `a' by the corresponding value `b'. The operation is performed according to
6576 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6577 *----------------------------------------------------------------------------*/
6579 float128
float128_div(float128 a
, float128 b
, float_status
*status
)
6581 flag aSign
, bSign
, zSign
;
6582 int32_t aExp
, bExp
, zExp
;
6583 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6584 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6587 aSig1
= extractFloat128Frac1( a
);
6588 aSig0
= extractFloat128Frac0( a
);
6589 aExp
= extractFloat128Exp( a
);
6590 aSign
= extractFloat128Sign( a
);
6591 bSig1
= extractFloat128Frac1( b
);
6592 bSig0
= extractFloat128Frac0( b
);
6593 bExp
= extractFloat128Exp( b
);
6594 bSign
= extractFloat128Sign( b
);
6595 zSign
= aSign
^ bSign
;
6596 if ( aExp
== 0x7FFF ) {
6597 if (aSig0
| aSig1
) {
6598 return propagateFloat128NaN(a
, b
, status
);
6600 if ( bExp
== 0x7FFF ) {
6601 if (bSig0
| bSig1
) {
6602 return propagateFloat128NaN(a
, b
, status
);
6606 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6608 if ( bExp
== 0x7FFF ) {
6609 if (bSig0
| bSig1
) {
6610 return propagateFloat128NaN(a
, b
, status
);
6612 return packFloat128( zSign
, 0, 0, 0 );
6615 if ( ( bSig0
| bSig1
) == 0 ) {
6616 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6618 float_raise(float_flag_invalid
, status
);
6619 z
.low
= float128_default_nan_low
;
6620 z
.high
= float128_default_nan_high
;
6623 float_raise(float_flag_divbyzero
, status
);
6624 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6626 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6629 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6630 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6632 zExp
= aExp
- bExp
+ 0x3FFD;
6634 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
6636 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
6637 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
6638 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
6641 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6642 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
6643 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
6644 while ( (int64_t) rem0
< 0 ) {
6646 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
6648 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
6649 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
6650 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
6651 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
6652 while ( (int64_t) rem1
< 0 ) {
6654 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
6656 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6658 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
6659 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6663 /*----------------------------------------------------------------------------
6664 | Returns the remainder of the quadruple-precision floating-point value `a'
6665 | with respect to the corresponding value `b'. The operation is performed
6666 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6667 *----------------------------------------------------------------------------*/
6669 float128
float128_rem(float128 a
, float128 b
, float_status
*status
)
6672 int32_t aExp
, bExp
, expDiff
;
6673 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
6674 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
6678 aSig1
= extractFloat128Frac1( a
);
6679 aSig0
= extractFloat128Frac0( a
);
6680 aExp
= extractFloat128Exp( a
);
6681 aSign
= extractFloat128Sign( a
);
6682 bSig1
= extractFloat128Frac1( b
);
6683 bSig0
= extractFloat128Frac0( b
);
6684 bExp
= extractFloat128Exp( b
);
6685 if ( aExp
== 0x7FFF ) {
6686 if ( ( aSig0
| aSig1
)
6687 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6688 return propagateFloat128NaN(a
, b
, status
);
6692 if ( bExp
== 0x7FFF ) {
6693 if (bSig0
| bSig1
) {
6694 return propagateFloat128NaN(a
, b
, status
);
6699 if ( ( bSig0
| bSig1
) == 0 ) {
6701 float_raise(float_flag_invalid
, status
);
6702 z
.low
= float128_default_nan_low
;
6703 z
.high
= float128_default_nan_high
;
6706 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6709 if ( ( aSig0
| aSig1
) == 0 ) return a
;
6710 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6712 expDiff
= aExp
- bExp
;
6713 if ( expDiff
< -1 ) return a
;
6715 aSig0
| LIT64( 0x0001000000000000 ),
6717 15 - ( expDiff
< 0 ),
6722 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
6723 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
6724 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6726 while ( 0 < expDiff
) {
6727 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6728 q
= ( 4 < q
) ? q
- 4 : 0;
6729 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6730 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
6731 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
6732 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
6735 if ( -64 < expDiff
) {
6736 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6737 q
= ( 4 < q
) ? q
- 4 : 0;
6739 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6741 if ( expDiff
< 0 ) {
6742 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6745 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
6747 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6748 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
6751 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
6752 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6755 alternateASig0
= aSig0
;
6756 alternateASig1
= aSig1
;
6758 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6759 } while ( 0 <= (int64_t) aSig0
);
6761 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
6762 if ( ( sigMean0
< 0 )
6763 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
6764 aSig0
= alternateASig0
;
6765 aSig1
= alternateASig1
;
6767 zSign
= ( (int64_t) aSig0
< 0 );
6768 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
6769 return normalizeRoundAndPackFloat128(aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
,
6773 /*----------------------------------------------------------------------------
6774 | Returns the square root of the quadruple-precision floating-point value `a'.
6775 | The operation is performed according to the IEC/IEEE Standard for Binary
6776 | Floating-Point Arithmetic.
6777 *----------------------------------------------------------------------------*/
6779 float128
float128_sqrt(float128 a
, float_status
*status
)
6783 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
6784 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6787 aSig1
= extractFloat128Frac1( a
);
6788 aSig0
= extractFloat128Frac0( a
);
6789 aExp
= extractFloat128Exp( a
);
6790 aSign
= extractFloat128Sign( a
);
6791 if ( aExp
== 0x7FFF ) {
6792 if (aSig0
| aSig1
) {
6793 return propagateFloat128NaN(a
, a
, status
);
6795 if ( ! aSign
) return a
;
6799 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
6801 float_raise(float_flag_invalid
, status
);
6802 z
.low
= float128_default_nan_low
;
6803 z
.high
= float128_default_nan_high
;
6807 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
6808 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6810 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
6811 aSig0
|= LIT64( 0x0001000000000000 );
6812 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
6813 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
6814 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6815 doubleZSig0
= zSig0
<<1;
6816 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6817 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6818 while ( (int64_t) rem0
< 0 ) {
6821 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6823 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
6824 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
6825 if ( zSig1
== 0 ) zSig1
= 1;
6826 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
6827 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6828 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
6829 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6830 while ( (int64_t) rem1
< 0 ) {
6832 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
6834 term2
|= doubleZSig0
;
6835 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6837 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6839 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
6840 return roundAndPackFloat128(0, zExp
, zSig0
, zSig1
, zSig2
, status
);
6844 /*----------------------------------------------------------------------------
6845 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6846 | the corresponding value `b', and 0 otherwise. The invalid exception is
6847 | raised if either operand is a NaN. Otherwise, the comparison is performed
6848 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6849 *----------------------------------------------------------------------------*/
6851 int float128_eq(float128 a
, float128 b
, float_status
*status
)
6854 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6855 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6856 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6857 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6859 float_raise(float_flag_invalid
, status
);
6864 && ( ( a
.high
== b
.high
)
6866 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6871 /*----------------------------------------------------------------------------
6872 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6873 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6874 | exception is raised if either operand is a NaN. The comparison is performed
6875 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6876 *----------------------------------------------------------------------------*/
6878 int float128_le(float128 a
, float128 b
, float_status
*status
)
6882 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6883 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6884 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6885 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6887 float_raise(float_flag_invalid
, status
);
6890 aSign
= extractFloat128Sign( a
);
6891 bSign
= extractFloat128Sign( b
);
6892 if ( aSign
!= bSign
) {
6895 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6899 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6900 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6904 /*----------------------------------------------------------------------------
6905 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6906 | the corresponding value `b', and 0 otherwise. The invalid exception is
6907 | raised if either operand is a NaN. The comparison is performed according
6908 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6909 *----------------------------------------------------------------------------*/
6911 int float128_lt(float128 a
, float128 b
, float_status
*status
)
6915 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6916 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6917 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6918 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6920 float_raise(float_flag_invalid
, status
);
6923 aSign
= extractFloat128Sign( a
);
6924 bSign
= extractFloat128Sign( b
);
6925 if ( aSign
!= bSign
) {
6928 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6932 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
6933 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6937 /*----------------------------------------------------------------------------
6938 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6939 | be compared, and 0 otherwise. The invalid exception is raised if either
6940 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6941 | Standard for Binary Floating-Point Arithmetic.
6942 *----------------------------------------------------------------------------*/
6944 int float128_unordered(float128 a
, float128 b
, float_status
*status
)
6946 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6947 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6948 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6949 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6951 float_raise(float_flag_invalid
, status
);
6957 /*----------------------------------------------------------------------------
6958 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6959 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6960 | exception. The comparison is performed according to the IEC/IEEE Standard
6961 | for Binary Floating-Point Arithmetic.
6962 *----------------------------------------------------------------------------*/
6964 int float128_eq_quiet(float128 a
, float128 b
, float_status
*status
)
6967 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6968 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6969 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6970 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6972 if ( float128_is_signaling_nan( a
)
6973 || float128_is_signaling_nan( b
) ) {
6974 float_raise(float_flag_invalid
, status
);
6980 && ( ( a
.high
== b
.high
)
6982 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6987 /*----------------------------------------------------------------------------
6988 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6989 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6990 | cause an exception. Otherwise, the comparison is performed according to the
6991 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6992 *----------------------------------------------------------------------------*/
6994 int float128_le_quiet(float128 a
, float128 b
, float_status
*status
)
6998 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6999 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7000 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7001 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7003 if ( float128_is_signaling_nan( a
)
7004 || float128_is_signaling_nan( b
) ) {
7005 float_raise(float_flag_invalid
, status
);
7009 aSign
= extractFloat128Sign( a
);
7010 bSign
= extractFloat128Sign( b
);
7011 if ( aSign
!= bSign
) {
7014 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
7018 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
7019 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
7023 /*----------------------------------------------------------------------------
7024 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7025 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7026 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
7027 | Standard for Binary Floating-Point Arithmetic.
7028 *----------------------------------------------------------------------------*/
7030 int float128_lt_quiet(float128 a
, float128 b
, float_status
*status
)
7034 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7035 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7036 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7037 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7039 if ( float128_is_signaling_nan( a
)
7040 || float128_is_signaling_nan( b
) ) {
7041 float_raise(float_flag_invalid
, status
);
7045 aSign
= extractFloat128Sign( a
);
7046 bSign
= extractFloat128Sign( b
);
7047 if ( aSign
!= bSign
) {
7050 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
7054 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
7055 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
7059 /*----------------------------------------------------------------------------
7060 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7061 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
7062 | comparison is performed according to the IEC/IEEE Standard for Binary
7063 | Floating-Point Arithmetic.
7064 *----------------------------------------------------------------------------*/
7066 int float128_unordered_quiet(float128 a
, float128 b
, float_status
*status
)
7068 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7069 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7070 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7071 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7073 if ( float128_is_signaling_nan( a
)
7074 || float128_is_signaling_nan( b
) ) {
7075 float_raise(float_flag_invalid
, status
);
7082 /* misc functions */
7083 float32
uint32_to_float32(uint32_t a
, float_status
*status
)
7085 return int64_to_float32(a
, status
);
7088 float64
uint32_to_float64(uint32_t a
, float_status
*status
)
7090 return int64_to_float64(a
, status
);
7093 uint32_t float32_to_uint32(float32 a
, float_status
*status
)
7097 int old_exc_flags
= get_float_exception_flags(status
);
7099 v
= float32_to_int64(a
, status
);
7102 } else if (v
> 0xffffffff) {
7107 set_float_exception_flags(old_exc_flags
, status
);
7108 float_raise(float_flag_invalid
, status
);
7112 uint32_t float32_to_uint32_round_to_zero(float32 a
, float_status
*status
)
7116 int old_exc_flags
= get_float_exception_flags(status
);
7118 v
= float32_to_int64_round_to_zero(a
, status
);
7121 } else if (v
> 0xffffffff) {
7126 set_float_exception_flags(old_exc_flags
, status
);
7127 float_raise(float_flag_invalid
, status
);
7131 int16_t float32_to_int16(float32 a
, float_status
*status
)
7135 int old_exc_flags
= get_float_exception_flags(status
);
7137 v
= float32_to_int32(a
, status
);
7140 } else if (v
> 0x7fff) {
7146 set_float_exception_flags(old_exc_flags
, status
);
7147 float_raise(float_flag_invalid
, status
);
7151 uint16_t float32_to_uint16(float32 a
, float_status
*status
)
7155 int old_exc_flags
= get_float_exception_flags(status
);
7157 v
= float32_to_int32(a
, status
);
7160 } else if (v
> 0xffff) {
7166 set_float_exception_flags(old_exc_flags
, status
);
7167 float_raise(float_flag_invalid
, status
);
7171 uint16_t float32_to_uint16_round_to_zero(float32 a
, float_status
*status
)
7175 int old_exc_flags
= get_float_exception_flags(status
);
7177 v
= float32_to_int64_round_to_zero(a
, status
);
7180 } else if (v
> 0xffff) {
7185 set_float_exception_flags(old_exc_flags
, status
);
7186 float_raise(float_flag_invalid
, status
);
7190 uint32_t float64_to_uint32(float64 a
, float_status
*status
)
7194 int old_exc_flags
= get_float_exception_flags(status
);
7196 v
= float64_to_uint64(a
, status
);
7197 if (v
> 0xffffffff) {
7202 set_float_exception_flags(old_exc_flags
, status
);
7203 float_raise(float_flag_invalid
, status
);
7207 uint32_t float64_to_uint32_round_to_zero(float64 a
, float_status
*status
)
7211 int old_exc_flags
= get_float_exception_flags(status
);
7213 v
= float64_to_uint64_round_to_zero(a
, status
);
7214 if (v
> 0xffffffff) {
7219 set_float_exception_flags(old_exc_flags
, status
);
7220 float_raise(float_flag_invalid
, status
);
7224 int16_t float64_to_int16(float64 a
, float_status
*status
)
7228 int old_exc_flags
= get_float_exception_flags(status
);
7230 v
= float64_to_int32(a
, status
);
7233 } else if (v
> 0x7fff) {
7239 set_float_exception_flags(old_exc_flags
, status
);
7240 float_raise(float_flag_invalid
, status
);
7244 uint16_t float64_to_uint16(float64 a
, float_status
*status
)
7248 int old_exc_flags
= get_float_exception_flags(status
);
7250 v
= float64_to_int32(a
, status
);
7253 } else if (v
> 0xffff) {
7259 set_float_exception_flags(old_exc_flags
, status
);
7260 float_raise(float_flag_invalid
, status
);
7264 uint16_t float64_to_uint16_round_to_zero(float64 a
, float_status
*status
)
7268 int old_exc_flags
= get_float_exception_flags(status
);
7270 v
= float64_to_int64_round_to_zero(a
, status
);
7273 } else if (v
> 0xffff) {
7278 set_float_exception_flags(old_exc_flags
, status
);
7279 float_raise(float_flag_invalid
, status
);
7283 /*----------------------------------------------------------------------------
7284 | Returns the result of converting the double-precision floating-point value
7285 | `a' to the 64-bit unsigned integer format. The conversion is
7286 | performed according to the IEC/IEEE Standard for Binary Floating-Point
7287 | Arithmetic---which means in particular that the conversion is rounded
7288 | according to the current rounding mode. If `a' is a NaN, the largest
7289 | positive integer is returned. If the conversion overflows, the
7290 | largest unsigned integer is returned. If 'a' is negative, the value is
7291 | rounded and zero is returned; negative values that do not round to zero
7292 | will raise the inexact exception.
7293 *----------------------------------------------------------------------------*/
7295 uint64_t float64_to_uint64(float64 a
, float_status
*status
)
7300 uint64_t aSig
, aSigExtra
;
7301 a
= float64_squash_input_denormal(a
, status
);
7303 aSig
= extractFloat64Frac(a
);
7304 aExp
= extractFloat64Exp(a
);
7305 aSign
= extractFloat64Sign(a
);
7306 if (aSign
&& (aExp
> 1022)) {
7307 float_raise(float_flag_invalid
, status
);
7308 if (float64_is_any_nan(a
)) {
7309 return LIT64(0xFFFFFFFFFFFFFFFF);
7315 aSig
|= LIT64(0x0010000000000000);
7317 shiftCount
= 0x433 - aExp
;
7318 if (shiftCount
<= 0) {
7320 float_raise(float_flag_invalid
, status
);
7321 return LIT64(0xFFFFFFFFFFFFFFFF);
7324 aSig
<<= -shiftCount
;
7326 shift64ExtraRightJamming(aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
7328 return roundAndPackUint64(aSign
, aSig
, aSigExtra
, status
);
7331 uint64_t float64_to_uint64_round_to_zero(float64 a
, float_status
*status
)
7333 signed char current_rounding_mode
= status
->float_rounding_mode
;
7334 set_float_rounding_mode(float_round_to_zero
, status
);
7335 int64_t v
= float64_to_uint64(a
, status
);
7336 set_float_rounding_mode(current_rounding_mode
, status
);
7340 #define COMPARE(s, nan_exp) \
7341 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7342 int is_quiet, float_status *status) \
7344 flag aSign, bSign; \
7345 uint ## s ## _t av, bv; \
7346 a = float ## s ## _squash_input_denormal(a, status); \
7347 b = float ## s ## _squash_input_denormal(b, status); \
7349 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
7350 extractFloat ## s ## Frac( a ) ) || \
7351 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
7352 extractFloat ## s ## Frac( b ) )) { \
7354 float ## s ## _is_signaling_nan( a ) || \
7355 float ## s ## _is_signaling_nan( b ) ) { \
7356 float_raise(float_flag_invalid, status); \
7358 return float_relation_unordered; \
7360 aSign = extractFloat ## s ## Sign( a ); \
7361 bSign = extractFloat ## s ## Sign( b ); \
7362 av = float ## s ## _val(a); \
7363 bv = float ## s ## _val(b); \
7364 if ( aSign != bSign ) { \
7365 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
7367 return float_relation_equal; \
7369 return 1 - (2 * aSign); \
7373 return float_relation_equal; \
7375 return 1 - 2 * (aSign ^ ( av < bv )); \
7380 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
7382 return float ## s ## _compare_internal(a, b, 0, status); \
7385 int float ## s ## _compare_quiet(float ## s a, float ## s b, \
7386 float_status *status) \
7388 return float ## s ## _compare_internal(a, b, 1, status); \
7394 static inline int floatx80_compare_internal(floatx80 a
, floatx80 b
,
7395 int is_quiet
, float_status
*status
)
7399 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
7400 ( extractFloatx80Frac( a
)<<1 ) ) ||
7401 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
7402 ( extractFloatx80Frac( b
)<<1 ) )) {
7404 floatx80_is_signaling_nan( a
) ||
7405 floatx80_is_signaling_nan( b
) ) {
7406 float_raise(float_flag_invalid
, status
);
7408 return float_relation_unordered
;
7410 aSign
= extractFloatx80Sign( a
);
7411 bSign
= extractFloatx80Sign( b
);
7412 if ( aSign
!= bSign
) {
7414 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
7415 ( ( a
.low
| b
.low
) == 0 ) ) {
7417 return float_relation_equal
;
7419 return 1 - (2 * aSign
);
7422 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7423 return float_relation_equal
;
7425 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7430 int floatx80_compare(floatx80 a
, floatx80 b
, float_status
*status
)
7432 return floatx80_compare_internal(a
, b
, 0, status
);
7435 int floatx80_compare_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
7437 return floatx80_compare_internal(a
, b
, 1, status
);
7440 static inline int float128_compare_internal(float128 a
, float128 b
,
7441 int is_quiet
, float_status
*status
)
7445 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
7446 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
7447 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
7448 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
7450 float128_is_signaling_nan( a
) ||
7451 float128_is_signaling_nan( b
) ) {
7452 float_raise(float_flag_invalid
, status
);
7454 return float_relation_unordered
;
7456 aSign
= extractFloat128Sign( a
);
7457 bSign
= extractFloat128Sign( b
);
7458 if ( aSign
!= bSign
) {
7459 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
7461 return float_relation_equal
;
7463 return 1 - (2 * aSign
);
7466 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7467 return float_relation_equal
;
7469 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7474 int float128_compare(float128 a
, float128 b
, float_status
*status
)
7476 return float128_compare_internal(a
, b
, 0, status
);
7479 int float128_compare_quiet(float128 a
, float128 b
, float_status
*status
)
7481 return float128_compare_internal(a
, b
, 1, status
);
7484 /* min() and max() functions. These can't be implemented as
7485 * 'compare and pick one input' because that would mishandle
7486 * NaNs and +0 vs -0.
7488 * minnum() and maxnum() functions. These are similar to the min()
7489 * and max() functions but if one of the arguments is a QNaN and
7490 * the other is numerical then the numerical argument is returned.
7491 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7492 * and maxNum() operations. min() and max() are the typical min/max
7493 * semantics provided by many CPUs which predate that specification.
7495 * minnummag() and maxnummag() functions correspond to minNumMag()
7496 * and minNumMag() from the IEEE-754 2008.
7499 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \
7500 int ismin, int isieee, \
7502 float_status *status) \
7504 flag aSign, bSign; \
7505 uint ## s ## _t av, bv, aav, abv; \
7506 a = float ## s ## _squash_input_denormal(a, status); \
7507 b = float ## s ## _squash_input_denormal(b, status); \
7508 if (float ## s ## _is_any_nan(a) || \
7509 float ## s ## _is_any_nan(b)) { \
7511 if (float ## s ## _is_quiet_nan(a) && \
7512 !float ## s ##_is_any_nan(b)) { \
7514 } else if (float ## s ## _is_quiet_nan(b) && \
7515 !float ## s ## _is_any_nan(a)) { \
7519 return propagateFloat ## s ## NaN(a, b, status); \
7521 aSign = extractFloat ## s ## Sign(a); \
7522 bSign = extractFloat ## s ## Sign(b); \
7523 av = float ## s ## _val(a); \
7524 bv = float ## s ## _val(b); \
7526 aav = float ## s ## _abs(av); \
7527 abv = float ## s ## _abs(bv); \
7530 return (aav < abv) ? a : b; \
7532 return (aav < abv) ? b : a; \
7536 if (aSign != bSign) { \
7538 return aSign ? a : b; \
7540 return aSign ? b : a; \
7544 return (aSign ^ (av < bv)) ? a : b; \
7546 return (aSign ^ (av < bv)) ? b : a; \
7551 float ## s float ## s ## _min(float ## s a, float ## s b, \
7552 float_status *status) \
7554 return float ## s ## _minmax(a, b, 1, 0, 0, status); \
7557 float ## s float ## s ## _max(float ## s a, float ## s b, \
7558 float_status *status) \
7560 return float ## s ## _minmax(a, b, 0, 0, 0, status); \
7563 float ## s float ## s ## _minnum(float ## s a, float ## s b, \
7564 float_status *status) \
7566 return float ## s ## _minmax(a, b, 1, 1, 0, status); \
7569 float ## s float ## s ## _maxnum(float ## s a, float ## s b, \
7570 float_status *status) \
7572 return float ## s ## _minmax(a, b, 0, 1, 0, status); \
7575 float ## s float ## s ## _minnummag(float ## s a, float ## s b, \
7576 float_status *status) \
7578 return float ## s ## _minmax(a, b, 1, 1, 1, status); \
7581 float ## s float ## s ## _maxnummag(float ## s a, float ## s b, \
7582 float_status *status) \
7584 return float ## s ## _minmax(a, b, 0, 1, 1, status); \
7591 /* Multiply A by 2 raised to the power N. */
7592 float32
float32_scalbn(float32 a
, int n
, float_status
*status
)
7598 a
= float32_squash_input_denormal(a
, status
);
7599 aSig
= extractFloat32Frac( a
);
7600 aExp
= extractFloat32Exp( a
);
7601 aSign
= extractFloat32Sign( a
);
7603 if ( aExp
== 0xFF ) {
7605 return propagateFloat32NaN(a
, a
, status
);
7611 } else if (aSig
== 0) {
7619 } else if (n
< -0x200) {
7625 return normalizeRoundAndPackFloat32(aSign
, aExp
, aSig
, status
);
7628 float64
float64_scalbn(float64 a
, int n
, float_status
*status
)
7634 a
= float64_squash_input_denormal(a
, status
);
7635 aSig
= extractFloat64Frac( a
);
7636 aExp
= extractFloat64Exp( a
);
7637 aSign
= extractFloat64Sign( a
);
7639 if ( aExp
== 0x7FF ) {
7641 return propagateFloat64NaN(a
, a
, status
);
7646 aSig
|= LIT64( 0x0010000000000000 );
7647 } else if (aSig
== 0) {
7655 } else if (n
< -0x1000) {
7661 return normalizeRoundAndPackFloat64(aSign
, aExp
, aSig
, status
);
7664 floatx80
floatx80_scalbn(floatx80 a
, int n
, float_status
*status
)
7670 aSig
= extractFloatx80Frac( a
);
7671 aExp
= extractFloatx80Exp( a
);
7672 aSign
= extractFloatx80Sign( a
);
7674 if ( aExp
== 0x7FFF ) {
7676 return propagateFloatx80NaN(a
, a
, status
);
7690 } else if (n
< -0x10000) {
7695 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
7696 aSign
, aExp
, aSig
, 0, status
);
7699 float128
float128_scalbn(float128 a
, int n
, float_status
*status
)
7703 uint64_t aSig0
, aSig1
;
7705 aSig1
= extractFloat128Frac1( a
);
7706 aSig0
= extractFloat128Frac0( a
);
7707 aExp
= extractFloat128Exp( a
);
7708 aSign
= extractFloat128Sign( a
);
7709 if ( aExp
== 0x7FFF ) {
7710 if ( aSig0
| aSig1
) {
7711 return propagateFloat128NaN(a
, a
, status
);
7716 aSig0
|= LIT64( 0x0001000000000000 );
7717 } else if (aSig0
== 0 && aSig1
== 0) {
7725 } else if (n
< -0x10000) {
7730 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1