4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
95 *----------------------------------------------------------------------------*/
96 #include "softfloat-macros.h"
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine: (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output. These details are target-
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
112 static inline uint32_t extractFloat16Frac(float16 a
)
114 return float16_val(a
) & 0x3ff;
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
121 static inline int extractFloat16Exp(float16 a
)
123 return (float16_val(a
) >> 10) & 0x1f;
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
130 static inline flag
extractFloat16Sign(float16 a
)
132 return float16_val(a
)>>15;
135 /*----------------------------------------------------------------------------
136 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
137 | and 7, and returns the properly rounded 32-bit integer corresponding to the
138 | input. If `zSign' is 1, the input is negated before being converted to an
139 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
140 | is simply rounded to an integer, with the inexact exception raised if the
141 | input cannot be represented exactly as an integer. However, if the fixed-
142 | point input is too large, the invalid exception is raised and the largest
143 | positive or negative integer is returned.
144 *----------------------------------------------------------------------------*/
146 static int32_t roundAndPackInt32(flag zSign
, uint64_t absZ
, float_status
*status
)
149 flag roundNearestEven
;
150 int8_t roundIncrement
, roundBits
;
153 roundingMode
= status
->float_rounding_mode
;
154 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
155 switch (roundingMode
) {
156 case float_round_nearest_even
:
157 case float_round_ties_away
:
158 roundIncrement
= 0x40;
160 case float_round_to_zero
:
164 roundIncrement
= zSign
? 0 : 0x7f;
166 case float_round_down
:
167 roundIncrement
= zSign
? 0x7f : 0;
172 roundBits
= absZ
& 0x7F;
173 absZ
= ( absZ
+ roundIncrement
)>>7;
174 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
176 if ( zSign
) z
= - z
;
177 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
178 float_raise(float_flag_invalid
, status
);
179 return zSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
182 status
->float_exception_flags
|= float_flag_inexact
;
188 /*----------------------------------------------------------------------------
189 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
190 | `absZ1', with binary point between bits 63 and 64 (between the input words),
191 | and returns the properly rounded 64-bit integer corresponding to the input.
192 | If `zSign' is 1, the input is negated before being converted to an integer.
193 | Ordinarily, the fixed-point input is simply rounded to an integer, with
194 | the inexact exception raised if the input cannot be represented exactly as
195 | an integer. However, if the fixed-point input is too large, the invalid
196 | exception is raised and the largest positive or negative integer is
198 *----------------------------------------------------------------------------*/
200 static int64_t roundAndPackInt64(flag zSign
, uint64_t absZ0
, uint64_t absZ1
,
201 float_status
*status
)
204 flag roundNearestEven
, increment
;
207 roundingMode
= status
->float_rounding_mode
;
208 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
209 switch (roundingMode
) {
210 case float_round_nearest_even
:
211 case float_round_ties_away
:
212 increment
= ((int64_t) absZ1
< 0);
214 case float_round_to_zero
:
218 increment
= !zSign
&& absZ1
;
220 case float_round_down
:
221 increment
= zSign
&& absZ1
;
228 if ( absZ0
== 0 ) goto overflow
;
229 absZ0
&= ~ ( ( (uint64_t) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
232 if ( zSign
) z
= - z
;
233 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
235 float_raise(float_flag_invalid
, status
);
237 zSign
? (int64_t) LIT64( 0x8000000000000000 )
238 : LIT64( 0x7FFFFFFFFFFFFFFF );
241 status
->float_exception_flags
|= float_flag_inexact
;
247 /*----------------------------------------------------------------------------
248 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
249 | `absZ1', with binary point between bits 63 and 64 (between the input words),
250 | and returns the properly rounded 64-bit unsigned integer corresponding to the
251 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
252 | with the inexact exception raised if the input cannot be represented exactly
253 | as an integer. However, if the fixed-point input is too large, the invalid
254 | exception is raised and the largest unsigned integer is returned.
255 *----------------------------------------------------------------------------*/
257 static int64_t roundAndPackUint64(flag zSign
, uint64_t absZ0
,
258 uint64_t absZ1
, float_status
*status
)
261 flag roundNearestEven
, increment
;
263 roundingMode
= status
->float_rounding_mode
;
264 roundNearestEven
= (roundingMode
== float_round_nearest_even
);
265 switch (roundingMode
) {
266 case float_round_nearest_even
:
267 case float_round_ties_away
:
268 increment
= ((int64_t)absZ1
< 0);
270 case float_round_to_zero
:
274 increment
= !zSign
&& absZ1
;
276 case float_round_down
:
277 increment
= zSign
&& absZ1
;
285 float_raise(float_flag_invalid
, status
);
286 return LIT64(0xFFFFFFFFFFFFFFFF);
288 absZ0
&= ~(((uint64_t)(absZ1
<<1) == 0) & roundNearestEven
);
291 if (zSign
&& absZ0
) {
292 float_raise(float_flag_invalid
, status
);
297 status
->float_exception_flags
|= float_flag_inexact
;
302 /*----------------------------------------------------------------------------
303 | Returns the fraction bits of the single-precision floating-point value `a'.
304 *----------------------------------------------------------------------------*/
306 static inline uint32_t extractFloat32Frac( float32 a
)
309 return float32_val(a
) & 0x007FFFFF;
313 /*----------------------------------------------------------------------------
314 | Returns the exponent bits of the single-precision floating-point value `a'.
315 *----------------------------------------------------------------------------*/
317 static inline int extractFloat32Exp(float32 a
)
320 return ( float32_val(a
)>>23 ) & 0xFF;
324 /*----------------------------------------------------------------------------
325 | Returns the sign bit of the single-precision floating-point value `a'.
326 *----------------------------------------------------------------------------*/
328 static inline flag
extractFloat32Sign( float32 a
)
331 return float32_val(a
)>>31;
335 /*----------------------------------------------------------------------------
336 | If `a' is denormal and we are in flush-to-zero mode then set the
337 | input-denormal exception and return zero. Otherwise just return the value.
338 *----------------------------------------------------------------------------*/
339 float32
float32_squash_input_denormal(float32 a
, float_status
*status
)
341 if (status
->flush_inputs_to_zero
) {
342 if (extractFloat32Exp(a
) == 0 && extractFloat32Frac(a
) != 0) {
343 float_raise(float_flag_input_denormal
, status
);
344 return make_float32(float32_val(a
) & 0x80000000);
350 /*----------------------------------------------------------------------------
351 | Normalizes the subnormal single-precision floating-point value represented
352 | by the denormalized significand `aSig'. The normalized exponent and
353 | significand are stored at the locations pointed to by `zExpPtr' and
354 | `zSigPtr', respectively.
355 *----------------------------------------------------------------------------*/
358 normalizeFloat32Subnormal(uint32_t aSig
, int *zExpPtr
, uint32_t *zSigPtr
)
362 shiftCount
= countLeadingZeros32( aSig
) - 8;
363 *zSigPtr
= aSig
<<shiftCount
;
364 *zExpPtr
= 1 - shiftCount
;
368 /*----------------------------------------------------------------------------
369 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
370 | single-precision floating-point value, returning the result. After being
371 | shifted into the proper positions, the three fields are simply added
372 | together to form the result. This means that any integer portion of `zSig'
373 | will be added into the exponent. Since a properly normalized significand
374 | will have an integer portion equal to 1, the `zExp' input should be 1 less
375 | than the desired result exponent whenever `zSig' is a complete, normalized
377 *----------------------------------------------------------------------------*/
379 static inline float32
packFloat32(flag zSign
, int zExp
, uint32_t zSig
)
383 ( ( (uint32_t) zSign
)<<31 ) + ( ( (uint32_t) zExp
)<<23 ) + zSig
);
387 /*----------------------------------------------------------------------------
388 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
389 | and significand `zSig', and returns the proper single-precision floating-
390 | point value corresponding to the abstract input. Ordinarily, the abstract
391 | value is simply rounded and packed into the single-precision format, with
392 | the inexact exception raised if the abstract input cannot be represented
393 | exactly. However, if the abstract value is too large, the overflow and
394 | inexact exceptions are raised and an infinity or maximal finite value is
395 | returned. If the abstract value is too small, the input value is rounded to
396 | a subnormal number, and the underflow and inexact exceptions are raised if
397 | the abstract input cannot be represented exactly as a subnormal single-
398 | precision floating-point number.
399 | The input significand `zSig' has its binary point between bits 30
400 | and 29, which is 7 bits to the left of the usual location. This shifted
401 | significand must be normalized or smaller. If `zSig' is not normalized,
402 | `zExp' must be 0; in that case, the result returned is a subnormal number,
403 | and it must not require rounding. In the usual case that `zSig' is
404 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
405 | The handling of underflow and overflow follows the IEC/IEEE Standard for
406 | Binary Floating-Point Arithmetic.
407 *----------------------------------------------------------------------------*/
409 static float32
roundAndPackFloat32(flag zSign
, int zExp
, uint32_t zSig
,
410 float_status
*status
)
413 flag roundNearestEven
;
414 int8_t roundIncrement
, roundBits
;
417 roundingMode
= status
->float_rounding_mode
;
418 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
419 switch (roundingMode
) {
420 case float_round_nearest_even
:
421 case float_round_ties_away
:
422 roundIncrement
= 0x40;
424 case float_round_to_zero
:
428 roundIncrement
= zSign
? 0 : 0x7f;
430 case float_round_down
:
431 roundIncrement
= zSign
? 0x7f : 0;
437 roundBits
= zSig
& 0x7F;
438 if ( 0xFD <= (uint16_t) zExp
) {
440 || ( ( zExp
== 0xFD )
441 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
443 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
444 return packFloat32( zSign
, 0xFF, - ( roundIncrement
== 0 ));
447 if (status
->flush_to_zero
) {
448 float_raise(float_flag_output_denormal
, status
);
449 return packFloat32(zSign
, 0, 0);
452 (status
->float_detect_tininess
453 == float_tininess_before_rounding
)
455 || ( zSig
+ roundIncrement
< 0x80000000 );
456 shift32RightJamming( zSig
, - zExp
, &zSig
);
458 roundBits
= zSig
& 0x7F;
459 if (isTiny
&& roundBits
) {
460 float_raise(float_flag_underflow
, status
);
465 status
->float_exception_flags
|= float_flag_inexact
;
467 zSig
= ( zSig
+ roundIncrement
)>>7;
468 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
469 if ( zSig
== 0 ) zExp
= 0;
470 return packFloat32( zSign
, zExp
, zSig
);
474 /*----------------------------------------------------------------------------
475 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
476 | and significand `zSig', and returns the proper single-precision floating-
477 | point value corresponding to the abstract input. This routine is just like
478 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
479 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
480 | floating-point exponent.
481 *----------------------------------------------------------------------------*/
484 normalizeRoundAndPackFloat32(flag zSign
, int zExp
, uint32_t zSig
,
485 float_status
*status
)
489 shiftCount
= countLeadingZeros32( zSig
) - 1;
490 return roundAndPackFloat32(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
495 /*----------------------------------------------------------------------------
496 | Returns the fraction bits of the double-precision floating-point value `a'.
497 *----------------------------------------------------------------------------*/
499 static inline uint64_t extractFloat64Frac( float64 a
)
502 return float64_val(a
) & LIT64( 0x000FFFFFFFFFFFFF );
506 /*----------------------------------------------------------------------------
507 | Returns the exponent bits of the double-precision floating-point value `a'.
508 *----------------------------------------------------------------------------*/
510 static inline int extractFloat64Exp(float64 a
)
513 return ( float64_val(a
)>>52 ) & 0x7FF;
517 /*----------------------------------------------------------------------------
518 | Returns the sign bit of the double-precision floating-point value `a'.
519 *----------------------------------------------------------------------------*/
521 static inline flag
extractFloat64Sign( float64 a
)
524 return float64_val(a
)>>63;
528 /*----------------------------------------------------------------------------
529 | If `a' is denormal and we are in flush-to-zero mode then set the
530 | input-denormal exception and return zero. Otherwise just return the value.
531 *----------------------------------------------------------------------------*/
532 float64
float64_squash_input_denormal(float64 a
, float_status
*status
)
534 if (status
->flush_inputs_to_zero
) {
535 if (extractFloat64Exp(a
) == 0 && extractFloat64Frac(a
) != 0) {
536 float_raise(float_flag_input_denormal
, status
);
537 return make_float64(float64_val(a
) & (1ULL << 63));
543 /*----------------------------------------------------------------------------
544 | Normalizes the subnormal double-precision floating-point value represented
545 | by the denormalized significand `aSig'. The normalized exponent and
546 | significand are stored at the locations pointed to by `zExpPtr' and
547 | `zSigPtr', respectively.
548 *----------------------------------------------------------------------------*/
551 normalizeFloat64Subnormal(uint64_t aSig
, int *zExpPtr
, uint64_t *zSigPtr
)
555 shiftCount
= countLeadingZeros64( aSig
) - 11;
556 *zSigPtr
= aSig
<<shiftCount
;
557 *zExpPtr
= 1 - shiftCount
;
561 /*----------------------------------------------------------------------------
562 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
563 | double-precision floating-point value, returning the result. After being
564 | shifted into the proper positions, the three fields are simply added
565 | together to form the result. This means that any integer portion of `zSig'
566 | will be added into the exponent. Since a properly normalized significand
567 | will have an integer portion equal to 1, the `zExp' input should be 1 less
568 | than the desired result exponent whenever `zSig' is a complete, normalized
570 *----------------------------------------------------------------------------*/
572 static inline float64
packFloat64(flag zSign
, int zExp
, uint64_t zSig
)
576 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
580 /*----------------------------------------------------------------------------
581 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
582 | and significand `zSig', and returns the proper double-precision floating-
583 | point value corresponding to the abstract input. Ordinarily, the abstract
584 | value is simply rounded and packed into the double-precision format, with
585 | the inexact exception raised if the abstract input cannot be represented
586 | exactly. However, if the abstract value is too large, the overflow and
587 | inexact exceptions are raised and an infinity or maximal finite value is
588 | returned. If the abstract value is too small, the input value is rounded to
589 | a subnormal number, and the underflow and inexact exceptions are raised if
590 | the abstract input cannot be represented exactly as a subnormal double-
591 | precision floating-point number.
592 | The input significand `zSig' has its binary point between bits 62
593 | and 61, which is 10 bits to the left of the usual location. This shifted
594 | significand must be normalized or smaller. If `zSig' is not normalized,
595 | `zExp' must be 0; in that case, the result returned is a subnormal number,
596 | and it must not require rounding. In the usual case that `zSig' is
597 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
598 | The handling of underflow and overflow follows the IEC/IEEE Standard for
599 | Binary Floating-Point Arithmetic.
600 *----------------------------------------------------------------------------*/
602 static float64
roundAndPackFloat64(flag zSign
, int zExp
, uint64_t zSig
,
603 float_status
*status
)
606 flag roundNearestEven
;
607 int roundIncrement
, roundBits
;
610 roundingMode
= status
->float_rounding_mode
;
611 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
612 switch (roundingMode
) {
613 case float_round_nearest_even
:
614 case float_round_ties_away
:
615 roundIncrement
= 0x200;
617 case float_round_to_zero
:
621 roundIncrement
= zSign
? 0 : 0x3ff;
623 case float_round_down
:
624 roundIncrement
= zSign
? 0x3ff : 0;
629 roundBits
= zSig
& 0x3FF;
630 if ( 0x7FD <= (uint16_t) zExp
) {
631 if ( ( 0x7FD < zExp
)
632 || ( ( zExp
== 0x7FD )
633 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
635 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
636 return packFloat64( zSign
, 0x7FF, - ( roundIncrement
== 0 ));
639 if (status
->flush_to_zero
) {
640 float_raise(float_flag_output_denormal
, status
);
641 return packFloat64(zSign
, 0, 0);
644 (status
->float_detect_tininess
645 == float_tininess_before_rounding
)
647 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
648 shift64RightJamming( zSig
, - zExp
, &zSig
);
650 roundBits
= zSig
& 0x3FF;
651 if (isTiny
&& roundBits
) {
652 float_raise(float_flag_underflow
, status
);
657 status
->float_exception_flags
|= float_flag_inexact
;
659 zSig
= ( zSig
+ roundIncrement
)>>10;
660 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
661 if ( zSig
== 0 ) zExp
= 0;
662 return packFloat64( zSign
, zExp
, zSig
);
666 /*----------------------------------------------------------------------------
667 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
668 | and significand `zSig', and returns the proper double-precision floating-
669 | point value corresponding to the abstract input. This routine is just like
670 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
671 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
672 | floating-point exponent.
673 *----------------------------------------------------------------------------*/
676 normalizeRoundAndPackFloat64(flag zSign
, int zExp
, uint64_t zSig
,
677 float_status
*status
)
681 shiftCount
= countLeadingZeros64( zSig
) - 1;
682 return roundAndPackFloat64(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
687 /*----------------------------------------------------------------------------
688 | Returns the fraction bits of the extended double-precision floating-point
690 *----------------------------------------------------------------------------*/
692 static inline uint64_t extractFloatx80Frac( floatx80 a
)
699 /*----------------------------------------------------------------------------
700 | Returns the exponent bits of the extended double-precision floating-point
702 *----------------------------------------------------------------------------*/
704 static inline int32_t extractFloatx80Exp( floatx80 a
)
707 return a
.high
& 0x7FFF;
711 /*----------------------------------------------------------------------------
712 | Returns the sign bit of the extended double-precision floating-point value
714 *----------------------------------------------------------------------------*/
716 static inline flag
extractFloatx80Sign( floatx80 a
)
723 /*----------------------------------------------------------------------------
724 | Normalizes the subnormal extended double-precision floating-point value
725 | represented by the denormalized significand `aSig'. The normalized exponent
726 | and significand are stored at the locations pointed to by `zExpPtr' and
727 | `zSigPtr', respectively.
728 *----------------------------------------------------------------------------*/
731 normalizeFloatx80Subnormal( uint64_t aSig
, int32_t *zExpPtr
, uint64_t *zSigPtr
)
735 shiftCount
= countLeadingZeros64( aSig
);
736 *zSigPtr
= aSig
<<shiftCount
;
737 *zExpPtr
= 1 - shiftCount
;
741 /*----------------------------------------------------------------------------
742 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
743 | extended double-precision floating-point value, returning the result.
744 *----------------------------------------------------------------------------*/
746 static inline floatx80
packFloatx80( flag zSign
, int32_t zExp
, uint64_t zSig
)
751 z
.high
= ( ( (uint16_t) zSign
)<<15 ) + zExp
;
756 /*----------------------------------------------------------------------------
757 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
758 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
759 | and returns the proper extended double-precision floating-point value
760 | corresponding to the abstract input. Ordinarily, the abstract value is
761 | rounded and packed into the extended double-precision format, with the
762 | inexact exception raised if the abstract input cannot be represented
763 | exactly. However, if the abstract value is too large, the overflow and
764 | inexact exceptions are raised and an infinity or maximal finite value is
765 | returned. If the abstract value is too small, the input value is rounded to
766 | a subnormal number, and the underflow and inexact exceptions are raised if
767 | the abstract input cannot be represented exactly as a subnormal extended
768 | double-precision floating-point number.
769 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
770 | number of bits as single or double precision, respectively. Otherwise, the
771 | result is rounded to the full precision of the extended double-precision
773 | The input significand must be normalized or smaller. If the input
774 | significand is not normalized, `zExp' must be 0; in that case, the result
775 | returned is a subnormal number, and it must not require rounding. The
776 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
777 | Floating-Point Arithmetic.
778 *----------------------------------------------------------------------------*/
780 static floatx80
roundAndPackFloatx80(int8_t roundingPrecision
, flag zSign
,
781 int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
,
782 float_status
*status
)
785 flag roundNearestEven
, increment
, isTiny
;
786 int64_t roundIncrement
, roundMask
, roundBits
;
788 roundingMode
= status
->float_rounding_mode
;
789 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
790 if ( roundingPrecision
== 80 ) goto precision80
;
791 if ( roundingPrecision
== 64 ) {
792 roundIncrement
= LIT64( 0x0000000000000400 );
793 roundMask
= LIT64( 0x00000000000007FF );
795 else if ( roundingPrecision
== 32 ) {
796 roundIncrement
= LIT64( 0x0000008000000000 );
797 roundMask
= LIT64( 0x000000FFFFFFFFFF );
802 zSig0
|= ( zSig1
!= 0 );
803 switch (roundingMode
) {
804 case float_round_nearest_even
:
805 case float_round_ties_away
:
807 case float_round_to_zero
:
811 roundIncrement
= zSign
? 0 : roundMask
;
813 case float_round_down
:
814 roundIncrement
= zSign
? roundMask
: 0;
819 roundBits
= zSig0
& roundMask
;
820 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
821 if ( ( 0x7FFE < zExp
)
822 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
827 if (status
->flush_to_zero
) {
828 float_raise(float_flag_output_denormal
, status
);
829 return packFloatx80(zSign
, 0, 0);
832 (status
->float_detect_tininess
833 == float_tininess_before_rounding
)
835 || ( zSig0
<= zSig0
+ roundIncrement
);
836 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
838 roundBits
= zSig0
& roundMask
;
839 if (isTiny
&& roundBits
) {
840 float_raise(float_flag_underflow
, status
);
843 status
->float_exception_flags
|= float_flag_inexact
;
845 zSig0
+= roundIncrement
;
846 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
847 roundIncrement
= roundMask
+ 1;
848 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
849 roundMask
|= roundIncrement
;
851 zSig0
&= ~ roundMask
;
852 return packFloatx80( zSign
, zExp
, zSig0
);
856 status
->float_exception_flags
|= float_flag_inexact
;
858 zSig0
+= roundIncrement
;
859 if ( zSig0
< roundIncrement
) {
861 zSig0
= LIT64( 0x8000000000000000 );
863 roundIncrement
= roundMask
+ 1;
864 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
865 roundMask
|= roundIncrement
;
867 zSig0
&= ~ roundMask
;
868 if ( zSig0
== 0 ) zExp
= 0;
869 return packFloatx80( zSign
, zExp
, zSig0
);
871 switch (roundingMode
) {
872 case float_round_nearest_even
:
873 case float_round_ties_away
:
874 increment
= ((int64_t)zSig1
< 0);
876 case float_round_to_zero
:
880 increment
= !zSign
&& zSig1
;
882 case float_round_down
:
883 increment
= zSign
&& zSig1
;
888 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
889 if ( ( 0x7FFE < zExp
)
890 || ( ( zExp
== 0x7FFE )
891 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
897 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
898 if ( ( roundingMode
== float_round_to_zero
)
899 || ( zSign
&& ( roundingMode
== float_round_up
) )
900 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
902 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
904 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
908 (status
->float_detect_tininess
909 == float_tininess_before_rounding
)
912 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
913 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
915 if (isTiny
&& zSig1
) {
916 float_raise(float_flag_underflow
, status
);
919 status
->float_exception_flags
|= float_flag_inexact
;
921 switch (roundingMode
) {
922 case float_round_nearest_even
:
923 case float_round_ties_away
:
924 increment
= ((int64_t)zSig1
< 0);
926 case float_round_to_zero
:
930 increment
= !zSign
&& zSig1
;
932 case float_round_down
:
933 increment
= zSign
&& zSig1
;
941 ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
942 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
944 return packFloatx80( zSign
, zExp
, zSig0
);
948 status
->float_exception_flags
|= float_flag_inexact
;
954 zSig0
= LIT64( 0x8000000000000000 );
957 zSig0
&= ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
961 if ( zSig0
== 0 ) zExp
= 0;
963 return packFloatx80( zSign
, zExp
, zSig0
);
967 /*----------------------------------------------------------------------------
968 | Takes an abstract floating-point value having sign `zSign', exponent
969 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
970 | and returns the proper extended double-precision floating-point value
971 | corresponding to the abstract input. This routine is just like
972 | `roundAndPackFloatx80' except that the input significand does not have to be
974 *----------------------------------------------------------------------------*/
976 static floatx80
normalizeRoundAndPackFloatx80(int8_t roundingPrecision
,
977 flag zSign
, int32_t zExp
,
978 uint64_t zSig0
, uint64_t zSig1
,
979 float_status
*status
)
988 shiftCount
= countLeadingZeros64( zSig0
);
989 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
991 return roundAndPackFloatx80(roundingPrecision
, zSign
, zExp
,
992 zSig0
, zSig1
, status
);
996 /*----------------------------------------------------------------------------
997 | Returns the least-significant 64 fraction bits of the quadruple-precision
998 | floating-point value `a'.
999 *----------------------------------------------------------------------------*/
1001 static inline uint64_t extractFloat128Frac1( float128 a
)
1008 /*----------------------------------------------------------------------------
1009 | Returns the most-significant 48 fraction bits of the quadruple-precision
1010 | floating-point value `a'.
1011 *----------------------------------------------------------------------------*/
1013 static inline uint64_t extractFloat128Frac0( float128 a
)
1016 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
1020 /*----------------------------------------------------------------------------
1021 | Returns the exponent bits of the quadruple-precision floating-point value
1023 *----------------------------------------------------------------------------*/
1025 static inline int32_t extractFloat128Exp( float128 a
)
1028 return ( a
.high
>>48 ) & 0x7FFF;
1032 /*----------------------------------------------------------------------------
1033 | Returns the sign bit of the quadruple-precision floating-point value `a'.
1034 *----------------------------------------------------------------------------*/
1036 static inline flag
extractFloat128Sign( float128 a
)
1043 /*----------------------------------------------------------------------------
1044 | Normalizes the subnormal quadruple-precision floating-point value
1045 | represented by the denormalized significand formed by the concatenation of
1046 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
1047 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
1048 | significand are stored at the location pointed to by `zSig0Ptr', and the
1049 | least significant 64 bits of the normalized significand are stored at the
1050 | location pointed to by `zSig1Ptr'.
1051 *----------------------------------------------------------------------------*/
1054 normalizeFloat128Subnormal(
1065 shiftCount
= countLeadingZeros64( aSig1
) - 15;
1066 if ( shiftCount
< 0 ) {
1067 *zSig0Ptr
= aSig1
>>( - shiftCount
);
1068 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
1071 *zSig0Ptr
= aSig1
<<shiftCount
;
1074 *zExpPtr
= - shiftCount
- 63;
1077 shiftCount
= countLeadingZeros64( aSig0
) - 15;
1078 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
1079 *zExpPtr
= 1 - shiftCount
;
1084 /*----------------------------------------------------------------------------
1085 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1086 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1087 | floating-point value, returning the result. After being shifted into the
1088 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1089 | added together to form the most significant 32 bits of the result. This
1090 | means that any integer portion of `zSig0' will be added into the exponent.
1091 | Since a properly normalized significand will have an integer portion equal
1092 | to 1, the `zExp' input should be 1 less than the desired result exponent
1093 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1095 *----------------------------------------------------------------------------*/
1097 static inline float128
1098 packFloat128( flag zSign
, int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
)
1103 z
.high
= ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<48 ) + zSig0
;
1108 /*----------------------------------------------------------------------------
1109 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1110 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1111 | and `zSig2', and returns the proper quadruple-precision floating-point value
1112 | corresponding to the abstract input. Ordinarily, the abstract value is
1113 | simply rounded and packed into the quadruple-precision format, with the
1114 | inexact exception raised if the abstract input cannot be represented
1115 | exactly. However, if the abstract value is too large, the overflow and
1116 | inexact exceptions are raised and an infinity or maximal finite value is
1117 | returned. If the abstract value is too small, the input value is rounded to
1118 | a subnormal number, and the underflow and inexact exceptions are raised if
1119 | the abstract input cannot be represented exactly as a subnormal quadruple-
1120 | precision floating-point number.
1121 | The input significand must be normalized or smaller. If the input
1122 | significand is not normalized, `zExp' must be 0; in that case, the result
1123 | returned is a subnormal number, and it must not require rounding. In the
1124 | usual case that the input significand is normalized, `zExp' must be 1 less
1125 | than the ``true'' floating-point exponent. The handling of underflow and
1126 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1127 *----------------------------------------------------------------------------*/
1129 static float128
roundAndPackFloat128(flag zSign
, int32_t zExp
,
1130 uint64_t zSig0
, uint64_t zSig1
,
1131 uint64_t zSig2
, float_status
*status
)
1133 int8_t roundingMode
;
1134 flag roundNearestEven
, increment
, isTiny
;
1136 roundingMode
= status
->float_rounding_mode
;
1137 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
1138 switch (roundingMode
) {
1139 case float_round_nearest_even
:
1140 case float_round_ties_away
:
1141 increment
= ((int64_t)zSig2
< 0);
1143 case float_round_to_zero
:
1146 case float_round_up
:
1147 increment
= !zSign
&& zSig2
;
1149 case float_round_down
:
1150 increment
= zSign
&& zSig2
;
1155 if ( 0x7FFD <= (uint32_t) zExp
) {
1156 if ( ( 0x7FFD < zExp
)
1157 || ( ( zExp
== 0x7FFD )
1159 LIT64( 0x0001FFFFFFFFFFFF ),
1160 LIT64( 0xFFFFFFFFFFFFFFFF ),
1167 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
1168 if ( ( roundingMode
== float_round_to_zero
)
1169 || ( zSign
&& ( roundingMode
== float_round_up
) )
1170 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
1176 LIT64( 0x0000FFFFFFFFFFFF ),
1177 LIT64( 0xFFFFFFFFFFFFFFFF )
1180 return packFloat128( zSign
, 0x7FFF, 0, 0 );
1183 if (status
->flush_to_zero
) {
1184 float_raise(float_flag_output_denormal
, status
);
1185 return packFloat128(zSign
, 0, 0, 0);
1188 (status
->float_detect_tininess
1189 == float_tininess_before_rounding
)
1195 LIT64( 0x0001FFFFFFFFFFFF ),
1196 LIT64( 0xFFFFFFFFFFFFFFFF )
1198 shift128ExtraRightJamming(
1199 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
1201 if (isTiny
&& zSig2
) {
1202 float_raise(float_flag_underflow
, status
);
1204 switch (roundingMode
) {
1205 case float_round_nearest_even
:
1206 case float_round_ties_away
:
1207 increment
= ((int64_t)zSig2
< 0);
1209 case float_round_to_zero
:
1212 case float_round_up
:
1213 increment
= !zSign
&& zSig2
;
1215 case float_round_down
:
1216 increment
= zSign
&& zSig2
;
1224 status
->float_exception_flags
|= float_flag_inexact
;
1227 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
1228 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
1231 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
1233 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1237 /*----------------------------------------------------------------------------
1238 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1239 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1240 | returns the proper quadruple-precision floating-point value corresponding
1241 | to the abstract input. This routine is just like `roundAndPackFloat128'
1242 | except that the input significand has fewer bits and does not have to be
1243 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1245 *----------------------------------------------------------------------------*/
1247 static float128
normalizeRoundAndPackFloat128(flag zSign
, int32_t zExp
,
1248 uint64_t zSig0
, uint64_t zSig1
,
1249 float_status
*status
)
1259 shiftCount
= countLeadingZeros64( zSig0
) - 15;
1260 if ( 0 <= shiftCount
) {
1262 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1265 shift128ExtraRightJamming(
1266 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
1269 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
1273 /*----------------------------------------------------------------------------
1274 | Returns the result of converting the 32-bit two's complement integer `a'
1275 | to the single-precision floating-point format. The conversion is performed
1276 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1277 *----------------------------------------------------------------------------*/
1279 float32
int32_to_float32(int32_t a
, float_status
*status
)
1283 if ( a
== 0 ) return float32_zero
;
1284 if ( a
== (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1286 return normalizeRoundAndPackFloat32(zSign
, 0x9C, zSign
? -a
: a
, status
);
1289 /*----------------------------------------------------------------------------
1290 | Returns the result of converting the 32-bit two's complement integer `a'
1291 | to the double-precision floating-point format. The conversion is performed
1292 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1293 *----------------------------------------------------------------------------*/
1295 float64
int32_to_float64(int32_t a
, float_status
*status
)
1302 if ( a
== 0 ) return float64_zero
;
1304 absA
= zSign
? - a
: a
;
1305 shiftCount
= countLeadingZeros32( absA
) + 21;
1307 return packFloat64( zSign
, 0x432 - shiftCount
, zSig
<<shiftCount
);
1311 /*----------------------------------------------------------------------------
1312 | Returns the result of converting the 32-bit two's complement integer `a'
1313 | to the extended double-precision floating-point format. The conversion
1314 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1316 *----------------------------------------------------------------------------*/
1318 floatx80
int32_to_floatx80(int32_t a
, float_status
*status
)
1325 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1327 absA
= zSign
? - a
: a
;
1328 shiftCount
= countLeadingZeros32( absA
) + 32;
1330 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
1334 /*----------------------------------------------------------------------------
1335 | Returns the result of converting the 32-bit two's complement integer `a' to
1336 | the quadruple-precision floating-point format. The conversion is performed
1337 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1338 *----------------------------------------------------------------------------*/
1340 float128
int32_to_float128(int32_t a
, float_status
*status
)
1347 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1349 absA
= zSign
? - a
: a
;
1350 shiftCount
= countLeadingZeros32( absA
) + 17;
1352 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
1356 /*----------------------------------------------------------------------------
1357 | Returns the result of converting the 64-bit two's complement integer `a'
1358 | to the single-precision floating-point format. The conversion is performed
1359 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1360 *----------------------------------------------------------------------------*/
1362 float32
int64_to_float32(int64_t a
, float_status
*status
)
1368 if ( a
== 0 ) return float32_zero
;
1370 absA
= zSign
? - a
: a
;
1371 shiftCount
= countLeadingZeros64( absA
) - 40;
1372 if ( 0 <= shiftCount
) {
1373 return packFloat32( zSign
, 0x95 - shiftCount
, absA
<<shiftCount
);
1377 if ( shiftCount
< 0 ) {
1378 shift64RightJamming( absA
, - shiftCount
, &absA
);
1381 absA
<<= shiftCount
;
1383 return roundAndPackFloat32(zSign
, 0x9C - shiftCount
, absA
, status
);
1388 /*----------------------------------------------------------------------------
1389 | Returns the result of converting the 64-bit two's complement integer `a'
1390 | to the double-precision floating-point format. The conversion is performed
1391 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1392 *----------------------------------------------------------------------------*/
1394 float64
int64_to_float64(int64_t a
, float_status
*status
)
1398 if ( a
== 0 ) return float64_zero
;
1399 if ( a
== (int64_t) LIT64( 0x8000000000000000 ) ) {
1400 return packFloat64( 1, 0x43E, 0 );
1403 return normalizeRoundAndPackFloat64(zSign
, 0x43C, zSign
? -a
: a
, status
);
1406 /*----------------------------------------------------------------------------
1407 | Returns the result of converting the 64-bit two's complement integer `a'
1408 | to the extended double-precision floating-point format. The conversion
1409 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1411 *----------------------------------------------------------------------------*/
1413 floatx80
int64_to_floatx80(int64_t a
, float_status
*status
)
1419 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1421 absA
= zSign
? - a
: a
;
1422 shiftCount
= countLeadingZeros64( absA
);
1423 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
1427 /*----------------------------------------------------------------------------
1428 | Returns the result of converting the 64-bit two's complement integer `a' to
1429 | the quadruple-precision floating-point format. The conversion is performed
1430 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1431 *----------------------------------------------------------------------------*/
1433 float128
int64_to_float128(int64_t a
, float_status
*status
)
1439 uint64_t zSig0
, zSig1
;
1441 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1443 absA
= zSign
? - a
: a
;
1444 shiftCount
= countLeadingZeros64( absA
) + 49;
1445 zExp
= 0x406E - shiftCount
;
1446 if ( 64 <= shiftCount
) {
1455 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1456 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1460 /*----------------------------------------------------------------------------
1461 | Returns the result of converting the 64-bit unsigned integer `a'
1462 | to the single-precision floating-point format. The conversion is performed
1463 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1464 *----------------------------------------------------------------------------*/
1466 float32
uint64_to_float32(uint64_t a
, float_status
*status
)
1471 return float32_zero
;
1474 /* Determine (left) shift needed to put first set bit into bit posn 23
1475 * (since packFloat32() expects the binary point between bits 23 and 22);
1476 * this is the fast case for smallish numbers.
1478 shiftcount
= countLeadingZeros64(a
) - 40;
1479 if (shiftcount
>= 0) {
1480 return packFloat32(0, 0x95 - shiftcount
, a
<< shiftcount
);
1482 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1483 * expects the binary point between bits 30 and 29, hence the + 7.
1486 if (shiftcount
< 0) {
1487 shift64RightJamming(a
, -shiftcount
, &a
);
1492 return roundAndPackFloat32(0, 0x9c - shiftcount
, a
, status
);
1495 /*----------------------------------------------------------------------------
1496 | Returns the result of converting the 64-bit unsigned integer `a'
1497 | to the double-precision floating-point format. The conversion is performed
1498 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1499 *----------------------------------------------------------------------------*/
1501 float64
uint64_to_float64(uint64_t a
, float_status
*status
)
1507 return float64_zero
;
1510 shiftcount
= countLeadingZeros64(a
) - 1;
1511 if (shiftcount
< 0) {
1512 shift64RightJamming(a
, -shiftcount
, &a
);
1516 return roundAndPackFloat64(0, exp
- shiftcount
, a
, status
);
1519 /*----------------------------------------------------------------------------
1520 | Returns the result of converting the 64-bit unsigned integer `a'
1521 | to the quadruple-precision floating-point format. The conversion is performed
1522 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1523 *----------------------------------------------------------------------------*/
1525 float128
uint64_to_float128(uint64_t a
, float_status
*status
)
1528 return float128_zero
;
1530 return normalizeRoundAndPackFloat128(0, 0x406E, a
, 0, status
);
1533 /*----------------------------------------------------------------------------
1534 | Returns the result of converting the single-precision floating-point value
1535 | `a' to the 32-bit two's complement integer format. The conversion is
1536 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1537 | Arithmetic---which means in particular that the conversion is rounded
1538 | according to the current rounding mode. If `a' is a NaN, the largest
1539 | positive integer is returned. Otherwise, if the conversion overflows, the
1540 | largest integer with the same sign as `a' is returned.
1541 *----------------------------------------------------------------------------*/
1543 int32_t float32_to_int32(float32 a
, float_status
*status
)
1551 a
= float32_squash_input_denormal(a
, status
);
1552 aSig
= extractFloat32Frac( a
);
1553 aExp
= extractFloat32Exp( a
);
1554 aSign
= extractFloat32Sign( a
);
1555 if ( ( aExp
== 0xFF ) && aSig
) aSign
= 0;
1556 if ( aExp
) aSig
|= 0x00800000;
1557 shiftCount
= 0xAF - aExp
;
1560 if ( 0 < shiftCount
) shift64RightJamming( aSig64
, shiftCount
, &aSig64
);
1561 return roundAndPackInt32(aSign
, aSig64
, status
);
1565 /*----------------------------------------------------------------------------
1566 | Returns the result of converting the single-precision floating-point value
1567 | `a' to the 32-bit two's complement integer format. The conversion is
1568 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1569 | Arithmetic, except that the conversion is always rounded toward zero.
1570 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1571 | the conversion overflows, the largest integer with the same sign as `a' is
1573 *----------------------------------------------------------------------------*/
1575 int32_t float32_to_int32_round_to_zero(float32 a
, float_status
*status
)
1582 a
= float32_squash_input_denormal(a
, status
);
1584 aSig
= extractFloat32Frac( a
);
1585 aExp
= extractFloat32Exp( a
);
1586 aSign
= extractFloat32Sign( a
);
1587 shiftCount
= aExp
- 0x9E;
1588 if ( 0 <= shiftCount
) {
1589 if ( float32_val(a
) != 0xCF000000 ) {
1590 float_raise(float_flag_invalid
, status
);
1591 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
1593 return (int32_t) 0x80000000;
1595 else if ( aExp
<= 0x7E ) {
1597 status
->float_exception_flags
|= float_flag_inexact
;
1601 aSig
= ( aSig
| 0x00800000 )<<8;
1602 z
= aSig
>>( - shiftCount
);
1603 if ( (uint32_t) ( aSig
<<( shiftCount
& 31 ) ) ) {
1604 status
->float_exception_flags
|= float_flag_inexact
;
1606 if ( aSign
) z
= - z
;
1611 /*----------------------------------------------------------------------------
1612 | Returns the result of converting the single-precision floating-point value
1613 | `a' to the 16-bit two's complement integer format. The conversion is
1614 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1615 | Arithmetic, except that the conversion is always rounded toward zero.
1616 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1617 | the conversion overflows, the largest integer with the same sign as `a' is
1619 *----------------------------------------------------------------------------*/
1621 int16_t float32_to_int16_round_to_zero(float32 a
, float_status
*status
)
1629 aSig
= extractFloat32Frac( a
);
1630 aExp
= extractFloat32Exp( a
);
1631 aSign
= extractFloat32Sign( a
);
1632 shiftCount
= aExp
- 0x8E;
1633 if ( 0 <= shiftCount
) {
1634 if ( float32_val(a
) != 0xC7000000 ) {
1635 float_raise(float_flag_invalid
, status
);
1636 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1640 return (int32_t) 0xffff8000;
1642 else if ( aExp
<= 0x7E ) {
1643 if ( aExp
| aSig
) {
1644 status
->float_exception_flags
|= float_flag_inexact
;
1649 aSig
= ( aSig
| 0x00800000 )<<8;
1650 z
= aSig
>>( - shiftCount
);
1651 if ( (uint32_t) ( aSig
<<( shiftCount
& 31 ) ) ) {
1652 status
->float_exception_flags
|= float_flag_inexact
;
1661 /*----------------------------------------------------------------------------
1662 | Returns the result of converting the single-precision floating-point value
1663 | `a' to the 64-bit two's complement integer format. The conversion is
1664 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1665 | Arithmetic---which means in particular that the conversion is rounded
1666 | according to the current rounding mode. If `a' is a NaN, the largest
1667 | positive integer is returned. Otherwise, if the conversion overflows, the
1668 | largest integer with the same sign as `a' is returned.
1669 *----------------------------------------------------------------------------*/
1671 int64_t float32_to_int64(float32 a
, float_status
*status
)
1677 uint64_t aSig64
, aSigExtra
;
1678 a
= float32_squash_input_denormal(a
, status
);
1680 aSig
= extractFloat32Frac( a
);
1681 aExp
= extractFloat32Exp( a
);
1682 aSign
= extractFloat32Sign( a
);
1683 shiftCount
= 0xBE - aExp
;
1684 if ( shiftCount
< 0 ) {
1685 float_raise(float_flag_invalid
, status
);
1686 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1687 return LIT64( 0x7FFFFFFFFFFFFFFF );
1689 return (int64_t) LIT64( 0x8000000000000000 );
1691 if ( aExp
) aSig
|= 0x00800000;
1694 shift64ExtraRightJamming( aSig64
, 0, shiftCount
, &aSig64
, &aSigExtra
);
1695 return roundAndPackInt64(aSign
, aSig64
, aSigExtra
, status
);
1699 /*----------------------------------------------------------------------------
1700 | Returns the result of converting the single-precision floating-point value
1701 | `a' to the 64-bit unsigned integer format. The conversion is
1702 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1703 | Arithmetic---which means in particular that the conversion is rounded
1704 | according to the current rounding mode. If `a' is a NaN, the largest
1705 | unsigned integer is returned. Otherwise, if the conversion overflows, the
1706 | largest unsigned integer is returned. If the 'a' is negative, the result
1707 | is rounded and zero is returned; values that do not round to zero will
1708 | raise the inexact exception flag.
1709 *----------------------------------------------------------------------------*/
1711 uint64_t float32_to_uint64(float32 a
, float_status
*status
)
1717 uint64_t aSig64
, aSigExtra
;
1718 a
= float32_squash_input_denormal(a
, status
);
1720 aSig
= extractFloat32Frac(a
);
1721 aExp
= extractFloat32Exp(a
);
1722 aSign
= extractFloat32Sign(a
);
1723 if ((aSign
) && (aExp
> 126)) {
1724 float_raise(float_flag_invalid
, status
);
1725 if (float32_is_any_nan(a
)) {
1726 return LIT64(0xFFFFFFFFFFFFFFFF);
1731 shiftCount
= 0xBE - aExp
;
1735 if (shiftCount
< 0) {
1736 float_raise(float_flag_invalid
, status
);
1737 return LIT64(0xFFFFFFFFFFFFFFFF);
1742 shift64ExtraRightJamming(aSig64
, 0, shiftCount
, &aSig64
, &aSigExtra
);
1743 return roundAndPackUint64(aSign
, aSig64
, aSigExtra
, status
);
1746 /*----------------------------------------------------------------------------
1747 | Returns the result of converting the single-precision floating-point value
1748 | `a' to the 64-bit unsigned integer format. The conversion is
1749 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1750 | Arithmetic, except that the conversion is always rounded toward zero. If
1751 | `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the
1752 | conversion overflows, the largest unsigned integer is returned. If the
1753 | 'a' is negative, the result is rounded and zero is returned; values that do
1754 | not round to zero will raise the inexact flag.
1755 *----------------------------------------------------------------------------*/
1757 uint64_t float32_to_uint64_round_to_zero(float32 a
, float_status
*status
)
1759 signed char current_rounding_mode
= status
->float_rounding_mode
;
1760 set_float_rounding_mode(float_round_to_zero
, status
);
1761 int64_t v
= float32_to_uint64(a
, status
);
1762 set_float_rounding_mode(current_rounding_mode
, status
);
1766 /*----------------------------------------------------------------------------
1767 | Returns the result of converting the single-precision floating-point value
1768 | `a' to the 64-bit two's complement integer format. The conversion is
1769 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1770 | Arithmetic, except that the conversion is always rounded toward zero. If
1771 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1772 | conversion overflows, the largest integer with the same sign as `a' is
1774 *----------------------------------------------------------------------------*/
1776 int64_t float32_to_int64_round_to_zero(float32 a
, float_status
*status
)
1784 a
= float32_squash_input_denormal(a
, status
);
1786 aSig
= extractFloat32Frac( a
);
1787 aExp
= extractFloat32Exp( a
);
1788 aSign
= extractFloat32Sign( a
);
1789 shiftCount
= aExp
- 0xBE;
1790 if ( 0 <= shiftCount
) {
1791 if ( float32_val(a
) != 0xDF000000 ) {
1792 float_raise(float_flag_invalid
, status
);
1793 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1794 return LIT64( 0x7FFFFFFFFFFFFFFF );
1797 return (int64_t) LIT64( 0x8000000000000000 );
1799 else if ( aExp
<= 0x7E ) {
1801 status
->float_exception_flags
|= float_flag_inexact
;
1805 aSig64
= aSig
| 0x00800000;
1807 z
= aSig64
>>( - shiftCount
);
1808 if ( (uint64_t) ( aSig64
<<( shiftCount
& 63 ) ) ) {
1809 status
->float_exception_flags
|= float_flag_inexact
;
1811 if ( aSign
) z
= - z
;
1816 /*----------------------------------------------------------------------------
1817 | Returns the result of converting the single-precision floating-point value
1818 | `a' to the double-precision floating-point format. The conversion is
1819 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1821 *----------------------------------------------------------------------------*/
1823 float64
float32_to_float64(float32 a
, float_status
*status
)
1828 a
= float32_squash_input_denormal(a
, status
);
1830 aSig
= extractFloat32Frac( a
);
1831 aExp
= extractFloat32Exp( a
);
1832 aSign
= extractFloat32Sign( a
);
1833 if ( aExp
== 0xFF ) {
1835 return commonNaNToFloat64(float32ToCommonNaN(a
, status
), status
);
1837 return packFloat64( aSign
, 0x7FF, 0 );
1840 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
1841 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1844 return packFloat64( aSign
, aExp
+ 0x380, ( (uint64_t) aSig
)<<29 );
1848 /*----------------------------------------------------------------------------
1849 | Returns the result of converting the single-precision floating-point value
1850 | `a' to the extended double-precision floating-point format. The conversion
1851 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1853 *----------------------------------------------------------------------------*/
1855 floatx80
float32_to_floatx80(float32 a
, float_status
*status
)
1861 a
= float32_squash_input_denormal(a
, status
);
1862 aSig
= extractFloat32Frac( a
);
1863 aExp
= extractFloat32Exp( a
);
1864 aSign
= extractFloat32Sign( a
);
1865 if ( aExp
== 0xFF ) {
1867 return commonNaNToFloatx80(float32ToCommonNaN(a
, status
), status
);
1869 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1872 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
1873 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1876 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
1880 /*----------------------------------------------------------------------------
1881 | Returns the result of converting the single-precision floating-point value
1882 | `a' to the double-precision floating-point format. The conversion is
1883 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1885 *----------------------------------------------------------------------------*/
1887 float128
float32_to_float128(float32 a
, float_status
*status
)
1893 a
= float32_squash_input_denormal(a
, status
);
1894 aSig
= extractFloat32Frac( a
);
1895 aExp
= extractFloat32Exp( a
);
1896 aSign
= extractFloat32Sign( a
);
1897 if ( aExp
== 0xFF ) {
1899 return commonNaNToFloat128(float32ToCommonNaN(a
, status
), status
);
1901 return packFloat128( aSign
, 0x7FFF, 0, 0 );
1904 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
1905 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1908 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
1912 /*----------------------------------------------------------------------------
1913 | Rounds the single-precision floating-point value `a' to an integer, and
1914 | returns the result as a single-precision floating-point value. The
1915 | operation is performed according to the IEC/IEEE Standard for Binary
1916 | Floating-Point Arithmetic.
1917 *----------------------------------------------------------------------------*/
1919 float32
float32_round_to_int(float32 a
, float_status
*status
)
1923 uint32_t lastBitMask
, roundBitsMask
;
1925 a
= float32_squash_input_denormal(a
, status
);
1927 aExp
= extractFloat32Exp( a
);
1928 if ( 0x96 <= aExp
) {
1929 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
1930 return propagateFloat32NaN(a
, a
, status
);
1934 if ( aExp
<= 0x7E ) {
1935 if ( (uint32_t) ( float32_val(a
)<<1 ) == 0 ) return a
;
1936 status
->float_exception_flags
|= float_flag_inexact
;
1937 aSign
= extractFloat32Sign( a
);
1938 switch (status
->float_rounding_mode
) {
1939 case float_round_nearest_even
:
1940 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
1941 return packFloat32( aSign
, 0x7F, 0 );
1944 case float_round_ties_away
:
1946 return packFloat32(aSign
, 0x7F, 0);
1949 case float_round_down
:
1950 return make_float32(aSign
? 0xBF800000 : 0);
1951 case float_round_up
:
1952 return make_float32(aSign
? 0x80000000 : 0x3F800000);
1954 return packFloat32( aSign
, 0, 0 );
1957 lastBitMask
<<= 0x96 - aExp
;
1958 roundBitsMask
= lastBitMask
- 1;
1960 switch (status
->float_rounding_mode
) {
1961 case float_round_nearest_even
:
1962 z
+= lastBitMask
>>1;
1963 if ((z
& roundBitsMask
) == 0) {
1967 case float_round_ties_away
:
1968 z
+= lastBitMask
>> 1;
1970 case float_round_to_zero
:
1972 case float_round_up
:
1973 if (!extractFloat32Sign(make_float32(z
))) {
1977 case float_round_down
:
1978 if (extractFloat32Sign(make_float32(z
))) {
1985 z
&= ~ roundBitsMask
;
1986 if (z
!= float32_val(a
)) {
1987 status
->float_exception_flags
|= float_flag_inexact
;
1989 return make_float32(z
);
1993 /*----------------------------------------------------------------------------
1994 | Returns the result of adding the absolute values of the single-precision
1995 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1996 | before being returned. `zSign' is ignored if the result is a NaN.
1997 | The addition is performed according to the IEC/IEEE Standard for Binary
1998 | Floating-Point Arithmetic.
1999 *----------------------------------------------------------------------------*/
2001 static float32
addFloat32Sigs(float32 a
, float32 b
, flag zSign
,
2002 float_status
*status
)
2004 int aExp
, bExp
, zExp
;
2005 uint32_t aSig
, bSig
, zSig
;
2008 aSig
= extractFloat32Frac( a
);
2009 aExp
= extractFloat32Exp( a
);
2010 bSig
= extractFloat32Frac( b
);
2011 bExp
= extractFloat32Exp( b
);
2012 expDiff
= aExp
- bExp
;
2015 if ( 0 < expDiff
) {
2016 if ( aExp
== 0xFF ) {
2018 return propagateFloat32NaN(a
, b
, status
);
2028 shift32RightJamming( bSig
, expDiff
, &bSig
);
2031 else if ( expDiff
< 0 ) {
2032 if ( bExp
== 0xFF ) {
2034 return propagateFloat32NaN(a
, b
, status
);
2036 return packFloat32( zSign
, 0xFF, 0 );
2044 shift32RightJamming( aSig
, - expDiff
, &aSig
);
2048 if ( aExp
== 0xFF ) {
2050 return propagateFloat32NaN(a
, b
, status
);
2055 if (status
->flush_to_zero
) {
2057 float_raise(float_flag_output_denormal
, status
);
2059 return packFloat32(zSign
, 0, 0);
2061 return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
2063 zSig
= 0x40000000 + aSig
+ bSig
;
2068 zSig
= ( aSig
+ bSig
)<<1;
2070 if ( (int32_t) zSig
< 0 ) {
2075 return roundAndPackFloat32(zSign
, zExp
, zSig
, status
);
2079 /*----------------------------------------------------------------------------
2080 | Returns the result of subtracting the absolute values of the single-
2081 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2082 | difference is negated before being returned. `zSign' is ignored if the
2083 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2084 | Standard for Binary Floating-Point Arithmetic.
2085 *----------------------------------------------------------------------------*/
2087 static float32
subFloat32Sigs(float32 a
, float32 b
, flag zSign
,
2088 float_status
*status
)
2090 int aExp
, bExp
, zExp
;
2091 uint32_t aSig
, bSig
, zSig
;
2094 aSig
= extractFloat32Frac( a
);
2095 aExp
= extractFloat32Exp( a
);
2096 bSig
= extractFloat32Frac( b
);
2097 bExp
= extractFloat32Exp( b
);
2098 expDiff
= aExp
- bExp
;
2101 if ( 0 < expDiff
) goto aExpBigger
;
2102 if ( expDiff
< 0 ) goto bExpBigger
;
2103 if ( aExp
== 0xFF ) {
2105 return propagateFloat32NaN(a
, b
, status
);
2107 float_raise(float_flag_invalid
, status
);
2108 return float32_default_nan(status
);
2114 if ( bSig
< aSig
) goto aBigger
;
2115 if ( aSig
< bSig
) goto bBigger
;
2116 return packFloat32(status
->float_rounding_mode
== float_round_down
, 0, 0);
2118 if ( bExp
== 0xFF ) {
2120 return propagateFloat32NaN(a
, b
, status
);
2122 return packFloat32( zSign
^ 1, 0xFF, 0 );
2130 shift32RightJamming( aSig
, - expDiff
, &aSig
);
2136 goto normalizeRoundAndPack
;
2138 if ( aExp
== 0xFF ) {
2140 return propagateFloat32NaN(a
, b
, status
);
2150 shift32RightJamming( bSig
, expDiff
, &bSig
);
2155 normalizeRoundAndPack
:
2157 return normalizeRoundAndPackFloat32(zSign
, zExp
, zSig
, status
);
2161 /*----------------------------------------------------------------------------
2162 | Returns the result of adding the single-precision floating-point values `a'
2163 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2164 | Binary Floating-Point Arithmetic.
2165 *----------------------------------------------------------------------------*/
2167 float32
float32_add(float32 a
, float32 b
, float_status
*status
)
2170 a
= float32_squash_input_denormal(a
, status
);
2171 b
= float32_squash_input_denormal(b
, status
);
2173 aSign
= extractFloat32Sign( a
);
2174 bSign
= extractFloat32Sign( b
);
2175 if ( aSign
== bSign
) {
2176 return addFloat32Sigs(a
, b
, aSign
, status
);
2179 return subFloat32Sigs(a
, b
, aSign
, status
);
2184 /*----------------------------------------------------------------------------
2185 | Returns the result of subtracting the single-precision floating-point values
2186 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2187 | for Binary Floating-Point Arithmetic.
2188 *----------------------------------------------------------------------------*/
2190 float32
float32_sub(float32 a
, float32 b
, float_status
*status
)
2193 a
= float32_squash_input_denormal(a
, status
);
2194 b
= float32_squash_input_denormal(b
, status
);
2196 aSign
= extractFloat32Sign( a
);
2197 bSign
= extractFloat32Sign( b
);
2198 if ( aSign
== bSign
) {
2199 return subFloat32Sigs(a
, b
, aSign
, status
);
2202 return addFloat32Sigs(a
, b
, aSign
, status
);
2207 /*----------------------------------------------------------------------------
2208 | Returns the result of multiplying the single-precision floating-point values
2209 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2210 | for Binary Floating-Point Arithmetic.
2211 *----------------------------------------------------------------------------*/
2213 float32
float32_mul(float32 a
, float32 b
, float_status
*status
)
2215 flag aSign
, bSign
, zSign
;
2216 int aExp
, bExp
, zExp
;
2217 uint32_t aSig
, bSig
;
2221 a
= float32_squash_input_denormal(a
, status
);
2222 b
= float32_squash_input_denormal(b
, status
);
2224 aSig
= extractFloat32Frac( a
);
2225 aExp
= extractFloat32Exp( a
);
2226 aSign
= extractFloat32Sign( a
);
2227 bSig
= extractFloat32Frac( b
);
2228 bExp
= extractFloat32Exp( b
);
2229 bSign
= extractFloat32Sign( b
);
2230 zSign
= aSign
^ bSign
;
2231 if ( aExp
== 0xFF ) {
2232 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
2233 return propagateFloat32NaN(a
, b
, status
);
2235 if ( ( bExp
| bSig
) == 0 ) {
2236 float_raise(float_flag_invalid
, status
);
2237 return float32_default_nan(status
);
2239 return packFloat32( zSign
, 0xFF, 0 );
2241 if ( bExp
== 0xFF ) {
2243 return propagateFloat32NaN(a
, b
, status
);
2245 if ( ( aExp
| aSig
) == 0 ) {
2246 float_raise(float_flag_invalid
, status
);
2247 return float32_default_nan(status
);
2249 return packFloat32( zSign
, 0xFF, 0 );
2252 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
2253 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2256 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
2257 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2259 zExp
= aExp
+ bExp
- 0x7F;
2260 aSig
= ( aSig
| 0x00800000 )<<7;
2261 bSig
= ( bSig
| 0x00800000 )<<8;
2262 shift64RightJamming( ( (uint64_t) aSig
) * bSig
, 32, &zSig64
);
2264 if ( 0 <= (int32_t) ( zSig
<<1 ) ) {
2268 return roundAndPackFloat32(zSign
, zExp
, zSig
, status
);
2272 /*----------------------------------------------------------------------------
2273 | Returns the result of dividing the single-precision floating-point value `a'
2274 | by the corresponding value `b'. The operation is performed according to the
2275 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2276 *----------------------------------------------------------------------------*/
2278 float32
float32_div(float32 a
, float32 b
, float_status
*status
)
2280 flag aSign
, bSign
, zSign
;
2281 int aExp
, bExp
, zExp
;
2282 uint32_t aSig
, bSig
, zSig
;
2283 a
= float32_squash_input_denormal(a
, status
);
2284 b
= float32_squash_input_denormal(b
, status
);
2286 aSig
= extractFloat32Frac( a
);
2287 aExp
= extractFloat32Exp( a
);
2288 aSign
= extractFloat32Sign( a
);
2289 bSig
= extractFloat32Frac( b
);
2290 bExp
= extractFloat32Exp( b
);
2291 bSign
= extractFloat32Sign( b
);
2292 zSign
= aSign
^ bSign
;
2293 if ( aExp
== 0xFF ) {
2295 return propagateFloat32NaN(a
, b
, status
);
2297 if ( bExp
== 0xFF ) {
2299 return propagateFloat32NaN(a
, b
, status
);
2301 float_raise(float_flag_invalid
, status
);
2302 return float32_default_nan(status
);
2304 return packFloat32( zSign
, 0xFF, 0 );
2306 if ( bExp
== 0xFF ) {
2308 return propagateFloat32NaN(a
, b
, status
);
2310 return packFloat32( zSign
, 0, 0 );
2314 if ( ( aExp
| aSig
) == 0 ) {
2315 float_raise(float_flag_invalid
, status
);
2316 return float32_default_nan(status
);
2318 float_raise(float_flag_divbyzero
, status
);
2319 return packFloat32( zSign
, 0xFF, 0 );
2321 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2324 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
2325 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2327 zExp
= aExp
- bExp
+ 0x7D;
2328 aSig
= ( aSig
| 0x00800000 )<<7;
2329 bSig
= ( bSig
| 0x00800000 )<<8;
2330 if ( bSig
<= ( aSig
+ aSig
) ) {
2334 zSig
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
2335 if ( ( zSig
& 0x3F ) == 0 ) {
2336 zSig
|= ( (uint64_t) bSig
* zSig
!= ( (uint64_t) aSig
)<<32 );
2338 return roundAndPackFloat32(zSign
, zExp
, zSig
, status
);
2342 /*----------------------------------------------------------------------------
2343 | Returns the remainder of the single-precision floating-point value `a'
2344 | with respect to the corresponding value `b'. The operation is performed
2345 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2346 *----------------------------------------------------------------------------*/
2348 float32
float32_rem(float32 a
, float32 b
, float_status
*status
)
2351 int aExp
, bExp
, expDiff
;
2352 uint32_t aSig
, bSig
;
2354 uint64_t aSig64
, bSig64
, q64
;
2355 uint32_t alternateASig
;
2357 a
= float32_squash_input_denormal(a
, status
);
2358 b
= float32_squash_input_denormal(b
, status
);
2360 aSig
= extractFloat32Frac( a
);
2361 aExp
= extractFloat32Exp( a
);
2362 aSign
= extractFloat32Sign( a
);
2363 bSig
= extractFloat32Frac( b
);
2364 bExp
= extractFloat32Exp( b
);
2365 if ( aExp
== 0xFF ) {
2366 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
2367 return propagateFloat32NaN(a
, b
, status
);
2369 float_raise(float_flag_invalid
, status
);
2370 return float32_default_nan(status
);
2372 if ( bExp
== 0xFF ) {
2374 return propagateFloat32NaN(a
, b
, status
);
2380 float_raise(float_flag_invalid
, status
);
2381 return float32_default_nan(status
);
2383 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2386 if ( aSig
== 0 ) return a
;
2387 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2389 expDiff
= aExp
- bExp
;
2392 if ( expDiff
< 32 ) {
2395 if ( expDiff
< 0 ) {
2396 if ( expDiff
< -1 ) return a
;
2399 q
= ( bSig
<= aSig
);
2400 if ( q
) aSig
-= bSig
;
2401 if ( 0 < expDiff
) {
2402 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
2405 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
2413 if ( bSig
<= aSig
) aSig
-= bSig
;
2414 aSig64
= ( (uint64_t) aSig
)<<40;
2415 bSig64
= ( (uint64_t) bSig
)<<40;
2417 while ( 0 < expDiff
) {
2418 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2419 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2420 aSig64
= - ( ( bSig
* q64
)<<38 );
2424 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2425 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2426 q
= q64
>>( 64 - expDiff
);
2428 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
2431 alternateASig
= aSig
;
2434 } while ( 0 <= (int32_t) aSig
);
2435 sigMean
= aSig
+ alternateASig
;
2436 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
2437 aSig
= alternateASig
;
2439 zSign
= ( (int32_t) aSig
< 0 );
2440 if ( zSign
) aSig
= - aSig
;
2441 return normalizeRoundAndPackFloat32(aSign
^ zSign
, bExp
, aSig
, status
);
2444 /*----------------------------------------------------------------------------
2445 | Returns the result of multiplying the single-precision floating-point values
2446 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2447 | multiplication. The operation is performed according to the IEC/IEEE
2448 | Standard for Binary Floating-Point Arithmetic 754-2008.
2449 | The flags argument allows the caller to select negation of the
2450 | addend, the intermediate product, or the final result. (The difference
2451 | between this and having the caller do a separate negation is that negating
2452 | externally will flip the sign bit on NaNs.)
2453 *----------------------------------------------------------------------------*/
2455 float32
float32_muladd(float32 a
, float32 b
, float32 c
, int flags
,
2456 float_status
*status
)
2458 flag aSign
, bSign
, cSign
, zSign
;
2459 int aExp
, bExp
, cExp
, pExp
, zExp
, expDiff
;
2460 uint32_t aSig
, bSig
, cSig
;
2461 flag pInf
, pZero
, pSign
;
2462 uint64_t pSig64
, cSig64
, zSig64
;
2465 flag signflip
, infzero
;
2467 a
= float32_squash_input_denormal(a
, status
);
2468 b
= float32_squash_input_denormal(b
, status
);
2469 c
= float32_squash_input_denormal(c
, status
);
2470 aSig
= extractFloat32Frac(a
);
2471 aExp
= extractFloat32Exp(a
);
2472 aSign
= extractFloat32Sign(a
);
2473 bSig
= extractFloat32Frac(b
);
2474 bExp
= extractFloat32Exp(b
);
2475 bSign
= extractFloat32Sign(b
);
2476 cSig
= extractFloat32Frac(c
);
2477 cExp
= extractFloat32Exp(c
);
2478 cSign
= extractFloat32Sign(c
);
2480 infzero
= ((aExp
== 0 && aSig
== 0 && bExp
== 0xff && bSig
== 0) ||
2481 (aExp
== 0xff && aSig
== 0 && bExp
== 0 && bSig
== 0));
2483 /* It is implementation-defined whether the cases of (0,inf,qnan)
2484 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2485 * they return if they do), so we have to hand this information
2486 * off to the target-specific pick-a-NaN routine.
2488 if (((aExp
== 0xff) && aSig
) ||
2489 ((bExp
== 0xff) && bSig
) ||
2490 ((cExp
== 0xff) && cSig
)) {
2491 return propagateFloat32MulAddNaN(a
, b
, c
, infzero
, status
);
2495 float_raise(float_flag_invalid
, status
);
2496 return float32_default_nan(status
);
2499 if (flags
& float_muladd_negate_c
) {
2503 signflip
= (flags
& float_muladd_negate_result
) ? 1 : 0;
2505 /* Work out the sign and type of the product */
2506 pSign
= aSign
^ bSign
;
2507 if (flags
& float_muladd_negate_product
) {
2510 pInf
= (aExp
== 0xff) || (bExp
== 0xff);
2511 pZero
= ((aExp
| aSig
) == 0) || ((bExp
| bSig
) == 0);
2514 if (pInf
&& (pSign
^ cSign
)) {
2515 /* addition of opposite-signed infinities => InvalidOperation */
2516 float_raise(float_flag_invalid
, status
);
2517 return float32_default_nan(status
);
2519 /* Otherwise generate an infinity of the same sign */
2520 return packFloat32(cSign
^ signflip
, 0xff, 0);
2524 return packFloat32(pSign
^ signflip
, 0xff, 0);
2530 /* Adding two exact zeroes */
2531 if (pSign
== cSign
) {
2533 } else if (status
->float_rounding_mode
== float_round_down
) {
2538 return packFloat32(zSign
^ signflip
, 0, 0);
2540 /* Exact zero plus a denorm */
2541 if (status
->flush_to_zero
) {
2542 float_raise(float_flag_output_denormal
, status
);
2543 return packFloat32(cSign
^ signflip
, 0, 0);
2546 /* Zero plus something non-zero : just return the something */
2547 if (flags
& float_muladd_halve_result
) {
2549 normalizeFloat32Subnormal(cSig
, &cExp
, &cSig
);
2551 /* Subtract one to halve, and one again because roundAndPackFloat32
2552 * wants one less than the true exponent.
2555 cSig
= (cSig
| 0x00800000) << 7;
2556 return roundAndPackFloat32(cSign
^ signflip
, cExp
, cSig
, status
);
2558 return packFloat32(cSign
^ signflip
, cExp
, cSig
);
2562 normalizeFloat32Subnormal(aSig
, &aExp
, &aSig
);
2565 normalizeFloat32Subnormal(bSig
, &bExp
, &bSig
);
2568 /* Calculate the actual result a * b + c */
2570 /* Multiply first; this is easy. */
2571 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2572 * because we want the true exponent, not the "one-less-than"
2573 * flavour that roundAndPackFloat32() takes.
2575 pExp
= aExp
+ bExp
- 0x7e;
2576 aSig
= (aSig
| 0x00800000) << 7;
2577 bSig
= (bSig
| 0x00800000) << 8;
2578 pSig64
= (uint64_t)aSig
* bSig
;
2579 if ((int64_t)(pSig64
<< 1) >= 0) {
2584 zSign
= pSign
^ signflip
;
2586 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2591 /* Throw out the special case of c being an exact zero now */
2592 shift64RightJamming(pSig64
, 32, &pSig64
);
2594 if (flags
& float_muladd_halve_result
) {
2597 return roundAndPackFloat32(zSign
, pExp
- 1,
2600 normalizeFloat32Subnormal(cSig
, &cExp
, &cSig
);
2603 cSig64
= (uint64_t)cSig
<< (62 - 23);
2604 cSig64
|= LIT64(0x4000000000000000);
2605 expDiff
= pExp
- cExp
;
2607 if (pSign
== cSign
) {
2610 /* scale c to match p */
2611 shift64RightJamming(cSig64
, expDiff
, &cSig64
);
2613 } else if (expDiff
< 0) {
2614 /* scale p to match c */
2615 shift64RightJamming(pSig64
, -expDiff
, &pSig64
);
2618 /* no scaling needed */
2621 /* Add significands and make sure explicit bit ends up in posn 62 */
2622 zSig64
= pSig64
+ cSig64
;
2623 if ((int64_t)zSig64
< 0) {
2624 shift64RightJamming(zSig64
, 1, &zSig64
);
2631 shift64RightJamming(cSig64
, expDiff
, &cSig64
);
2632 zSig64
= pSig64
- cSig64
;
2634 } else if (expDiff
< 0) {
2635 shift64RightJamming(pSig64
, -expDiff
, &pSig64
);
2636 zSig64
= cSig64
- pSig64
;
2641 if (cSig64
< pSig64
) {
2642 zSig64
= pSig64
- cSig64
;
2643 } else if (pSig64
< cSig64
) {
2644 zSig64
= cSig64
- pSig64
;
2649 if (status
->float_rounding_mode
== float_round_down
) {
2652 return packFloat32(zSign
, 0, 0);
2656 /* Normalize to put the explicit bit back into bit 62. */
2657 shiftcount
= countLeadingZeros64(zSig64
) - 1;
2658 zSig64
<<= shiftcount
;
2661 if (flags
& float_muladd_halve_result
) {
2665 shift64RightJamming(zSig64
, 32, &zSig64
);
2666 return roundAndPackFloat32(zSign
, zExp
, zSig64
, status
);
2670 /*----------------------------------------------------------------------------
2671 | Returns the square root of the single-precision floating-point value `a'.
2672 | The operation is performed according to the IEC/IEEE Standard for Binary
2673 | Floating-Point Arithmetic.
2674 *----------------------------------------------------------------------------*/
2676 float32
float32_sqrt(float32 a
, float_status
*status
)
2680 uint32_t aSig
, zSig
;
2682 a
= float32_squash_input_denormal(a
, status
);
2684 aSig
= extractFloat32Frac( a
);
2685 aExp
= extractFloat32Exp( a
);
2686 aSign
= extractFloat32Sign( a
);
2687 if ( aExp
== 0xFF ) {
2689 return propagateFloat32NaN(a
, float32_zero
, status
);
2691 if ( ! aSign
) return a
;
2692 float_raise(float_flag_invalid
, status
);
2693 return float32_default_nan(status
);
2696 if ( ( aExp
| aSig
) == 0 ) return a
;
2697 float_raise(float_flag_invalid
, status
);
2698 return float32_default_nan(status
);
2701 if ( aSig
== 0 ) return float32_zero
;
2702 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2704 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
2705 aSig
= ( aSig
| 0x00800000 )<<8;
2706 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
2707 if ( ( zSig
& 0x7F ) <= 5 ) {
2713 term
= ( (uint64_t) zSig
) * zSig
;
2714 rem
= ( ( (uint64_t) aSig
)<<32 ) - term
;
2715 while ( (int64_t) rem
< 0 ) {
2717 rem
+= ( ( (uint64_t) zSig
)<<1 ) | 1;
2719 zSig
|= ( rem
!= 0 );
2721 shift32RightJamming( zSig
, 1, &zSig
);
2723 return roundAndPackFloat32(0, zExp
, zSig
, status
);
2727 /*----------------------------------------------------------------------------
2728 | Returns the binary exponential of the single-precision floating-point value
2729 | `a'. The operation is performed according to the IEC/IEEE Standard for
2730 | Binary Floating-Point Arithmetic.
2732 | Uses the following identities:
2734 | 1. -------------------------------------------------------------------------
2738 | 2. -------------------------------------------------------------------------
2741 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2743 *----------------------------------------------------------------------------*/
2745 static const float64 float32_exp2_coefficients
[15] =
2747 const_float64( 0x3ff0000000000000ll
), /* 1 */
2748 const_float64( 0x3fe0000000000000ll
), /* 2 */
2749 const_float64( 0x3fc5555555555555ll
), /* 3 */
2750 const_float64( 0x3fa5555555555555ll
), /* 4 */
2751 const_float64( 0x3f81111111111111ll
), /* 5 */
2752 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
2753 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
2754 const_float64( 0x3efa01a01a01a01all
), /* 8 */
2755 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
2756 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
2757 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
2758 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
2759 const_float64( 0x3de6124613a86d09ll
), /* 13 */
2760 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
2761 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
2764 float32
float32_exp2(float32 a
, float_status
*status
)
2771 a
= float32_squash_input_denormal(a
, status
);
2773 aSig
= extractFloat32Frac( a
);
2774 aExp
= extractFloat32Exp( a
);
2775 aSign
= extractFloat32Sign( a
);
2777 if ( aExp
== 0xFF) {
2779 return propagateFloat32NaN(a
, float32_zero
, status
);
2781 return (aSign
) ? float32_zero
: a
;
2784 if (aSig
== 0) return float32_one
;
2787 float_raise(float_flag_inexact
, status
);
2789 /* ******************************* */
2790 /* using float64 for approximation */
2791 /* ******************************* */
2792 x
= float32_to_float64(a
, status
);
2793 x
= float64_mul(x
, float64_ln2
, status
);
2797 for (i
= 0 ; i
< 15 ; i
++) {
2800 f
= float64_mul(xn
, float32_exp2_coefficients
[i
], status
);
2801 r
= float64_add(r
, f
, status
);
2803 xn
= float64_mul(xn
, x
, status
);
2806 return float64_to_float32(r
, status
);
2809 /*----------------------------------------------------------------------------
2810 | Returns the binary log of the single-precision floating-point value `a'.
2811 | The operation is performed according to the IEC/IEEE Standard for Binary
2812 | Floating-Point Arithmetic.
2813 *----------------------------------------------------------------------------*/
2814 float32
float32_log2(float32 a
, float_status
*status
)
2818 uint32_t aSig
, zSig
, i
;
2820 a
= float32_squash_input_denormal(a
, status
);
2821 aSig
= extractFloat32Frac( a
);
2822 aExp
= extractFloat32Exp( a
);
2823 aSign
= extractFloat32Sign( a
);
2826 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
2827 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2830 float_raise(float_flag_invalid
, status
);
2831 return float32_default_nan(status
);
2833 if ( aExp
== 0xFF ) {
2835 return propagateFloat32NaN(a
, float32_zero
, status
);
2845 for (i
= 1 << 22; i
> 0; i
>>= 1) {
2846 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
2847 if ( aSig
& 0x01000000 ) {
2856 return normalizeRoundAndPackFloat32(zSign
, 0x85, zSig
, status
);
2859 /*----------------------------------------------------------------------------
2860 | Returns 1 if the single-precision floating-point value `a' is equal to
2861 | the corresponding value `b', and 0 otherwise. The invalid exception is
2862 | raised if either operand is a NaN. Otherwise, the comparison is performed
2863 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2864 *----------------------------------------------------------------------------*/
2866 int float32_eq(float32 a
, float32 b
, float_status
*status
)
2869 a
= float32_squash_input_denormal(a
, status
);
2870 b
= float32_squash_input_denormal(b
, status
);
2872 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2873 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2875 float_raise(float_flag_invalid
, status
);
2878 av
= float32_val(a
);
2879 bv
= float32_val(b
);
2880 return ( av
== bv
) || ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
2883 /*----------------------------------------------------------------------------
2884 | Returns 1 if the single-precision floating-point value `a' is less than
2885 | or equal to the corresponding value `b', and 0 otherwise. The invalid
2886 | exception is raised if either operand is a NaN. The comparison is performed
2887 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2888 *----------------------------------------------------------------------------*/
2890 int float32_le(float32 a
, float32 b
, float_status
*status
)
2894 a
= float32_squash_input_denormal(a
, status
);
2895 b
= float32_squash_input_denormal(b
, status
);
2897 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2898 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2900 float_raise(float_flag_invalid
, status
);
2903 aSign
= extractFloat32Sign( a
);
2904 bSign
= extractFloat32Sign( b
);
2905 av
= float32_val(a
);
2906 bv
= float32_val(b
);
2907 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
2908 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
2912 /*----------------------------------------------------------------------------
2913 | Returns 1 if the single-precision floating-point value `a' is less than
2914 | the corresponding value `b', and 0 otherwise. The invalid exception is
2915 | raised if either operand is a NaN. The comparison is performed according
2916 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2917 *----------------------------------------------------------------------------*/
2919 int float32_lt(float32 a
, float32 b
, float_status
*status
)
2923 a
= float32_squash_input_denormal(a
, status
);
2924 b
= float32_squash_input_denormal(b
, status
);
2926 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2927 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2929 float_raise(float_flag_invalid
, status
);
2932 aSign
= extractFloat32Sign( a
);
2933 bSign
= extractFloat32Sign( b
);
2934 av
= float32_val(a
);
2935 bv
= float32_val(b
);
2936 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
2937 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
2941 /*----------------------------------------------------------------------------
2942 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2943 | be compared, and 0 otherwise. The invalid exception is raised if either
2944 | operand is a NaN. The comparison is performed according to the IEC/IEEE
2945 | Standard for Binary Floating-Point Arithmetic.
2946 *----------------------------------------------------------------------------*/
2948 int float32_unordered(float32 a
, float32 b
, float_status
*status
)
2950 a
= float32_squash_input_denormal(a
, status
);
2951 b
= float32_squash_input_denormal(b
, status
);
2953 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2954 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2956 float_raise(float_flag_invalid
, status
);
2962 /*----------------------------------------------------------------------------
2963 | Returns 1 if the single-precision floating-point value `a' is equal to
2964 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2965 | exception. The comparison is performed according to the IEC/IEEE Standard
2966 | for Binary Floating-Point Arithmetic.
2967 *----------------------------------------------------------------------------*/
2969 int float32_eq_quiet(float32 a
, float32 b
, float_status
*status
)
2971 a
= float32_squash_input_denormal(a
, status
);
2972 b
= float32_squash_input_denormal(b
, status
);
2974 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2975 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2977 if (float32_is_signaling_nan(a
, status
)
2978 || float32_is_signaling_nan(b
, status
)) {
2979 float_raise(float_flag_invalid
, status
);
2983 return ( float32_val(a
) == float32_val(b
) ) ||
2984 ( (uint32_t) ( ( float32_val(a
) | float32_val(b
) )<<1 ) == 0 );
2987 /*----------------------------------------------------------------------------
2988 | Returns 1 if the single-precision floating-point value `a' is less than or
2989 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2990 | cause an exception. Otherwise, the comparison is performed according to the
2991 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2992 *----------------------------------------------------------------------------*/
2994 int float32_le_quiet(float32 a
, float32 b
, float_status
*status
)
2998 a
= float32_squash_input_denormal(a
, status
);
2999 b
= float32_squash_input_denormal(b
, status
);
3001 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3002 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3004 if (float32_is_signaling_nan(a
, status
)
3005 || float32_is_signaling_nan(b
, status
)) {
3006 float_raise(float_flag_invalid
, status
);
3010 aSign
= extractFloat32Sign( a
);
3011 bSign
= extractFloat32Sign( b
);
3012 av
= float32_val(a
);
3013 bv
= float32_val(b
);
3014 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
3015 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3019 /*----------------------------------------------------------------------------
3020 | Returns 1 if the single-precision floating-point value `a' is less than
3021 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3022 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3023 | Standard for Binary Floating-Point Arithmetic.
3024 *----------------------------------------------------------------------------*/
3026 int float32_lt_quiet(float32 a
, float32 b
, float_status
*status
)
3030 a
= float32_squash_input_denormal(a
, status
);
3031 b
= float32_squash_input_denormal(b
, status
);
3033 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3034 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3036 if (float32_is_signaling_nan(a
, status
)
3037 || float32_is_signaling_nan(b
, status
)) {
3038 float_raise(float_flag_invalid
, status
);
3042 aSign
= extractFloat32Sign( a
);
3043 bSign
= extractFloat32Sign( b
);
3044 av
= float32_val(a
);
3045 bv
= float32_val(b
);
3046 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
3047 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3051 /*----------------------------------------------------------------------------
3052 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3053 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3054 | comparison is performed according to the IEC/IEEE Standard for Binary
3055 | Floating-Point Arithmetic.
3056 *----------------------------------------------------------------------------*/
3058 int float32_unordered_quiet(float32 a
, float32 b
, float_status
*status
)
3060 a
= float32_squash_input_denormal(a
, status
);
3061 b
= float32_squash_input_denormal(b
, status
);
3063 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3064 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3066 if (float32_is_signaling_nan(a
, status
)
3067 || float32_is_signaling_nan(b
, status
)) {
3068 float_raise(float_flag_invalid
, status
);
3075 /*----------------------------------------------------------------------------
3076 | Returns the result of converting the double-precision floating-point value
3077 | `a' to the 32-bit two's complement integer format. The conversion is
3078 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3079 | Arithmetic---which means in particular that the conversion is rounded
3080 | according to the current rounding mode. If `a' is a NaN, the largest
3081 | positive integer is returned. Otherwise, if the conversion overflows, the
3082 | largest integer with the same sign as `a' is returned.
3083 *----------------------------------------------------------------------------*/
3085 int32_t float64_to_int32(float64 a
, float_status
*status
)
3091 a
= float64_squash_input_denormal(a
, status
);
3093 aSig
= extractFloat64Frac( a
);
3094 aExp
= extractFloat64Exp( a
);
3095 aSign
= extractFloat64Sign( a
);
3096 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
3097 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
3098 shiftCount
= 0x42C - aExp
;
3099 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
3100 return roundAndPackInt32(aSign
, aSig
, status
);
3104 /*----------------------------------------------------------------------------
3105 | Returns the result of converting the double-precision floating-point value
3106 | `a' to the 32-bit two's complement integer format. The conversion is
3107 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3108 | Arithmetic, except that the conversion is always rounded toward zero.
3109 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3110 | the conversion overflows, the largest integer with the same sign as `a' is
3112 *----------------------------------------------------------------------------*/
3114 int32_t float64_to_int32_round_to_zero(float64 a
, float_status
*status
)
3119 uint64_t aSig
, savedASig
;
3121 a
= float64_squash_input_denormal(a
, status
);
3123 aSig
= extractFloat64Frac( a
);
3124 aExp
= extractFloat64Exp( a
);
3125 aSign
= extractFloat64Sign( a
);
3126 if ( 0x41E < aExp
) {
3127 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
3130 else if ( aExp
< 0x3FF ) {
3132 status
->float_exception_flags
|= float_flag_inexact
;
3136 aSig
|= LIT64( 0x0010000000000000 );
3137 shiftCount
= 0x433 - aExp
;
3139 aSig
>>= shiftCount
;
3141 if ( aSign
) z
= - z
;
3142 if ( ( z
< 0 ) ^ aSign
) {
3144 float_raise(float_flag_invalid
, status
);
3145 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
3147 if ( ( aSig
<<shiftCount
) != savedASig
) {
3148 status
->float_exception_flags
|= float_flag_inexact
;
3154 /*----------------------------------------------------------------------------
3155 | Returns the result of converting the double-precision floating-point value
3156 | `a' to the 16-bit two's complement integer format. The conversion is
3157 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3158 | Arithmetic, except that the conversion is always rounded toward zero.
3159 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3160 | the conversion overflows, the largest integer with the same sign as `a' is
3162 *----------------------------------------------------------------------------*/
3164 int16_t float64_to_int16_round_to_zero(float64 a
, float_status
*status
)
3169 uint64_t aSig
, savedASig
;
3172 aSig
= extractFloat64Frac( a
);
3173 aExp
= extractFloat64Exp( a
);
3174 aSign
= extractFloat64Sign( a
);
3175 if ( 0x40E < aExp
) {
3176 if ( ( aExp
== 0x7FF ) && aSig
) {
3181 else if ( aExp
< 0x3FF ) {
3182 if ( aExp
|| aSig
) {
3183 status
->float_exception_flags
|= float_flag_inexact
;
3187 aSig
|= LIT64( 0x0010000000000000 );
3188 shiftCount
= 0x433 - aExp
;
3190 aSig
>>= shiftCount
;
3195 if ( ( (int16_t)z
< 0 ) ^ aSign
) {
3197 float_raise(float_flag_invalid
, status
);
3198 return aSign
? (int32_t) 0xffff8000 : 0x7FFF;
3200 if ( ( aSig
<<shiftCount
) != savedASig
) {
3201 status
->float_exception_flags
|= float_flag_inexact
;
3206 /*----------------------------------------------------------------------------
3207 | Returns the result of converting the double-precision floating-point value
3208 | `a' to the 64-bit two's complement integer format. The conversion is
3209 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3210 | Arithmetic---which means in particular that the conversion is rounded
3211 | according to the current rounding mode. If `a' is a NaN, the largest
3212 | positive integer is returned. Otherwise, if the conversion overflows, the
3213 | largest integer with the same sign as `a' is returned.
3214 *----------------------------------------------------------------------------*/
3216 int64_t float64_to_int64(float64 a
, float_status
*status
)
3221 uint64_t aSig
, aSigExtra
;
3222 a
= float64_squash_input_denormal(a
, status
);
3224 aSig
= extractFloat64Frac( a
);
3225 aExp
= extractFloat64Exp( a
);
3226 aSign
= extractFloat64Sign( a
);
3227 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
3228 shiftCount
= 0x433 - aExp
;
3229 if ( shiftCount
<= 0 ) {
3230 if ( 0x43E < aExp
) {
3231 float_raise(float_flag_invalid
, status
);
3233 || ( ( aExp
== 0x7FF )
3234 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
3236 return LIT64( 0x7FFFFFFFFFFFFFFF );
3238 return (int64_t) LIT64( 0x8000000000000000 );
3241 aSig
<<= - shiftCount
;
3244 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
3246 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
3250 /*----------------------------------------------------------------------------
3251 | Returns the result of converting the double-precision floating-point value
3252 | `a' to the 64-bit two's complement integer format. The conversion is
3253 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3254 | Arithmetic, except that the conversion is always rounded toward zero.
3255 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3256 | the conversion overflows, the largest integer with the same sign as `a' is
3258 *----------------------------------------------------------------------------*/
3260 int64_t float64_to_int64_round_to_zero(float64 a
, float_status
*status
)
3267 a
= float64_squash_input_denormal(a
, status
);
3269 aSig
= extractFloat64Frac( a
);
3270 aExp
= extractFloat64Exp( a
);
3271 aSign
= extractFloat64Sign( a
);
3272 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
3273 shiftCount
= aExp
- 0x433;
3274 if ( 0 <= shiftCount
) {
3275 if ( 0x43E <= aExp
) {
3276 if ( float64_val(a
) != LIT64( 0xC3E0000000000000 ) ) {
3277 float_raise(float_flag_invalid
, status
);
3279 || ( ( aExp
== 0x7FF )
3280 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
3282 return LIT64( 0x7FFFFFFFFFFFFFFF );
3285 return (int64_t) LIT64( 0x8000000000000000 );
3287 z
= aSig
<<shiftCount
;
3290 if ( aExp
< 0x3FE ) {
3292 status
->float_exception_flags
|= float_flag_inexact
;
3296 z
= aSig
>>( - shiftCount
);
3297 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
3298 status
->float_exception_flags
|= float_flag_inexact
;
3301 if ( aSign
) z
= - z
;
3306 /*----------------------------------------------------------------------------
3307 | Returns the result of converting the double-precision floating-point value
3308 | `a' to the single-precision floating-point format. The conversion is
3309 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3311 *----------------------------------------------------------------------------*/
3313 float32
float64_to_float32(float64 a
, float_status
*status
)
3319 a
= float64_squash_input_denormal(a
, status
);
3321 aSig
= extractFloat64Frac( a
);
3322 aExp
= extractFloat64Exp( a
);
3323 aSign
= extractFloat64Sign( a
);
3324 if ( aExp
== 0x7FF ) {
3326 return commonNaNToFloat32(float64ToCommonNaN(a
, status
), status
);
3328 return packFloat32( aSign
, 0xFF, 0 );
3330 shift64RightJamming( aSig
, 22, &aSig
);
3332 if ( aExp
|| zSig
) {
3336 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
3341 /*----------------------------------------------------------------------------
3342 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3343 | half-precision floating-point value, returning the result. After being
3344 | shifted into the proper positions, the three fields are simply added
3345 | together to form the result. This means that any integer portion of `zSig'
3346 | will be added into the exponent. Since a properly normalized significand
3347 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3348 | than the desired result exponent whenever `zSig' is a complete, normalized
3350 *----------------------------------------------------------------------------*/
3351 static float16
packFloat16(flag zSign
, int zExp
, uint16_t zSig
)
3353 return make_float16(
3354 (((uint32_t)zSign
) << 15) + (((uint32_t)zExp
) << 10) + zSig
);
3357 /*----------------------------------------------------------------------------
3358 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3359 | and significand `zSig', and returns the proper half-precision floating-
3360 | point value corresponding to the abstract input. Ordinarily, the abstract
3361 | value is simply rounded and packed into the half-precision format, with
3362 | the inexact exception raised if the abstract input cannot be represented
3363 | exactly. However, if the abstract value is too large, the overflow and
3364 | inexact exceptions are raised and an infinity or maximal finite value is
3365 | returned. If the abstract value is too small, the input value is rounded to
3366 | a subnormal number, and the underflow and inexact exceptions are raised if
3367 | the abstract input cannot be represented exactly as a subnormal half-
3368 | precision floating-point number.
3369 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3370 | ARM-style "alternative representation", which omits the NaN and Inf
3371 | encodings in order to raise the maximum representable exponent by one.
3372 | The input significand `zSig' has its binary point between bits 22
3373 | and 23, which is 13 bits to the left of the usual location. This shifted
3374 | significand must be normalized or smaller. If `zSig' is not normalized,
3375 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3376 | and it must not require rounding. In the usual case that `zSig' is
3377 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3378 | Note the slightly odd position of the binary point in zSig compared with the
3379 | other roundAndPackFloat functions. This should probably be fixed if we
3380 | need to implement more float16 routines than just conversion.
3381 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3382 | Binary Floating-Point Arithmetic.
3383 *----------------------------------------------------------------------------*/
3385 static float16
roundAndPackFloat16(flag zSign
, int zExp
,
3386 uint32_t zSig
, flag ieee
,
3387 float_status
*status
)
3389 int maxexp
= ieee
? 29 : 30;
3392 bool rounding_bumps_exp
;
3393 bool is_tiny
= false;
3395 /* Calculate the mask of bits of the mantissa which are not
3396 * representable in half-precision and will be lost.
3399 /* Will be denormal in halfprec */
3405 /* Normal number in halfprec */
3409 switch (status
->float_rounding_mode
) {
3410 case float_round_nearest_even
:
3411 increment
= (mask
+ 1) >> 1;
3412 if ((zSig
& mask
) == increment
) {
3413 increment
= zSig
& (increment
<< 1);
3416 case float_round_ties_away
:
3417 increment
= (mask
+ 1) >> 1;
3419 case float_round_up
:
3420 increment
= zSign
? 0 : mask
;
3422 case float_round_down
:
3423 increment
= zSign
? mask
: 0;
3425 default: /* round_to_zero */
3430 rounding_bumps_exp
= (zSig
+ increment
>= 0x01000000);
3432 if (zExp
> maxexp
|| (zExp
== maxexp
&& rounding_bumps_exp
)) {
3434 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3435 return packFloat16(zSign
, 0x1f, 0);
3437 float_raise(float_flag_invalid
, status
);
3438 return packFloat16(zSign
, 0x1f, 0x3ff);
3443 /* Note that flush-to-zero does not affect half-precision results */
3445 (status
->float_detect_tininess
== float_tininess_before_rounding
)
3447 || (!rounding_bumps_exp
);
3450 float_raise(float_flag_inexact
, status
);
3452 float_raise(float_flag_underflow
, status
);
3457 if (rounding_bumps_exp
) {
3463 return packFloat16(zSign
, 0, 0);
3469 return packFloat16(zSign
, zExp
, zSig
>> 13);
3472 static void normalizeFloat16Subnormal(uint32_t aSig
, int *zExpPtr
,
3475 int8_t shiftCount
= countLeadingZeros32(aSig
) - 21;
3476 *zSigPtr
= aSig
<< shiftCount
;
3477 *zExpPtr
= 1 - shiftCount
;
3480 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3481 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3483 float32
float16_to_float32(float16 a
, flag ieee
, float_status
*status
)
3489 aSign
= extractFloat16Sign(a
);
3490 aExp
= extractFloat16Exp(a
);
3491 aSig
= extractFloat16Frac(a
);
3493 if (aExp
== 0x1f && ieee
) {
3495 return commonNaNToFloat32(float16ToCommonNaN(a
, status
), status
);
3497 return packFloat32(aSign
, 0xff, 0);
3501 return packFloat32(aSign
, 0, 0);
3504 normalizeFloat16Subnormal(aSig
, &aExp
, &aSig
);
3507 return packFloat32( aSign
, aExp
+ 0x70, aSig
<< 13);
3510 float16
float32_to_float16(float32 a
, flag ieee
, float_status
*status
)
3516 a
= float32_squash_input_denormal(a
, status
);
3518 aSig
= extractFloat32Frac( a
);
3519 aExp
= extractFloat32Exp( a
);
3520 aSign
= extractFloat32Sign( a
);
3521 if ( aExp
== 0xFF ) {
3523 /* Input is a NaN */
3525 float_raise(float_flag_invalid
, status
);
3526 return packFloat16(aSign
, 0, 0);
3528 return commonNaNToFloat16(
3529 float32ToCommonNaN(a
, status
), status
);
3533 float_raise(float_flag_invalid
, status
);
3534 return packFloat16(aSign
, 0x1f, 0x3ff);
3536 return packFloat16(aSign
, 0x1f, 0);
3538 if (aExp
== 0 && aSig
== 0) {
3539 return packFloat16(aSign
, 0, 0);
3541 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3542 * even if the input is denormal; however this is harmless because
3543 * the largest possible single-precision denormal is still smaller
3544 * than the smallest representable half-precision denormal, and so we
3545 * will end up ignoring aSig and returning via the "always return zero"
3551 return roundAndPackFloat16(aSign
, aExp
, aSig
, ieee
, status
);
3554 float64
float16_to_float64(float16 a
, flag ieee
, float_status
*status
)
3560 aSign
= extractFloat16Sign(a
);
3561 aExp
= extractFloat16Exp(a
);
3562 aSig
= extractFloat16Frac(a
);
3564 if (aExp
== 0x1f && ieee
) {
3566 return commonNaNToFloat64(
3567 float16ToCommonNaN(a
, status
), status
);
3569 return packFloat64(aSign
, 0x7ff, 0);
3573 return packFloat64(aSign
, 0, 0);
3576 normalizeFloat16Subnormal(aSig
, &aExp
, &aSig
);
3579 return packFloat64(aSign
, aExp
+ 0x3f0, ((uint64_t)aSig
) << 42);
3582 float16
float64_to_float16(float64 a
, flag ieee
, float_status
*status
)
3589 a
= float64_squash_input_denormal(a
, status
);
3591 aSig
= extractFloat64Frac(a
);
3592 aExp
= extractFloat64Exp(a
);
3593 aSign
= extractFloat64Sign(a
);
3594 if (aExp
== 0x7FF) {
3596 /* Input is a NaN */
3598 float_raise(float_flag_invalid
, status
);
3599 return packFloat16(aSign
, 0, 0);
3601 return commonNaNToFloat16(
3602 float64ToCommonNaN(a
, status
), status
);
3606 float_raise(float_flag_invalid
, status
);
3607 return packFloat16(aSign
, 0x1f, 0x3ff);
3609 return packFloat16(aSign
, 0x1f, 0);
3611 shift64RightJamming(aSig
, 29, &aSig
);
3613 if (aExp
== 0 && zSig
== 0) {
3614 return packFloat16(aSign
, 0, 0);
3616 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3617 * even if the input is denormal; however this is harmless because
3618 * the largest possible single-precision denormal is still smaller
3619 * than the smallest representable half-precision denormal, and so we
3620 * will end up ignoring aSig and returning via the "always return zero"
3626 return roundAndPackFloat16(aSign
, aExp
, zSig
, ieee
, status
);
3629 /*----------------------------------------------------------------------------
3630 | Returns the result of converting the double-precision floating-point value
3631 | `a' to the extended double-precision floating-point format. The conversion
3632 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3634 *----------------------------------------------------------------------------*/
3636 floatx80
float64_to_floatx80(float64 a
, float_status
*status
)
3642 a
= float64_squash_input_denormal(a
, status
);
3643 aSig
= extractFloat64Frac( a
);
3644 aExp
= extractFloat64Exp( a
);
3645 aSign
= extractFloat64Sign( a
);
3646 if ( aExp
== 0x7FF ) {
3648 return commonNaNToFloatx80(float64ToCommonNaN(a
, status
), status
);
3650 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3653 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
3654 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3658 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
3662 /*----------------------------------------------------------------------------
3663 | Returns the result of converting the double-precision floating-point value
3664 | `a' to the quadruple-precision floating-point format. The conversion is
3665 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3667 *----------------------------------------------------------------------------*/
3669 float128
float64_to_float128(float64 a
, float_status
*status
)
3673 uint64_t aSig
, zSig0
, zSig1
;
3675 a
= float64_squash_input_denormal(a
, status
);
3676 aSig
= extractFloat64Frac( a
);
3677 aExp
= extractFloat64Exp( a
);
3678 aSign
= extractFloat64Sign( a
);
3679 if ( aExp
== 0x7FF ) {
3681 return commonNaNToFloat128(float64ToCommonNaN(a
, status
), status
);
3683 return packFloat128( aSign
, 0x7FFF, 0, 0 );
3686 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
3687 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3690 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
3691 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
3695 /*----------------------------------------------------------------------------
3696 | Rounds the double-precision floating-point value `a' to an integer, and
3697 | returns the result as a double-precision floating-point value. The
3698 | operation is performed according to the IEC/IEEE Standard for Binary
3699 | Floating-Point Arithmetic.
3700 *----------------------------------------------------------------------------*/
3702 float64
float64_round_to_int(float64 a
, float_status
*status
)
3706 uint64_t lastBitMask
, roundBitsMask
;
3708 a
= float64_squash_input_denormal(a
, status
);
3710 aExp
= extractFloat64Exp( a
);
3711 if ( 0x433 <= aExp
) {
3712 if ( ( aExp
== 0x7FF ) && extractFloat64Frac( a
) ) {
3713 return propagateFloat64NaN(a
, a
, status
);
3717 if ( aExp
< 0x3FF ) {
3718 if ( (uint64_t) ( float64_val(a
)<<1 ) == 0 ) return a
;
3719 status
->float_exception_flags
|= float_flag_inexact
;
3720 aSign
= extractFloat64Sign( a
);
3721 switch (status
->float_rounding_mode
) {
3722 case float_round_nearest_even
:
3723 if ( ( aExp
== 0x3FE ) && extractFloat64Frac( a
) ) {
3724 return packFloat64( aSign
, 0x3FF, 0 );
3727 case float_round_ties_away
:
3728 if (aExp
== 0x3FE) {
3729 return packFloat64(aSign
, 0x3ff, 0);
3732 case float_round_down
:
3733 return make_float64(aSign
? LIT64( 0xBFF0000000000000 ) : 0);
3734 case float_round_up
:
3735 return make_float64(
3736 aSign
? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3738 return packFloat64( aSign
, 0, 0 );
3741 lastBitMask
<<= 0x433 - aExp
;
3742 roundBitsMask
= lastBitMask
- 1;
3744 switch (status
->float_rounding_mode
) {
3745 case float_round_nearest_even
:
3746 z
+= lastBitMask
>> 1;
3747 if ((z
& roundBitsMask
) == 0) {
3751 case float_round_ties_away
:
3752 z
+= lastBitMask
>> 1;
3754 case float_round_to_zero
:
3756 case float_round_up
:
3757 if (!extractFloat64Sign(make_float64(z
))) {
3761 case float_round_down
:
3762 if (extractFloat64Sign(make_float64(z
))) {
3769 z
&= ~ roundBitsMask
;
3770 if (z
!= float64_val(a
)) {
3771 status
->float_exception_flags
|= float_flag_inexact
;
3773 return make_float64(z
);
3777 float64
float64_trunc_to_int(float64 a
, float_status
*status
)
3781 oldmode
= status
->float_rounding_mode
;
3782 status
->float_rounding_mode
= float_round_to_zero
;
3783 res
= float64_round_to_int(a
, status
);
3784 status
->float_rounding_mode
= oldmode
;
3788 /*----------------------------------------------------------------------------
3789 | Returns the result of adding the absolute values of the double-precision
3790 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3791 | before being returned. `zSign' is ignored if the result is a NaN.
3792 | The addition is performed according to the IEC/IEEE Standard for Binary
3793 | Floating-Point Arithmetic.
3794 *----------------------------------------------------------------------------*/
3796 static float64
addFloat64Sigs(float64 a
, float64 b
, flag zSign
,
3797 float_status
*status
)
3799 int aExp
, bExp
, zExp
;
3800 uint64_t aSig
, bSig
, zSig
;
3803 aSig
= extractFloat64Frac( a
);
3804 aExp
= extractFloat64Exp( a
);
3805 bSig
= extractFloat64Frac( b
);
3806 bExp
= extractFloat64Exp( b
);
3807 expDiff
= aExp
- bExp
;
3810 if ( 0 < expDiff
) {
3811 if ( aExp
== 0x7FF ) {
3813 return propagateFloat64NaN(a
, b
, status
);
3821 bSig
|= LIT64( 0x2000000000000000 );
3823 shift64RightJamming( bSig
, expDiff
, &bSig
);
3826 else if ( expDiff
< 0 ) {
3827 if ( bExp
== 0x7FF ) {
3829 return propagateFloat64NaN(a
, b
, status
);
3831 return packFloat64( zSign
, 0x7FF, 0 );
3837 aSig
|= LIT64( 0x2000000000000000 );
3839 shift64RightJamming( aSig
, - expDiff
, &aSig
);
3843 if ( aExp
== 0x7FF ) {
3845 return propagateFloat64NaN(a
, b
, status
);
3850 if (status
->flush_to_zero
) {
3852 float_raise(float_flag_output_denormal
, status
);
3854 return packFloat64(zSign
, 0, 0);
3856 return packFloat64( zSign
, 0, ( aSig
+ bSig
)>>9 );
3858 zSig
= LIT64( 0x4000000000000000 ) + aSig
+ bSig
;
3862 aSig
|= LIT64( 0x2000000000000000 );
3863 zSig
= ( aSig
+ bSig
)<<1;
3865 if ( (int64_t) zSig
< 0 ) {
3870 return roundAndPackFloat64(zSign
, zExp
, zSig
, status
);
3874 /*----------------------------------------------------------------------------
3875 | Returns the result of subtracting the absolute values of the double-
3876 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3877 | difference is negated before being returned. `zSign' is ignored if the
3878 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3879 | Standard for Binary Floating-Point Arithmetic.
3880 *----------------------------------------------------------------------------*/
3882 static float64
subFloat64Sigs(float64 a
, float64 b
, flag zSign
,
3883 float_status
*status
)
3885 int aExp
, bExp
, zExp
;
3886 uint64_t aSig
, bSig
, zSig
;
3889 aSig
= extractFloat64Frac( a
);
3890 aExp
= extractFloat64Exp( a
);
3891 bSig
= extractFloat64Frac( b
);
3892 bExp
= extractFloat64Exp( b
);
3893 expDiff
= aExp
- bExp
;
3896 if ( 0 < expDiff
) goto aExpBigger
;
3897 if ( expDiff
< 0 ) goto bExpBigger
;
3898 if ( aExp
== 0x7FF ) {
3900 return propagateFloat64NaN(a
, b
, status
);
3902 float_raise(float_flag_invalid
, status
);
3903 return float64_default_nan(status
);
3909 if ( bSig
< aSig
) goto aBigger
;
3910 if ( aSig
< bSig
) goto bBigger
;
3911 return packFloat64(status
->float_rounding_mode
== float_round_down
, 0, 0);
3913 if ( bExp
== 0x7FF ) {
3915 return propagateFloat64NaN(a
, b
, status
);
3917 return packFloat64( zSign
^ 1, 0x7FF, 0 );
3923 aSig
|= LIT64( 0x4000000000000000 );
3925 shift64RightJamming( aSig
, - expDiff
, &aSig
);
3926 bSig
|= LIT64( 0x4000000000000000 );
3931 goto normalizeRoundAndPack
;
3933 if ( aExp
== 0x7FF ) {
3935 return propagateFloat64NaN(a
, b
, status
);
3943 bSig
|= LIT64( 0x4000000000000000 );
3945 shift64RightJamming( bSig
, expDiff
, &bSig
);
3946 aSig
|= LIT64( 0x4000000000000000 );
3950 normalizeRoundAndPack
:
3952 return normalizeRoundAndPackFloat64(zSign
, zExp
, zSig
, status
);
3956 /*----------------------------------------------------------------------------
3957 | Returns the result of adding the double-precision floating-point values `a'
3958 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3959 | Binary Floating-Point Arithmetic.
3960 *----------------------------------------------------------------------------*/
3962 float64
float64_add(float64 a
, float64 b
, float_status
*status
)
3965 a
= float64_squash_input_denormal(a
, status
);
3966 b
= float64_squash_input_denormal(b
, status
);
3968 aSign
= extractFloat64Sign( a
);
3969 bSign
= extractFloat64Sign( b
);
3970 if ( aSign
== bSign
) {
3971 return addFloat64Sigs(a
, b
, aSign
, status
);
3974 return subFloat64Sigs(a
, b
, aSign
, status
);
3979 /*----------------------------------------------------------------------------
3980 | Returns the result of subtracting the double-precision floating-point values
3981 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3982 | for Binary Floating-Point Arithmetic.
3983 *----------------------------------------------------------------------------*/
3985 float64
float64_sub(float64 a
, float64 b
, float_status
*status
)
3988 a
= float64_squash_input_denormal(a
, status
);
3989 b
= float64_squash_input_denormal(b
, status
);
3991 aSign
= extractFloat64Sign( a
);
3992 bSign
= extractFloat64Sign( b
);
3993 if ( aSign
== bSign
) {
3994 return subFloat64Sigs(a
, b
, aSign
, status
);
3997 return addFloat64Sigs(a
, b
, aSign
, status
);
4002 /*----------------------------------------------------------------------------
4003 | Returns the result of multiplying the double-precision floating-point values
4004 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4005 | for Binary Floating-Point Arithmetic.
4006 *----------------------------------------------------------------------------*/
4008 float64
float64_mul(float64 a
, float64 b
, float_status
*status
)
4010 flag aSign
, bSign
, zSign
;
4011 int aExp
, bExp
, zExp
;
4012 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4014 a
= float64_squash_input_denormal(a
, status
);
4015 b
= float64_squash_input_denormal(b
, status
);
4017 aSig
= extractFloat64Frac( a
);
4018 aExp
= extractFloat64Exp( a
);
4019 aSign
= extractFloat64Sign( a
);
4020 bSig
= extractFloat64Frac( b
);
4021 bExp
= extractFloat64Exp( b
);
4022 bSign
= extractFloat64Sign( b
);
4023 zSign
= aSign
^ bSign
;
4024 if ( aExp
== 0x7FF ) {
4025 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
4026 return propagateFloat64NaN(a
, b
, status
);
4028 if ( ( bExp
| bSig
) == 0 ) {
4029 float_raise(float_flag_invalid
, status
);
4030 return float64_default_nan(status
);
4032 return packFloat64( zSign
, 0x7FF, 0 );
4034 if ( bExp
== 0x7FF ) {
4036 return propagateFloat64NaN(a
, b
, status
);
4038 if ( ( aExp
| aSig
) == 0 ) {
4039 float_raise(float_flag_invalid
, status
);
4040 return float64_default_nan(status
);
4042 return packFloat64( zSign
, 0x7FF, 0 );
4045 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
4046 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4049 if ( bSig
== 0 ) return packFloat64( zSign
, 0, 0 );
4050 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
4052 zExp
= aExp
+ bExp
- 0x3FF;
4053 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
4054 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
4055 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
4056 zSig0
|= ( zSig1
!= 0 );
4057 if ( 0 <= (int64_t) ( zSig0
<<1 ) ) {
4061 return roundAndPackFloat64(zSign
, zExp
, zSig0
, status
);
4065 /*----------------------------------------------------------------------------
4066 | Returns the result of dividing the double-precision floating-point value `a'
4067 | by the corresponding value `b'. The operation is performed according to
4068 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4069 *----------------------------------------------------------------------------*/
4071 float64
float64_div(float64 a
, float64 b
, float_status
*status
)
4073 flag aSign
, bSign
, zSign
;
4074 int aExp
, bExp
, zExp
;
4075 uint64_t aSig
, bSig
, zSig
;
4076 uint64_t rem0
, rem1
;
4077 uint64_t term0
, term1
;
4078 a
= float64_squash_input_denormal(a
, status
);
4079 b
= float64_squash_input_denormal(b
, status
);
4081 aSig
= extractFloat64Frac( a
);
4082 aExp
= extractFloat64Exp( a
);
4083 aSign
= extractFloat64Sign( a
);
4084 bSig
= extractFloat64Frac( b
);
4085 bExp
= extractFloat64Exp( b
);
4086 bSign
= extractFloat64Sign( b
);
4087 zSign
= aSign
^ bSign
;
4088 if ( aExp
== 0x7FF ) {
4090 return propagateFloat64NaN(a
, b
, status
);
4092 if ( bExp
== 0x7FF ) {
4094 return propagateFloat64NaN(a
, b
, status
);
4096 float_raise(float_flag_invalid
, status
);
4097 return float64_default_nan(status
);
4099 return packFloat64( zSign
, 0x7FF, 0 );
4101 if ( bExp
== 0x7FF ) {
4103 return propagateFloat64NaN(a
, b
, status
);
4105 return packFloat64( zSign
, 0, 0 );
4109 if ( ( aExp
| aSig
) == 0 ) {
4110 float_raise(float_flag_invalid
, status
);
4111 return float64_default_nan(status
);
4113 float_raise(float_flag_divbyzero
, status
);
4114 return packFloat64( zSign
, 0x7FF, 0 );
4116 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
4119 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
4120 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4122 zExp
= aExp
- bExp
+ 0x3FD;
4123 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
4124 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
4125 if ( bSig
<= ( aSig
+ aSig
) ) {
4129 zSig
= estimateDiv128To64( aSig
, 0, bSig
);
4130 if ( ( zSig
& 0x1FF ) <= 2 ) {
4131 mul64To128( bSig
, zSig
, &term0
, &term1
);
4132 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
4133 while ( (int64_t) rem0
< 0 ) {
4135 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
4137 zSig
|= ( rem1
!= 0 );
4139 return roundAndPackFloat64(zSign
, zExp
, zSig
, status
);
4143 /*----------------------------------------------------------------------------
4144 | Returns the remainder of the double-precision floating-point value `a'
4145 | with respect to the corresponding value `b'. The operation is performed
4146 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4147 *----------------------------------------------------------------------------*/
4149 float64
float64_rem(float64 a
, float64 b
, float_status
*status
)
4152 int aExp
, bExp
, expDiff
;
4153 uint64_t aSig
, bSig
;
4154 uint64_t q
, alternateASig
;
4157 a
= float64_squash_input_denormal(a
, status
);
4158 b
= float64_squash_input_denormal(b
, status
);
4159 aSig
= extractFloat64Frac( a
);
4160 aExp
= extractFloat64Exp( a
);
4161 aSign
= extractFloat64Sign( a
);
4162 bSig
= extractFloat64Frac( b
);
4163 bExp
= extractFloat64Exp( b
);
4164 if ( aExp
== 0x7FF ) {
4165 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
4166 return propagateFloat64NaN(a
, b
, status
);
4168 float_raise(float_flag_invalid
, status
);
4169 return float64_default_nan(status
);
4171 if ( bExp
== 0x7FF ) {
4173 return propagateFloat64NaN(a
, b
, status
);
4179 float_raise(float_flag_invalid
, status
);
4180 return float64_default_nan(status
);
4182 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
4185 if ( aSig
== 0 ) return a
;
4186 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4188 expDiff
= aExp
- bExp
;
4189 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
4190 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
4191 if ( expDiff
< 0 ) {
4192 if ( expDiff
< -1 ) return a
;
4195 q
= ( bSig
<= aSig
);
4196 if ( q
) aSig
-= bSig
;
4198 while ( 0 < expDiff
) {
4199 q
= estimateDiv128To64( aSig
, 0, bSig
);
4200 q
= ( 2 < q
) ? q
- 2 : 0;
4201 aSig
= - ( ( bSig
>>2 ) * q
);
4205 if ( 0 < expDiff
) {
4206 q
= estimateDiv128To64( aSig
, 0, bSig
);
4207 q
= ( 2 < q
) ? q
- 2 : 0;
4210 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
4217 alternateASig
= aSig
;
4220 } while ( 0 <= (int64_t) aSig
);
4221 sigMean
= aSig
+ alternateASig
;
4222 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
4223 aSig
= alternateASig
;
4225 zSign
= ( (int64_t) aSig
< 0 );
4226 if ( zSign
) aSig
= - aSig
;
4227 return normalizeRoundAndPackFloat64(aSign
^ zSign
, bExp
, aSig
, status
);
4231 /*----------------------------------------------------------------------------
4232 | Returns the result of multiplying the double-precision floating-point values
4233 | `a' and `b' then adding 'c', with no intermediate rounding step after the
4234 | multiplication. The operation is performed according to the IEC/IEEE
4235 | Standard for Binary Floating-Point Arithmetic 754-2008.
4236 | The flags argument allows the caller to select negation of the
4237 | addend, the intermediate product, or the final result. (The difference
4238 | between this and having the caller do a separate negation is that negating
4239 | externally will flip the sign bit on NaNs.)
4240 *----------------------------------------------------------------------------*/
4242 float64
float64_muladd(float64 a
, float64 b
, float64 c
, int flags
,
4243 float_status
*status
)
4245 flag aSign
, bSign
, cSign
, zSign
;
4246 int aExp
, bExp
, cExp
, pExp
, zExp
, expDiff
;
4247 uint64_t aSig
, bSig
, cSig
;
4248 flag pInf
, pZero
, pSign
;
4249 uint64_t pSig0
, pSig1
, cSig0
, cSig1
, zSig0
, zSig1
;
4251 flag signflip
, infzero
;
4253 a
= float64_squash_input_denormal(a
, status
);
4254 b
= float64_squash_input_denormal(b
, status
);
4255 c
= float64_squash_input_denormal(c
, status
);
4256 aSig
= extractFloat64Frac(a
);
4257 aExp
= extractFloat64Exp(a
);
4258 aSign
= extractFloat64Sign(a
);
4259 bSig
= extractFloat64Frac(b
);
4260 bExp
= extractFloat64Exp(b
);
4261 bSign
= extractFloat64Sign(b
);
4262 cSig
= extractFloat64Frac(c
);
4263 cExp
= extractFloat64Exp(c
);
4264 cSign
= extractFloat64Sign(c
);
4266 infzero
= ((aExp
== 0 && aSig
== 0 && bExp
== 0x7ff && bSig
== 0) ||
4267 (aExp
== 0x7ff && aSig
== 0 && bExp
== 0 && bSig
== 0));
4269 /* It is implementation-defined whether the cases of (0,inf,qnan)
4270 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4271 * they return if they do), so we have to hand this information
4272 * off to the target-specific pick-a-NaN routine.
4274 if (((aExp
== 0x7ff) && aSig
) ||
4275 ((bExp
== 0x7ff) && bSig
) ||
4276 ((cExp
== 0x7ff) && cSig
)) {
4277 return propagateFloat64MulAddNaN(a
, b
, c
, infzero
, status
);
4281 float_raise(float_flag_invalid
, status
);
4282 return float64_default_nan(status
);
4285 if (flags
& float_muladd_negate_c
) {
4289 signflip
= (flags
& float_muladd_negate_result
) ? 1 : 0;
4291 /* Work out the sign and type of the product */
4292 pSign
= aSign
^ bSign
;
4293 if (flags
& float_muladd_negate_product
) {
4296 pInf
= (aExp
== 0x7ff) || (bExp
== 0x7ff);
4297 pZero
= ((aExp
| aSig
) == 0) || ((bExp
| bSig
) == 0);
4299 if (cExp
== 0x7ff) {
4300 if (pInf
&& (pSign
^ cSign
)) {
4301 /* addition of opposite-signed infinities => InvalidOperation */
4302 float_raise(float_flag_invalid
, status
);
4303 return float64_default_nan(status
);
4305 /* Otherwise generate an infinity of the same sign */
4306 return packFloat64(cSign
^ signflip
, 0x7ff, 0);
4310 return packFloat64(pSign
^ signflip
, 0x7ff, 0);
4316 /* Adding two exact zeroes */
4317 if (pSign
== cSign
) {
4319 } else if (status
->float_rounding_mode
== float_round_down
) {
4324 return packFloat64(zSign
^ signflip
, 0, 0);
4326 /* Exact zero plus a denorm */
4327 if (status
->flush_to_zero
) {
4328 float_raise(float_flag_output_denormal
, status
);
4329 return packFloat64(cSign
^ signflip
, 0, 0);
4332 /* Zero plus something non-zero : just return the something */
4333 if (flags
& float_muladd_halve_result
) {
4335 normalizeFloat64Subnormal(cSig
, &cExp
, &cSig
);
4337 /* Subtract one to halve, and one again because roundAndPackFloat64
4338 * wants one less than the true exponent.
4341 cSig
= (cSig
| 0x0010000000000000ULL
) << 10;
4342 return roundAndPackFloat64(cSign
^ signflip
, cExp
, cSig
, status
);
4344 return packFloat64(cSign
^ signflip
, cExp
, cSig
);
4348 normalizeFloat64Subnormal(aSig
, &aExp
, &aSig
);
4351 normalizeFloat64Subnormal(bSig
, &bExp
, &bSig
);
4354 /* Calculate the actual result a * b + c */
4356 /* Multiply first; this is easy. */
4357 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4358 * because we want the true exponent, not the "one-less-than"
4359 * flavour that roundAndPackFloat64() takes.
4361 pExp
= aExp
+ bExp
- 0x3fe;
4362 aSig
= (aSig
| LIT64(0x0010000000000000))<<10;
4363 bSig
= (bSig
| LIT64(0x0010000000000000))<<11;
4364 mul64To128(aSig
, bSig
, &pSig0
, &pSig1
);
4365 if ((int64_t)(pSig0
<< 1) >= 0) {
4366 shortShift128Left(pSig0
, pSig1
, 1, &pSig0
, &pSig1
);
4370 zSign
= pSign
^ signflip
;
4372 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4373 * bit in position 126.
4377 /* Throw out the special case of c being an exact zero now */
4378 shift128RightJamming(pSig0
, pSig1
, 64, &pSig0
, &pSig1
);
4379 if (flags
& float_muladd_halve_result
) {
4382 return roundAndPackFloat64(zSign
, pExp
- 1,
4385 normalizeFloat64Subnormal(cSig
, &cExp
, &cSig
);
4388 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4389 * significand of the addend, with the explicit bit in position 126.
4391 cSig0
= cSig
<< (126 - 64 - 52);
4393 cSig0
|= LIT64(0x4000000000000000);
4394 expDiff
= pExp
- cExp
;
4396 if (pSign
== cSign
) {
4399 /* scale c to match p */
4400 shift128RightJamming(cSig0
, cSig1
, expDiff
, &cSig0
, &cSig1
);
4402 } else if (expDiff
< 0) {
4403 /* scale p to match c */
4404 shift128RightJamming(pSig0
, pSig1
, -expDiff
, &pSig0
, &pSig1
);
4407 /* no scaling needed */
4410 /* Add significands and make sure explicit bit ends up in posn 126 */
4411 add128(pSig0
, pSig1
, cSig0
, cSig1
, &zSig0
, &zSig1
);
4412 if ((int64_t)zSig0
< 0) {
4413 shift128RightJamming(zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
4417 shift128RightJamming(zSig0
, zSig1
, 64, &zSig0
, &zSig1
);
4418 if (flags
& float_muladd_halve_result
) {
4421 return roundAndPackFloat64(zSign
, zExp
, zSig1
, status
);
4425 shift128RightJamming(cSig0
, cSig1
, expDiff
, &cSig0
, &cSig1
);
4426 sub128(pSig0
, pSig1
, cSig0
, cSig1
, &zSig0
, &zSig1
);
4428 } else if (expDiff
< 0) {
4429 shift128RightJamming(pSig0
, pSig1
, -expDiff
, &pSig0
, &pSig1
);
4430 sub128(cSig0
, cSig1
, pSig0
, pSig1
, &zSig0
, &zSig1
);
4435 if (lt128(cSig0
, cSig1
, pSig0
, pSig1
)) {
4436 sub128(pSig0
, pSig1
, cSig0
, cSig1
, &zSig0
, &zSig1
);
4437 } else if (lt128(pSig0
, pSig1
, cSig0
, cSig1
)) {
4438 sub128(cSig0
, cSig1
, pSig0
, pSig1
, &zSig0
, &zSig1
);
4443 if (status
->float_rounding_mode
== float_round_down
) {
4446 return packFloat64(zSign
, 0, 0);
4450 /* Do the equivalent of normalizeRoundAndPackFloat64() but
4451 * starting with the significand in a pair of uint64_t.
4454 shiftcount
= countLeadingZeros64(zSig0
) - 1;
4455 shortShift128Left(zSig0
, zSig1
, shiftcount
, &zSig0
, &zSig1
);
4461 shiftcount
= countLeadingZeros64(zSig1
);
4462 if (shiftcount
== 0) {
4463 zSig0
= (zSig1
>> 1) | (zSig1
& 1);
4467 zSig0
= zSig1
<< shiftcount
;
4468 zExp
-= (shiftcount
+ 64);
4471 if (flags
& float_muladd_halve_result
) {
4474 return roundAndPackFloat64(zSign
, zExp
, zSig0
, status
);
4478 /*----------------------------------------------------------------------------
4479 | Returns the square root of the double-precision floating-point value `a'.
4480 | The operation is performed according to the IEC/IEEE Standard for Binary
4481 | Floating-Point Arithmetic.
4482 *----------------------------------------------------------------------------*/
4484 float64
float64_sqrt(float64 a
, float_status
*status
)
4488 uint64_t aSig
, zSig
, doubleZSig
;
4489 uint64_t rem0
, rem1
, term0
, term1
;
4490 a
= float64_squash_input_denormal(a
, status
);
4492 aSig
= extractFloat64Frac( a
);
4493 aExp
= extractFloat64Exp( a
);
4494 aSign
= extractFloat64Sign( a
);
4495 if ( aExp
== 0x7FF ) {
4497 return propagateFloat64NaN(a
, a
, status
);
4499 if ( ! aSign
) return a
;
4500 float_raise(float_flag_invalid
, status
);
4501 return float64_default_nan(status
);
4504 if ( ( aExp
| aSig
) == 0 ) return a
;
4505 float_raise(float_flag_invalid
, status
);
4506 return float64_default_nan(status
);
4509 if ( aSig
== 0 ) return float64_zero
;
4510 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4512 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
4513 aSig
|= LIT64( 0x0010000000000000 );
4514 zSig
= estimateSqrt32( aExp
, aSig
>>21 );
4515 aSig
<<= 9 - ( aExp
& 1 );
4516 zSig
= estimateDiv128To64( aSig
, 0, zSig
<<32 ) + ( zSig
<<30 );
4517 if ( ( zSig
& 0x1FF ) <= 5 ) {
4518 doubleZSig
= zSig
<<1;
4519 mul64To128( zSig
, zSig
, &term0
, &term1
);
4520 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
4521 while ( (int64_t) rem0
< 0 ) {
4524 add128( rem0
, rem1
, zSig
>>63, doubleZSig
| 1, &rem0
, &rem1
);
4526 zSig
|= ( ( rem0
| rem1
) != 0 );
4528 return roundAndPackFloat64(0, zExp
, zSig
, status
);
4532 /*----------------------------------------------------------------------------
4533 | Returns the binary log of the double-precision floating-point value `a'.
4534 | The operation is performed according to the IEC/IEEE Standard for Binary
4535 | Floating-Point Arithmetic.
4536 *----------------------------------------------------------------------------*/
4537 float64
float64_log2(float64 a
, float_status
*status
)
4541 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
4542 a
= float64_squash_input_denormal(a
, status
);
4544 aSig
= extractFloat64Frac( a
);
4545 aExp
= extractFloat64Exp( a
);
4546 aSign
= extractFloat64Sign( a
);
4549 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
4550 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4553 float_raise(float_flag_invalid
, status
);
4554 return float64_default_nan(status
);
4556 if ( aExp
== 0x7FF ) {
4558 return propagateFloat64NaN(a
, float64_zero
, status
);
4564 aSig
|= LIT64( 0x0010000000000000 );
4566 zSig
= (uint64_t)aExp
<< 52;
4567 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
4568 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
4569 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
4570 if ( aSig
& LIT64( 0x0020000000000000 ) ) {
4578 return normalizeRoundAndPackFloat64(zSign
, 0x408, zSig
, status
);
4581 /*----------------------------------------------------------------------------
4582 | Returns 1 if the double-precision floating-point value `a' is equal to the
4583 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4584 | if either operand is a NaN. Otherwise, the comparison is performed
4585 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4586 *----------------------------------------------------------------------------*/
4588 int float64_eq(float64 a
, float64 b
, float_status
*status
)
4591 a
= float64_squash_input_denormal(a
, status
);
4592 b
= float64_squash_input_denormal(b
, status
);
4594 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4595 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4597 float_raise(float_flag_invalid
, status
);
4600 av
= float64_val(a
);
4601 bv
= float64_val(b
);
4602 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4606 /*----------------------------------------------------------------------------
4607 | Returns 1 if the double-precision floating-point value `a' is less than or
4608 | equal to the corresponding value `b', and 0 otherwise. The invalid
4609 | exception is raised if either operand is a NaN. The comparison is performed
4610 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4611 *----------------------------------------------------------------------------*/
4613 int float64_le(float64 a
, float64 b
, float_status
*status
)
4617 a
= float64_squash_input_denormal(a
, status
);
4618 b
= float64_squash_input_denormal(b
, status
);
4620 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4621 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4623 float_raise(float_flag_invalid
, status
);
4626 aSign
= extractFloat64Sign( a
);
4627 bSign
= extractFloat64Sign( b
);
4628 av
= float64_val(a
);
4629 bv
= float64_val(b
);
4630 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4631 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4635 /*----------------------------------------------------------------------------
4636 | Returns 1 if the double-precision floating-point value `a' is less than
4637 | the corresponding value `b', and 0 otherwise. The invalid exception is
4638 | raised if either operand is a NaN. The comparison is performed according
4639 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4640 *----------------------------------------------------------------------------*/
4642 int float64_lt(float64 a
, float64 b
, float_status
*status
)
4647 a
= float64_squash_input_denormal(a
, status
);
4648 b
= float64_squash_input_denormal(b
, status
);
4649 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4650 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4652 float_raise(float_flag_invalid
, status
);
4655 aSign
= extractFloat64Sign( a
);
4656 bSign
= extractFloat64Sign( b
);
4657 av
= float64_val(a
);
4658 bv
= float64_val(b
);
4659 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
4660 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4664 /*----------------------------------------------------------------------------
4665 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4666 | be compared, and 0 otherwise. The invalid exception is raised if either
4667 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4668 | Standard for Binary Floating-Point Arithmetic.
4669 *----------------------------------------------------------------------------*/
4671 int float64_unordered(float64 a
, float64 b
, float_status
*status
)
4673 a
= float64_squash_input_denormal(a
, status
);
4674 b
= float64_squash_input_denormal(b
, status
);
4676 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4677 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4679 float_raise(float_flag_invalid
, status
);
4685 /*----------------------------------------------------------------------------
4686 | Returns 1 if the double-precision floating-point value `a' is equal to the
4687 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4688 | exception.The comparison is performed according to the IEC/IEEE Standard
4689 | for Binary Floating-Point Arithmetic.
4690 *----------------------------------------------------------------------------*/
4692 int float64_eq_quiet(float64 a
, float64 b
, float_status
*status
)
4695 a
= float64_squash_input_denormal(a
, status
);
4696 b
= float64_squash_input_denormal(b
, status
);
4698 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4699 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4701 if (float64_is_signaling_nan(a
, status
)
4702 || float64_is_signaling_nan(b
, status
)) {
4703 float_raise(float_flag_invalid
, status
);
4707 av
= float64_val(a
);
4708 bv
= float64_val(b
);
4709 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4713 /*----------------------------------------------------------------------------
4714 | Returns 1 if the double-precision floating-point value `a' is less than or
4715 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4716 | cause an exception. Otherwise, the comparison is performed according to the
4717 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4718 *----------------------------------------------------------------------------*/
4720 int float64_le_quiet(float64 a
, float64 b
, float_status
*status
)
4724 a
= float64_squash_input_denormal(a
, status
);
4725 b
= float64_squash_input_denormal(b
, status
);
4727 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4728 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4730 if (float64_is_signaling_nan(a
, status
)
4731 || float64_is_signaling_nan(b
, status
)) {
4732 float_raise(float_flag_invalid
, status
);
4736 aSign
= extractFloat64Sign( a
);
4737 bSign
= extractFloat64Sign( b
);
4738 av
= float64_val(a
);
4739 bv
= float64_val(b
);
4740 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4741 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4745 /*----------------------------------------------------------------------------
4746 | Returns 1 if the double-precision floating-point value `a' is less than
4747 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4748 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4749 | Standard for Binary Floating-Point Arithmetic.
4750 *----------------------------------------------------------------------------*/
4752 int float64_lt_quiet(float64 a
, float64 b
, float_status
*status
)
4756 a
= float64_squash_input_denormal(a
, status
);
4757 b
= float64_squash_input_denormal(b
, status
);
4759 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4760 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4762 if (float64_is_signaling_nan(a
, status
)
4763 || float64_is_signaling_nan(b
, status
)) {
4764 float_raise(float_flag_invalid
, status
);
4768 aSign
= extractFloat64Sign( a
);
4769 bSign
= extractFloat64Sign( b
);
4770 av
= float64_val(a
);
4771 bv
= float64_val(b
);
4772 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
4773 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4777 /*----------------------------------------------------------------------------
4778 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4779 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4780 | comparison is performed according to the IEC/IEEE Standard for Binary
4781 | Floating-Point Arithmetic.
4782 *----------------------------------------------------------------------------*/
4784 int float64_unordered_quiet(float64 a
, float64 b
, float_status
*status
)
4786 a
= float64_squash_input_denormal(a
, status
);
4787 b
= float64_squash_input_denormal(b
, status
);
4789 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4790 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4792 if (float64_is_signaling_nan(a
, status
)
4793 || float64_is_signaling_nan(b
, status
)) {
4794 float_raise(float_flag_invalid
, status
);
4801 /*----------------------------------------------------------------------------
4802 | Returns the result of converting the extended double-precision floating-
4803 | point value `a' to the 32-bit two's complement integer format. The
4804 | conversion is performed according to the IEC/IEEE Standard for Binary
4805 | Floating-Point Arithmetic---which means in particular that the conversion
4806 | is rounded according to the current rounding mode. If `a' is a NaN, the
4807 | largest positive integer is returned. Otherwise, if the conversion
4808 | overflows, the largest integer with the same sign as `a' is returned.
4809 *----------------------------------------------------------------------------*/
4811 int32_t floatx80_to_int32(floatx80 a
, float_status
*status
)
4814 int32_t aExp
, shiftCount
;
4817 if (floatx80_invalid_encoding(a
)) {
4818 float_raise(float_flag_invalid
, status
);
4821 aSig
= extractFloatx80Frac( a
);
4822 aExp
= extractFloatx80Exp( a
);
4823 aSign
= extractFloatx80Sign( a
);
4824 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
4825 shiftCount
= 0x4037 - aExp
;
4826 if ( shiftCount
<= 0 ) shiftCount
= 1;
4827 shift64RightJamming( aSig
, shiftCount
, &aSig
);
4828 return roundAndPackInt32(aSign
, aSig
, status
);
4832 /*----------------------------------------------------------------------------
4833 | Returns the result of converting the extended double-precision floating-
4834 | point value `a' to the 32-bit two's complement integer format. The
4835 | conversion is performed according to the IEC/IEEE Standard for Binary
4836 | Floating-Point Arithmetic, except that the conversion is always rounded
4837 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4838 | Otherwise, if the conversion overflows, the largest integer with the same
4839 | sign as `a' is returned.
4840 *----------------------------------------------------------------------------*/
4842 int32_t floatx80_to_int32_round_to_zero(floatx80 a
, float_status
*status
)
4845 int32_t aExp
, shiftCount
;
4846 uint64_t aSig
, savedASig
;
4849 if (floatx80_invalid_encoding(a
)) {
4850 float_raise(float_flag_invalid
, status
);
4853 aSig
= extractFloatx80Frac( a
);
4854 aExp
= extractFloatx80Exp( a
);
4855 aSign
= extractFloatx80Sign( a
);
4856 if ( 0x401E < aExp
) {
4857 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
4860 else if ( aExp
< 0x3FFF ) {
4862 status
->float_exception_flags
|= float_flag_inexact
;
4866 shiftCount
= 0x403E - aExp
;
4868 aSig
>>= shiftCount
;
4870 if ( aSign
) z
= - z
;
4871 if ( ( z
< 0 ) ^ aSign
) {
4873 float_raise(float_flag_invalid
, status
);
4874 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
4876 if ( ( aSig
<<shiftCount
) != savedASig
) {
4877 status
->float_exception_flags
|= float_flag_inexact
;
4883 /*----------------------------------------------------------------------------
4884 | Returns the result of converting the extended double-precision floating-
4885 | point value `a' to the 64-bit two's complement integer format. The
4886 | conversion is performed according to the IEC/IEEE Standard for Binary
4887 | Floating-Point Arithmetic---which means in particular that the conversion
4888 | is rounded according to the current rounding mode. If `a' is a NaN,
4889 | the largest positive integer is returned. Otherwise, if the conversion
4890 | overflows, the largest integer with the same sign as `a' is returned.
4891 *----------------------------------------------------------------------------*/
4893 int64_t floatx80_to_int64(floatx80 a
, float_status
*status
)
4896 int32_t aExp
, shiftCount
;
4897 uint64_t aSig
, aSigExtra
;
4899 if (floatx80_invalid_encoding(a
)) {
4900 float_raise(float_flag_invalid
, status
);
4903 aSig
= extractFloatx80Frac( a
);
4904 aExp
= extractFloatx80Exp( a
);
4905 aSign
= extractFloatx80Sign( a
);
4906 shiftCount
= 0x403E - aExp
;
4907 if ( shiftCount
<= 0 ) {
4909 float_raise(float_flag_invalid
, status
);
4911 || ( ( aExp
== 0x7FFF )
4912 && ( aSig
!= LIT64( 0x8000000000000000 ) ) )
4914 return LIT64( 0x7FFFFFFFFFFFFFFF );
4916 return (int64_t) LIT64( 0x8000000000000000 );
4921 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
4923 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
4927 /*----------------------------------------------------------------------------
4928 | Returns the result of converting the extended double-precision floating-
4929 | point value `a' to the 64-bit two's complement integer format. The
4930 | conversion is performed according to the IEC/IEEE Standard for Binary
4931 | Floating-Point Arithmetic, except that the conversion is always rounded
4932 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4933 | Otherwise, if the conversion overflows, the largest integer with the same
4934 | sign as `a' is returned.
4935 *----------------------------------------------------------------------------*/
4937 int64_t floatx80_to_int64_round_to_zero(floatx80 a
, float_status
*status
)
4940 int32_t aExp
, shiftCount
;
4944 if (floatx80_invalid_encoding(a
)) {
4945 float_raise(float_flag_invalid
, status
);
4948 aSig
= extractFloatx80Frac( a
);
4949 aExp
= extractFloatx80Exp( a
);
4950 aSign
= extractFloatx80Sign( a
);
4951 shiftCount
= aExp
- 0x403E;
4952 if ( 0 <= shiftCount
) {
4953 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
4954 if ( ( a
.high
!= 0xC03E ) || aSig
) {
4955 float_raise(float_flag_invalid
, status
);
4956 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
4957 return LIT64( 0x7FFFFFFFFFFFFFFF );
4960 return (int64_t) LIT64( 0x8000000000000000 );
4962 else if ( aExp
< 0x3FFF ) {
4964 status
->float_exception_flags
|= float_flag_inexact
;
4968 z
= aSig
>>( - shiftCount
);
4969 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
4970 status
->float_exception_flags
|= float_flag_inexact
;
4972 if ( aSign
) z
= - z
;
4977 /*----------------------------------------------------------------------------
4978 | Returns the result of converting the extended double-precision floating-
4979 | point value `a' to the single-precision floating-point format. The
4980 | conversion is performed according to the IEC/IEEE Standard for Binary
4981 | Floating-Point Arithmetic.
4982 *----------------------------------------------------------------------------*/
4984 float32
floatx80_to_float32(floatx80 a
, float_status
*status
)
4990 if (floatx80_invalid_encoding(a
)) {
4991 float_raise(float_flag_invalid
, status
);
4992 return float32_default_nan(status
);
4994 aSig
= extractFloatx80Frac( a
);
4995 aExp
= extractFloatx80Exp( a
);
4996 aSign
= extractFloatx80Sign( a
);
4997 if ( aExp
== 0x7FFF ) {
4998 if ( (uint64_t) ( aSig
<<1 ) ) {
4999 return commonNaNToFloat32(floatx80ToCommonNaN(a
, status
), status
);
5001 return packFloat32( aSign
, 0xFF, 0 );
5003 shift64RightJamming( aSig
, 33, &aSig
);
5004 if ( aExp
|| aSig
) aExp
-= 0x3F81;
5005 return roundAndPackFloat32(aSign
, aExp
, aSig
, status
);
5009 /*----------------------------------------------------------------------------
5010 | Returns the result of converting the extended double-precision floating-
5011 | point value `a' to the double-precision floating-point format. The
5012 | conversion is performed according to the IEC/IEEE Standard for Binary
5013 | Floating-Point Arithmetic.
5014 *----------------------------------------------------------------------------*/
5016 float64
floatx80_to_float64(floatx80 a
, float_status
*status
)
5020 uint64_t aSig
, zSig
;
5022 if (floatx80_invalid_encoding(a
)) {
5023 float_raise(float_flag_invalid
, status
);
5024 return float64_default_nan(status
);
5026 aSig
= extractFloatx80Frac( a
);
5027 aExp
= extractFloatx80Exp( a
);
5028 aSign
= extractFloatx80Sign( a
);
5029 if ( aExp
== 0x7FFF ) {
5030 if ( (uint64_t) ( aSig
<<1 ) ) {
5031 return commonNaNToFloat64(floatx80ToCommonNaN(a
, status
), status
);
5033 return packFloat64( aSign
, 0x7FF, 0 );
5035 shift64RightJamming( aSig
, 1, &zSig
);
5036 if ( aExp
|| aSig
) aExp
-= 0x3C01;
5037 return roundAndPackFloat64(aSign
, aExp
, zSig
, status
);
5041 /*----------------------------------------------------------------------------
5042 | Returns the result of converting the extended double-precision floating-
5043 | point value `a' to the quadruple-precision floating-point format. The
5044 | conversion is performed according to the IEC/IEEE Standard for Binary
5045 | Floating-Point Arithmetic.
5046 *----------------------------------------------------------------------------*/
5048 float128
floatx80_to_float128(floatx80 a
, float_status
*status
)
5052 uint64_t aSig
, zSig0
, zSig1
;
5054 if (floatx80_invalid_encoding(a
)) {
5055 float_raise(float_flag_invalid
, status
);
5056 return float128_default_nan(status
);
5058 aSig
= extractFloatx80Frac( a
);
5059 aExp
= extractFloatx80Exp( a
);
5060 aSign
= extractFloatx80Sign( a
);
5061 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
5062 return commonNaNToFloat128(floatx80ToCommonNaN(a
, status
), status
);
5064 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
5065 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
5069 /*----------------------------------------------------------------------------
5070 | Rounds the extended double-precision floating-point value `a' to an integer,
5071 | and returns the result as an extended quadruple-precision floating-point
5072 | value. The operation is performed according to the IEC/IEEE Standard for
5073 | Binary Floating-Point Arithmetic.
5074 *----------------------------------------------------------------------------*/
5076 floatx80
floatx80_round_to_int(floatx80 a
, float_status
*status
)
5080 uint64_t lastBitMask
, roundBitsMask
;
5083 if (floatx80_invalid_encoding(a
)) {
5084 float_raise(float_flag_invalid
, status
);
5085 return floatx80_default_nan(status
);
5087 aExp
= extractFloatx80Exp( a
);
5088 if ( 0x403E <= aExp
) {
5089 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
5090 return propagateFloatx80NaN(a
, a
, status
);
5094 if ( aExp
< 0x3FFF ) {
5096 && ( (uint64_t) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
5099 status
->float_exception_flags
|= float_flag_inexact
;
5100 aSign
= extractFloatx80Sign( a
);
5101 switch (status
->float_rounding_mode
) {
5102 case float_round_nearest_even
:
5103 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
5106 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
5109 case float_round_ties_away
:
5110 if (aExp
== 0x3FFE) {
5111 return packFloatx80(aSign
, 0x3FFF, LIT64(0x8000000000000000));
5114 case float_round_down
:
5117 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5118 : packFloatx80( 0, 0, 0 );
5119 case float_round_up
:
5121 aSign
? packFloatx80( 1, 0, 0 )
5122 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5124 return packFloatx80( aSign
, 0, 0 );
5127 lastBitMask
<<= 0x403E - aExp
;
5128 roundBitsMask
= lastBitMask
- 1;
5130 switch (status
->float_rounding_mode
) {
5131 case float_round_nearest_even
:
5132 z
.low
+= lastBitMask
>>1;
5133 if ((z
.low
& roundBitsMask
) == 0) {
5134 z
.low
&= ~lastBitMask
;
5137 case float_round_ties_away
:
5138 z
.low
+= lastBitMask
>> 1;
5140 case float_round_to_zero
:
5142 case float_round_up
:
5143 if (!extractFloatx80Sign(z
)) {
5144 z
.low
+= roundBitsMask
;
5147 case float_round_down
:
5148 if (extractFloatx80Sign(z
)) {
5149 z
.low
+= roundBitsMask
;
5155 z
.low
&= ~ roundBitsMask
;
5158 z
.low
= LIT64( 0x8000000000000000 );
5160 if (z
.low
!= a
.low
) {
5161 status
->float_exception_flags
|= float_flag_inexact
;
5167 /*----------------------------------------------------------------------------
5168 | Returns the result of adding the absolute values of the extended double-
5169 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5170 | negated before being returned. `zSign' is ignored if the result is a NaN.
5171 | The addition is performed according to the IEC/IEEE Standard for Binary
5172 | Floating-Point Arithmetic.
5173 *----------------------------------------------------------------------------*/
5175 static floatx80
addFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
5176 float_status
*status
)
5178 int32_t aExp
, bExp
, zExp
;
5179 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5182 aSig
= extractFloatx80Frac( a
);
5183 aExp
= extractFloatx80Exp( a
);
5184 bSig
= extractFloatx80Frac( b
);
5185 bExp
= extractFloatx80Exp( b
);
5186 expDiff
= aExp
- bExp
;
5187 if ( 0 < expDiff
) {
5188 if ( aExp
== 0x7FFF ) {
5189 if ((uint64_t)(aSig
<< 1)) {
5190 return propagateFloatx80NaN(a
, b
, status
);
5194 if ( bExp
== 0 ) --expDiff
;
5195 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5198 else if ( expDiff
< 0 ) {
5199 if ( bExp
== 0x7FFF ) {
5200 if ((uint64_t)(bSig
<< 1)) {
5201 return propagateFloatx80NaN(a
, b
, status
);
5203 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5205 if ( aExp
== 0 ) ++expDiff
;
5206 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5210 if ( aExp
== 0x7FFF ) {
5211 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5212 return propagateFloatx80NaN(a
, b
, status
);
5217 zSig0
= aSig
+ bSig
;
5219 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
5225 zSig0
= aSig
+ bSig
;
5226 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
5228 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5229 zSig0
|= LIT64( 0x8000000000000000 );
5232 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5233 zSign
, zExp
, zSig0
, zSig1
, status
);
5236 /*----------------------------------------------------------------------------
5237 | Returns the result of subtracting the absolute values of the extended
5238 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5239 | difference is negated before being returned. `zSign' is ignored if the
5240 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5241 | Standard for Binary Floating-Point Arithmetic.
5242 *----------------------------------------------------------------------------*/
5244 static floatx80
subFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
5245 float_status
*status
)
5247 int32_t aExp
, bExp
, zExp
;
5248 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5251 aSig
= extractFloatx80Frac( a
);
5252 aExp
= extractFloatx80Exp( a
);
5253 bSig
= extractFloatx80Frac( b
);
5254 bExp
= extractFloatx80Exp( b
);
5255 expDiff
= aExp
- bExp
;
5256 if ( 0 < expDiff
) goto aExpBigger
;
5257 if ( expDiff
< 0 ) goto bExpBigger
;
5258 if ( aExp
== 0x7FFF ) {
5259 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5260 return propagateFloatx80NaN(a
, b
, status
);
5262 float_raise(float_flag_invalid
, status
);
5263 return floatx80_default_nan(status
);
5270 if ( bSig
< aSig
) goto aBigger
;
5271 if ( aSig
< bSig
) goto bBigger
;
5272 return packFloatx80(status
->float_rounding_mode
== float_round_down
, 0, 0);
5274 if ( bExp
== 0x7FFF ) {
5275 if ((uint64_t)(bSig
<< 1)) {
5276 return propagateFloatx80NaN(a
, b
, status
);
5278 return packFloatx80( zSign
^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5280 if ( aExp
== 0 ) ++expDiff
;
5281 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5283 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
5286 goto normalizeRoundAndPack
;
5288 if ( aExp
== 0x7FFF ) {
5289 if ((uint64_t)(aSig
<< 1)) {
5290 return propagateFloatx80NaN(a
, b
, status
);
5294 if ( bExp
== 0 ) --expDiff
;
5295 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5297 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
5299 normalizeRoundAndPack
:
5300 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
5301 zSign
, zExp
, zSig0
, zSig1
, status
);
5304 /*----------------------------------------------------------------------------
5305 | Returns the result of adding the extended double-precision floating-point
5306 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5307 | Standard for Binary Floating-Point Arithmetic.
5308 *----------------------------------------------------------------------------*/
5310 floatx80
floatx80_add(floatx80 a
, floatx80 b
, float_status
*status
)
5314 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5315 float_raise(float_flag_invalid
, status
);
5316 return floatx80_default_nan(status
);
5318 aSign
= extractFloatx80Sign( a
);
5319 bSign
= extractFloatx80Sign( b
);
5320 if ( aSign
== bSign
) {
5321 return addFloatx80Sigs(a
, b
, aSign
, status
);
5324 return subFloatx80Sigs(a
, b
, aSign
, status
);
5329 /*----------------------------------------------------------------------------
5330 | Returns the result of subtracting the extended double-precision floating-
5331 | point values `a' and `b'. The operation is performed according to the
5332 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5333 *----------------------------------------------------------------------------*/
5335 floatx80
floatx80_sub(floatx80 a
, floatx80 b
, float_status
*status
)
5339 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5340 float_raise(float_flag_invalid
, status
);
5341 return floatx80_default_nan(status
);
5343 aSign
= extractFloatx80Sign( a
);
5344 bSign
= extractFloatx80Sign( b
);
5345 if ( aSign
== bSign
) {
5346 return subFloatx80Sigs(a
, b
, aSign
, status
);
5349 return addFloatx80Sigs(a
, b
, aSign
, status
);
5354 /*----------------------------------------------------------------------------
5355 | Returns the result of multiplying the extended double-precision floating-
5356 | point values `a' and `b'. The operation is performed according to the
5357 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5358 *----------------------------------------------------------------------------*/
5360 floatx80
floatx80_mul(floatx80 a
, floatx80 b
, float_status
*status
)
5362 flag aSign
, bSign
, zSign
;
5363 int32_t aExp
, bExp
, zExp
;
5364 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5366 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5367 float_raise(float_flag_invalid
, status
);
5368 return floatx80_default_nan(status
);
5370 aSig
= extractFloatx80Frac( a
);
5371 aExp
= extractFloatx80Exp( a
);
5372 aSign
= extractFloatx80Sign( a
);
5373 bSig
= extractFloatx80Frac( b
);
5374 bExp
= extractFloatx80Exp( b
);
5375 bSign
= extractFloatx80Sign( b
);
5376 zSign
= aSign
^ bSign
;
5377 if ( aExp
== 0x7FFF ) {
5378 if ( (uint64_t) ( aSig
<<1 )
5379 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5380 return propagateFloatx80NaN(a
, b
, status
);
5382 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
5383 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5385 if ( bExp
== 0x7FFF ) {
5386 if ((uint64_t)(bSig
<< 1)) {
5387 return propagateFloatx80NaN(a
, b
, status
);
5389 if ( ( aExp
| aSig
) == 0 ) {
5391 float_raise(float_flag_invalid
, status
);
5392 return floatx80_default_nan(status
);
5394 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5397 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5398 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5401 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5402 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5404 zExp
= aExp
+ bExp
- 0x3FFE;
5405 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
5406 if ( 0 < (int64_t) zSig0
) {
5407 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5410 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5411 zSign
, zExp
, zSig0
, zSig1
, status
);
5414 /*----------------------------------------------------------------------------
5415 | Returns the result of dividing the extended double-precision floating-point
5416 | value `a' by the corresponding value `b'. The operation is performed
5417 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5418 *----------------------------------------------------------------------------*/
5420 floatx80
floatx80_div(floatx80 a
, floatx80 b
, float_status
*status
)
5422 flag aSign
, bSign
, zSign
;
5423 int32_t aExp
, bExp
, zExp
;
5424 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5425 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
5427 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5428 float_raise(float_flag_invalid
, status
);
5429 return floatx80_default_nan(status
);
5431 aSig
= extractFloatx80Frac( a
);
5432 aExp
= extractFloatx80Exp( a
);
5433 aSign
= extractFloatx80Sign( a
);
5434 bSig
= extractFloatx80Frac( b
);
5435 bExp
= extractFloatx80Exp( b
);
5436 bSign
= extractFloatx80Sign( b
);
5437 zSign
= aSign
^ bSign
;
5438 if ( aExp
== 0x7FFF ) {
5439 if ((uint64_t)(aSig
<< 1)) {
5440 return propagateFloatx80NaN(a
, b
, status
);
5442 if ( bExp
== 0x7FFF ) {
5443 if ((uint64_t)(bSig
<< 1)) {
5444 return propagateFloatx80NaN(a
, b
, status
);
5448 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5450 if ( bExp
== 0x7FFF ) {
5451 if ((uint64_t)(bSig
<< 1)) {
5452 return propagateFloatx80NaN(a
, b
, status
);
5454 return packFloatx80( zSign
, 0, 0 );
5458 if ( ( aExp
| aSig
) == 0 ) {
5460 float_raise(float_flag_invalid
, status
);
5461 return floatx80_default_nan(status
);
5463 float_raise(float_flag_divbyzero
, status
);
5464 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5466 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5469 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5470 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5472 zExp
= aExp
- bExp
+ 0x3FFE;
5474 if ( bSig
<= aSig
) {
5475 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
5478 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
5479 mul64To128( bSig
, zSig0
, &term0
, &term1
);
5480 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
5481 while ( (int64_t) rem0
< 0 ) {
5483 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
5485 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
5486 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
5487 mul64To128( bSig
, zSig1
, &term1
, &term2
);
5488 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5489 while ( (int64_t) rem1
< 0 ) {
5491 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
5493 zSig1
|= ( ( rem1
| rem2
) != 0 );
5495 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5496 zSign
, zExp
, zSig0
, zSig1
, status
);
5499 /*----------------------------------------------------------------------------
5500 | Returns the remainder of the extended double-precision floating-point value
5501 | `a' with respect to the corresponding value `b'. The operation is performed
5502 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5503 *----------------------------------------------------------------------------*/
5505 floatx80
floatx80_rem(floatx80 a
, floatx80 b
, float_status
*status
)
5508 int32_t aExp
, bExp
, expDiff
;
5509 uint64_t aSig0
, aSig1
, bSig
;
5510 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
5512 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5513 float_raise(float_flag_invalid
, status
);
5514 return floatx80_default_nan(status
);
5516 aSig0
= extractFloatx80Frac( a
);
5517 aExp
= extractFloatx80Exp( a
);
5518 aSign
= extractFloatx80Sign( a
);
5519 bSig
= extractFloatx80Frac( b
);
5520 bExp
= extractFloatx80Exp( b
);
5521 if ( aExp
== 0x7FFF ) {
5522 if ( (uint64_t) ( aSig0
<<1 )
5523 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5524 return propagateFloatx80NaN(a
, b
, status
);
5528 if ( bExp
== 0x7FFF ) {
5529 if ((uint64_t)(bSig
<< 1)) {
5530 return propagateFloatx80NaN(a
, b
, status
);
5537 float_raise(float_flag_invalid
, status
);
5538 return floatx80_default_nan(status
);
5540 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5543 if ( (uint64_t) ( aSig0
<<1 ) == 0 ) return a
;
5544 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5546 bSig
|= LIT64( 0x8000000000000000 );
5548 expDiff
= aExp
- bExp
;
5550 if ( expDiff
< 0 ) {
5551 if ( expDiff
< -1 ) return a
;
5552 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
5555 q
= ( bSig
<= aSig0
);
5556 if ( q
) aSig0
-= bSig
;
5558 while ( 0 < expDiff
) {
5559 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5560 q
= ( 2 < q
) ? q
- 2 : 0;
5561 mul64To128( bSig
, q
, &term0
, &term1
);
5562 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5563 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
5567 if ( 0 < expDiff
) {
5568 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5569 q
= ( 2 < q
) ? q
- 2 : 0;
5571 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
5572 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5573 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
5574 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
5576 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5583 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
5584 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5585 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5588 aSig0
= alternateASig0
;
5589 aSig1
= alternateASig1
;
5593 normalizeRoundAndPackFloatx80(
5594 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
, status
);
5598 /*----------------------------------------------------------------------------
5599 | Returns the square root of the extended double-precision floating-point
5600 | value `a'. The operation is performed according to the IEC/IEEE Standard
5601 | for Binary Floating-Point Arithmetic.
5602 *----------------------------------------------------------------------------*/
5604 floatx80
floatx80_sqrt(floatx80 a
, float_status
*status
)
5608 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
5609 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5611 if (floatx80_invalid_encoding(a
)) {
5612 float_raise(float_flag_invalid
, status
);
5613 return floatx80_default_nan(status
);
5615 aSig0
= extractFloatx80Frac( a
);
5616 aExp
= extractFloatx80Exp( a
);
5617 aSign
= extractFloatx80Sign( a
);
5618 if ( aExp
== 0x7FFF ) {
5619 if ((uint64_t)(aSig0
<< 1)) {
5620 return propagateFloatx80NaN(a
, a
, status
);
5622 if ( ! aSign
) return a
;
5626 if ( ( aExp
| aSig0
) == 0 ) return a
;
5628 float_raise(float_flag_invalid
, status
);
5629 return floatx80_default_nan(status
);
5632 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
5633 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5635 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
5636 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
5637 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
5638 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
5639 doubleZSig0
= zSig0
<<1;
5640 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
5641 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
5642 while ( (int64_t) rem0
< 0 ) {
5645 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
5647 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
5648 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5649 if ( zSig1
== 0 ) zSig1
= 1;
5650 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
5651 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5652 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
5653 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5654 while ( (int64_t) rem1
< 0 ) {
5656 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
5658 term2
|= doubleZSig0
;
5659 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5661 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5663 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
5664 zSig0
|= doubleZSig0
;
5665 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5666 0, zExp
, zSig0
, zSig1
, status
);
5669 /*----------------------------------------------------------------------------
5670 | Returns 1 if the extended double-precision floating-point value `a' is equal
5671 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5672 | raised if either operand is a NaN. Otherwise, the comparison is performed
5673 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5674 *----------------------------------------------------------------------------*/
5676 int floatx80_eq(floatx80 a
, floatx80 b
, float_status
*status
)
5679 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5680 || (extractFloatx80Exp(a
) == 0x7FFF
5681 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5682 || (extractFloatx80Exp(b
) == 0x7FFF
5683 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5685 float_raise(float_flag_invalid
, status
);
5690 && ( ( a
.high
== b
.high
)
5692 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5697 /*----------------------------------------------------------------------------
5698 | Returns 1 if the extended double-precision floating-point value `a' is
5699 | less than or equal to the corresponding value `b', and 0 otherwise. The
5700 | invalid exception is raised if either operand is a NaN. The comparison is
5701 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5703 *----------------------------------------------------------------------------*/
5705 int floatx80_le(floatx80 a
, floatx80 b
, float_status
*status
)
5709 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5710 || (extractFloatx80Exp(a
) == 0x7FFF
5711 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5712 || (extractFloatx80Exp(b
) == 0x7FFF
5713 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5715 float_raise(float_flag_invalid
, status
);
5718 aSign
= extractFloatx80Sign( a
);
5719 bSign
= extractFloatx80Sign( b
);
5720 if ( aSign
!= bSign
) {
5723 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5727 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5728 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5732 /*----------------------------------------------------------------------------
5733 | Returns 1 if the extended double-precision floating-point value `a' is
5734 | less than the corresponding value `b', and 0 otherwise. The invalid
5735 | exception is raised if either operand is a NaN. The comparison is performed
5736 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5737 *----------------------------------------------------------------------------*/
5739 int floatx80_lt(floatx80 a
, floatx80 b
, float_status
*status
)
5743 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5744 || (extractFloatx80Exp(a
) == 0x7FFF
5745 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5746 || (extractFloatx80Exp(b
) == 0x7FFF
5747 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5749 float_raise(float_flag_invalid
, status
);
5752 aSign
= extractFloatx80Sign( a
);
5753 bSign
= extractFloatx80Sign( b
);
5754 if ( aSign
!= bSign
) {
5757 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5761 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5762 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5766 /*----------------------------------------------------------------------------
5767 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5768 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5769 | either operand is a NaN. The comparison is performed according to the
5770 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5771 *----------------------------------------------------------------------------*/
5772 int floatx80_unordered(floatx80 a
, floatx80 b
, float_status
*status
)
5774 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5775 || (extractFloatx80Exp(a
) == 0x7FFF
5776 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5777 || (extractFloatx80Exp(b
) == 0x7FFF
5778 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5780 float_raise(float_flag_invalid
, status
);
5786 /*----------------------------------------------------------------------------
5787 | Returns 1 if the extended double-precision floating-point value `a' is
5788 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5789 | cause an exception. The comparison is performed according to the IEC/IEEE
5790 | Standard for Binary Floating-Point Arithmetic.
5791 *----------------------------------------------------------------------------*/
5793 int floatx80_eq_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5796 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5797 float_raise(float_flag_invalid
, status
);
5800 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5801 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5802 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5803 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5805 if (floatx80_is_signaling_nan(a
, status
)
5806 || floatx80_is_signaling_nan(b
, status
)) {
5807 float_raise(float_flag_invalid
, status
);
5813 && ( ( a
.high
== b
.high
)
5815 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5820 /*----------------------------------------------------------------------------
5821 | Returns 1 if the extended double-precision floating-point value `a' is less
5822 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5823 | do not cause an exception. Otherwise, the comparison is performed according
5824 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5825 *----------------------------------------------------------------------------*/
5827 int floatx80_le_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5831 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5832 float_raise(float_flag_invalid
, status
);
5835 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5836 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5837 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5838 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5840 if (floatx80_is_signaling_nan(a
, status
)
5841 || floatx80_is_signaling_nan(b
, status
)) {
5842 float_raise(float_flag_invalid
, status
);
5846 aSign
= extractFloatx80Sign( a
);
5847 bSign
= extractFloatx80Sign( b
);
5848 if ( aSign
!= bSign
) {
5851 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5855 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5856 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5860 /*----------------------------------------------------------------------------
5861 | Returns 1 if the extended double-precision floating-point value `a' is less
5862 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5863 | an exception. Otherwise, the comparison is performed according to the
5864 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5865 *----------------------------------------------------------------------------*/
5867 int floatx80_lt_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5871 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5872 float_raise(float_flag_invalid
, status
);
5875 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5876 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5877 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5878 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5880 if (floatx80_is_signaling_nan(a
, status
)
5881 || floatx80_is_signaling_nan(b
, status
)) {
5882 float_raise(float_flag_invalid
, status
);
5886 aSign
= extractFloatx80Sign( a
);
5887 bSign
= extractFloatx80Sign( b
);
5888 if ( aSign
!= bSign
) {
5891 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5895 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5896 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5900 /*----------------------------------------------------------------------------
5901 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5902 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5903 | The comparison is performed according to the IEC/IEEE Standard for Binary
5904 | Floating-Point Arithmetic.
5905 *----------------------------------------------------------------------------*/
5906 int floatx80_unordered_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5908 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5909 float_raise(float_flag_invalid
, status
);
5912 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5913 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5914 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5915 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5917 if (floatx80_is_signaling_nan(a
, status
)
5918 || floatx80_is_signaling_nan(b
, status
)) {
5919 float_raise(float_flag_invalid
, status
);
5926 /*----------------------------------------------------------------------------
5927 | Returns the result of converting the quadruple-precision floating-point
5928 | value `a' to the 32-bit two's complement integer format. The conversion
5929 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5930 | Arithmetic---which means in particular that the conversion is rounded
5931 | according to the current rounding mode. If `a' is a NaN, the largest
5932 | positive integer is returned. Otherwise, if the conversion overflows, the
5933 | largest integer with the same sign as `a' is returned.
5934 *----------------------------------------------------------------------------*/
5936 int32_t float128_to_int32(float128 a
, float_status
*status
)
5939 int32_t aExp
, shiftCount
;
5940 uint64_t aSig0
, aSig1
;
5942 aSig1
= extractFloat128Frac1( a
);
5943 aSig0
= extractFloat128Frac0( a
);
5944 aExp
= extractFloat128Exp( a
);
5945 aSign
= extractFloat128Sign( a
);
5946 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
5947 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5948 aSig0
|= ( aSig1
!= 0 );
5949 shiftCount
= 0x4028 - aExp
;
5950 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
5951 return roundAndPackInt32(aSign
, aSig0
, status
);
5955 /*----------------------------------------------------------------------------
5956 | Returns the result of converting the quadruple-precision floating-point
5957 | value `a' to the 32-bit two's complement integer format. The conversion
5958 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5959 | Arithmetic, except that the conversion is always rounded toward zero. If
5960 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5961 | conversion overflows, the largest integer with the same sign as `a' is
5963 *----------------------------------------------------------------------------*/
5965 int32_t float128_to_int32_round_to_zero(float128 a
, float_status
*status
)
5968 int32_t aExp
, shiftCount
;
5969 uint64_t aSig0
, aSig1
, savedASig
;
5972 aSig1
= extractFloat128Frac1( a
);
5973 aSig0
= extractFloat128Frac0( a
);
5974 aExp
= extractFloat128Exp( a
);
5975 aSign
= extractFloat128Sign( a
);
5976 aSig0
|= ( aSig1
!= 0 );
5977 if ( 0x401E < aExp
) {
5978 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
5981 else if ( aExp
< 0x3FFF ) {
5982 if (aExp
|| aSig0
) {
5983 status
->float_exception_flags
|= float_flag_inexact
;
5987 aSig0
|= LIT64( 0x0001000000000000 );
5988 shiftCount
= 0x402F - aExp
;
5990 aSig0
>>= shiftCount
;
5992 if ( aSign
) z
= - z
;
5993 if ( ( z
< 0 ) ^ aSign
) {
5995 float_raise(float_flag_invalid
, status
);
5996 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
5998 if ( ( aSig0
<<shiftCount
) != savedASig
) {
5999 status
->float_exception_flags
|= float_flag_inexact
;
6005 /*----------------------------------------------------------------------------
6006 | Returns the result of converting the quadruple-precision floating-point
6007 | value `a' to the 64-bit two's complement integer format. The conversion
6008 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6009 | Arithmetic---which means in particular that the conversion is rounded
6010 | according to the current rounding mode. If `a' is a NaN, the largest
6011 | positive integer is returned. Otherwise, if the conversion overflows, the
6012 | largest integer with the same sign as `a' is returned.
6013 *----------------------------------------------------------------------------*/
6015 int64_t float128_to_int64(float128 a
, float_status
*status
)
6018 int32_t aExp
, shiftCount
;
6019 uint64_t aSig0
, aSig1
;
6021 aSig1
= extractFloat128Frac1( a
);
6022 aSig0
= extractFloat128Frac0( a
);
6023 aExp
= extractFloat128Exp( a
);
6024 aSign
= extractFloat128Sign( a
);
6025 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
6026 shiftCount
= 0x402F - aExp
;
6027 if ( shiftCount
<= 0 ) {
6028 if ( 0x403E < aExp
) {
6029 float_raise(float_flag_invalid
, status
);
6031 || ( ( aExp
== 0x7FFF )
6032 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
6035 return LIT64( 0x7FFFFFFFFFFFFFFF );
6037 return (int64_t) LIT64( 0x8000000000000000 );
6039 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
6042 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
6044 return roundAndPackInt64(aSign
, aSig0
, aSig1
, status
);
6048 /*----------------------------------------------------------------------------
6049 | Returns the result of converting the quadruple-precision floating-point
6050 | value `a' to the 64-bit two's complement integer format. The conversion
6051 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6052 | Arithmetic, except that the conversion is always rounded toward zero.
6053 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6054 | the conversion overflows, the largest integer with the same sign as `a' is
6056 *----------------------------------------------------------------------------*/
6058 int64_t float128_to_int64_round_to_zero(float128 a
, float_status
*status
)
6061 int32_t aExp
, shiftCount
;
6062 uint64_t aSig0
, aSig1
;
6065 aSig1
= extractFloat128Frac1( a
);
6066 aSig0
= extractFloat128Frac0( a
);
6067 aExp
= extractFloat128Exp( a
);
6068 aSign
= extractFloat128Sign( a
);
6069 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
6070 shiftCount
= aExp
- 0x402F;
6071 if ( 0 < shiftCount
) {
6072 if ( 0x403E <= aExp
) {
6073 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
6074 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
6075 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
6077 status
->float_exception_flags
|= float_flag_inexact
;
6081 float_raise(float_flag_invalid
, status
);
6082 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
6083 return LIT64( 0x7FFFFFFFFFFFFFFF );
6086 return (int64_t) LIT64( 0x8000000000000000 );
6088 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
6089 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
6090 status
->float_exception_flags
|= float_flag_inexact
;
6094 if ( aExp
< 0x3FFF ) {
6095 if ( aExp
| aSig0
| aSig1
) {
6096 status
->float_exception_flags
|= float_flag_inexact
;
6100 z
= aSig0
>>( - shiftCount
);
6102 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
6103 status
->float_exception_flags
|= float_flag_inexact
;
6106 if ( aSign
) z
= - z
;
6111 /*----------------------------------------------------------------------------
6112 | Returns the result of converting the quadruple-precision floating-point
6113 | value `a' to the single-precision floating-point format. The conversion
6114 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6116 *----------------------------------------------------------------------------*/
6118 float32
float128_to_float32(float128 a
, float_status
*status
)
6122 uint64_t aSig0
, aSig1
;
6125 aSig1
= extractFloat128Frac1( a
);
6126 aSig0
= extractFloat128Frac0( a
);
6127 aExp
= extractFloat128Exp( a
);
6128 aSign
= extractFloat128Sign( a
);
6129 if ( aExp
== 0x7FFF ) {
6130 if ( aSig0
| aSig1
) {
6131 return commonNaNToFloat32(float128ToCommonNaN(a
, status
), status
);
6133 return packFloat32( aSign
, 0xFF, 0 );
6135 aSig0
|= ( aSig1
!= 0 );
6136 shift64RightJamming( aSig0
, 18, &aSig0
);
6138 if ( aExp
|| zSig
) {
6142 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
6146 /*----------------------------------------------------------------------------
6147 | Returns the result of converting the quadruple-precision floating-point
6148 | value `a' to the double-precision floating-point format. The conversion
6149 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6151 *----------------------------------------------------------------------------*/
6153 float64
float128_to_float64(float128 a
, float_status
*status
)
6157 uint64_t aSig0
, aSig1
;
6159 aSig1
= extractFloat128Frac1( a
);
6160 aSig0
= extractFloat128Frac0( a
);
6161 aExp
= extractFloat128Exp( a
);
6162 aSign
= extractFloat128Sign( a
);
6163 if ( aExp
== 0x7FFF ) {
6164 if ( aSig0
| aSig1
) {
6165 return commonNaNToFloat64(float128ToCommonNaN(a
, status
), status
);
6167 return packFloat64( aSign
, 0x7FF, 0 );
6169 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6170 aSig0
|= ( aSig1
!= 0 );
6171 if ( aExp
|| aSig0
) {
6172 aSig0
|= LIT64( 0x4000000000000000 );
6175 return roundAndPackFloat64(aSign
, aExp
, aSig0
, status
);
6179 /*----------------------------------------------------------------------------
6180 | Returns the result of converting the quadruple-precision floating-point
6181 | value `a' to the extended double-precision floating-point format. The
6182 | conversion is performed according to the IEC/IEEE Standard for Binary
6183 | Floating-Point Arithmetic.
6184 *----------------------------------------------------------------------------*/
6186 floatx80
float128_to_floatx80(float128 a
, float_status
*status
)
6190 uint64_t aSig0
, aSig1
;
6192 aSig1
= extractFloat128Frac1( a
);
6193 aSig0
= extractFloat128Frac0( a
);
6194 aExp
= extractFloat128Exp( a
);
6195 aSign
= extractFloat128Sign( a
);
6196 if ( aExp
== 0x7FFF ) {
6197 if ( aSig0
| aSig1
) {
6198 return commonNaNToFloatx80(float128ToCommonNaN(a
, status
), status
);
6200 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
6203 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
6204 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6207 aSig0
|= LIT64( 0x0001000000000000 );
6209 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
6210 return roundAndPackFloatx80(80, aSign
, aExp
, aSig0
, aSig1
, status
);
6214 /*----------------------------------------------------------------------------
6215 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6216 | returns the result as a quadruple-precision floating-point value. The
6217 | operation is performed according to the IEC/IEEE Standard for Binary
6218 | Floating-Point Arithmetic.
6219 *----------------------------------------------------------------------------*/
6221 float128
float128_round_to_int(float128 a
, float_status
*status
)
6225 uint64_t lastBitMask
, roundBitsMask
;
6228 aExp
= extractFloat128Exp( a
);
6229 if ( 0x402F <= aExp
) {
6230 if ( 0x406F <= aExp
) {
6231 if ( ( aExp
== 0x7FFF )
6232 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
6234 return propagateFloat128NaN(a
, a
, status
);
6239 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
6240 roundBitsMask
= lastBitMask
- 1;
6242 switch (status
->float_rounding_mode
) {
6243 case float_round_nearest_even
:
6244 if ( lastBitMask
) {
6245 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
6246 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
6249 if ( (int64_t) z
.low
< 0 ) {
6251 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
6255 case float_round_ties_away
:
6257 add128(z
.high
, z
.low
, 0, lastBitMask
>> 1, &z
.high
, &z
.low
);
6259 if ((int64_t) z
.low
< 0) {
6264 case float_round_to_zero
:
6266 case float_round_up
:
6267 if (!extractFloat128Sign(z
)) {
6268 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6271 case float_round_down
:
6272 if (extractFloat128Sign(z
)) {
6273 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6279 z
.low
&= ~ roundBitsMask
;
6282 if ( aExp
< 0x3FFF ) {
6283 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
6284 status
->float_exception_flags
|= float_flag_inexact
;
6285 aSign
= extractFloat128Sign( a
);
6286 switch (status
->float_rounding_mode
) {
6287 case float_round_nearest_even
:
6288 if ( ( aExp
== 0x3FFE )
6289 && ( extractFloat128Frac0( a
)
6290 | extractFloat128Frac1( a
) )
6292 return packFloat128( aSign
, 0x3FFF, 0, 0 );
6295 case float_round_ties_away
:
6296 if (aExp
== 0x3FFE) {
6297 return packFloat128(aSign
, 0x3FFF, 0, 0);
6300 case float_round_down
:
6302 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
6303 : packFloat128( 0, 0, 0, 0 );
6304 case float_round_up
:
6306 aSign
? packFloat128( 1, 0, 0, 0 )
6307 : packFloat128( 0, 0x3FFF, 0, 0 );
6309 return packFloat128( aSign
, 0, 0, 0 );
6312 lastBitMask
<<= 0x402F - aExp
;
6313 roundBitsMask
= lastBitMask
- 1;
6316 switch (status
->float_rounding_mode
) {
6317 case float_round_nearest_even
:
6318 z
.high
+= lastBitMask
>>1;
6319 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
6320 z
.high
&= ~ lastBitMask
;
6323 case float_round_ties_away
:
6324 z
.high
+= lastBitMask
>>1;
6326 case float_round_to_zero
:
6328 case float_round_up
:
6329 if (!extractFloat128Sign(z
)) {
6330 z
.high
|= ( a
.low
!= 0 );
6331 z
.high
+= roundBitsMask
;
6334 case float_round_down
:
6335 if (extractFloat128Sign(z
)) {
6336 z
.high
|= (a
.low
!= 0);
6337 z
.high
+= roundBitsMask
;
6343 z
.high
&= ~ roundBitsMask
;
6345 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
6346 status
->float_exception_flags
|= float_flag_inexact
;
6352 /*----------------------------------------------------------------------------
6353 | Returns the result of adding the absolute values of the quadruple-precision
6354 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6355 | before being returned. `zSign' is ignored if the result is a NaN.
6356 | The addition is performed according to the IEC/IEEE Standard for Binary
6357 | Floating-Point Arithmetic.
6358 *----------------------------------------------------------------------------*/
6360 static float128
addFloat128Sigs(float128 a
, float128 b
, flag zSign
,
6361 float_status
*status
)
6363 int32_t aExp
, bExp
, zExp
;
6364 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6367 aSig1
= extractFloat128Frac1( a
);
6368 aSig0
= extractFloat128Frac0( a
);
6369 aExp
= extractFloat128Exp( a
);
6370 bSig1
= extractFloat128Frac1( b
);
6371 bSig0
= extractFloat128Frac0( b
);
6372 bExp
= extractFloat128Exp( b
);
6373 expDiff
= aExp
- bExp
;
6374 if ( 0 < expDiff
) {
6375 if ( aExp
== 0x7FFF ) {
6376 if (aSig0
| aSig1
) {
6377 return propagateFloat128NaN(a
, b
, status
);
6385 bSig0
|= LIT64( 0x0001000000000000 );
6387 shift128ExtraRightJamming(
6388 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
6391 else if ( expDiff
< 0 ) {
6392 if ( bExp
== 0x7FFF ) {
6393 if (bSig0
| bSig1
) {
6394 return propagateFloat128NaN(a
, b
, status
);
6396 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6402 aSig0
|= LIT64( 0x0001000000000000 );
6404 shift128ExtraRightJamming(
6405 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
6409 if ( aExp
== 0x7FFF ) {
6410 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6411 return propagateFloat128NaN(a
, b
, status
);
6415 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6417 if (status
->flush_to_zero
) {
6418 if (zSig0
| zSig1
) {
6419 float_raise(float_flag_output_denormal
, status
);
6421 return packFloat128(zSign
, 0, 0, 0);
6423 return packFloat128( zSign
, 0, zSig0
, zSig1
);
6426 zSig0
|= LIT64( 0x0002000000000000 );
6430 aSig0
|= LIT64( 0x0001000000000000 );
6431 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6433 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
6436 shift128ExtraRightJamming(
6437 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6439 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6443 /*----------------------------------------------------------------------------
6444 | Returns the result of subtracting the absolute values of the quadruple-
6445 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6446 | difference is negated before being returned. `zSign' is ignored if the
6447 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6448 | Standard for Binary Floating-Point Arithmetic.
6449 *----------------------------------------------------------------------------*/
6451 static float128
subFloat128Sigs(float128 a
, float128 b
, flag zSign
,
6452 float_status
*status
)
6454 int32_t aExp
, bExp
, zExp
;
6455 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
6458 aSig1
= extractFloat128Frac1( a
);
6459 aSig0
= extractFloat128Frac0( a
);
6460 aExp
= extractFloat128Exp( a
);
6461 bSig1
= extractFloat128Frac1( b
);
6462 bSig0
= extractFloat128Frac0( b
);
6463 bExp
= extractFloat128Exp( b
);
6464 expDiff
= aExp
- bExp
;
6465 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6466 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
6467 if ( 0 < expDiff
) goto aExpBigger
;
6468 if ( expDiff
< 0 ) goto bExpBigger
;
6469 if ( aExp
== 0x7FFF ) {
6470 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6471 return propagateFloat128NaN(a
, b
, status
);
6473 float_raise(float_flag_invalid
, status
);
6474 return float128_default_nan(status
);
6480 if ( bSig0
< aSig0
) goto aBigger
;
6481 if ( aSig0
< bSig0
) goto bBigger
;
6482 if ( bSig1
< aSig1
) goto aBigger
;
6483 if ( aSig1
< bSig1
) goto bBigger
;
6484 return packFloat128(status
->float_rounding_mode
== float_round_down
,
6487 if ( bExp
== 0x7FFF ) {
6488 if (bSig0
| bSig1
) {
6489 return propagateFloat128NaN(a
, b
, status
);
6491 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
6497 aSig0
|= LIT64( 0x4000000000000000 );
6499 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6500 bSig0
|= LIT64( 0x4000000000000000 );
6502 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6505 goto normalizeRoundAndPack
;
6507 if ( aExp
== 0x7FFF ) {
6508 if (aSig0
| aSig1
) {
6509 return propagateFloat128NaN(a
, b
, status
);
6517 bSig0
|= LIT64( 0x4000000000000000 );
6519 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
6520 aSig0
|= LIT64( 0x4000000000000000 );
6522 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6524 normalizeRoundAndPack
:
6526 return normalizeRoundAndPackFloat128(zSign
, zExp
- 14, zSig0
, zSig1
,
6531 /*----------------------------------------------------------------------------
6532 | Returns the result of adding the quadruple-precision floating-point values
6533 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6534 | for Binary Floating-Point Arithmetic.
6535 *----------------------------------------------------------------------------*/
6537 float128
float128_add(float128 a
, float128 b
, float_status
*status
)
6541 aSign
= extractFloat128Sign( a
);
6542 bSign
= extractFloat128Sign( b
);
6543 if ( aSign
== bSign
) {
6544 return addFloat128Sigs(a
, b
, aSign
, status
);
6547 return subFloat128Sigs(a
, b
, aSign
, status
);
6552 /*----------------------------------------------------------------------------
6553 | Returns the result of subtracting the quadruple-precision floating-point
6554 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6555 | Standard for Binary Floating-Point Arithmetic.
6556 *----------------------------------------------------------------------------*/
6558 float128
float128_sub(float128 a
, float128 b
, float_status
*status
)
6562 aSign
= extractFloat128Sign( a
);
6563 bSign
= extractFloat128Sign( b
);
6564 if ( aSign
== bSign
) {
6565 return subFloat128Sigs(a
, b
, aSign
, status
);
6568 return addFloat128Sigs(a
, b
, aSign
, status
);
6573 /*----------------------------------------------------------------------------
6574 | Returns the result of multiplying the quadruple-precision floating-point
6575 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6576 | Standard for Binary Floating-Point Arithmetic.
6577 *----------------------------------------------------------------------------*/
6579 float128
float128_mul(float128 a
, float128 b
, float_status
*status
)
6581 flag aSign
, bSign
, zSign
;
6582 int32_t aExp
, bExp
, zExp
;
6583 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
6585 aSig1
= extractFloat128Frac1( a
);
6586 aSig0
= extractFloat128Frac0( a
);
6587 aExp
= extractFloat128Exp( a
);
6588 aSign
= extractFloat128Sign( a
);
6589 bSig1
= extractFloat128Frac1( b
);
6590 bSig0
= extractFloat128Frac0( b
);
6591 bExp
= extractFloat128Exp( b
);
6592 bSign
= extractFloat128Sign( b
);
6593 zSign
= aSign
^ bSign
;
6594 if ( aExp
== 0x7FFF ) {
6595 if ( ( aSig0
| aSig1
)
6596 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6597 return propagateFloat128NaN(a
, b
, status
);
6599 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
6600 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6602 if ( bExp
== 0x7FFF ) {
6603 if (bSig0
| bSig1
) {
6604 return propagateFloat128NaN(a
, b
, status
);
6606 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6608 float_raise(float_flag_invalid
, status
);
6609 return float128_default_nan(status
);
6611 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6614 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6615 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6618 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6619 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6621 zExp
= aExp
+ bExp
- 0x4000;
6622 aSig0
|= LIT64( 0x0001000000000000 );
6623 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
6624 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
6625 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6626 zSig2
|= ( zSig3
!= 0 );
6627 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
6628 shift128ExtraRightJamming(
6629 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6632 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6636 /*----------------------------------------------------------------------------
6637 | Returns the result of dividing the quadruple-precision floating-point value
6638 | `a' by the corresponding value `b'. The operation is performed according to
6639 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6640 *----------------------------------------------------------------------------*/
6642 float128
float128_div(float128 a
, float128 b
, float_status
*status
)
6644 flag aSign
, bSign
, zSign
;
6645 int32_t aExp
, bExp
, zExp
;
6646 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6647 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6649 aSig1
= extractFloat128Frac1( a
);
6650 aSig0
= extractFloat128Frac0( a
);
6651 aExp
= extractFloat128Exp( a
);
6652 aSign
= extractFloat128Sign( a
);
6653 bSig1
= extractFloat128Frac1( b
);
6654 bSig0
= extractFloat128Frac0( b
);
6655 bExp
= extractFloat128Exp( b
);
6656 bSign
= extractFloat128Sign( b
);
6657 zSign
= aSign
^ bSign
;
6658 if ( aExp
== 0x7FFF ) {
6659 if (aSig0
| aSig1
) {
6660 return propagateFloat128NaN(a
, b
, status
);
6662 if ( bExp
== 0x7FFF ) {
6663 if (bSig0
| bSig1
) {
6664 return propagateFloat128NaN(a
, b
, status
);
6668 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6670 if ( bExp
== 0x7FFF ) {
6671 if (bSig0
| bSig1
) {
6672 return propagateFloat128NaN(a
, b
, status
);
6674 return packFloat128( zSign
, 0, 0, 0 );
6677 if ( ( bSig0
| bSig1
) == 0 ) {
6678 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6680 float_raise(float_flag_invalid
, status
);
6681 return float128_default_nan(status
);
6683 float_raise(float_flag_divbyzero
, status
);
6684 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6686 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6689 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6690 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6692 zExp
= aExp
- bExp
+ 0x3FFD;
6694 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
6696 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
6697 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
6698 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
6701 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6702 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
6703 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
6704 while ( (int64_t) rem0
< 0 ) {
6706 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
6708 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
6709 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
6710 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
6711 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
6712 while ( (int64_t) rem1
< 0 ) {
6714 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
6716 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6718 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
6719 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6723 /*----------------------------------------------------------------------------
6724 | Returns the remainder of the quadruple-precision floating-point value `a'
6725 | with respect to the corresponding value `b'. The operation is performed
6726 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6727 *----------------------------------------------------------------------------*/
6729 float128
float128_rem(float128 a
, float128 b
, float_status
*status
)
6732 int32_t aExp
, bExp
, expDiff
;
6733 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
6734 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
6737 aSig1
= extractFloat128Frac1( a
);
6738 aSig0
= extractFloat128Frac0( a
);
6739 aExp
= extractFloat128Exp( a
);
6740 aSign
= extractFloat128Sign( a
);
6741 bSig1
= extractFloat128Frac1( b
);
6742 bSig0
= extractFloat128Frac0( b
);
6743 bExp
= extractFloat128Exp( b
);
6744 if ( aExp
== 0x7FFF ) {
6745 if ( ( aSig0
| aSig1
)
6746 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6747 return propagateFloat128NaN(a
, b
, status
);
6751 if ( bExp
== 0x7FFF ) {
6752 if (bSig0
| bSig1
) {
6753 return propagateFloat128NaN(a
, b
, status
);
6758 if ( ( bSig0
| bSig1
) == 0 ) {
6760 float_raise(float_flag_invalid
, status
);
6761 return float128_default_nan(status
);
6763 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6766 if ( ( aSig0
| aSig1
) == 0 ) return a
;
6767 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6769 expDiff
= aExp
- bExp
;
6770 if ( expDiff
< -1 ) return a
;
6772 aSig0
| LIT64( 0x0001000000000000 ),
6774 15 - ( expDiff
< 0 ),
6779 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
6780 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
6781 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6783 while ( 0 < expDiff
) {
6784 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6785 q
= ( 4 < q
) ? q
- 4 : 0;
6786 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6787 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
6788 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
6789 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
6792 if ( -64 < expDiff
) {
6793 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6794 q
= ( 4 < q
) ? q
- 4 : 0;
6796 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6798 if ( expDiff
< 0 ) {
6799 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6802 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
6804 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6805 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
6808 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
6809 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6812 alternateASig0
= aSig0
;
6813 alternateASig1
= aSig1
;
6815 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6816 } while ( 0 <= (int64_t) aSig0
);
6818 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
6819 if ( ( sigMean0
< 0 )
6820 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
6821 aSig0
= alternateASig0
;
6822 aSig1
= alternateASig1
;
6824 zSign
= ( (int64_t) aSig0
< 0 );
6825 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
6826 return normalizeRoundAndPackFloat128(aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
,
6830 /*----------------------------------------------------------------------------
6831 | Returns the square root of the quadruple-precision floating-point value `a'.
6832 | The operation is performed according to the IEC/IEEE Standard for Binary
6833 | Floating-Point Arithmetic.
6834 *----------------------------------------------------------------------------*/
6836 float128
float128_sqrt(float128 a
, float_status
*status
)
6840 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
6841 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6843 aSig1
= extractFloat128Frac1( a
);
6844 aSig0
= extractFloat128Frac0( a
);
6845 aExp
= extractFloat128Exp( a
);
6846 aSign
= extractFloat128Sign( a
);
6847 if ( aExp
== 0x7FFF ) {
6848 if (aSig0
| aSig1
) {
6849 return propagateFloat128NaN(a
, a
, status
);
6851 if ( ! aSign
) return a
;
6855 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
6857 float_raise(float_flag_invalid
, status
);
6858 return float128_default_nan(status
);
6861 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
6862 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6864 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
6865 aSig0
|= LIT64( 0x0001000000000000 );
6866 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
6867 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
6868 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6869 doubleZSig0
= zSig0
<<1;
6870 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6871 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6872 while ( (int64_t) rem0
< 0 ) {
6875 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6877 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
6878 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
6879 if ( zSig1
== 0 ) zSig1
= 1;
6880 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
6881 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6882 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
6883 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6884 while ( (int64_t) rem1
< 0 ) {
6886 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
6888 term2
|= doubleZSig0
;
6889 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6891 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6893 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
6894 return roundAndPackFloat128(0, zExp
, zSig0
, zSig1
, zSig2
, status
);
6898 /*----------------------------------------------------------------------------
6899 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6900 | the corresponding value `b', and 0 otherwise. The invalid exception is
6901 | raised if either operand is a NaN. Otherwise, the comparison is performed
6902 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6903 *----------------------------------------------------------------------------*/
6905 int float128_eq(float128 a
, float128 b
, float_status
*status
)
6908 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6909 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6910 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6911 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6913 float_raise(float_flag_invalid
, status
);
6918 && ( ( a
.high
== b
.high
)
6920 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6925 /*----------------------------------------------------------------------------
6926 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6927 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6928 | exception is raised if either operand is a NaN. The comparison is performed
6929 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6930 *----------------------------------------------------------------------------*/
6932 int float128_le(float128 a
, float128 b
, float_status
*status
)
6936 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6937 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6938 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6939 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6941 float_raise(float_flag_invalid
, status
);
6944 aSign
= extractFloat128Sign( a
);
6945 bSign
= extractFloat128Sign( b
);
6946 if ( aSign
!= bSign
) {
6949 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6953 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6954 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6958 /*----------------------------------------------------------------------------
6959 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6960 | the corresponding value `b', and 0 otherwise. The invalid exception is
6961 | raised if either operand is a NaN. The comparison is performed according
6962 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6963 *----------------------------------------------------------------------------*/
6965 int float128_lt(float128 a
, float128 b
, float_status
*status
)
6969 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6970 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6971 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6972 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6974 float_raise(float_flag_invalid
, status
);
6977 aSign
= extractFloat128Sign( a
);
6978 bSign
= extractFloat128Sign( b
);
6979 if ( aSign
!= bSign
) {
6982 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6986 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
6987 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6991 /*----------------------------------------------------------------------------
6992 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6993 | be compared, and 0 otherwise. The invalid exception is raised if either
6994 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6995 | Standard for Binary Floating-Point Arithmetic.
6996 *----------------------------------------------------------------------------*/
6998 int float128_unordered(float128 a
, float128 b
, float_status
*status
)
7000 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7001 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7002 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7003 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7005 float_raise(float_flag_invalid
, status
);
7011 /*----------------------------------------------------------------------------
7012 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7013 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7014 | exception. The comparison is performed according to the IEC/IEEE Standard
7015 | for Binary Floating-Point Arithmetic.
7016 *----------------------------------------------------------------------------*/
7018 int float128_eq_quiet(float128 a
, float128 b
, float_status
*status
)
7021 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7022 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7023 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7024 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7026 if (float128_is_signaling_nan(a
, status
)
7027 || float128_is_signaling_nan(b
, status
)) {
7028 float_raise(float_flag_invalid
, status
);
7034 && ( ( a
.high
== b
.high
)
7036 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
7041 /*----------------------------------------------------------------------------
7042 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7043 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
7044 | cause an exception. Otherwise, the comparison is performed according to the
7045 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7046 *----------------------------------------------------------------------------*/
7048 int float128_le_quiet(float128 a
, float128 b
, float_status
*status
)
7052 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7053 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7054 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7055 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7057 if (float128_is_signaling_nan(a
, status
)
7058 || float128_is_signaling_nan(b
, status
)) {
7059 float_raise(float_flag_invalid
, status
);
7063 aSign
= extractFloat128Sign( a
);
7064 bSign
= extractFloat128Sign( b
);
7065 if ( aSign
!= bSign
) {
7068 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
7072 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
7073 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
7077 /*----------------------------------------------------------------------------
7078 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7079 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7080 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
7081 | Standard for Binary Floating-Point Arithmetic.
7082 *----------------------------------------------------------------------------*/
7084 int float128_lt_quiet(float128 a
, float128 b
, float_status
*status
)
7088 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7089 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7090 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7091 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7093 if (float128_is_signaling_nan(a
, status
)
7094 || float128_is_signaling_nan(b
, status
)) {
7095 float_raise(float_flag_invalid
, status
);
7099 aSign
= extractFloat128Sign( a
);
7100 bSign
= extractFloat128Sign( b
);
7101 if ( aSign
!= bSign
) {
7104 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
7108 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
7109 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
7113 /*----------------------------------------------------------------------------
7114 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7115 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
7116 | comparison is performed according to the IEC/IEEE Standard for Binary
7117 | Floating-Point Arithmetic.
7118 *----------------------------------------------------------------------------*/
7120 int float128_unordered_quiet(float128 a
, float128 b
, float_status
*status
)
7122 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7123 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7124 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7125 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7127 if (float128_is_signaling_nan(a
, status
)
7128 || float128_is_signaling_nan(b
, status
)) {
7129 float_raise(float_flag_invalid
, status
);
7136 /* misc functions */
7137 float32
uint32_to_float32(uint32_t a
, float_status
*status
)
7139 return int64_to_float32(a
, status
);
7142 float64
uint32_to_float64(uint32_t a
, float_status
*status
)
7144 return int64_to_float64(a
, status
);
7147 uint32_t float32_to_uint32(float32 a
, float_status
*status
)
7151 int old_exc_flags
= get_float_exception_flags(status
);
7153 v
= float32_to_int64(a
, status
);
7156 } else if (v
> 0xffffffff) {
7161 set_float_exception_flags(old_exc_flags
, status
);
7162 float_raise(float_flag_invalid
, status
);
7166 uint32_t float32_to_uint32_round_to_zero(float32 a
, float_status
*status
)
7170 int old_exc_flags
= get_float_exception_flags(status
);
7172 v
= float32_to_int64_round_to_zero(a
, status
);
7175 } else if (v
> 0xffffffff) {
7180 set_float_exception_flags(old_exc_flags
, status
);
7181 float_raise(float_flag_invalid
, status
);
7185 int16_t float32_to_int16(float32 a
, float_status
*status
)
7189 int old_exc_flags
= get_float_exception_flags(status
);
7191 v
= float32_to_int32(a
, status
);
7194 } else if (v
> 0x7fff) {
7200 set_float_exception_flags(old_exc_flags
, status
);
7201 float_raise(float_flag_invalid
, status
);
7205 uint16_t float32_to_uint16(float32 a
, float_status
*status
)
7209 int old_exc_flags
= get_float_exception_flags(status
);
7211 v
= float32_to_int32(a
, status
);
7214 } else if (v
> 0xffff) {
7220 set_float_exception_flags(old_exc_flags
, status
);
7221 float_raise(float_flag_invalid
, status
);
7225 uint16_t float32_to_uint16_round_to_zero(float32 a
, float_status
*status
)
7229 int old_exc_flags
= get_float_exception_flags(status
);
7231 v
= float32_to_int64_round_to_zero(a
, status
);
7234 } else if (v
> 0xffff) {
7239 set_float_exception_flags(old_exc_flags
, status
);
7240 float_raise(float_flag_invalid
, status
);
7244 uint32_t float64_to_uint32(float64 a
, float_status
*status
)
7248 int old_exc_flags
= get_float_exception_flags(status
);
7250 v
= float64_to_uint64(a
, status
);
7251 if (v
> 0xffffffff) {
7256 set_float_exception_flags(old_exc_flags
, status
);
7257 float_raise(float_flag_invalid
, status
);
7261 uint32_t float64_to_uint32_round_to_zero(float64 a
, float_status
*status
)
7265 int old_exc_flags
= get_float_exception_flags(status
);
7267 v
= float64_to_uint64_round_to_zero(a
, status
);
7268 if (v
> 0xffffffff) {
7273 set_float_exception_flags(old_exc_flags
, status
);
7274 float_raise(float_flag_invalid
, status
);
7278 int16_t float64_to_int16(float64 a
, float_status
*status
)
7282 int old_exc_flags
= get_float_exception_flags(status
);
7284 v
= float64_to_int32(a
, status
);
7287 } else if (v
> 0x7fff) {
7293 set_float_exception_flags(old_exc_flags
, status
);
7294 float_raise(float_flag_invalid
, status
);
7298 uint16_t float64_to_uint16(float64 a
, float_status
*status
)
7302 int old_exc_flags
= get_float_exception_flags(status
);
7304 v
= float64_to_int32(a
, status
);
7307 } else if (v
> 0xffff) {
7313 set_float_exception_flags(old_exc_flags
, status
);
7314 float_raise(float_flag_invalid
, status
);
7318 uint16_t float64_to_uint16_round_to_zero(float64 a
, float_status
*status
)
7322 int old_exc_flags
= get_float_exception_flags(status
);
7324 v
= float64_to_int64_round_to_zero(a
, status
);
7327 } else if (v
> 0xffff) {
7332 set_float_exception_flags(old_exc_flags
, status
);
7333 float_raise(float_flag_invalid
, status
);
7337 /*----------------------------------------------------------------------------
7338 | Returns the result of converting the double-precision floating-point value
7339 | `a' to the 64-bit unsigned integer format. The conversion is
7340 | performed according to the IEC/IEEE Standard for Binary Floating-Point
7341 | Arithmetic---which means in particular that the conversion is rounded
7342 | according to the current rounding mode. If `a' is a NaN, the largest
7343 | positive integer is returned. If the conversion overflows, the
7344 | largest unsigned integer is returned. If 'a' is negative, the value is
7345 | rounded and zero is returned; negative values that do not round to zero
7346 | will raise the inexact exception.
7347 *----------------------------------------------------------------------------*/
7349 uint64_t float64_to_uint64(float64 a
, float_status
*status
)
7354 uint64_t aSig
, aSigExtra
;
7355 a
= float64_squash_input_denormal(a
, status
);
7357 aSig
= extractFloat64Frac(a
);
7358 aExp
= extractFloat64Exp(a
);
7359 aSign
= extractFloat64Sign(a
);
7360 if (aSign
&& (aExp
> 1022)) {
7361 float_raise(float_flag_invalid
, status
);
7362 if (float64_is_any_nan(a
)) {
7363 return LIT64(0xFFFFFFFFFFFFFFFF);
7369 aSig
|= LIT64(0x0010000000000000);
7371 shiftCount
= 0x433 - aExp
;
7372 if (shiftCount
<= 0) {
7374 float_raise(float_flag_invalid
, status
);
7375 return LIT64(0xFFFFFFFFFFFFFFFF);
7378 aSig
<<= -shiftCount
;
7380 shift64ExtraRightJamming(aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
7382 return roundAndPackUint64(aSign
, aSig
, aSigExtra
, status
);
7385 uint64_t float64_to_uint64_round_to_zero(float64 a
, float_status
*status
)
7387 signed char current_rounding_mode
= status
->float_rounding_mode
;
7388 set_float_rounding_mode(float_round_to_zero
, status
);
7389 int64_t v
= float64_to_uint64(a
, status
);
7390 set_float_rounding_mode(current_rounding_mode
, status
);
7394 #define COMPARE(s, nan_exp) \
7395 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7396 int is_quiet, float_status *status) \
7398 flag aSign, bSign; \
7399 uint ## s ## _t av, bv; \
7400 a = float ## s ## _squash_input_denormal(a, status); \
7401 b = float ## s ## _squash_input_denormal(b, status); \
7403 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
7404 extractFloat ## s ## Frac( a ) ) || \
7405 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
7406 extractFloat ## s ## Frac( b ) )) { \
7408 float ## s ## _is_signaling_nan(a, status) || \
7409 float ## s ## _is_signaling_nan(b, status)) { \
7410 float_raise(float_flag_invalid, status); \
7412 return float_relation_unordered; \
7414 aSign = extractFloat ## s ## Sign( a ); \
7415 bSign = extractFloat ## s ## Sign( b ); \
7416 av = float ## s ## _val(a); \
7417 bv = float ## s ## _val(b); \
7418 if ( aSign != bSign ) { \
7419 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
7421 return float_relation_equal; \
7423 return 1 - (2 * aSign); \
7427 return float_relation_equal; \
7429 return 1 - 2 * (aSign ^ ( av < bv )); \
7434 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
7436 return float ## s ## _compare_internal(a, b, 0, status); \
7439 int float ## s ## _compare_quiet(float ## s a, float ## s b, \
7440 float_status *status) \
7442 return float ## s ## _compare_internal(a, b, 1, status); \
7448 static inline int floatx80_compare_internal(floatx80 a
, floatx80 b
,
7449 int is_quiet
, float_status
*status
)
7453 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
7454 float_raise(float_flag_invalid
, status
);
7455 return float_relation_unordered
;
7457 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
7458 ( extractFloatx80Frac( a
)<<1 ) ) ||
7459 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
7460 ( extractFloatx80Frac( b
)<<1 ) )) {
7462 floatx80_is_signaling_nan(a
, status
) ||
7463 floatx80_is_signaling_nan(b
, status
)) {
7464 float_raise(float_flag_invalid
, status
);
7466 return float_relation_unordered
;
7468 aSign
= extractFloatx80Sign( a
);
7469 bSign
= extractFloatx80Sign( b
);
7470 if ( aSign
!= bSign
) {
7472 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
7473 ( ( a
.low
| b
.low
) == 0 ) ) {
7475 return float_relation_equal
;
7477 return 1 - (2 * aSign
);
7480 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7481 return float_relation_equal
;
7483 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7488 int floatx80_compare(floatx80 a
, floatx80 b
, float_status
*status
)
7490 return floatx80_compare_internal(a
, b
, 0, status
);
7493 int floatx80_compare_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
7495 return floatx80_compare_internal(a
, b
, 1, status
);
7498 static inline int float128_compare_internal(float128 a
, float128 b
,
7499 int is_quiet
, float_status
*status
)
7503 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
7504 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
7505 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
7506 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
7508 float128_is_signaling_nan(a
, status
) ||
7509 float128_is_signaling_nan(b
, status
)) {
7510 float_raise(float_flag_invalid
, status
);
7512 return float_relation_unordered
;
7514 aSign
= extractFloat128Sign( a
);
7515 bSign
= extractFloat128Sign( b
);
7516 if ( aSign
!= bSign
) {
7517 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
7519 return float_relation_equal
;
7521 return 1 - (2 * aSign
);
7524 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7525 return float_relation_equal
;
7527 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7532 int float128_compare(float128 a
, float128 b
, float_status
*status
)
7534 return float128_compare_internal(a
, b
, 0, status
);
7537 int float128_compare_quiet(float128 a
, float128 b
, float_status
*status
)
7539 return float128_compare_internal(a
, b
, 1, status
);
7542 /* min() and max() functions. These can't be implemented as
7543 * 'compare and pick one input' because that would mishandle
7544 * NaNs and +0 vs -0.
7546 * minnum() and maxnum() functions. These are similar to the min()
7547 * and max() functions but if one of the arguments is a QNaN and
7548 * the other is numerical then the numerical argument is returned.
7549 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7550 * and maxNum() operations. min() and max() are the typical min/max
7551 * semantics provided by many CPUs which predate that specification.
7553 * minnummag() and maxnummag() functions correspond to minNumMag()
7554 * and minNumMag() from the IEEE-754 2008.
7557 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \
7558 int ismin, int isieee, \
7560 float_status *status) \
7562 flag aSign, bSign; \
7563 uint ## s ## _t av, bv, aav, abv; \
7564 a = float ## s ## _squash_input_denormal(a, status); \
7565 b = float ## s ## _squash_input_denormal(b, status); \
7566 if (float ## s ## _is_any_nan(a) || \
7567 float ## s ## _is_any_nan(b)) { \
7569 if (float ## s ## _is_quiet_nan(a, status) && \
7570 !float ## s ##_is_any_nan(b)) { \
7572 } else if (float ## s ## _is_quiet_nan(b, status) && \
7573 !float ## s ## _is_any_nan(a)) { \
7577 return propagateFloat ## s ## NaN(a, b, status); \
7579 aSign = extractFloat ## s ## Sign(a); \
7580 bSign = extractFloat ## s ## Sign(b); \
7581 av = float ## s ## _val(a); \
7582 bv = float ## s ## _val(b); \
7584 aav = float ## s ## _abs(av); \
7585 abv = float ## s ## _abs(bv); \
7588 return (aav < abv) ? a : b; \
7590 return (aav < abv) ? b : a; \
7594 if (aSign != bSign) { \
7596 return aSign ? a : b; \
7598 return aSign ? b : a; \
7602 return (aSign ^ (av < bv)) ? a : b; \
7604 return (aSign ^ (av < bv)) ? b : a; \
7609 float ## s float ## s ## _min(float ## s a, float ## s b, \
7610 float_status *status) \
7612 return float ## s ## _minmax(a, b, 1, 0, 0, status); \
7615 float ## s float ## s ## _max(float ## s a, float ## s b, \
7616 float_status *status) \
7618 return float ## s ## _minmax(a, b, 0, 0, 0, status); \
7621 float ## s float ## s ## _minnum(float ## s a, float ## s b, \
7622 float_status *status) \
7624 return float ## s ## _minmax(a, b, 1, 1, 0, status); \
7627 float ## s float ## s ## _maxnum(float ## s a, float ## s b, \
7628 float_status *status) \
7630 return float ## s ## _minmax(a, b, 0, 1, 0, status); \
7633 float ## s float ## s ## _minnummag(float ## s a, float ## s b, \
7634 float_status *status) \
7636 return float ## s ## _minmax(a, b, 1, 1, 1, status); \
7639 float ## s float ## s ## _maxnummag(float ## s a, float ## s b, \
7640 float_status *status) \
7642 return float ## s ## _minmax(a, b, 0, 1, 1, status); \
7649 /* Multiply A by 2 raised to the power N. */
7650 float32
float32_scalbn(float32 a
, int n
, float_status
*status
)
7656 a
= float32_squash_input_denormal(a
, status
);
7657 aSig
= extractFloat32Frac( a
);
7658 aExp
= extractFloat32Exp( a
);
7659 aSign
= extractFloat32Sign( a
);
7661 if ( aExp
== 0xFF ) {
7663 return propagateFloat32NaN(a
, a
, status
);
7669 } else if (aSig
== 0) {
7677 } else if (n
< -0x200) {
7683 return normalizeRoundAndPackFloat32(aSign
, aExp
, aSig
, status
);
7686 float64
float64_scalbn(float64 a
, int n
, float_status
*status
)
7692 a
= float64_squash_input_denormal(a
, status
);
7693 aSig
= extractFloat64Frac( a
);
7694 aExp
= extractFloat64Exp( a
);
7695 aSign
= extractFloat64Sign( a
);
7697 if ( aExp
== 0x7FF ) {
7699 return propagateFloat64NaN(a
, a
, status
);
7704 aSig
|= LIT64( 0x0010000000000000 );
7705 } else if (aSig
== 0) {
7713 } else if (n
< -0x1000) {
7719 return normalizeRoundAndPackFloat64(aSign
, aExp
, aSig
, status
);
7722 floatx80
floatx80_scalbn(floatx80 a
, int n
, float_status
*status
)
7728 if (floatx80_invalid_encoding(a
)) {
7729 float_raise(float_flag_invalid
, status
);
7730 return floatx80_default_nan(status
);
7732 aSig
= extractFloatx80Frac( a
);
7733 aExp
= extractFloatx80Exp( a
);
7734 aSign
= extractFloatx80Sign( a
);
7736 if ( aExp
== 0x7FFF ) {
7738 return propagateFloatx80NaN(a
, a
, status
);
7752 } else if (n
< -0x10000) {
7757 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
7758 aSign
, aExp
, aSig
, 0, status
);
7761 float128
float128_scalbn(float128 a
, int n
, float_status
*status
)
7765 uint64_t aSig0
, aSig1
;
7767 aSig1
= extractFloat128Frac1( a
);
7768 aSig0
= extractFloat128Frac0( a
);
7769 aExp
= extractFloat128Exp( a
);
7770 aSign
= extractFloat128Sign( a
);
7771 if ( aExp
== 0x7FFF ) {
7772 if ( aSig0
| aSig1
) {
7773 return propagateFloat128NaN(a
, a
, status
);
7778 aSig0
|= LIT64( 0x0001000000000000 );
7779 } else if (aSig0
== 0 && aSig1
== 0) {
7787 } else if (n
< -0x10000) {
7792 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1