4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
95 *----------------------------------------------------------------------------*/
96 #include "softfloat-macros.h"
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine: (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output. These details are target-
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
112 static inline uint32_t extractFloat16Frac(float16 a
)
114 return float16_val(a
) & 0x3ff;
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
121 static inline int_fast16_t extractFloat16Exp(float16 a
)
123 return (float16_val(a
) >> 10) & 0x1f;
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
130 static inline flag
extractFloat16Sign(float16 a
)
132 return float16_val(a
)>>15;
135 /*----------------------------------------------------------------------------
136 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
137 | and 7, and returns the properly rounded 32-bit integer corresponding to the
138 | input. If `zSign' is 1, the input is negated before being converted to an
139 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
140 | is simply rounded to an integer, with the inexact exception raised if the
141 | input cannot be represented exactly as an integer. However, if the fixed-
142 | point input is too large, the invalid exception is raised and the largest
143 | positive or negative integer is returned.
144 *----------------------------------------------------------------------------*/
146 static int32_t roundAndPackInt32(flag zSign
, uint64_t absZ
, float_status
*status
)
149 flag roundNearestEven
;
150 int8_t roundIncrement
, roundBits
;
153 roundingMode
= status
->float_rounding_mode
;
154 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
155 switch (roundingMode
) {
156 case float_round_nearest_even
:
157 case float_round_ties_away
:
158 roundIncrement
= 0x40;
160 case float_round_to_zero
:
164 roundIncrement
= zSign
? 0 : 0x7f;
166 case float_round_down
:
167 roundIncrement
= zSign
? 0x7f : 0;
172 roundBits
= absZ
& 0x7F;
173 absZ
= ( absZ
+ roundIncrement
)>>7;
174 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
176 if ( zSign
) z
= - z
;
177 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
178 float_raise(float_flag_invalid
, status
);
179 return zSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
182 status
->float_exception_flags
|= float_flag_inexact
;
188 /*----------------------------------------------------------------------------
189 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
190 | `absZ1', with binary point between bits 63 and 64 (between the input words),
191 | and returns the properly rounded 64-bit integer corresponding to the input.
192 | If `zSign' is 1, the input is negated before being converted to an integer.
193 | Ordinarily, the fixed-point input is simply rounded to an integer, with
194 | the inexact exception raised if the input cannot be represented exactly as
195 | an integer. However, if the fixed-point input is too large, the invalid
196 | exception is raised and the largest positive or negative integer is
198 *----------------------------------------------------------------------------*/
200 static int64_t roundAndPackInt64(flag zSign
, uint64_t absZ0
, uint64_t absZ1
,
201 float_status
*status
)
204 flag roundNearestEven
, increment
;
207 roundingMode
= status
->float_rounding_mode
;
208 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
209 switch (roundingMode
) {
210 case float_round_nearest_even
:
211 case float_round_ties_away
:
212 increment
= ((int64_t) absZ1
< 0);
214 case float_round_to_zero
:
218 increment
= !zSign
&& absZ1
;
220 case float_round_down
:
221 increment
= zSign
&& absZ1
;
228 if ( absZ0
== 0 ) goto overflow
;
229 absZ0
&= ~ ( ( (uint64_t) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
232 if ( zSign
) z
= - z
;
233 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
235 float_raise(float_flag_invalid
, status
);
237 zSign
? (int64_t) LIT64( 0x8000000000000000 )
238 : LIT64( 0x7FFFFFFFFFFFFFFF );
241 status
->float_exception_flags
|= float_flag_inexact
;
247 /*----------------------------------------------------------------------------
248 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
249 | `absZ1', with binary point between bits 63 and 64 (between the input words),
250 | and returns the properly rounded 64-bit unsigned integer corresponding to the
251 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
252 | with the inexact exception raised if the input cannot be represented exactly
253 | as an integer. However, if the fixed-point input is too large, the invalid
254 | exception is raised and the largest unsigned integer is returned.
255 *----------------------------------------------------------------------------*/
257 static int64_t roundAndPackUint64(flag zSign
, uint64_t absZ0
,
258 uint64_t absZ1
, float_status
*status
)
261 flag roundNearestEven
, increment
;
263 roundingMode
= status
->float_rounding_mode
;
264 roundNearestEven
= (roundingMode
== float_round_nearest_even
);
265 switch (roundingMode
) {
266 case float_round_nearest_even
:
267 case float_round_ties_away
:
268 increment
= ((int64_t)absZ1
< 0);
270 case float_round_to_zero
:
274 increment
= !zSign
&& absZ1
;
276 case float_round_down
:
277 increment
= zSign
&& absZ1
;
285 float_raise(float_flag_invalid
, status
);
286 return LIT64(0xFFFFFFFFFFFFFFFF);
288 absZ0
&= ~(((uint64_t)(absZ1
<<1) == 0) & roundNearestEven
);
291 if (zSign
&& absZ0
) {
292 float_raise(float_flag_invalid
, status
);
297 status
->float_exception_flags
|= float_flag_inexact
;
302 /*----------------------------------------------------------------------------
303 | Returns the fraction bits of the single-precision floating-point value `a'.
304 *----------------------------------------------------------------------------*/
306 static inline uint32_t extractFloat32Frac( float32 a
)
309 return float32_val(a
) & 0x007FFFFF;
313 /*----------------------------------------------------------------------------
314 | Returns the exponent bits of the single-precision floating-point value `a'.
315 *----------------------------------------------------------------------------*/
317 static inline int_fast16_t extractFloat32Exp(float32 a
)
320 return ( float32_val(a
)>>23 ) & 0xFF;
324 /*----------------------------------------------------------------------------
325 | Returns the sign bit of the single-precision floating-point value `a'.
326 *----------------------------------------------------------------------------*/
328 static inline flag
extractFloat32Sign( float32 a
)
331 return float32_val(a
)>>31;
335 /*----------------------------------------------------------------------------
336 | If `a' is denormal and we are in flush-to-zero mode then set the
337 | input-denormal exception and return zero. Otherwise just return the value.
338 *----------------------------------------------------------------------------*/
339 float32
float32_squash_input_denormal(float32 a
, float_status
*status
)
341 if (status
->flush_inputs_to_zero
) {
342 if (extractFloat32Exp(a
) == 0 && extractFloat32Frac(a
) != 0) {
343 float_raise(float_flag_input_denormal
, status
);
344 return make_float32(float32_val(a
) & 0x80000000);
350 /*----------------------------------------------------------------------------
351 | Normalizes the subnormal single-precision floating-point value represented
352 | by the denormalized significand `aSig'. The normalized exponent and
353 | significand are stored at the locations pointed to by `zExpPtr' and
354 | `zSigPtr', respectively.
355 *----------------------------------------------------------------------------*/
358 normalizeFloat32Subnormal(uint32_t aSig
, int_fast16_t *zExpPtr
, uint32_t *zSigPtr
)
362 shiftCount
= countLeadingZeros32( aSig
) - 8;
363 *zSigPtr
= aSig
<<shiftCount
;
364 *zExpPtr
= 1 - shiftCount
;
368 /*----------------------------------------------------------------------------
369 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
370 | single-precision floating-point value, returning the result. After being
371 | shifted into the proper positions, the three fields are simply added
372 | together to form the result. This means that any integer portion of `zSig'
373 | will be added into the exponent. Since a properly normalized significand
374 | will have an integer portion equal to 1, the `zExp' input should be 1 less
375 | than the desired result exponent whenever `zSig' is a complete, normalized
377 *----------------------------------------------------------------------------*/
379 static inline float32
packFloat32(flag zSign
, int_fast16_t zExp
, uint32_t zSig
)
383 ( ( (uint32_t) zSign
)<<31 ) + ( ( (uint32_t) zExp
)<<23 ) + zSig
);
387 /*----------------------------------------------------------------------------
388 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
389 | and significand `zSig', and returns the proper single-precision floating-
390 | point value corresponding to the abstract input. Ordinarily, the abstract
391 | value is simply rounded and packed into the single-precision format, with
392 | the inexact exception raised if the abstract input cannot be represented
393 | exactly. However, if the abstract value is too large, the overflow and
394 | inexact exceptions are raised and an infinity or maximal finite value is
395 | returned. If the abstract value is too small, the input value is rounded to
396 | a subnormal number, and the underflow and inexact exceptions are raised if
397 | the abstract input cannot be represented exactly as a subnormal single-
398 | precision floating-point number.
399 | The input significand `zSig' has its binary point between bits 30
400 | and 29, which is 7 bits to the left of the usual location. This shifted
401 | significand must be normalized or smaller. If `zSig' is not normalized,
402 | `zExp' must be 0; in that case, the result returned is a subnormal number,
403 | and it must not require rounding. In the usual case that `zSig' is
404 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
405 | The handling of underflow and overflow follows the IEC/IEEE Standard for
406 | Binary Floating-Point Arithmetic.
407 *----------------------------------------------------------------------------*/
409 static float32
roundAndPackFloat32(flag zSign
, int_fast16_t zExp
, uint32_t zSig
,
410 float_status
*status
)
413 flag roundNearestEven
;
414 int8_t roundIncrement
, roundBits
;
417 roundingMode
= status
->float_rounding_mode
;
418 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
419 switch (roundingMode
) {
420 case float_round_nearest_even
:
421 case float_round_ties_away
:
422 roundIncrement
= 0x40;
424 case float_round_to_zero
:
428 roundIncrement
= zSign
? 0 : 0x7f;
430 case float_round_down
:
431 roundIncrement
= zSign
? 0x7f : 0;
437 roundBits
= zSig
& 0x7F;
438 if ( 0xFD <= (uint16_t) zExp
) {
440 || ( ( zExp
== 0xFD )
441 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
443 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
444 return packFloat32( zSign
, 0xFF, - ( roundIncrement
== 0 ));
447 if (status
->flush_to_zero
) {
448 float_raise(float_flag_output_denormal
, status
);
449 return packFloat32(zSign
, 0, 0);
452 (status
->float_detect_tininess
453 == float_tininess_before_rounding
)
455 || ( zSig
+ roundIncrement
< 0x80000000 );
456 shift32RightJamming( zSig
, - zExp
, &zSig
);
458 roundBits
= zSig
& 0x7F;
459 if (isTiny
&& roundBits
) {
460 float_raise(float_flag_underflow
, status
);
465 status
->float_exception_flags
|= float_flag_inexact
;
467 zSig
= ( zSig
+ roundIncrement
)>>7;
468 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
469 if ( zSig
== 0 ) zExp
= 0;
470 return packFloat32( zSign
, zExp
, zSig
);
474 /*----------------------------------------------------------------------------
475 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
476 | and significand `zSig', and returns the proper single-precision floating-
477 | point value corresponding to the abstract input. This routine is just like
478 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
479 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
480 | floating-point exponent.
481 *----------------------------------------------------------------------------*/
484 normalizeRoundAndPackFloat32(flag zSign
, int_fast16_t zExp
, uint32_t zSig
,
485 float_status
*status
)
489 shiftCount
= countLeadingZeros32( zSig
) - 1;
490 return roundAndPackFloat32(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
495 /*----------------------------------------------------------------------------
496 | Returns the fraction bits of the double-precision floating-point value `a'.
497 *----------------------------------------------------------------------------*/
499 static inline uint64_t extractFloat64Frac( float64 a
)
502 return float64_val(a
) & LIT64( 0x000FFFFFFFFFFFFF );
506 /*----------------------------------------------------------------------------
507 | Returns the exponent bits of the double-precision floating-point value `a'.
508 *----------------------------------------------------------------------------*/
510 static inline int_fast16_t extractFloat64Exp(float64 a
)
513 return ( float64_val(a
)>>52 ) & 0x7FF;
517 /*----------------------------------------------------------------------------
518 | Returns the sign bit of the double-precision floating-point value `a'.
519 *----------------------------------------------------------------------------*/
521 static inline flag
extractFloat64Sign( float64 a
)
524 return float64_val(a
)>>63;
528 /*----------------------------------------------------------------------------
529 | If `a' is denormal and we are in flush-to-zero mode then set the
530 | input-denormal exception and return zero. Otherwise just return the value.
531 *----------------------------------------------------------------------------*/
532 float64
float64_squash_input_denormal(float64 a
, float_status
*status
)
534 if (status
->flush_inputs_to_zero
) {
535 if (extractFloat64Exp(a
) == 0 && extractFloat64Frac(a
) != 0) {
536 float_raise(float_flag_input_denormal
, status
);
537 return make_float64(float64_val(a
) & (1ULL << 63));
543 /*----------------------------------------------------------------------------
544 | Normalizes the subnormal double-precision floating-point value represented
545 | by the denormalized significand `aSig'. The normalized exponent and
546 | significand are stored at the locations pointed to by `zExpPtr' and
547 | `zSigPtr', respectively.
548 *----------------------------------------------------------------------------*/
551 normalizeFloat64Subnormal(uint64_t aSig
, int_fast16_t *zExpPtr
, uint64_t *zSigPtr
)
555 shiftCount
= countLeadingZeros64( aSig
) - 11;
556 *zSigPtr
= aSig
<<shiftCount
;
557 *zExpPtr
= 1 - shiftCount
;
561 /*----------------------------------------------------------------------------
562 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
563 | double-precision floating-point value, returning the result. After being
564 | shifted into the proper positions, the three fields are simply added
565 | together to form the result. This means that any integer portion of `zSig'
566 | will be added into the exponent. Since a properly normalized significand
567 | will have an integer portion equal to 1, the `zExp' input should be 1 less
568 | than the desired result exponent whenever `zSig' is a complete, normalized
570 *----------------------------------------------------------------------------*/
572 static inline float64
packFloat64(flag zSign
, int_fast16_t zExp
, uint64_t zSig
)
576 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
580 /*----------------------------------------------------------------------------
581 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
582 | and significand `zSig', and returns the proper double-precision floating-
583 | point value corresponding to the abstract input. Ordinarily, the abstract
584 | value is simply rounded and packed into the double-precision format, with
585 | the inexact exception raised if the abstract input cannot be represented
586 | exactly. However, if the abstract value is too large, the overflow and
587 | inexact exceptions are raised and an infinity or maximal finite value is
588 | returned. If the abstract value is too small, the input value is rounded to
589 | a subnormal number, and the underflow and inexact exceptions are raised if
590 | the abstract input cannot be represented exactly as a subnormal double-
591 | precision floating-point number.
592 | The input significand `zSig' has its binary point between bits 62
593 | and 61, which is 10 bits to the left of the usual location. This shifted
594 | significand must be normalized or smaller. If `zSig' is not normalized,
595 | `zExp' must be 0; in that case, the result returned is a subnormal number,
596 | and it must not require rounding. In the usual case that `zSig' is
597 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
598 | The handling of underflow and overflow follows the IEC/IEEE Standard for
599 | Binary Floating-Point Arithmetic.
600 *----------------------------------------------------------------------------*/
602 static float64
roundAndPackFloat64(flag zSign
, int_fast16_t zExp
, uint64_t zSig
,
603 float_status
*status
)
606 flag roundNearestEven
;
607 int_fast16_t roundIncrement
, roundBits
;
610 roundingMode
= status
->float_rounding_mode
;
611 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
612 switch (roundingMode
) {
613 case float_round_nearest_even
:
614 case float_round_ties_away
:
615 roundIncrement
= 0x200;
617 case float_round_to_zero
:
621 roundIncrement
= zSign
? 0 : 0x3ff;
623 case float_round_down
:
624 roundIncrement
= zSign
? 0x3ff : 0;
629 roundBits
= zSig
& 0x3FF;
630 if ( 0x7FD <= (uint16_t) zExp
) {
631 if ( ( 0x7FD < zExp
)
632 || ( ( zExp
== 0x7FD )
633 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
635 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
636 return packFloat64( zSign
, 0x7FF, - ( roundIncrement
== 0 ));
639 if (status
->flush_to_zero
) {
640 float_raise(float_flag_output_denormal
, status
);
641 return packFloat64(zSign
, 0, 0);
644 (status
->float_detect_tininess
645 == float_tininess_before_rounding
)
647 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
648 shift64RightJamming( zSig
, - zExp
, &zSig
);
650 roundBits
= zSig
& 0x3FF;
651 if (isTiny
&& roundBits
) {
652 float_raise(float_flag_underflow
, status
);
657 status
->float_exception_flags
|= float_flag_inexact
;
659 zSig
= ( zSig
+ roundIncrement
)>>10;
660 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
661 if ( zSig
== 0 ) zExp
= 0;
662 return packFloat64( zSign
, zExp
, zSig
);
666 /*----------------------------------------------------------------------------
667 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
668 | and significand `zSig', and returns the proper double-precision floating-
669 | point value corresponding to the abstract input. This routine is just like
670 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
671 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
672 | floating-point exponent.
673 *----------------------------------------------------------------------------*/
676 normalizeRoundAndPackFloat64(flag zSign
, int_fast16_t zExp
, uint64_t zSig
,
677 float_status
*status
)
681 shiftCount
= countLeadingZeros64( zSig
) - 1;
682 return roundAndPackFloat64(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
687 /*----------------------------------------------------------------------------
688 | Returns the fraction bits of the extended double-precision floating-point
690 *----------------------------------------------------------------------------*/
692 static inline uint64_t extractFloatx80Frac( floatx80 a
)
699 /*----------------------------------------------------------------------------
700 | Returns the exponent bits of the extended double-precision floating-point
702 *----------------------------------------------------------------------------*/
704 static inline int32_t extractFloatx80Exp( floatx80 a
)
707 return a
.high
& 0x7FFF;
711 /*----------------------------------------------------------------------------
712 | Returns the sign bit of the extended double-precision floating-point value
714 *----------------------------------------------------------------------------*/
716 static inline flag
extractFloatx80Sign( floatx80 a
)
723 /*----------------------------------------------------------------------------
724 | Normalizes the subnormal extended double-precision floating-point value
725 | represented by the denormalized significand `aSig'. The normalized exponent
726 | and significand are stored at the locations pointed to by `zExpPtr' and
727 | `zSigPtr', respectively.
728 *----------------------------------------------------------------------------*/
731 normalizeFloatx80Subnormal( uint64_t aSig
, int32_t *zExpPtr
, uint64_t *zSigPtr
)
735 shiftCount
= countLeadingZeros64( aSig
);
736 *zSigPtr
= aSig
<<shiftCount
;
737 *zExpPtr
= 1 - shiftCount
;
741 /*----------------------------------------------------------------------------
742 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
743 | extended double-precision floating-point value, returning the result.
744 *----------------------------------------------------------------------------*/
746 static inline floatx80
packFloatx80( flag zSign
, int32_t zExp
, uint64_t zSig
)
751 z
.high
= ( ( (uint16_t) zSign
)<<15 ) + zExp
;
756 /*----------------------------------------------------------------------------
757 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
758 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
759 | and returns the proper extended double-precision floating-point value
760 | corresponding to the abstract input. Ordinarily, the abstract value is
761 | rounded and packed into the extended double-precision format, with the
762 | inexact exception raised if the abstract input cannot be represented
763 | exactly. However, if the abstract value is too large, the overflow and
764 | inexact exceptions are raised and an infinity or maximal finite value is
765 | returned. If the abstract value is too small, the input value is rounded to
766 | a subnormal number, and the underflow and inexact exceptions are raised if
767 | the abstract input cannot be represented exactly as a subnormal extended
768 | double-precision floating-point number.
769 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
770 | number of bits as single or double precision, respectively. Otherwise, the
771 | result is rounded to the full precision of the extended double-precision
773 | The input significand must be normalized or smaller. If the input
774 | significand is not normalized, `zExp' must be 0; in that case, the result
775 | returned is a subnormal number, and it must not require rounding. The
776 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
777 | Floating-Point Arithmetic.
778 *----------------------------------------------------------------------------*/
780 static floatx80
roundAndPackFloatx80(int8_t roundingPrecision
, flag zSign
,
781 int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
,
782 float_status
*status
)
785 flag roundNearestEven
, increment
, isTiny
;
786 int64_t roundIncrement
, roundMask
, roundBits
;
788 roundingMode
= status
->float_rounding_mode
;
789 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
790 if ( roundingPrecision
== 80 ) goto precision80
;
791 if ( roundingPrecision
== 64 ) {
792 roundIncrement
= LIT64( 0x0000000000000400 );
793 roundMask
= LIT64( 0x00000000000007FF );
795 else if ( roundingPrecision
== 32 ) {
796 roundIncrement
= LIT64( 0x0000008000000000 );
797 roundMask
= LIT64( 0x000000FFFFFFFFFF );
802 zSig0
|= ( zSig1
!= 0 );
803 switch (roundingMode
) {
804 case float_round_nearest_even
:
805 case float_round_ties_away
:
807 case float_round_to_zero
:
811 roundIncrement
= zSign
? 0 : roundMask
;
813 case float_round_down
:
814 roundIncrement
= zSign
? roundMask
: 0;
819 roundBits
= zSig0
& roundMask
;
820 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
821 if ( ( 0x7FFE < zExp
)
822 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
827 if (status
->flush_to_zero
) {
828 float_raise(float_flag_output_denormal
, status
);
829 return packFloatx80(zSign
, 0, 0);
832 (status
->float_detect_tininess
833 == float_tininess_before_rounding
)
835 || ( zSig0
<= zSig0
+ roundIncrement
);
836 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
838 roundBits
= zSig0
& roundMask
;
839 if (isTiny
&& roundBits
) {
840 float_raise(float_flag_underflow
, status
);
843 status
->float_exception_flags
|= float_flag_inexact
;
845 zSig0
+= roundIncrement
;
846 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
847 roundIncrement
= roundMask
+ 1;
848 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
849 roundMask
|= roundIncrement
;
851 zSig0
&= ~ roundMask
;
852 return packFloatx80( zSign
, zExp
, zSig0
);
856 status
->float_exception_flags
|= float_flag_inexact
;
858 zSig0
+= roundIncrement
;
859 if ( zSig0
< roundIncrement
) {
861 zSig0
= LIT64( 0x8000000000000000 );
863 roundIncrement
= roundMask
+ 1;
864 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
865 roundMask
|= roundIncrement
;
867 zSig0
&= ~ roundMask
;
868 if ( zSig0
== 0 ) zExp
= 0;
869 return packFloatx80( zSign
, zExp
, zSig0
);
871 switch (roundingMode
) {
872 case float_round_nearest_even
:
873 case float_round_ties_away
:
874 increment
= ((int64_t)zSig1
< 0);
876 case float_round_to_zero
:
880 increment
= !zSign
&& zSig1
;
882 case float_round_down
:
883 increment
= zSign
&& zSig1
;
888 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
889 if ( ( 0x7FFE < zExp
)
890 || ( ( zExp
== 0x7FFE )
891 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
897 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
898 if ( ( roundingMode
== float_round_to_zero
)
899 || ( zSign
&& ( roundingMode
== float_round_up
) )
900 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
902 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
904 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
908 (status
->float_detect_tininess
909 == float_tininess_before_rounding
)
912 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
913 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
915 if (isTiny
&& zSig1
) {
916 float_raise(float_flag_underflow
, status
);
919 status
->float_exception_flags
|= float_flag_inexact
;
921 switch (roundingMode
) {
922 case float_round_nearest_even
:
923 case float_round_ties_away
:
924 increment
= ((int64_t)zSig1
< 0);
926 case float_round_to_zero
:
930 increment
= !zSign
&& zSig1
;
932 case float_round_down
:
933 increment
= zSign
&& zSig1
;
941 ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
942 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
944 return packFloatx80( zSign
, zExp
, zSig0
);
948 status
->float_exception_flags
|= float_flag_inexact
;
954 zSig0
= LIT64( 0x8000000000000000 );
957 zSig0
&= ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
961 if ( zSig0
== 0 ) zExp
= 0;
963 return packFloatx80( zSign
, zExp
, zSig0
);
967 /*----------------------------------------------------------------------------
968 | Takes an abstract floating-point value having sign `zSign', exponent
969 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
970 | and returns the proper extended double-precision floating-point value
971 | corresponding to the abstract input. This routine is just like
972 | `roundAndPackFloatx80' except that the input significand does not have to be
974 *----------------------------------------------------------------------------*/
976 static floatx80
normalizeRoundAndPackFloatx80(int8_t roundingPrecision
,
977 flag zSign
, int32_t zExp
,
978 uint64_t zSig0
, uint64_t zSig1
,
979 float_status
*status
)
988 shiftCount
= countLeadingZeros64( zSig0
);
989 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
991 return roundAndPackFloatx80(roundingPrecision
, zSign
, zExp
,
992 zSig0
, zSig1
, status
);
996 /*----------------------------------------------------------------------------
997 | Returns the least-significant 64 fraction bits of the quadruple-precision
998 | floating-point value `a'.
999 *----------------------------------------------------------------------------*/
1001 static inline uint64_t extractFloat128Frac1( float128 a
)
1008 /*----------------------------------------------------------------------------
1009 | Returns the most-significant 48 fraction bits of the quadruple-precision
1010 | floating-point value `a'.
1011 *----------------------------------------------------------------------------*/
1013 static inline uint64_t extractFloat128Frac0( float128 a
)
1016 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
1020 /*----------------------------------------------------------------------------
1021 | Returns the exponent bits of the quadruple-precision floating-point value
1023 *----------------------------------------------------------------------------*/
1025 static inline int32_t extractFloat128Exp( float128 a
)
1028 return ( a
.high
>>48 ) & 0x7FFF;
1032 /*----------------------------------------------------------------------------
1033 | Returns the sign bit of the quadruple-precision floating-point value `a'.
1034 *----------------------------------------------------------------------------*/
1036 static inline flag
extractFloat128Sign( float128 a
)
1043 /*----------------------------------------------------------------------------
1044 | Normalizes the subnormal quadruple-precision floating-point value
1045 | represented by the denormalized significand formed by the concatenation of
1046 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
1047 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
1048 | significand are stored at the location pointed to by `zSig0Ptr', and the
1049 | least significant 64 bits of the normalized significand are stored at the
1050 | location pointed to by `zSig1Ptr'.
1051 *----------------------------------------------------------------------------*/
1054 normalizeFloat128Subnormal(
1065 shiftCount
= countLeadingZeros64( aSig1
) - 15;
1066 if ( shiftCount
< 0 ) {
1067 *zSig0Ptr
= aSig1
>>( - shiftCount
);
1068 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
1071 *zSig0Ptr
= aSig1
<<shiftCount
;
1074 *zExpPtr
= - shiftCount
- 63;
1077 shiftCount
= countLeadingZeros64( aSig0
) - 15;
1078 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
1079 *zExpPtr
= 1 - shiftCount
;
1084 /*----------------------------------------------------------------------------
1085 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1086 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1087 | floating-point value, returning the result. After being shifted into the
1088 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1089 | added together to form the most significant 32 bits of the result. This
1090 | means that any integer portion of `zSig0' will be added into the exponent.
1091 | Since a properly normalized significand will have an integer portion equal
1092 | to 1, the `zExp' input should be 1 less than the desired result exponent
1093 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1095 *----------------------------------------------------------------------------*/
1097 static inline float128
1098 packFloat128( flag zSign
, int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
)
1103 z
.high
= ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<48 ) + zSig0
;
1108 /*----------------------------------------------------------------------------
1109 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1110 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1111 | and `zSig2', and returns the proper quadruple-precision floating-point value
1112 | corresponding to the abstract input. Ordinarily, the abstract value is
1113 | simply rounded and packed into the quadruple-precision format, with the
1114 | inexact exception raised if the abstract input cannot be represented
1115 | exactly. However, if the abstract value is too large, the overflow and
1116 | inexact exceptions are raised and an infinity or maximal finite value is
1117 | returned. If the abstract value is too small, the input value is rounded to
1118 | a subnormal number, and the underflow and inexact exceptions are raised if
1119 | the abstract input cannot be represented exactly as a subnormal quadruple-
1120 | precision floating-point number.
1121 | The input significand must be normalized or smaller. If the input
1122 | significand is not normalized, `zExp' must be 0; in that case, the result
1123 | returned is a subnormal number, and it must not require rounding. In the
1124 | usual case that the input significand is normalized, `zExp' must be 1 less
1125 | than the ``true'' floating-point exponent. The handling of underflow and
1126 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1127 *----------------------------------------------------------------------------*/
1129 static float128
roundAndPackFloat128(flag zSign
, int32_t zExp
,
1130 uint64_t zSig0
, uint64_t zSig1
,
1131 uint64_t zSig2
, float_status
*status
)
1133 int8_t roundingMode
;
1134 flag roundNearestEven
, increment
, isTiny
;
1136 roundingMode
= status
->float_rounding_mode
;
1137 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
1138 switch (roundingMode
) {
1139 case float_round_nearest_even
:
1140 case float_round_ties_away
:
1141 increment
= ((int64_t)zSig2
< 0);
1143 case float_round_to_zero
:
1146 case float_round_up
:
1147 increment
= !zSign
&& zSig2
;
1149 case float_round_down
:
1150 increment
= zSign
&& zSig2
;
1155 if ( 0x7FFD <= (uint32_t) zExp
) {
1156 if ( ( 0x7FFD < zExp
)
1157 || ( ( zExp
== 0x7FFD )
1159 LIT64( 0x0001FFFFFFFFFFFF ),
1160 LIT64( 0xFFFFFFFFFFFFFFFF ),
1167 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
1168 if ( ( roundingMode
== float_round_to_zero
)
1169 || ( zSign
&& ( roundingMode
== float_round_up
) )
1170 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
1176 LIT64( 0x0000FFFFFFFFFFFF ),
1177 LIT64( 0xFFFFFFFFFFFFFFFF )
1180 return packFloat128( zSign
, 0x7FFF, 0, 0 );
1183 if (status
->flush_to_zero
) {
1184 float_raise(float_flag_output_denormal
, status
);
1185 return packFloat128(zSign
, 0, 0, 0);
1188 (status
->float_detect_tininess
1189 == float_tininess_before_rounding
)
1195 LIT64( 0x0001FFFFFFFFFFFF ),
1196 LIT64( 0xFFFFFFFFFFFFFFFF )
1198 shift128ExtraRightJamming(
1199 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
1201 if (isTiny
&& zSig2
) {
1202 float_raise(float_flag_underflow
, status
);
1204 switch (roundingMode
) {
1205 case float_round_nearest_even
:
1206 case float_round_ties_away
:
1207 increment
= ((int64_t)zSig2
< 0);
1209 case float_round_to_zero
:
1212 case float_round_up
:
1213 increment
= !zSign
&& zSig2
;
1215 case float_round_down
:
1216 increment
= zSign
&& zSig2
;
1224 status
->float_exception_flags
|= float_flag_inexact
;
1227 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
1228 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
1231 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
1233 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1237 /*----------------------------------------------------------------------------
1238 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1239 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1240 | returns the proper quadruple-precision floating-point value corresponding
1241 | to the abstract input. This routine is just like `roundAndPackFloat128'
1242 | except that the input significand has fewer bits and does not have to be
1243 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1245 *----------------------------------------------------------------------------*/
1247 static float128
normalizeRoundAndPackFloat128(flag zSign
, int32_t zExp
,
1248 uint64_t zSig0
, uint64_t zSig1
,
1249 float_status
*status
)
1259 shiftCount
= countLeadingZeros64( zSig0
) - 15;
1260 if ( 0 <= shiftCount
) {
1262 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1265 shift128ExtraRightJamming(
1266 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
1269 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
1273 /*----------------------------------------------------------------------------
1274 | Returns the result of converting the 32-bit two's complement integer `a'
1275 | to the single-precision floating-point format. The conversion is performed
1276 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1277 *----------------------------------------------------------------------------*/
1279 float32
int32_to_float32(int32_t a
, float_status
*status
)
1283 if ( a
== 0 ) return float32_zero
;
1284 if ( a
== (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1286 return normalizeRoundAndPackFloat32(zSign
, 0x9C, zSign
? -a
: a
, status
);
1289 /*----------------------------------------------------------------------------
1290 | Returns the result of converting the 32-bit two's complement integer `a'
1291 | to the double-precision floating-point format. The conversion is performed
1292 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1293 *----------------------------------------------------------------------------*/
1295 float64
int32_to_float64(int32_t a
, float_status
*status
)
1302 if ( a
== 0 ) return float64_zero
;
1304 absA
= zSign
? - a
: a
;
1305 shiftCount
= countLeadingZeros32( absA
) + 21;
1307 return packFloat64( zSign
, 0x432 - shiftCount
, zSig
<<shiftCount
);
1311 /*----------------------------------------------------------------------------
1312 | Returns the result of converting the 32-bit two's complement integer `a'
1313 | to the extended double-precision floating-point format. The conversion
1314 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1316 *----------------------------------------------------------------------------*/
1318 floatx80
int32_to_floatx80(int32_t a
, float_status
*status
)
1325 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1327 absA
= zSign
? - a
: a
;
1328 shiftCount
= countLeadingZeros32( absA
) + 32;
1330 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
1334 /*----------------------------------------------------------------------------
1335 | Returns the result of converting the 32-bit two's complement integer `a' to
1336 | the quadruple-precision floating-point format. The conversion is performed
1337 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1338 *----------------------------------------------------------------------------*/
1340 float128
int32_to_float128(int32_t a
, float_status
*status
)
1347 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1349 absA
= zSign
? - a
: a
;
1350 shiftCount
= countLeadingZeros32( absA
) + 17;
1352 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
1356 /*----------------------------------------------------------------------------
1357 | Returns the result of converting the 64-bit two's complement integer `a'
1358 | to the single-precision floating-point format. The conversion is performed
1359 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1360 *----------------------------------------------------------------------------*/
1362 float32
int64_to_float32(int64_t a
, float_status
*status
)
1368 if ( a
== 0 ) return float32_zero
;
1370 absA
= zSign
? - a
: a
;
1371 shiftCount
= countLeadingZeros64( absA
) - 40;
1372 if ( 0 <= shiftCount
) {
1373 return packFloat32( zSign
, 0x95 - shiftCount
, absA
<<shiftCount
);
1377 if ( shiftCount
< 0 ) {
1378 shift64RightJamming( absA
, - shiftCount
, &absA
);
1381 absA
<<= shiftCount
;
1383 return roundAndPackFloat32(zSign
, 0x9C - shiftCount
, absA
, status
);
1388 /*----------------------------------------------------------------------------
1389 | Returns the result of converting the 64-bit two's complement integer `a'
1390 | to the double-precision floating-point format. The conversion is performed
1391 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1392 *----------------------------------------------------------------------------*/
1394 float64
int64_to_float64(int64_t a
, float_status
*status
)
1398 if ( a
== 0 ) return float64_zero
;
1399 if ( a
== (int64_t) LIT64( 0x8000000000000000 ) ) {
1400 return packFloat64( 1, 0x43E, 0 );
1403 return normalizeRoundAndPackFloat64(zSign
, 0x43C, zSign
? -a
: a
, status
);
1406 /*----------------------------------------------------------------------------
1407 | Returns the result of converting the 64-bit two's complement integer `a'
1408 | to the extended double-precision floating-point format. The conversion
1409 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1411 *----------------------------------------------------------------------------*/
1413 floatx80
int64_to_floatx80(int64_t a
, float_status
*status
)
1419 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1421 absA
= zSign
? - a
: a
;
1422 shiftCount
= countLeadingZeros64( absA
);
1423 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
1427 /*----------------------------------------------------------------------------
1428 | Returns the result of converting the 64-bit two's complement integer `a' to
1429 | the quadruple-precision floating-point format. The conversion is performed
1430 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1431 *----------------------------------------------------------------------------*/
1433 float128
int64_to_float128(int64_t a
, float_status
*status
)
1439 uint64_t zSig0
, zSig1
;
1441 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1443 absA
= zSign
? - a
: a
;
1444 shiftCount
= countLeadingZeros64( absA
) + 49;
1445 zExp
= 0x406E - shiftCount
;
1446 if ( 64 <= shiftCount
) {
1455 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1456 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1460 /*----------------------------------------------------------------------------
1461 | Returns the result of converting the 64-bit unsigned integer `a'
1462 | to the single-precision floating-point format. The conversion is performed
1463 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1464 *----------------------------------------------------------------------------*/
1466 float32
uint64_to_float32(uint64_t a
, float_status
*status
)
1471 return float32_zero
;
1474 /* Determine (left) shift needed to put first set bit into bit posn 23
1475 * (since packFloat32() expects the binary point between bits 23 and 22);
1476 * this is the fast case for smallish numbers.
1478 shiftcount
= countLeadingZeros64(a
) - 40;
1479 if (shiftcount
>= 0) {
1480 return packFloat32(0, 0x95 - shiftcount
, a
<< shiftcount
);
1482 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1483 * expects the binary point between bits 30 and 29, hence the + 7.
1486 if (shiftcount
< 0) {
1487 shift64RightJamming(a
, -shiftcount
, &a
);
1492 return roundAndPackFloat32(0, 0x9c - shiftcount
, a
, status
);
1495 /*----------------------------------------------------------------------------
1496 | Returns the result of converting the 64-bit unsigned integer `a'
1497 | to the double-precision floating-point format. The conversion is performed
1498 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1499 *----------------------------------------------------------------------------*/
1501 float64
uint64_to_float64(uint64_t a
, float_status
*status
)
1507 return float64_zero
;
1510 shiftcount
= countLeadingZeros64(a
) - 1;
1511 if (shiftcount
< 0) {
1512 shift64RightJamming(a
, -shiftcount
, &a
);
1516 return roundAndPackFloat64(0, exp
- shiftcount
, a
, status
);
1519 /*----------------------------------------------------------------------------
1520 | Returns the result of converting the 64-bit unsigned integer `a'
1521 | to the quadruple-precision floating-point format. The conversion is performed
1522 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1523 *----------------------------------------------------------------------------*/
1525 float128
uint64_to_float128(uint64_t a
, float_status
*status
)
1528 return float128_zero
;
1530 return normalizeRoundAndPackFloat128(0, 0x406E, a
, 0, status
);
1533 /*----------------------------------------------------------------------------
1534 | Returns the result of converting the single-precision floating-point value
1535 | `a' to the 32-bit two's complement integer format. The conversion is
1536 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1537 | Arithmetic---which means in particular that the conversion is rounded
1538 | according to the current rounding mode. If `a' is a NaN, the largest
1539 | positive integer is returned. Otherwise, if the conversion overflows, the
1540 | largest integer with the same sign as `a' is returned.
1541 *----------------------------------------------------------------------------*/
1543 int32_t float32_to_int32(float32 a
, float_status
*status
)
1546 int_fast16_t aExp
, shiftCount
;
1550 a
= float32_squash_input_denormal(a
, status
);
1551 aSig
= extractFloat32Frac( a
);
1552 aExp
= extractFloat32Exp( a
);
1553 aSign
= extractFloat32Sign( a
);
1554 if ( ( aExp
== 0xFF ) && aSig
) aSign
= 0;
1555 if ( aExp
) aSig
|= 0x00800000;
1556 shiftCount
= 0xAF - aExp
;
1559 if ( 0 < shiftCount
) shift64RightJamming( aSig64
, shiftCount
, &aSig64
);
1560 return roundAndPackInt32(aSign
, aSig64
, status
);
1564 /*----------------------------------------------------------------------------
1565 | Returns the result of converting the single-precision floating-point value
1566 | `a' to the 32-bit two's complement integer format. The conversion is
1567 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1568 | Arithmetic, except that the conversion is always rounded toward zero.
1569 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1570 | the conversion overflows, the largest integer with the same sign as `a' is
1572 *----------------------------------------------------------------------------*/
1574 int32_t float32_to_int32_round_to_zero(float32 a
, float_status
*status
)
1577 int_fast16_t aExp
, shiftCount
;
1580 a
= float32_squash_input_denormal(a
, status
);
1582 aSig
= extractFloat32Frac( a
);
1583 aExp
= extractFloat32Exp( a
);
1584 aSign
= extractFloat32Sign( a
);
1585 shiftCount
= aExp
- 0x9E;
1586 if ( 0 <= shiftCount
) {
1587 if ( float32_val(a
) != 0xCF000000 ) {
1588 float_raise(float_flag_invalid
, status
);
1589 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
1591 return (int32_t) 0x80000000;
1593 else if ( aExp
<= 0x7E ) {
1595 status
->float_exception_flags
|= float_flag_inexact
;
1599 aSig
= ( aSig
| 0x00800000 )<<8;
1600 z
= aSig
>>( - shiftCount
);
1601 if ( (uint32_t) ( aSig
<<( shiftCount
& 31 ) ) ) {
1602 status
->float_exception_flags
|= float_flag_inexact
;
1604 if ( aSign
) z
= - z
;
1609 /*----------------------------------------------------------------------------
1610 | Returns the result of converting the single-precision floating-point value
1611 | `a' to the 16-bit two's complement integer format. The conversion is
1612 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1613 | Arithmetic, except that the conversion is always rounded toward zero.
1614 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1615 | the conversion overflows, the largest integer with the same sign as `a' is
1617 *----------------------------------------------------------------------------*/
1619 int_fast16_t float32_to_int16_round_to_zero(float32 a
, float_status
*status
)
1622 int_fast16_t aExp
, shiftCount
;
1626 aSig
= extractFloat32Frac( a
);
1627 aExp
= extractFloat32Exp( a
);
1628 aSign
= extractFloat32Sign( a
);
1629 shiftCount
= aExp
- 0x8E;
1630 if ( 0 <= shiftCount
) {
1631 if ( float32_val(a
) != 0xC7000000 ) {
1632 float_raise(float_flag_invalid
, status
);
1633 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1637 return (int32_t) 0xffff8000;
1639 else if ( aExp
<= 0x7E ) {
1640 if ( aExp
| aSig
) {
1641 status
->float_exception_flags
|= float_flag_inexact
;
1646 aSig
= ( aSig
| 0x00800000 )<<8;
1647 z
= aSig
>>( - shiftCount
);
1648 if ( (uint32_t) ( aSig
<<( shiftCount
& 31 ) ) ) {
1649 status
->float_exception_flags
|= float_flag_inexact
;
1658 /*----------------------------------------------------------------------------
1659 | Returns the result of converting the single-precision floating-point value
1660 | `a' to the 64-bit two's complement integer format. The conversion is
1661 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1662 | Arithmetic---which means in particular that the conversion is rounded
1663 | according to the current rounding mode. If `a' is a NaN, the largest
1664 | positive integer is returned. Otherwise, if the conversion overflows, the
1665 | largest integer with the same sign as `a' is returned.
1666 *----------------------------------------------------------------------------*/
1668 int64_t float32_to_int64(float32 a
, float_status
*status
)
1671 int_fast16_t aExp
, shiftCount
;
1673 uint64_t aSig64
, aSigExtra
;
1674 a
= float32_squash_input_denormal(a
, status
);
1676 aSig
= extractFloat32Frac( a
);
1677 aExp
= extractFloat32Exp( a
);
1678 aSign
= extractFloat32Sign( a
);
1679 shiftCount
= 0xBE - aExp
;
1680 if ( shiftCount
< 0 ) {
1681 float_raise(float_flag_invalid
, status
);
1682 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1683 return LIT64( 0x7FFFFFFFFFFFFFFF );
1685 return (int64_t) LIT64( 0x8000000000000000 );
1687 if ( aExp
) aSig
|= 0x00800000;
1690 shift64ExtraRightJamming( aSig64
, 0, shiftCount
, &aSig64
, &aSigExtra
);
1691 return roundAndPackInt64(aSign
, aSig64
, aSigExtra
, status
);
1695 /*----------------------------------------------------------------------------
1696 | Returns the result of converting the single-precision floating-point value
1697 | `a' to the 64-bit unsigned integer format. The conversion is
1698 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1699 | Arithmetic---which means in particular that the conversion is rounded
1700 | according to the current rounding mode. If `a' is a NaN, the largest
1701 | unsigned integer is returned. Otherwise, if the conversion overflows, the
1702 | largest unsigned integer is returned. If the 'a' is negative, the result
1703 | is rounded and zero is returned; values that do not round to zero will
1704 | raise the inexact exception flag.
1705 *----------------------------------------------------------------------------*/
1707 uint64_t float32_to_uint64(float32 a
, float_status
*status
)
1710 int_fast16_t aExp
, shiftCount
;
1712 uint64_t aSig64
, aSigExtra
;
1713 a
= float32_squash_input_denormal(a
, status
);
1715 aSig
= extractFloat32Frac(a
);
1716 aExp
= extractFloat32Exp(a
);
1717 aSign
= extractFloat32Sign(a
);
1718 if ((aSign
) && (aExp
> 126)) {
1719 float_raise(float_flag_invalid
, status
);
1720 if (float32_is_any_nan(a
)) {
1721 return LIT64(0xFFFFFFFFFFFFFFFF);
1726 shiftCount
= 0xBE - aExp
;
1730 if (shiftCount
< 0) {
1731 float_raise(float_flag_invalid
, status
);
1732 return LIT64(0xFFFFFFFFFFFFFFFF);
1737 shift64ExtraRightJamming(aSig64
, 0, shiftCount
, &aSig64
, &aSigExtra
);
1738 return roundAndPackUint64(aSign
, aSig64
, aSigExtra
, status
);
1741 /*----------------------------------------------------------------------------
1742 | Returns the result of converting the single-precision floating-point value
1743 | `a' to the 64-bit unsigned integer format. The conversion is
1744 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1745 | Arithmetic, except that the conversion is always rounded toward zero. If
1746 | `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the
1747 | conversion overflows, the largest unsigned integer is returned. If the
1748 | 'a' is negative, the result is rounded and zero is returned; values that do
1749 | not round to zero will raise the inexact flag.
1750 *----------------------------------------------------------------------------*/
1752 uint64_t float32_to_uint64_round_to_zero(float32 a
, float_status
*status
)
1754 signed char current_rounding_mode
= status
->float_rounding_mode
;
1755 set_float_rounding_mode(float_round_to_zero
, status
);
1756 int64_t v
= float32_to_uint64(a
, status
);
1757 set_float_rounding_mode(current_rounding_mode
, status
);
1761 /*----------------------------------------------------------------------------
1762 | Returns the result of converting the single-precision floating-point value
1763 | `a' to the 64-bit two's complement integer format. The conversion is
1764 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1765 | Arithmetic, except that the conversion is always rounded toward zero. If
1766 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1767 | conversion overflows, the largest integer with the same sign as `a' is
1769 *----------------------------------------------------------------------------*/
1771 int64_t float32_to_int64_round_to_zero(float32 a
, float_status
*status
)
1774 int_fast16_t aExp
, shiftCount
;
1778 a
= float32_squash_input_denormal(a
, status
);
1780 aSig
= extractFloat32Frac( a
);
1781 aExp
= extractFloat32Exp( a
);
1782 aSign
= extractFloat32Sign( a
);
1783 shiftCount
= aExp
- 0xBE;
1784 if ( 0 <= shiftCount
) {
1785 if ( float32_val(a
) != 0xDF000000 ) {
1786 float_raise(float_flag_invalid
, status
);
1787 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1788 return LIT64( 0x7FFFFFFFFFFFFFFF );
1791 return (int64_t) LIT64( 0x8000000000000000 );
1793 else if ( aExp
<= 0x7E ) {
1795 status
->float_exception_flags
|= float_flag_inexact
;
1799 aSig64
= aSig
| 0x00800000;
1801 z
= aSig64
>>( - shiftCount
);
1802 if ( (uint64_t) ( aSig64
<<( shiftCount
& 63 ) ) ) {
1803 status
->float_exception_flags
|= float_flag_inexact
;
1805 if ( aSign
) z
= - z
;
1810 /*----------------------------------------------------------------------------
1811 | Returns the result of converting the single-precision floating-point value
1812 | `a' to the double-precision floating-point format. The conversion is
1813 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1815 *----------------------------------------------------------------------------*/
1817 float64
float32_to_float64(float32 a
, float_status
*status
)
1822 a
= float32_squash_input_denormal(a
, status
);
1824 aSig
= extractFloat32Frac( a
);
1825 aExp
= extractFloat32Exp( a
);
1826 aSign
= extractFloat32Sign( a
);
1827 if ( aExp
== 0xFF ) {
1829 return commonNaNToFloat64(float32ToCommonNaN(a
, status
), status
);
1831 return packFloat64( aSign
, 0x7FF, 0 );
1834 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
1835 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1838 return packFloat64( aSign
, aExp
+ 0x380, ( (uint64_t) aSig
)<<29 );
1842 /*----------------------------------------------------------------------------
1843 | Returns the result of converting the single-precision floating-point value
1844 | `a' to the extended double-precision floating-point format. The conversion
1845 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1847 *----------------------------------------------------------------------------*/
1849 floatx80
float32_to_floatx80(float32 a
, float_status
*status
)
1855 a
= float32_squash_input_denormal(a
, status
);
1856 aSig
= extractFloat32Frac( a
);
1857 aExp
= extractFloat32Exp( a
);
1858 aSign
= extractFloat32Sign( a
);
1859 if ( aExp
== 0xFF ) {
1861 return commonNaNToFloatx80(float32ToCommonNaN(a
, status
), status
);
1863 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1866 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
1867 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1870 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
1874 /*----------------------------------------------------------------------------
1875 | Returns the result of converting the single-precision floating-point value
1876 | `a' to the double-precision floating-point format. The conversion is
1877 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1879 *----------------------------------------------------------------------------*/
1881 float128
float32_to_float128(float32 a
, float_status
*status
)
1887 a
= float32_squash_input_denormal(a
, status
);
1888 aSig
= extractFloat32Frac( a
);
1889 aExp
= extractFloat32Exp( a
);
1890 aSign
= extractFloat32Sign( a
);
1891 if ( aExp
== 0xFF ) {
1893 return commonNaNToFloat128(float32ToCommonNaN(a
, status
), status
);
1895 return packFloat128( aSign
, 0x7FFF, 0, 0 );
1898 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
1899 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1902 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
1906 /*----------------------------------------------------------------------------
1907 | Rounds the single-precision floating-point value `a' to an integer, and
1908 | returns the result as a single-precision floating-point value. The
1909 | operation is performed according to the IEC/IEEE Standard for Binary
1910 | Floating-Point Arithmetic.
1911 *----------------------------------------------------------------------------*/
1913 float32
float32_round_to_int(float32 a
, float_status
*status
)
1917 uint32_t lastBitMask
, roundBitsMask
;
1919 a
= float32_squash_input_denormal(a
, status
);
1921 aExp
= extractFloat32Exp( a
);
1922 if ( 0x96 <= aExp
) {
1923 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
1924 return propagateFloat32NaN(a
, a
, status
);
1928 if ( aExp
<= 0x7E ) {
1929 if ( (uint32_t) ( float32_val(a
)<<1 ) == 0 ) return a
;
1930 status
->float_exception_flags
|= float_flag_inexact
;
1931 aSign
= extractFloat32Sign( a
);
1932 switch (status
->float_rounding_mode
) {
1933 case float_round_nearest_even
:
1934 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
1935 return packFloat32( aSign
, 0x7F, 0 );
1938 case float_round_ties_away
:
1940 return packFloat32(aSign
, 0x7F, 0);
1943 case float_round_down
:
1944 return make_float32(aSign
? 0xBF800000 : 0);
1945 case float_round_up
:
1946 return make_float32(aSign
? 0x80000000 : 0x3F800000);
1948 return packFloat32( aSign
, 0, 0 );
1951 lastBitMask
<<= 0x96 - aExp
;
1952 roundBitsMask
= lastBitMask
- 1;
1954 switch (status
->float_rounding_mode
) {
1955 case float_round_nearest_even
:
1956 z
+= lastBitMask
>>1;
1957 if ((z
& roundBitsMask
) == 0) {
1961 case float_round_ties_away
:
1962 z
+= lastBitMask
>> 1;
1964 case float_round_to_zero
:
1966 case float_round_up
:
1967 if (!extractFloat32Sign(make_float32(z
))) {
1971 case float_round_down
:
1972 if (extractFloat32Sign(make_float32(z
))) {
1979 z
&= ~ roundBitsMask
;
1980 if (z
!= float32_val(a
)) {
1981 status
->float_exception_flags
|= float_flag_inexact
;
1983 return make_float32(z
);
1987 /*----------------------------------------------------------------------------
1988 | Returns the result of adding the absolute values of the single-precision
1989 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1990 | before being returned. `zSign' is ignored if the result is a NaN.
1991 | The addition is performed according to the IEC/IEEE Standard for Binary
1992 | Floating-Point Arithmetic.
1993 *----------------------------------------------------------------------------*/
1995 static float32
addFloat32Sigs(float32 a
, float32 b
, flag zSign
,
1996 float_status
*status
)
1998 int_fast16_t aExp
, bExp
, zExp
;
1999 uint32_t aSig
, bSig
, zSig
;
2000 int_fast16_t expDiff
;
2002 aSig
= extractFloat32Frac( a
);
2003 aExp
= extractFloat32Exp( a
);
2004 bSig
= extractFloat32Frac( b
);
2005 bExp
= extractFloat32Exp( b
);
2006 expDiff
= aExp
- bExp
;
2009 if ( 0 < expDiff
) {
2010 if ( aExp
== 0xFF ) {
2012 return propagateFloat32NaN(a
, b
, status
);
2022 shift32RightJamming( bSig
, expDiff
, &bSig
);
2025 else if ( expDiff
< 0 ) {
2026 if ( bExp
== 0xFF ) {
2028 return propagateFloat32NaN(a
, b
, status
);
2030 return packFloat32( zSign
, 0xFF, 0 );
2038 shift32RightJamming( aSig
, - expDiff
, &aSig
);
2042 if ( aExp
== 0xFF ) {
2044 return propagateFloat32NaN(a
, b
, status
);
2049 if (status
->flush_to_zero
) {
2051 float_raise(float_flag_output_denormal
, status
);
2053 return packFloat32(zSign
, 0, 0);
2055 return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
2057 zSig
= 0x40000000 + aSig
+ bSig
;
2062 zSig
= ( aSig
+ bSig
)<<1;
2064 if ( (int32_t) zSig
< 0 ) {
2069 return roundAndPackFloat32(zSign
, zExp
, zSig
, status
);
2073 /*----------------------------------------------------------------------------
2074 | Returns the result of subtracting the absolute values of the single-
2075 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2076 | difference is negated before being returned. `zSign' is ignored if the
2077 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2078 | Standard for Binary Floating-Point Arithmetic.
2079 *----------------------------------------------------------------------------*/
2081 static float32
subFloat32Sigs(float32 a
, float32 b
, flag zSign
,
2082 float_status
*status
)
2084 int_fast16_t aExp
, bExp
, zExp
;
2085 uint32_t aSig
, bSig
, zSig
;
2086 int_fast16_t expDiff
;
2088 aSig
= extractFloat32Frac( a
);
2089 aExp
= extractFloat32Exp( a
);
2090 bSig
= extractFloat32Frac( b
);
2091 bExp
= extractFloat32Exp( b
);
2092 expDiff
= aExp
- bExp
;
2095 if ( 0 < expDiff
) goto aExpBigger
;
2096 if ( expDiff
< 0 ) goto bExpBigger
;
2097 if ( aExp
== 0xFF ) {
2099 return propagateFloat32NaN(a
, b
, status
);
2101 float_raise(float_flag_invalid
, status
);
2102 return float32_default_nan
;
2108 if ( bSig
< aSig
) goto aBigger
;
2109 if ( aSig
< bSig
) goto bBigger
;
2110 return packFloat32(status
->float_rounding_mode
== float_round_down
, 0, 0);
2112 if ( bExp
== 0xFF ) {
2114 return propagateFloat32NaN(a
, b
, status
);
2116 return packFloat32( zSign
^ 1, 0xFF, 0 );
2124 shift32RightJamming( aSig
, - expDiff
, &aSig
);
2130 goto normalizeRoundAndPack
;
2132 if ( aExp
== 0xFF ) {
2134 return propagateFloat32NaN(a
, b
, status
);
2144 shift32RightJamming( bSig
, expDiff
, &bSig
);
2149 normalizeRoundAndPack
:
2151 return normalizeRoundAndPackFloat32(zSign
, zExp
, zSig
, status
);
2155 /*----------------------------------------------------------------------------
2156 | Returns the result of adding the single-precision floating-point values `a'
2157 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2158 | Binary Floating-Point Arithmetic.
2159 *----------------------------------------------------------------------------*/
2161 float32
float32_add(float32 a
, float32 b
, float_status
*status
)
2164 a
= float32_squash_input_denormal(a
, status
);
2165 b
= float32_squash_input_denormal(b
, status
);
2167 aSign
= extractFloat32Sign( a
);
2168 bSign
= extractFloat32Sign( b
);
2169 if ( aSign
== bSign
) {
2170 return addFloat32Sigs(a
, b
, aSign
, status
);
2173 return subFloat32Sigs(a
, b
, aSign
, status
);
2178 /*----------------------------------------------------------------------------
2179 | Returns the result of subtracting the single-precision floating-point values
2180 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2181 | for Binary Floating-Point Arithmetic.
2182 *----------------------------------------------------------------------------*/
2184 float32
float32_sub(float32 a
, float32 b
, float_status
*status
)
2187 a
= float32_squash_input_denormal(a
, status
);
2188 b
= float32_squash_input_denormal(b
, status
);
2190 aSign
= extractFloat32Sign( a
);
2191 bSign
= extractFloat32Sign( b
);
2192 if ( aSign
== bSign
) {
2193 return subFloat32Sigs(a
, b
, aSign
, status
);
2196 return addFloat32Sigs(a
, b
, aSign
, status
);
2201 /*----------------------------------------------------------------------------
2202 | Returns the result of multiplying the single-precision floating-point values
2203 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2204 | for Binary Floating-Point Arithmetic.
2205 *----------------------------------------------------------------------------*/
2207 float32
float32_mul(float32 a
, float32 b
, float_status
*status
)
2209 flag aSign
, bSign
, zSign
;
2210 int_fast16_t aExp
, bExp
, zExp
;
2211 uint32_t aSig
, bSig
;
2215 a
= float32_squash_input_denormal(a
, status
);
2216 b
= float32_squash_input_denormal(b
, status
);
2218 aSig
= extractFloat32Frac( a
);
2219 aExp
= extractFloat32Exp( a
);
2220 aSign
= extractFloat32Sign( a
);
2221 bSig
= extractFloat32Frac( b
);
2222 bExp
= extractFloat32Exp( b
);
2223 bSign
= extractFloat32Sign( b
);
2224 zSign
= aSign
^ bSign
;
2225 if ( aExp
== 0xFF ) {
2226 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
2227 return propagateFloat32NaN(a
, b
, status
);
2229 if ( ( bExp
| bSig
) == 0 ) {
2230 float_raise(float_flag_invalid
, status
);
2231 return float32_default_nan
;
2233 return packFloat32( zSign
, 0xFF, 0 );
2235 if ( bExp
== 0xFF ) {
2237 return propagateFloat32NaN(a
, b
, status
);
2239 if ( ( aExp
| aSig
) == 0 ) {
2240 float_raise(float_flag_invalid
, status
);
2241 return float32_default_nan
;
2243 return packFloat32( zSign
, 0xFF, 0 );
2246 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
2247 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2250 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
2251 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2253 zExp
= aExp
+ bExp
- 0x7F;
2254 aSig
= ( aSig
| 0x00800000 )<<7;
2255 bSig
= ( bSig
| 0x00800000 )<<8;
2256 shift64RightJamming( ( (uint64_t) aSig
) * bSig
, 32, &zSig64
);
2258 if ( 0 <= (int32_t) ( zSig
<<1 ) ) {
2262 return roundAndPackFloat32(zSign
, zExp
, zSig
, status
);
2266 /*----------------------------------------------------------------------------
2267 | Returns the result of dividing the single-precision floating-point value `a'
2268 | by the corresponding value `b'. The operation is performed according to the
2269 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2270 *----------------------------------------------------------------------------*/
2272 float32
float32_div(float32 a
, float32 b
, float_status
*status
)
2274 flag aSign
, bSign
, zSign
;
2275 int_fast16_t aExp
, bExp
, zExp
;
2276 uint32_t aSig
, bSig
, zSig
;
2277 a
= float32_squash_input_denormal(a
, status
);
2278 b
= float32_squash_input_denormal(b
, status
);
2280 aSig
= extractFloat32Frac( a
);
2281 aExp
= extractFloat32Exp( a
);
2282 aSign
= extractFloat32Sign( a
);
2283 bSig
= extractFloat32Frac( b
);
2284 bExp
= extractFloat32Exp( b
);
2285 bSign
= extractFloat32Sign( b
);
2286 zSign
= aSign
^ bSign
;
2287 if ( aExp
== 0xFF ) {
2289 return propagateFloat32NaN(a
, b
, status
);
2291 if ( bExp
== 0xFF ) {
2293 return propagateFloat32NaN(a
, b
, status
);
2295 float_raise(float_flag_invalid
, status
);
2296 return float32_default_nan
;
2298 return packFloat32( zSign
, 0xFF, 0 );
2300 if ( bExp
== 0xFF ) {
2302 return propagateFloat32NaN(a
, b
, status
);
2304 return packFloat32( zSign
, 0, 0 );
2308 if ( ( aExp
| aSig
) == 0 ) {
2309 float_raise(float_flag_invalid
, status
);
2310 return float32_default_nan
;
2312 float_raise(float_flag_divbyzero
, status
);
2313 return packFloat32( zSign
, 0xFF, 0 );
2315 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2318 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
2319 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2321 zExp
= aExp
- bExp
+ 0x7D;
2322 aSig
= ( aSig
| 0x00800000 )<<7;
2323 bSig
= ( bSig
| 0x00800000 )<<8;
2324 if ( bSig
<= ( aSig
+ aSig
) ) {
2328 zSig
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
2329 if ( ( zSig
& 0x3F ) == 0 ) {
2330 zSig
|= ( (uint64_t) bSig
* zSig
!= ( (uint64_t) aSig
)<<32 );
2332 return roundAndPackFloat32(zSign
, zExp
, zSig
, status
);
2336 /*----------------------------------------------------------------------------
2337 | Returns the remainder of the single-precision floating-point value `a'
2338 | with respect to the corresponding value `b'. The operation is performed
2339 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2340 *----------------------------------------------------------------------------*/
2342 float32
float32_rem(float32 a
, float32 b
, float_status
*status
)
2345 int_fast16_t aExp
, bExp
, expDiff
;
2346 uint32_t aSig
, bSig
;
2348 uint64_t aSig64
, bSig64
, q64
;
2349 uint32_t alternateASig
;
2351 a
= float32_squash_input_denormal(a
, status
);
2352 b
= float32_squash_input_denormal(b
, status
);
2354 aSig
= extractFloat32Frac( a
);
2355 aExp
= extractFloat32Exp( a
);
2356 aSign
= extractFloat32Sign( a
);
2357 bSig
= extractFloat32Frac( b
);
2358 bExp
= extractFloat32Exp( b
);
2359 if ( aExp
== 0xFF ) {
2360 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
2361 return propagateFloat32NaN(a
, b
, status
);
2363 float_raise(float_flag_invalid
, status
);
2364 return float32_default_nan
;
2366 if ( bExp
== 0xFF ) {
2368 return propagateFloat32NaN(a
, b
, status
);
2374 float_raise(float_flag_invalid
, status
);
2375 return float32_default_nan
;
2377 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2380 if ( aSig
== 0 ) return a
;
2381 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2383 expDiff
= aExp
- bExp
;
2386 if ( expDiff
< 32 ) {
2389 if ( expDiff
< 0 ) {
2390 if ( expDiff
< -1 ) return a
;
2393 q
= ( bSig
<= aSig
);
2394 if ( q
) aSig
-= bSig
;
2395 if ( 0 < expDiff
) {
2396 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
2399 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
2407 if ( bSig
<= aSig
) aSig
-= bSig
;
2408 aSig64
= ( (uint64_t) aSig
)<<40;
2409 bSig64
= ( (uint64_t) bSig
)<<40;
2411 while ( 0 < expDiff
) {
2412 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2413 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2414 aSig64
= - ( ( bSig
* q64
)<<38 );
2418 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2419 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2420 q
= q64
>>( 64 - expDiff
);
2422 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
2425 alternateASig
= aSig
;
2428 } while ( 0 <= (int32_t) aSig
);
2429 sigMean
= aSig
+ alternateASig
;
2430 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
2431 aSig
= alternateASig
;
2433 zSign
= ( (int32_t) aSig
< 0 );
2434 if ( zSign
) aSig
= - aSig
;
2435 return normalizeRoundAndPackFloat32(aSign
^ zSign
, bExp
, aSig
, status
);
2438 /*----------------------------------------------------------------------------
2439 | Returns the result of multiplying the single-precision floating-point values
2440 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2441 | multiplication. The operation is performed according to the IEC/IEEE
2442 | Standard for Binary Floating-Point Arithmetic 754-2008.
2443 | The flags argument allows the caller to select negation of the
2444 | addend, the intermediate product, or the final result. (The difference
2445 | between this and having the caller do a separate negation is that negating
2446 | externally will flip the sign bit on NaNs.)
2447 *----------------------------------------------------------------------------*/
2449 float32
float32_muladd(float32 a
, float32 b
, float32 c
, int flags
,
2450 float_status
*status
)
2452 flag aSign
, bSign
, cSign
, zSign
;
2453 int_fast16_t aExp
, bExp
, cExp
, pExp
, zExp
, expDiff
;
2454 uint32_t aSig
, bSig
, cSig
;
2455 flag pInf
, pZero
, pSign
;
2456 uint64_t pSig64
, cSig64
, zSig64
;
2459 flag signflip
, infzero
;
2461 a
= float32_squash_input_denormal(a
, status
);
2462 b
= float32_squash_input_denormal(b
, status
);
2463 c
= float32_squash_input_denormal(c
, status
);
2464 aSig
= extractFloat32Frac(a
);
2465 aExp
= extractFloat32Exp(a
);
2466 aSign
= extractFloat32Sign(a
);
2467 bSig
= extractFloat32Frac(b
);
2468 bExp
= extractFloat32Exp(b
);
2469 bSign
= extractFloat32Sign(b
);
2470 cSig
= extractFloat32Frac(c
);
2471 cExp
= extractFloat32Exp(c
);
2472 cSign
= extractFloat32Sign(c
);
2474 infzero
= ((aExp
== 0 && aSig
== 0 && bExp
== 0xff && bSig
== 0) ||
2475 (aExp
== 0xff && aSig
== 0 && bExp
== 0 && bSig
== 0));
2477 /* It is implementation-defined whether the cases of (0,inf,qnan)
2478 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2479 * they return if they do), so we have to hand this information
2480 * off to the target-specific pick-a-NaN routine.
2482 if (((aExp
== 0xff) && aSig
) ||
2483 ((bExp
== 0xff) && bSig
) ||
2484 ((cExp
== 0xff) && cSig
)) {
2485 return propagateFloat32MulAddNaN(a
, b
, c
, infzero
, status
);
2489 float_raise(float_flag_invalid
, status
);
2490 return float32_default_nan
;
2493 if (flags
& float_muladd_negate_c
) {
2497 signflip
= (flags
& float_muladd_negate_result
) ? 1 : 0;
2499 /* Work out the sign and type of the product */
2500 pSign
= aSign
^ bSign
;
2501 if (flags
& float_muladd_negate_product
) {
2504 pInf
= (aExp
== 0xff) || (bExp
== 0xff);
2505 pZero
= ((aExp
| aSig
) == 0) || ((bExp
| bSig
) == 0);
2508 if (pInf
&& (pSign
^ cSign
)) {
2509 /* addition of opposite-signed infinities => InvalidOperation */
2510 float_raise(float_flag_invalid
, status
);
2511 return float32_default_nan
;
2513 /* Otherwise generate an infinity of the same sign */
2514 return packFloat32(cSign
^ signflip
, 0xff, 0);
2518 return packFloat32(pSign
^ signflip
, 0xff, 0);
2524 /* Adding two exact zeroes */
2525 if (pSign
== cSign
) {
2527 } else if (status
->float_rounding_mode
== float_round_down
) {
2532 return packFloat32(zSign
^ signflip
, 0, 0);
2534 /* Exact zero plus a denorm */
2535 if (status
->flush_to_zero
) {
2536 float_raise(float_flag_output_denormal
, status
);
2537 return packFloat32(cSign
^ signflip
, 0, 0);
2540 /* Zero plus something non-zero : just return the something */
2541 if (flags
& float_muladd_halve_result
) {
2543 normalizeFloat32Subnormal(cSig
, &cExp
, &cSig
);
2545 /* Subtract one to halve, and one again because roundAndPackFloat32
2546 * wants one less than the true exponent.
2549 cSig
= (cSig
| 0x00800000) << 7;
2550 return roundAndPackFloat32(cSign
^ signflip
, cExp
, cSig
, status
);
2552 return packFloat32(cSign
^ signflip
, cExp
, cSig
);
2556 normalizeFloat32Subnormal(aSig
, &aExp
, &aSig
);
2559 normalizeFloat32Subnormal(bSig
, &bExp
, &bSig
);
2562 /* Calculate the actual result a * b + c */
2564 /* Multiply first; this is easy. */
2565 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2566 * because we want the true exponent, not the "one-less-than"
2567 * flavour that roundAndPackFloat32() takes.
2569 pExp
= aExp
+ bExp
- 0x7e;
2570 aSig
= (aSig
| 0x00800000) << 7;
2571 bSig
= (bSig
| 0x00800000) << 8;
2572 pSig64
= (uint64_t)aSig
* bSig
;
2573 if ((int64_t)(pSig64
<< 1) >= 0) {
2578 zSign
= pSign
^ signflip
;
2580 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2585 /* Throw out the special case of c being an exact zero now */
2586 shift64RightJamming(pSig64
, 32, &pSig64
);
2588 if (flags
& float_muladd_halve_result
) {
2591 return roundAndPackFloat32(zSign
, pExp
- 1,
2594 normalizeFloat32Subnormal(cSig
, &cExp
, &cSig
);
2597 cSig64
= (uint64_t)cSig
<< (62 - 23);
2598 cSig64
|= LIT64(0x4000000000000000);
2599 expDiff
= pExp
- cExp
;
2601 if (pSign
== cSign
) {
2604 /* scale c to match p */
2605 shift64RightJamming(cSig64
, expDiff
, &cSig64
);
2607 } else if (expDiff
< 0) {
2608 /* scale p to match c */
2609 shift64RightJamming(pSig64
, -expDiff
, &pSig64
);
2612 /* no scaling needed */
2615 /* Add significands and make sure explicit bit ends up in posn 62 */
2616 zSig64
= pSig64
+ cSig64
;
2617 if ((int64_t)zSig64
< 0) {
2618 shift64RightJamming(zSig64
, 1, &zSig64
);
2625 shift64RightJamming(cSig64
, expDiff
, &cSig64
);
2626 zSig64
= pSig64
- cSig64
;
2628 } else if (expDiff
< 0) {
2629 shift64RightJamming(pSig64
, -expDiff
, &pSig64
);
2630 zSig64
= cSig64
- pSig64
;
2635 if (cSig64
< pSig64
) {
2636 zSig64
= pSig64
- cSig64
;
2637 } else if (pSig64
< cSig64
) {
2638 zSig64
= cSig64
- pSig64
;
2643 if (status
->float_rounding_mode
== float_round_down
) {
2646 return packFloat32(zSign
, 0, 0);
2650 /* Normalize to put the explicit bit back into bit 62. */
2651 shiftcount
= countLeadingZeros64(zSig64
) - 1;
2652 zSig64
<<= shiftcount
;
2655 if (flags
& float_muladd_halve_result
) {
2659 shift64RightJamming(zSig64
, 32, &zSig64
);
2660 return roundAndPackFloat32(zSign
, zExp
, zSig64
, status
);
2664 /*----------------------------------------------------------------------------
2665 | Returns the square root of the single-precision floating-point value `a'.
2666 | The operation is performed according to the IEC/IEEE Standard for Binary
2667 | Floating-Point Arithmetic.
2668 *----------------------------------------------------------------------------*/
2670 float32
float32_sqrt(float32 a
, float_status
*status
)
2673 int_fast16_t aExp
, zExp
;
2674 uint32_t aSig
, zSig
;
2676 a
= float32_squash_input_denormal(a
, status
);
2678 aSig
= extractFloat32Frac( a
);
2679 aExp
= extractFloat32Exp( a
);
2680 aSign
= extractFloat32Sign( a
);
2681 if ( aExp
== 0xFF ) {
2683 return propagateFloat32NaN(a
, float32_zero
, status
);
2685 if ( ! aSign
) return a
;
2686 float_raise(float_flag_invalid
, status
);
2687 return float32_default_nan
;
2690 if ( ( aExp
| aSig
) == 0 ) return a
;
2691 float_raise(float_flag_invalid
, status
);
2692 return float32_default_nan
;
2695 if ( aSig
== 0 ) return float32_zero
;
2696 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2698 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
2699 aSig
= ( aSig
| 0x00800000 )<<8;
2700 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
2701 if ( ( zSig
& 0x7F ) <= 5 ) {
2707 term
= ( (uint64_t) zSig
) * zSig
;
2708 rem
= ( ( (uint64_t) aSig
)<<32 ) - term
;
2709 while ( (int64_t) rem
< 0 ) {
2711 rem
+= ( ( (uint64_t) zSig
)<<1 ) | 1;
2713 zSig
|= ( rem
!= 0 );
2715 shift32RightJamming( zSig
, 1, &zSig
);
2717 return roundAndPackFloat32(0, zExp
, zSig
, status
);
2721 /*----------------------------------------------------------------------------
2722 | Returns the binary exponential of the single-precision floating-point value
2723 | `a'. The operation is performed according to the IEC/IEEE Standard for
2724 | Binary Floating-Point Arithmetic.
2726 | Uses the following identities:
2728 | 1. -------------------------------------------------------------------------
2732 | 2. -------------------------------------------------------------------------
2735 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2737 *----------------------------------------------------------------------------*/
2739 static const float64 float32_exp2_coefficients
[15] =
2741 const_float64( 0x3ff0000000000000ll
), /* 1 */
2742 const_float64( 0x3fe0000000000000ll
), /* 2 */
2743 const_float64( 0x3fc5555555555555ll
), /* 3 */
2744 const_float64( 0x3fa5555555555555ll
), /* 4 */
2745 const_float64( 0x3f81111111111111ll
), /* 5 */
2746 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
2747 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
2748 const_float64( 0x3efa01a01a01a01all
), /* 8 */
2749 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
2750 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
2751 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
2752 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
2753 const_float64( 0x3de6124613a86d09ll
), /* 13 */
2754 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
2755 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
2758 float32
float32_exp2(float32 a
, float_status
*status
)
2765 a
= float32_squash_input_denormal(a
, status
);
2767 aSig
= extractFloat32Frac( a
);
2768 aExp
= extractFloat32Exp( a
);
2769 aSign
= extractFloat32Sign( a
);
2771 if ( aExp
== 0xFF) {
2773 return propagateFloat32NaN(a
, float32_zero
, status
);
2775 return (aSign
) ? float32_zero
: a
;
2778 if (aSig
== 0) return float32_one
;
2781 float_raise(float_flag_inexact
, status
);
2783 /* ******************************* */
2784 /* using float64 for approximation */
2785 /* ******************************* */
2786 x
= float32_to_float64(a
, status
);
2787 x
= float64_mul(x
, float64_ln2
, status
);
2791 for (i
= 0 ; i
< 15 ; i
++) {
2794 f
= float64_mul(xn
, float32_exp2_coefficients
[i
], status
);
2795 r
= float64_add(r
, f
, status
);
2797 xn
= float64_mul(xn
, x
, status
);
2800 return float64_to_float32(r
, status
);
2803 /*----------------------------------------------------------------------------
2804 | Returns the binary log of the single-precision floating-point value `a'.
2805 | The operation is performed according to the IEC/IEEE Standard for Binary
2806 | Floating-Point Arithmetic.
2807 *----------------------------------------------------------------------------*/
2808 float32
float32_log2(float32 a
, float_status
*status
)
2812 uint32_t aSig
, zSig
, i
;
2814 a
= float32_squash_input_denormal(a
, status
);
2815 aSig
= extractFloat32Frac( a
);
2816 aExp
= extractFloat32Exp( a
);
2817 aSign
= extractFloat32Sign( a
);
2820 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
2821 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2824 float_raise(float_flag_invalid
, status
);
2825 return float32_default_nan
;
2827 if ( aExp
== 0xFF ) {
2829 return propagateFloat32NaN(a
, float32_zero
, status
);
2839 for (i
= 1 << 22; i
> 0; i
>>= 1) {
2840 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
2841 if ( aSig
& 0x01000000 ) {
2850 return normalizeRoundAndPackFloat32(zSign
, 0x85, zSig
, status
);
2853 /*----------------------------------------------------------------------------
2854 | Returns 1 if the single-precision floating-point value `a' is equal to
2855 | the corresponding value `b', and 0 otherwise. The invalid exception is
2856 | raised if either operand is a NaN. Otherwise, the comparison is performed
2857 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2858 *----------------------------------------------------------------------------*/
2860 int float32_eq(float32 a
, float32 b
, float_status
*status
)
2863 a
= float32_squash_input_denormal(a
, status
);
2864 b
= float32_squash_input_denormal(b
, status
);
2866 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2867 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2869 float_raise(float_flag_invalid
, status
);
2872 av
= float32_val(a
);
2873 bv
= float32_val(b
);
2874 return ( av
== bv
) || ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
2877 /*----------------------------------------------------------------------------
2878 | Returns 1 if the single-precision floating-point value `a' is less than
2879 | or equal to the corresponding value `b', and 0 otherwise. The invalid
2880 | exception is raised if either operand is a NaN. The comparison is performed
2881 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2882 *----------------------------------------------------------------------------*/
2884 int float32_le(float32 a
, float32 b
, float_status
*status
)
2888 a
= float32_squash_input_denormal(a
, status
);
2889 b
= float32_squash_input_denormal(b
, status
);
2891 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2892 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2894 float_raise(float_flag_invalid
, status
);
2897 aSign
= extractFloat32Sign( a
);
2898 bSign
= extractFloat32Sign( b
);
2899 av
= float32_val(a
);
2900 bv
= float32_val(b
);
2901 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
2902 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
2906 /*----------------------------------------------------------------------------
2907 | Returns 1 if the single-precision floating-point value `a' is less than
2908 | the corresponding value `b', and 0 otherwise. The invalid exception is
2909 | raised if either operand is a NaN. The comparison is performed according
2910 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2911 *----------------------------------------------------------------------------*/
2913 int float32_lt(float32 a
, float32 b
, float_status
*status
)
2917 a
= float32_squash_input_denormal(a
, status
);
2918 b
= float32_squash_input_denormal(b
, status
);
2920 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2921 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2923 float_raise(float_flag_invalid
, status
);
2926 aSign
= extractFloat32Sign( a
);
2927 bSign
= extractFloat32Sign( b
);
2928 av
= float32_val(a
);
2929 bv
= float32_val(b
);
2930 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
2931 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
2935 /*----------------------------------------------------------------------------
2936 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2937 | be compared, and 0 otherwise. The invalid exception is raised if either
2938 | operand is a NaN. The comparison is performed according to the IEC/IEEE
2939 | Standard for Binary Floating-Point Arithmetic.
2940 *----------------------------------------------------------------------------*/
2942 int float32_unordered(float32 a
, float32 b
, float_status
*status
)
2944 a
= float32_squash_input_denormal(a
, status
);
2945 b
= float32_squash_input_denormal(b
, status
);
2947 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2948 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2950 float_raise(float_flag_invalid
, status
);
2956 /*----------------------------------------------------------------------------
2957 | Returns 1 if the single-precision floating-point value `a' is equal to
2958 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2959 | exception. The comparison is performed according to the IEC/IEEE Standard
2960 | for Binary Floating-Point Arithmetic.
2961 *----------------------------------------------------------------------------*/
2963 int float32_eq_quiet(float32 a
, float32 b
, float_status
*status
)
2965 a
= float32_squash_input_denormal(a
, status
);
2966 b
= float32_squash_input_denormal(b
, status
);
2968 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2969 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2971 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2972 float_raise(float_flag_invalid
, status
);
2976 return ( float32_val(a
) == float32_val(b
) ) ||
2977 ( (uint32_t) ( ( float32_val(a
) | float32_val(b
) )<<1 ) == 0 );
2980 /*----------------------------------------------------------------------------
2981 | Returns 1 if the single-precision floating-point value `a' is less than or
2982 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2983 | cause an exception. Otherwise, the comparison is performed according to the
2984 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2985 *----------------------------------------------------------------------------*/
2987 int float32_le_quiet(float32 a
, float32 b
, float_status
*status
)
2991 a
= float32_squash_input_denormal(a
, status
);
2992 b
= float32_squash_input_denormal(b
, status
);
2994 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2995 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2997 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2998 float_raise(float_flag_invalid
, status
);
3002 aSign
= extractFloat32Sign( a
);
3003 bSign
= extractFloat32Sign( b
);
3004 av
= float32_val(a
);
3005 bv
= float32_val(b
);
3006 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
3007 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3011 /*----------------------------------------------------------------------------
3012 | Returns 1 if the single-precision floating-point value `a' is less than
3013 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3014 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3015 | Standard for Binary Floating-Point Arithmetic.
3016 *----------------------------------------------------------------------------*/
3018 int float32_lt_quiet(float32 a
, float32 b
, float_status
*status
)
3022 a
= float32_squash_input_denormal(a
, status
);
3023 b
= float32_squash_input_denormal(b
, status
);
3025 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3026 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3028 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
3029 float_raise(float_flag_invalid
, status
);
3033 aSign
= extractFloat32Sign( a
);
3034 bSign
= extractFloat32Sign( b
);
3035 av
= float32_val(a
);
3036 bv
= float32_val(b
);
3037 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
3038 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3042 /*----------------------------------------------------------------------------
3043 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3044 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3045 | comparison is performed according to the IEC/IEEE Standard for Binary
3046 | Floating-Point Arithmetic.
3047 *----------------------------------------------------------------------------*/
3049 int float32_unordered_quiet(float32 a
, float32 b
, float_status
*status
)
3051 a
= float32_squash_input_denormal(a
, status
);
3052 b
= float32_squash_input_denormal(b
, status
);
3054 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3055 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3057 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
3058 float_raise(float_flag_invalid
, status
);
3065 /*----------------------------------------------------------------------------
3066 | Returns the result of converting the double-precision floating-point value
3067 | `a' to the 32-bit two's complement integer format. The conversion is
3068 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3069 | Arithmetic---which means in particular that the conversion is rounded
3070 | according to the current rounding mode. If `a' is a NaN, the largest
3071 | positive integer is returned. Otherwise, if the conversion overflows, the
3072 | largest integer with the same sign as `a' is returned.
3073 *----------------------------------------------------------------------------*/
3075 int32_t float64_to_int32(float64 a
, float_status
*status
)
3078 int_fast16_t aExp
, shiftCount
;
3080 a
= float64_squash_input_denormal(a
, status
);
3082 aSig
= extractFloat64Frac( a
);
3083 aExp
= extractFloat64Exp( a
);
3084 aSign
= extractFloat64Sign( a
);
3085 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
3086 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
3087 shiftCount
= 0x42C - aExp
;
3088 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
3089 return roundAndPackInt32(aSign
, aSig
, status
);
3093 /*----------------------------------------------------------------------------
3094 | Returns the result of converting the double-precision floating-point value
3095 | `a' to the 32-bit two's complement integer format. The conversion is
3096 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3097 | Arithmetic, except that the conversion is always rounded toward zero.
3098 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3099 | the conversion overflows, the largest integer with the same sign as `a' is
3101 *----------------------------------------------------------------------------*/
3103 int32_t float64_to_int32_round_to_zero(float64 a
, float_status
*status
)
3106 int_fast16_t aExp
, shiftCount
;
3107 uint64_t aSig
, savedASig
;
3109 a
= float64_squash_input_denormal(a
, status
);
3111 aSig
= extractFloat64Frac( a
);
3112 aExp
= extractFloat64Exp( a
);
3113 aSign
= extractFloat64Sign( a
);
3114 if ( 0x41E < aExp
) {
3115 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
3118 else if ( aExp
< 0x3FF ) {
3120 status
->float_exception_flags
|= float_flag_inexact
;
3124 aSig
|= LIT64( 0x0010000000000000 );
3125 shiftCount
= 0x433 - aExp
;
3127 aSig
>>= shiftCount
;
3129 if ( aSign
) z
= - z
;
3130 if ( ( z
< 0 ) ^ aSign
) {
3132 float_raise(float_flag_invalid
, status
);
3133 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
3135 if ( ( aSig
<<shiftCount
) != savedASig
) {
3136 status
->float_exception_flags
|= float_flag_inexact
;
3142 /*----------------------------------------------------------------------------
3143 | Returns the result of converting the double-precision floating-point value
3144 | `a' to the 16-bit two's complement integer format. The conversion is
3145 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3146 | Arithmetic, except that the conversion is always rounded toward zero.
3147 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3148 | the conversion overflows, the largest integer with the same sign as `a' is
3150 *----------------------------------------------------------------------------*/
3152 int_fast16_t float64_to_int16_round_to_zero(float64 a
, float_status
*status
)
3155 int_fast16_t aExp
, shiftCount
;
3156 uint64_t aSig
, savedASig
;
3159 aSig
= extractFloat64Frac( a
);
3160 aExp
= extractFloat64Exp( a
);
3161 aSign
= extractFloat64Sign( a
);
3162 if ( 0x40E < aExp
) {
3163 if ( ( aExp
== 0x7FF ) && aSig
) {
3168 else if ( aExp
< 0x3FF ) {
3169 if ( aExp
|| aSig
) {
3170 status
->float_exception_flags
|= float_flag_inexact
;
3174 aSig
|= LIT64( 0x0010000000000000 );
3175 shiftCount
= 0x433 - aExp
;
3177 aSig
>>= shiftCount
;
3182 if ( ( (int16_t)z
< 0 ) ^ aSign
) {
3184 float_raise(float_flag_invalid
, status
);
3185 return aSign
? (int32_t) 0xffff8000 : 0x7FFF;
3187 if ( ( aSig
<<shiftCount
) != savedASig
) {
3188 status
->float_exception_flags
|= float_flag_inexact
;
3193 /*----------------------------------------------------------------------------
3194 | Returns the result of converting the double-precision floating-point value
3195 | `a' to the 64-bit two's complement integer format. The conversion is
3196 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3197 | Arithmetic---which means in particular that the conversion is rounded
3198 | according to the current rounding mode. If `a' is a NaN, the largest
3199 | positive integer is returned. Otherwise, if the conversion overflows, the
3200 | largest integer with the same sign as `a' is returned.
3201 *----------------------------------------------------------------------------*/
3203 int64_t float64_to_int64(float64 a
, float_status
*status
)
3206 int_fast16_t aExp
, shiftCount
;
3207 uint64_t aSig
, aSigExtra
;
3208 a
= float64_squash_input_denormal(a
, status
);
3210 aSig
= extractFloat64Frac( a
);
3211 aExp
= extractFloat64Exp( a
);
3212 aSign
= extractFloat64Sign( a
);
3213 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
3214 shiftCount
= 0x433 - aExp
;
3215 if ( shiftCount
<= 0 ) {
3216 if ( 0x43E < aExp
) {
3217 float_raise(float_flag_invalid
, status
);
3219 || ( ( aExp
== 0x7FF )
3220 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
3222 return LIT64( 0x7FFFFFFFFFFFFFFF );
3224 return (int64_t) LIT64( 0x8000000000000000 );
3227 aSig
<<= - shiftCount
;
3230 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
3232 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
3236 /*----------------------------------------------------------------------------
3237 | Returns the result of converting the double-precision floating-point value
3238 | `a' to the 64-bit two's complement integer format. The conversion is
3239 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3240 | Arithmetic, except that the conversion is always rounded toward zero.
3241 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3242 | the conversion overflows, the largest integer with the same sign as `a' is
3244 *----------------------------------------------------------------------------*/
3246 int64_t float64_to_int64_round_to_zero(float64 a
, float_status
*status
)
3249 int_fast16_t aExp
, shiftCount
;
3252 a
= float64_squash_input_denormal(a
, status
);
3254 aSig
= extractFloat64Frac( a
);
3255 aExp
= extractFloat64Exp( a
);
3256 aSign
= extractFloat64Sign( a
);
3257 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
3258 shiftCount
= aExp
- 0x433;
3259 if ( 0 <= shiftCount
) {
3260 if ( 0x43E <= aExp
) {
3261 if ( float64_val(a
) != LIT64( 0xC3E0000000000000 ) ) {
3262 float_raise(float_flag_invalid
, status
);
3264 || ( ( aExp
== 0x7FF )
3265 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
3267 return LIT64( 0x7FFFFFFFFFFFFFFF );
3270 return (int64_t) LIT64( 0x8000000000000000 );
3272 z
= aSig
<<shiftCount
;
3275 if ( aExp
< 0x3FE ) {
3277 status
->float_exception_flags
|= float_flag_inexact
;
3281 z
= aSig
>>( - shiftCount
);
3282 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
3283 status
->float_exception_flags
|= float_flag_inexact
;
3286 if ( aSign
) z
= - z
;
3291 /*----------------------------------------------------------------------------
3292 | Returns the result of converting the double-precision floating-point value
3293 | `a' to the single-precision floating-point format. The conversion is
3294 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3296 *----------------------------------------------------------------------------*/
3298 float32
float64_to_float32(float64 a
, float_status
*status
)
3304 a
= float64_squash_input_denormal(a
, status
);
3306 aSig
= extractFloat64Frac( a
);
3307 aExp
= extractFloat64Exp( a
);
3308 aSign
= extractFloat64Sign( a
);
3309 if ( aExp
== 0x7FF ) {
3311 return commonNaNToFloat32(float64ToCommonNaN(a
, status
), status
);
3313 return packFloat32( aSign
, 0xFF, 0 );
3315 shift64RightJamming( aSig
, 22, &aSig
);
3317 if ( aExp
|| zSig
) {
3321 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
3326 /*----------------------------------------------------------------------------
3327 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3328 | half-precision floating-point value, returning the result. After being
3329 | shifted into the proper positions, the three fields are simply added
3330 | together to form the result. This means that any integer portion of `zSig'
3331 | will be added into the exponent. Since a properly normalized significand
3332 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3333 | than the desired result exponent whenever `zSig' is a complete, normalized
3335 *----------------------------------------------------------------------------*/
3336 static float16
packFloat16(flag zSign
, int_fast16_t zExp
, uint16_t zSig
)
3338 return make_float16(
3339 (((uint32_t)zSign
) << 15) + (((uint32_t)zExp
) << 10) + zSig
);
3342 /*----------------------------------------------------------------------------
3343 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3344 | and significand `zSig', and returns the proper half-precision floating-
3345 | point value corresponding to the abstract input. Ordinarily, the abstract
3346 | value is simply rounded and packed into the half-precision format, with
3347 | the inexact exception raised if the abstract input cannot be represented
3348 | exactly. However, if the abstract value is too large, the overflow and
3349 | inexact exceptions are raised and an infinity or maximal finite value is
3350 | returned. If the abstract value is too small, the input value is rounded to
3351 | a subnormal number, and the underflow and inexact exceptions are raised if
3352 | the abstract input cannot be represented exactly as a subnormal half-
3353 | precision floating-point number.
3354 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3355 | ARM-style "alternative representation", which omits the NaN and Inf
3356 | encodings in order to raise the maximum representable exponent by one.
3357 | The input significand `zSig' has its binary point between bits 22
3358 | and 23, which is 13 bits to the left of the usual location. This shifted
3359 | significand must be normalized or smaller. If `zSig' is not normalized,
3360 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3361 | and it must not require rounding. In the usual case that `zSig' is
3362 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3363 | Note the slightly odd position of the binary point in zSig compared with the
3364 | other roundAndPackFloat functions. This should probably be fixed if we
3365 | need to implement more float16 routines than just conversion.
3366 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3367 | Binary Floating-Point Arithmetic.
3368 *----------------------------------------------------------------------------*/
3370 static float16
roundAndPackFloat16(flag zSign
, int_fast16_t zExp
,
3371 uint32_t zSig
, flag ieee
,
3372 float_status
*status
)
3374 int maxexp
= ieee
? 29 : 30;
3377 bool rounding_bumps_exp
;
3378 bool is_tiny
= false;
3380 /* Calculate the mask of bits of the mantissa which are not
3381 * representable in half-precision and will be lost.
3384 /* Will be denormal in halfprec */
3390 /* Normal number in halfprec */
3394 switch (status
->float_rounding_mode
) {
3395 case float_round_nearest_even
:
3396 increment
= (mask
+ 1) >> 1;
3397 if ((zSig
& mask
) == increment
) {
3398 increment
= zSig
& (increment
<< 1);
3401 case float_round_ties_away
:
3402 increment
= (mask
+ 1) >> 1;
3404 case float_round_up
:
3405 increment
= zSign
? 0 : mask
;
3407 case float_round_down
:
3408 increment
= zSign
? mask
: 0;
3410 default: /* round_to_zero */
3415 rounding_bumps_exp
= (zSig
+ increment
>= 0x01000000);
3417 if (zExp
> maxexp
|| (zExp
== maxexp
&& rounding_bumps_exp
)) {
3419 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3420 return packFloat16(zSign
, 0x1f, 0);
3422 float_raise(float_flag_invalid
, status
);
3423 return packFloat16(zSign
, 0x1f, 0x3ff);
3428 /* Note that flush-to-zero does not affect half-precision results */
3430 (status
->float_detect_tininess
== float_tininess_before_rounding
)
3432 || (!rounding_bumps_exp
);
3435 float_raise(float_flag_inexact
, status
);
3437 float_raise(float_flag_underflow
, status
);
3442 if (rounding_bumps_exp
) {
3448 return packFloat16(zSign
, 0, 0);
3454 return packFloat16(zSign
, zExp
, zSig
>> 13);
3457 static void normalizeFloat16Subnormal(uint32_t aSig
, int_fast16_t *zExpPtr
,
3460 int8_t shiftCount
= countLeadingZeros32(aSig
) - 21;
3461 *zSigPtr
= aSig
<< shiftCount
;
3462 *zExpPtr
= 1 - shiftCount
;
3465 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3466 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3468 float32
float16_to_float32(float16 a
, flag ieee
, float_status
*status
)
3474 aSign
= extractFloat16Sign(a
);
3475 aExp
= extractFloat16Exp(a
);
3476 aSig
= extractFloat16Frac(a
);
3478 if (aExp
== 0x1f && ieee
) {
3480 return commonNaNToFloat32(float16ToCommonNaN(a
, status
), status
);
3482 return packFloat32(aSign
, 0xff, 0);
3486 return packFloat32(aSign
, 0, 0);
3489 normalizeFloat16Subnormal(aSig
, &aExp
, &aSig
);
3492 return packFloat32( aSign
, aExp
+ 0x70, aSig
<< 13);
3495 float16
float32_to_float16(float32 a
, flag ieee
, float_status
*status
)
3501 a
= float32_squash_input_denormal(a
, status
);
3503 aSig
= extractFloat32Frac( a
);
3504 aExp
= extractFloat32Exp( a
);
3505 aSign
= extractFloat32Sign( a
);
3506 if ( aExp
== 0xFF ) {
3508 /* Input is a NaN */
3510 float_raise(float_flag_invalid
, status
);
3511 return packFloat16(aSign
, 0, 0);
3513 return commonNaNToFloat16(
3514 float32ToCommonNaN(a
, status
), status
);
3518 float_raise(float_flag_invalid
, status
);
3519 return packFloat16(aSign
, 0x1f, 0x3ff);
3521 return packFloat16(aSign
, 0x1f, 0);
3523 if (aExp
== 0 && aSig
== 0) {
3524 return packFloat16(aSign
, 0, 0);
3526 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3527 * even if the input is denormal; however this is harmless because
3528 * the largest possible single-precision denormal is still smaller
3529 * than the smallest representable half-precision denormal, and so we
3530 * will end up ignoring aSig and returning via the "always return zero"
3536 return roundAndPackFloat16(aSign
, aExp
, aSig
, ieee
, status
);
3539 float64
float16_to_float64(float16 a
, flag ieee
, float_status
*status
)
3545 aSign
= extractFloat16Sign(a
);
3546 aExp
= extractFloat16Exp(a
);
3547 aSig
= extractFloat16Frac(a
);
3549 if (aExp
== 0x1f && ieee
) {
3551 return commonNaNToFloat64(
3552 float16ToCommonNaN(a
, status
), status
);
3554 return packFloat64(aSign
, 0x7ff, 0);
3558 return packFloat64(aSign
, 0, 0);
3561 normalizeFloat16Subnormal(aSig
, &aExp
, &aSig
);
3564 return packFloat64(aSign
, aExp
+ 0x3f0, ((uint64_t)aSig
) << 42);
3567 float16
float64_to_float16(float64 a
, flag ieee
, float_status
*status
)
3574 a
= float64_squash_input_denormal(a
, status
);
3576 aSig
= extractFloat64Frac(a
);
3577 aExp
= extractFloat64Exp(a
);
3578 aSign
= extractFloat64Sign(a
);
3579 if (aExp
== 0x7FF) {
3581 /* Input is a NaN */
3583 float_raise(float_flag_invalid
, status
);
3584 return packFloat16(aSign
, 0, 0);
3586 return commonNaNToFloat16(
3587 float64ToCommonNaN(a
, status
), status
);
3591 float_raise(float_flag_invalid
, status
);
3592 return packFloat16(aSign
, 0x1f, 0x3ff);
3594 return packFloat16(aSign
, 0x1f, 0);
3596 shift64RightJamming(aSig
, 29, &aSig
);
3598 if (aExp
== 0 && zSig
== 0) {
3599 return packFloat16(aSign
, 0, 0);
3601 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3602 * even if the input is denormal; however this is harmless because
3603 * the largest possible single-precision denormal is still smaller
3604 * than the smallest representable half-precision denormal, and so we
3605 * will end up ignoring aSig and returning via the "always return zero"
3611 return roundAndPackFloat16(aSign
, aExp
, zSig
, ieee
, status
);
3614 /*----------------------------------------------------------------------------
3615 | Returns the result of converting the double-precision floating-point value
3616 | `a' to the extended double-precision floating-point format. The conversion
3617 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3619 *----------------------------------------------------------------------------*/
3621 floatx80
float64_to_floatx80(float64 a
, float_status
*status
)
3627 a
= float64_squash_input_denormal(a
, status
);
3628 aSig
= extractFloat64Frac( a
);
3629 aExp
= extractFloat64Exp( a
);
3630 aSign
= extractFloat64Sign( a
);
3631 if ( aExp
== 0x7FF ) {
3633 return commonNaNToFloatx80(float64ToCommonNaN(a
, status
), status
);
3635 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3638 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
3639 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3643 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
3647 /*----------------------------------------------------------------------------
3648 | Returns the result of converting the double-precision floating-point value
3649 | `a' to the quadruple-precision floating-point format. The conversion is
3650 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3652 *----------------------------------------------------------------------------*/
3654 float128
float64_to_float128(float64 a
, float_status
*status
)
3658 uint64_t aSig
, zSig0
, zSig1
;
3660 a
= float64_squash_input_denormal(a
, status
);
3661 aSig
= extractFloat64Frac( a
);
3662 aExp
= extractFloat64Exp( a
);
3663 aSign
= extractFloat64Sign( a
);
3664 if ( aExp
== 0x7FF ) {
3666 return commonNaNToFloat128(float64ToCommonNaN(a
, status
), status
);
3668 return packFloat128( aSign
, 0x7FFF, 0, 0 );
3671 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
3672 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3675 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
3676 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
3680 /*----------------------------------------------------------------------------
3681 | Rounds the double-precision floating-point value `a' to an integer, and
3682 | returns the result as a double-precision floating-point value. The
3683 | operation is performed according to the IEC/IEEE Standard for Binary
3684 | Floating-Point Arithmetic.
3685 *----------------------------------------------------------------------------*/
3687 float64
float64_round_to_int(float64 a
, float_status
*status
)
3691 uint64_t lastBitMask
, roundBitsMask
;
3693 a
= float64_squash_input_denormal(a
, status
);
3695 aExp
= extractFloat64Exp( a
);
3696 if ( 0x433 <= aExp
) {
3697 if ( ( aExp
== 0x7FF ) && extractFloat64Frac( a
) ) {
3698 return propagateFloat64NaN(a
, a
, status
);
3702 if ( aExp
< 0x3FF ) {
3703 if ( (uint64_t) ( float64_val(a
)<<1 ) == 0 ) return a
;
3704 status
->float_exception_flags
|= float_flag_inexact
;
3705 aSign
= extractFloat64Sign( a
);
3706 switch (status
->float_rounding_mode
) {
3707 case float_round_nearest_even
:
3708 if ( ( aExp
== 0x3FE ) && extractFloat64Frac( a
) ) {
3709 return packFloat64( aSign
, 0x3FF, 0 );
3712 case float_round_ties_away
:
3713 if (aExp
== 0x3FE) {
3714 return packFloat64(aSign
, 0x3ff, 0);
3717 case float_round_down
:
3718 return make_float64(aSign
? LIT64( 0xBFF0000000000000 ) : 0);
3719 case float_round_up
:
3720 return make_float64(
3721 aSign
? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3723 return packFloat64( aSign
, 0, 0 );
3726 lastBitMask
<<= 0x433 - aExp
;
3727 roundBitsMask
= lastBitMask
- 1;
3729 switch (status
->float_rounding_mode
) {
3730 case float_round_nearest_even
:
3731 z
+= lastBitMask
>> 1;
3732 if ((z
& roundBitsMask
) == 0) {
3736 case float_round_ties_away
:
3737 z
+= lastBitMask
>> 1;
3739 case float_round_to_zero
:
3741 case float_round_up
:
3742 if (!extractFloat64Sign(make_float64(z
))) {
3746 case float_round_down
:
3747 if (extractFloat64Sign(make_float64(z
))) {
3754 z
&= ~ roundBitsMask
;
3755 if (z
!= float64_val(a
)) {
3756 status
->float_exception_flags
|= float_flag_inexact
;
3758 return make_float64(z
);
3762 float64
float64_trunc_to_int(float64 a
, float_status
*status
)
3766 oldmode
= status
->float_rounding_mode
;
3767 status
->float_rounding_mode
= float_round_to_zero
;
3768 res
= float64_round_to_int(a
, status
);
3769 status
->float_rounding_mode
= oldmode
;
3773 /*----------------------------------------------------------------------------
3774 | Returns the result of adding the absolute values of the double-precision
3775 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3776 | before being returned. `zSign' is ignored if the result is a NaN.
3777 | The addition is performed according to the IEC/IEEE Standard for Binary
3778 | Floating-Point Arithmetic.
3779 *----------------------------------------------------------------------------*/
3781 static float64
addFloat64Sigs(float64 a
, float64 b
, flag zSign
,
3782 float_status
*status
)
3784 int_fast16_t aExp
, bExp
, zExp
;
3785 uint64_t aSig
, bSig
, zSig
;
3786 int_fast16_t expDiff
;
3788 aSig
= extractFloat64Frac( a
);
3789 aExp
= extractFloat64Exp( a
);
3790 bSig
= extractFloat64Frac( b
);
3791 bExp
= extractFloat64Exp( b
);
3792 expDiff
= aExp
- bExp
;
3795 if ( 0 < expDiff
) {
3796 if ( aExp
== 0x7FF ) {
3798 return propagateFloat64NaN(a
, b
, status
);
3806 bSig
|= LIT64( 0x2000000000000000 );
3808 shift64RightJamming( bSig
, expDiff
, &bSig
);
3811 else if ( expDiff
< 0 ) {
3812 if ( bExp
== 0x7FF ) {
3814 return propagateFloat64NaN(a
, b
, status
);
3816 return packFloat64( zSign
, 0x7FF, 0 );
3822 aSig
|= LIT64( 0x2000000000000000 );
3824 shift64RightJamming( aSig
, - expDiff
, &aSig
);
3828 if ( aExp
== 0x7FF ) {
3830 return propagateFloat64NaN(a
, b
, status
);
3835 if (status
->flush_to_zero
) {
3837 float_raise(float_flag_output_denormal
, status
);
3839 return packFloat64(zSign
, 0, 0);
3841 return packFloat64( zSign
, 0, ( aSig
+ bSig
)>>9 );
3843 zSig
= LIT64( 0x4000000000000000 ) + aSig
+ bSig
;
3847 aSig
|= LIT64( 0x2000000000000000 );
3848 zSig
= ( aSig
+ bSig
)<<1;
3850 if ( (int64_t) zSig
< 0 ) {
3855 return roundAndPackFloat64(zSign
, zExp
, zSig
, status
);
3859 /*----------------------------------------------------------------------------
3860 | Returns the result of subtracting the absolute values of the double-
3861 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3862 | difference is negated before being returned. `zSign' is ignored if the
3863 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3864 | Standard for Binary Floating-Point Arithmetic.
3865 *----------------------------------------------------------------------------*/
3867 static float64
subFloat64Sigs(float64 a
, float64 b
, flag zSign
,
3868 float_status
*status
)
3870 int_fast16_t aExp
, bExp
, zExp
;
3871 uint64_t aSig
, bSig
, zSig
;
3872 int_fast16_t expDiff
;
3874 aSig
= extractFloat64Frac( a
);
3875 aExp
= extractFloat64Exp( a
);
3876 bSig
= extractFloat64Frac( b
);
3877 bExp
= extractFloat64Exp( b
);
3878 expDiff
= aExp
- bExp
;
3881 if ( 0 < expDiff
) goto aExpBigger
;
3882 if ( expDiff
< 0 ) goto bExpBigger
;
3883 if ( aExp
== 0x7FF ) {
3885 return propagateFloat64NaN(a
, b
, status
);
3887 float_raise(float_flag_invalid
, status
);
3888 return float64_default_nan
;
3894 if ( bSig
< aSig
) goto aBigger
;
3895 if ( aSig
< bSig
) goto bBigger
;
3896 return packFloat64(status
->float_rounding_mode
== float_round_down
, 0, 0);
3898 if ( bExp
== 0x7FF ) {
3900 return propagateFloat64NaN(a
, b
, status
);
3902 return packFloat64( zSign
^ 1, 0x7FF, 0 );
3908 aSig
|= LIT64( 0x4000000000000000 );
3910 shift64RightJamming( aSig
, - expDiff
, &aSig
);
3911 bSig
|= LIT64( 0x4000000000000000 );
3916 goto normalizeRoundAndPack
;
3918 if ( aExp
== 0x7FF ) {
3920 return propagateFloat64NaN(a
, b
, status
);
3928 bSig
|= LIT64( 0x4000000000000000 );
3930 shift64RightJamming( bSig
, expDiff
, &bSig
);
3931 aSig
|= LIT64( 0x4000000000000000 );
3935 normalizeRoundAndPack
:
3937 return normalizeRoundAndPackFloat64(zSign
, zExp
, zSig
, status
);
3941 /*----------------------------------------------------------------------------
3942 | Returns the result of adding the double-precision floating-point values `a'
3943 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3944 | Binary Floating-Point Arithmetic.
3945 *----------------------------------------------------------------------------*/
3947 float64
float64_add(float64 a
, float64 b
, float_status
*status
)
3950 a
= float64_squash_input_denormal(a
, status
);
3951 b
= float64_squash_input_denormal(b
, status
);
3953 aSign
= extractFloat64Sign( a
);
3954 bSign
= extractFloat64Sign( b
);
3955 if ( aSign
== bSign
) {
3956 return addFloat64Sigs(a
, b
, aSign
, status
);
3959 return subFloat64Sigs(a
, b
, aSign
, status
);
3964 /*----------------------------------------------------------------------------
3965 | Returns the result of subtracting the double-precision floating-point values
3966 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3967 | for Binary Floating-Point Arithmetic.
3968 *----------------------------------------------------------------------------*/
3970 float64
float64_sub(float64 a
, float64 b
, float_status
*status
)
3973 a
= float64_squash_input_denormal(a
, status
);
3974 b
= float64_squash_input_denormal(b
, status
);
3976 aSign
= extractFloat64Sign( a
);
3977 bSign
= extractFloat64Sign( b
);
3978 if ( aSign
== bSign
) {
3979 return subFloat64Sigs(a
, b
, aSign
, status
);
3982 return addFloat64Sigs(a
, b
, aSign
, status
);
3987 /*----------------------------------------------------------------------------
3988 | Returns the result of multiplying the double-precision floating-point values
3989 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3990 | for Binary Floating-Point Arithmetic.
3991 *----------------------------------------------------------------------------*/
3993 float64
float64_mul(float64 a
, float64 b
, float_status
*status
)
3995 flag aSign
, bSign
, zSign
;
3996 int_fast16_t aExp
, bExp
, zExp
;
3997 uint64_t aSig
, bSig
, zSig0
, zSig1
;
3999 a
= float64_squash_input_denormal(a
, status
);
4000 b
= float64_squash_input_denormal(b
, status
);
4002 aSig
= extractFloat64Frac( a
);
4003 aExp
= extractFloat64Exp( a
);
4004 aSign
= extractFloat64Sign( a
);
4005 bSig
= extractFloat64Frac( b
);
4006 bExp
= extractFloat64Exp( b
);
4007 bSign
= extractFloat64Sign( b
);
4008 zSign
= aSign
^ bSign
;
4009 if ( aExp
== 0x7FF ) {
4010 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
4011 return propagateFloat64NaN(a
, b
, status
);
4013 if ( ( bExp
| bSig
) == 0 ) {
4014 float_raise(float_flag_invalid
, status
);
4015 return float64_default_nan
;
4017 return packFloat64( zSign
, 0x7FF, 0 );
4019 if ( bExp
== 0x7FF ) {
4021 return propagateFloat64NaN(a
, b
, status
);
4023 if ( ( aExp
| aSig
) == 0 ) {
4024 float_raise(float_flag_invalid
, status
);
4025 return float64_default_nan
;
4027 return packFloat64( zSign
, 0x7FF, 0 );
4030 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
4031 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4034 if ( bSig
== 0 ) return packFloat64( zSign
, 0, 0 );
4035 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
4037 zExp
= aExp
+ bExp
- 0x3FF;
4038 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
4039 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
4040 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
4041 zSig0
|= ( zSig1
!= 0 );
4042 if ( 0 <= (int64_t) ( zSig0
<<1 ) ) {
4046 return roundAndPackFloat64(zSign
, zExp
, zSig0
, status
);
4050 /*----------------------------------------------------------------------------
4051 | Returns the result of dividing the double-precision floating-point value `a'
4052 | by the corresponding value `b'. The operation is performed according to
4053 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4054 *----------------------------------------------------------------------------*/
4056 float64
float64_div(float64 a
, float64 b
, float_status
*status
)
4058 flag aSign
, bSign
, zSign
;
4059 int_fast16_t aExp
, bExp
, zExp
;
4060 uint64_t aSig
, bSig
, zSig
;
4061 uint64_t rem0
, rem1
;
4062 uint64_t term0
, term1
;
4063 a
= float64_squash_input_denormal(a
, status
);
4064 b
= float64_squash_input_denormal(b
, status
);
4066 aSig
= extractFloat64Frac( a
);
4067 aExp
= extractFloat64Exp( a
);
4068 aSign
= extractFloat64Sign( a
);
4069 bSig
= extractFloat64Frac( b
);
4070 bExp
= extractFloat64Exp( b
);
4071 bSign
= extractFloat64Sign( b
);
4072 zSign
= aSign
^ bSign
;
4073 if ( aExp
== 0x7FF ) {
4075 return propagateFloat64NaN(a
, b
, status
);
4077 if ( bExp
== 0x7FF ) {
4079 return propagateFloat64NaN(a
, b
, status
);
4081 float_raise(float_flag_invalid
, status
);
4082 return float64_default_nan
;
4084 return packFloat64( zSign
, 0x7FF, 0 );
4086 if ( bExp
== 0x7FF ) {
4088 return propagateFloat64NaN(a
, b
, status
);
4090 return packFloat64( zSign
, 0, 0 );
4094 if ( ( aExp
| aSig
) == 0 ) {
4095 float_raise(float_flag_invalid
, status
);
4096 return float64_default_nan
;
4098 float_raise(float_flag_divbyzero
, status
);
4099 return packFloat64( zSign
, 0x7FF, 0 );
4101 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
4104 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
4105 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4107 zExp
= aExp
- bExp
+ 0x3FD;
4108 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
4109 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
4110 if ( bSig
<= ( aSig
+ aSig
) ) {
4114 zSig
= estimateDiv128To64( aSig
, 0, bSig
);
4115 if ( ( zSig
& 0x1FF ) <= 2 ) {
4116 mul64To128( bSig
, zSig
, &term0
, &term1
);
4117 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
4118 while ( (int64_t) rem0
< 0 ) {
4120 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
4122 zSig
|= ( rem1
!= 0 );
4124 return roundAndPackFloat64(zSign
, zExp
, zSig
, status
);
4128 /*----------------------------------------------------------------------------
4129 | Returns the remainder of the double-precision floating-point value `a'
4130 | with respect to the corresponding value `b'. The operation is performed
4131 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4132 *----------------------------------------------------------------------------*/
4134 float64
float64_rem(float64 a
, float64 b
, float_status
*status
)
4137 int_fast16_t aExp
, bExp
, expDiff
;
4138 uint64_t aSig
, bSig
;
4139 uint64_t q
, alternateASig
;
4142 a
= float64_squash_input_denormal(a
, status
);
4143 b
= float64_squash_input_denormal(b
, status
);
4144 aSig
= extractFloat64Frac( a
);
4145 aExp
= extractFloat64Exp( a
);
4146 aSign
= extractFloat64Sign( a
);
4147 bSig
= extractFloat64Frac( b
);
4148 bExp
= extractFloat64Exp( b
);
4149 if ( aExp
== 0x7FF ) {
4150 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
4151 return propagateFloat64NaN(a
, b
, status
);
4153 float_raise(float_flag_invalid
, status
);
4154 return float64_default_nan
;
4156 if ( bExp
== 0x7FF ) {
4158 return propagateFloat64NaN(a
, b
, status
);
4164 float_raise(float_flag_invalid
, status
);
4165 return float64_default_nan
;
4167 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
4170 if ( aSig
== 0 ) return a
;
4171 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4173 expDiff
= aExp
- bExp
;
4174 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
4175 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
4176 if ( expDiff
< 0 ) {
4177 if ( expDiff
< -1 ) return a
;
4180 q
= ( bSig
<= aSig
);
4181 if ( q
) aSig
-= bSig
;
4183 while ( 0 < expDiff
) {
4184 q
= estimateDiv128To64( aSig
, 0, bSig
);
4185 q
= ( 2 < q
) ? q
- 2 : 0;
4186 aSig
= - ( ( bSig
>>2 ) * q
);
4190 if ( 0 < expDiff
) {
4191 q
= estimateDiv128To64( aSig
, 0, bSig
);
4192 q
= ( 2 < q
) ? q
- 2 : 0;
4195 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
4202 alternateASig
= aSig
;
4205 } while ( 0 <= (int64_t) aSig
);
4206 sigMean
= aSig
+ alternateASig
;
4207 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
4208 aSig
= alternateASig
;
4210 zSign
= ( (int64_t) aSig
< 0 );
4211 if ( zSign
) aSig
= - aSig
;
4212 return normalizeRoundAndPackFloat64(aSign
^ zSign
, bExp
, aSig
, status
);
4216 /*----------------------------------------------------------------------------
4217 | Returns the result of multiplying the double-precision floating-point values
4218 | `a' and `b' then adding 'c', with no intermediate rounding step after the
4219 | multiplication. The operation is performed according to the IEC/IEEE
4220 | Standard for Binary Floating-Point Arithmetic 754-2008.
4221 | The flags argument allows the caller to select negation of the
4222 | addend, the intermediate product, or the final result. (The difference
4223 | between this and having the caller do a separate negation is that negating
4224 | externally will flip the sign bit on NaNs.)
4225 *----------------------------------------------------------------------------*/
4227 float64
float64_muladd(float64 a
, float64 b
, float64 c
, int flags
,
4228 float_status
*status
)
4230 flag aSign
, bSign
, cSign
, zSign
;
4231 int_fast16_t aExp
, bExp
, cExp
, pExp
, zExp
, expDiff
;
4232 uint64_t aSig
, bSig
, cSig
;
4233 flag pInf
, pZero
, pSign
;
4234 uint64_t pSig0
, pSig1
, cSig0
, cSig1
, zSig0
, zSig1
;
4236 flag signflip
, infzero
;
4238 a
= float64_squash_input_denormal(a
, status
);
4239 b
= float64_squash_input_denormal(b
, status
);
4240 c
= float64_squash_input_denormal(c
, status
);
4241 aSig
= extractFloat64Frac(a
);
4242 aExp
= extractFloat64Exp(a
);
4243 aSign
= extractFloat64Sign(a
);
4244 bSig
= extractFloat64Frac(b
);
4245 bExp
= extractFloat64Exp(b
);
4246 bSign
= extractFloat64Sign(b
);
4247 cSig
= extractFloat64Frac(c
);
4248 cExp
= extractFloat64Exp(c
);
4249 cSign
= extractFloat64Sign(c
);
4251 infzero
= ((aExp
== 0 && aSig
== 0 && bExp
== 0x7ff && bSig
== 0) ||
4252 (aExp
== 0x7ff && aSig
== 0 && bExp
== 0 && bSig
== 0));
4254 /* It is implementation-defined whether the cases of (0,inf,qnan)
4255 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4256 * they return if they do), so we have to hand this information
4257 * off to the target-specific pick-a-NaN routine.
4259 if (((aExp
== 0x7ff) && aSig
) ||
4260 ((bExp
== 0x7ff) && bSig
) ||
4261 ((cExp
== 0x7ff) && cSig
)) {
4262 return propagateFloat64MulAddNaN(a
, b
, c
, infzero
, status
);
4266 float_raise(float_flag_invalid
, status
);
4267 return float64_default_nan
;
4270 if (flags
& float_muladd_negate_c
) {
4274 signflip
= (flags
& float_muladd_negate_result
) ? 1 : 0;
4276 /* Work out the sign and type of the product */
4277 pSign
= aSign
^ bSign
;
4278 if (flags
& float_muladd_negate_product
) {
4281 pInf
= (aExp
== 0x7ff) || (bExp
== 0x7ff);
4282 pZero
= ((aExp
| aSig
) == 0) || ((bExp
| bSig
) == 0);
4284 if (cExp
== 0x7ff) {
4285 if (pInf
&& (pSign
^ cSign
)) {
4286 /* addition of opposite-signed infinities => InvalidOperation */
4287 float_raise(float_flag_invalid
, status
);
4288 return float64_default_nan
;
4290 /* Otherwise generate an infinity of the same sign */
4291 return packFloat64(cSign
^ signflip
, 0x7ff, 0);
4295 return packFloat64(pSign
^ signflip
, 0x7ff, 0);
4301 /* Adding two exact zeroes */
4302 if (pSign
== cSign
) {
4304 } else if (status
->float_rounding_mode
== float_round_down
) {
4309 return packFloat64(zSign
^ signflip
, 0, 0);
4311 /* Exact zero plus a denorm */
4312 if (status
->flush_to_zero
) {
4313 float_raise(float_flag_output_denormal
, status
);
4314 return packFloat64(cSign
^ signflip
, 0, 0);
4317 /* Zero plus something non-zero : just return the something */
4318 if (flags
& float_muladd_halve_result
) {
4320 normalizeFloat64Subnormal(cSig
, &cExp
, &cSig
);
4322 /* Subtract one to halve, and one again because roundAndPackFloat64
4323 * wants one less than the true exponent.
4326 cSig
= (cSig
| 0x0010000000000000ULL
) << 10;
4327 return roundAndPackFloat64(cSign
^ signflip
, cExp
, cSig
, status
);
4329 return packFloat64(cSign
^ signflip
, cExp
, cSig
);
4333 normalizeFloat64Subnormal(aSig
, &aExp
, &aSig
);
4336 normalizeFloat64Subnormal(bSig
, &bExp
, &bSig
);
4339 /* Calculate the actual result a * b + c */
4341 /* Multiply first; this is easy. */
4342 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4343 * because we want the true exponent, not the "one-less-than"
4344 * flavour that roundAndPackFloat64() takes.
4346 pExp
= aExp
+ bExp
- 0x3fe;
4347 aSig
= (aSig
| LIT64(0x0010000000000000))<<10;
4348 bSig
= (bSig
| LIT64(0x0010000000000000))<<11;
4349 mul64To128(aSig
, bSig
, &pSig0
, &pSig1
);
4350 if ((int64_t)(pSig0
<< 1) >= 0) {
4351 shortShift128Left(pSig0
, pSig1
, 1, &pSig0
, &pSig1
);
4355 zSign
= pSign
^ signflip
;
4357 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4358 * bit in position 126.
4362 /* Throw out the special case of c being an exact zero now */
4363 shift128RightJamming(pSig0
, pSig1
, 64, &pSig0
, &pSig1
);
4364 if (flags
& float_muladd_halve_result
) {
4367 return roundAndPackFloat64(zSign
, pExp
- 1,
4370 normalizeFloat64Subnormal(cSig
, &cExp
, &cSig
);
4373 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4374 * significand of the addend, with the explicit bit in position 126.
4376 cSig0
= cSig
<< (126 - 64 - 52);
4378 cSig0
|= LIT64(0x4000000000000000);
4379 expDiff
= pExp
- cExp
;
4381 if (pSign
== cSign
) {
4384 /* scale c to match p */
4385 shift128RightJamming(cSig0
, cSig1
, expDiff
, &cSig0
, &cSig1
);
4387 } else if (expDiff
< 0) {
4388 /* scale p to match c */
4389 shift128RightJamming(pSig0
, pSig1
, -expDiff
, &pSig0
, &pSig1
);
4392 /* no scaling needed */
4395 /* Add significands and make sure explicit bit ends up in posn 126 */
4396 add128(pSig0
, pSig1
, cSig0
, cSig1
, &zSig0
, &zSig1
);
4397 if ((int64_t)zSig0
< 0) {
4398 shift128RightJamming(zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
4402 shift128RightJamming(zSig0
, zSig1
, 64, &zSig0
, &zSig1
);
4403 if (flags
& float_muladd_halve_result
) {
4406 return roundAndPackFloat64(zSign
, zExp
, zSig1
, status
);
4410 shift128RightJamming(cSig0
, cSig1
, expDiff
, &cSig0
, &cSig1
);
4411 sub128(pSig0
, pSig1
, cSig0
, cSig1
, &zSig0
, &zSig1
);
4413 } else if (expDiff
< 0) {
4414 shift128RightJamming(pSig0
, pSig1
, -expDiff
, &pSig0
, &pSig1
);
4415 sub128(cSig0
, cSig1
, pSig0
, pSig1
, &zSig0
, &zSig1
);
4420 if (lt128(cSig0
, cSig1
, pSig0
, pSig1
)) {
4421 sub128(pSig0
, pSig1
, cSig0
, cSig1
, &zSig0
, &zSig1
);
4422 } else if (lt128(pSig0
, pSig1
, cSig0
, cSig1
)) {
4423 sub128(cSig0
, cSig1
, pSig0
, pSig1
, &zSig0
, &zSig1
);
4428 if (status
->float_rounding_mode
== float_round_down
) {
4431 return packFloat64(zSign
, 0, 0);
4435 /* Do the equivalent of normalizeRoundAndPackFloat64() but
4436 * starting with the significand in a pair of uint64_t.
4439 shiftcount
= countLeadingZeros64(zSig0
) - 1;
4440 shortShift128Left(zSig0
, zSig1
, shiftcount
, &zSig0
, &zSig1
);
4446 shiftcount
= countLeadingZeros64(zSig1
);
4447 if (shiftcount
== 0) {
4448 zSig0
= (zSig1
>> 1) | (zSig1
& 1);
4452 zSig0
= zSig1
<< shiftcount
;
4453 zExp
-= (shiftcount
+ 64);
4456 if (flags
& float_muladd_halve_result
) {
4459 return roundAndPackFloat64(zSign
, zExp
, zSig0
, status
);
4463 /*----------------------------------------------------------------------------
4464 | Returns the square root of the double-precision floating-point value `a'.
4465 | The operation is performed according to the IEC/IEEE Standard for Binary
4466 | Floating-Point Arithmetic.
4467 *----------------------------------------------------------------------------*/
4469 float64
float64_sqrt(float64 a
, float_status
*status
)
4472 int_fast16_t aExp
, zExp
;
4473 uint64_t aSig
, zSig
, doubleZSig
;
4474 uint64_t rem0
, rem1
, term0
, term1
;
4475 a
= float64_squash_input_denormal(a
, status
);
4477 aSig
= extractFloat64Frac( a
);
4478 aExp
= extractFloat64Exp( a
);
4479 aSign
= extractFloat64Sign( a
);
4480 if ( aExp
== 0x7FF ) {
4482 return propagateFloat64NaN(a
, a
, status
);
4484 if ( ! aSign
) return a
;
4485 float_raise(float_flag_invalid
, status
);
4486 return float64_default_nan
;
4489 if ( ( aExp
| aSig
) == 0 ) return a
;
4490 float_raise(float_flag_invalid
, status
);
4491 return float64_default_nan
;
4494 if ( aSig
== 0 ) return float64_zero
;
4495 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4497 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
4498 aSig
|= LIT64( 0x0010000000000000 );
4499 zSig
= estimateSqrt32( aExp
, aSig
>>21 );
4500 aSig
<<= 9 - ( aExp
& 1 );
4501 zSig
= estimateDiv128To64( aSig
, 0, zSig
<<32 ) + ( zSig
<<30 );
4502 if ( ( zSig
& 0x1FF ) <= 5 ) {
4503 doubleZSig
= zSig
<<1;
4504 mul64To128( zSig
, zSig
, &term0
, &term1
);
4505 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
4506 while ( (int64_t) rem0
< 0 ) {
4509 add128( rem0
, rem1
, zSig
>>63, doubleZSig
| 1, &rem0
, &rem1
);
4511 zSig
|= ( ( rem0
| rem1
) != 0 );
4513 return roundAndPackFloat64(0, zExp
, zSig
, status
);
4517 /*----------------------------------------------------------------------------
4518 | Returns the binary log of the double-precision floating-point value `a'.
4519 | The operation is performed according to the IEC/IEEE Standard for Binary
4520 | Floating-Point Arithmetic.
4521 *----------------------------------------------------------------------------*/
4522 float64
float64_log2(float64 a
, float_status
*status
)
4526 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
4527 a
= float64_squash_input_denormal(a
, status
);
4529 aSig
= extractFloat64Frac( a
);
4530 aExp
= extractFloat64Exp( a
);
4531 aSign
= extractFloat64Sign( a
);
4534 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
4535 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4538 float_raise(float_flag_invalid
, status
);
4539 return float64_default_nan
;
4541 if ( aExp
== 0x7FF ) {
4543 return propagateFloat64NaN(a
, float64_zero
, status
);
4549 aSig
|= LIT64( 0x0010000000000000 );
4551 zSig
= (uint64_t)aExp
<< 52;
4552 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
4553 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
4554 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
4555 if ( aSig
& LIT64( 0x0020000000000000 ) ) {
4563 return normalizeRoundAndPackFloat64(zSign
, 0x408, zSig
, status
);
4566 /*----------------------------------------------------------------------------
4567 | Returns 1 if the double-precision floating-point value `a' is equal to the
4568 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4569 | if either operand is a NaN. Otherwise, the comparison is performed
4570 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4571 *----------------------------------------------------------------------------*/
4573 int float64_eq(float64 a
, float64 b
, float_status
*status
)
4576 a
= float64_squash_input_denormal(a
, status
);
4577 b
= float64_squash_input_denormal(b
, status
);
4579 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4580 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4582 float_raise(float_flag_invalid
, status
);
4585 av
= float64_val(a
);
4586 bv
= float64_val(b
);
4587 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4591 /*----------------------------------------------------------------------------
4592 | Returns 1 if the double-precision floating-point value `a' is less than or
4593 | equal to the corresponding value `b', and 0 otherwise. The invalid
4594 | exception is raised if either operand is a NaN. The comparison is performed
4595 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4596 *----------------------------------------------------------------------------*/
4598 int float64_le(float64 a
, float64 b
, float_status
*status
)
4602 a
= float64_squash_input_denormal(a
, status
);
4603 b
= float64_squash_input_denormal(b
, status
);
4605 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4606 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4608 float_raise(float_flag_invalid
, status
);
4611 aSign
= extractFloat64Sign( a
);
4612 bSign
= extractFloat64Sign( b
);
4613 av
= float64_val(a
);
4614 bv
= float64_val(b
);
4615 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4616 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4620 /*----------------------------------------------------------------------------
4621 | Returns 1 if the double-precision floating-point value `a' is less than
4622 | the corresponding value `b', and 0 otherwise. The invalid exception is
4623 | raised if either operand is a NaN. The comparison is performed according
4624 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4625 *----------------------------------------------------------------------------*/
4627 int float64_lt(float64 a
, float64 b
, float_status
*status
)
4632 a
= float64_squash_input_denormal(a
, status
);
4633 b
= float64_squash_input_denormal(b
, status
);
4634 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4635 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4637 float_raise(float_flag_invalid
, status
);
4640 aSign
= extractFloat64Sign( a
);
4641 bSign
= extractFloat64Sign( b
);
4642 av
= float64_val(a
);
4643 bv
= float64_val(b
);
4644 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
4645 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4649 /*----------------------------------------------------------------------------
4650 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4651 | be compared, and 0 otherwise. The invalid exception is raised if either
4652 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4653 | Standard for Binary Floating-Point Arithmetic.
4654 *----------------------------------------------------------------------------*/
4656 int float64_unordered(float64 a
, float64 b
, float_status
*status
)
4658 a
= float64_squash_input_denormal(a
, status
);
4659 b
= float64_squash_input_denormal(b
, status
);
4661 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4662 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4664 float_raise(float_flag_invalid
, status
);
4670 /*----------------------------------------------------------------------------
4671 | Returns 1 if the double-precision floating-point value `a' is equal to the
4672 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4673 | exception.The comparison is performed according to the IEC/IEEE Standard
4674 | for Binary Floating-Point Arithmetic.
4675 *----------------------------------------------------------------------------*/
4677 int float64_eq_quiet(float64 a
, float64 b
, float_status
*status
)
4680 a
= float64_squash_input_denormal(a
, status
);
4681 b
= float64_squash_input_denormal(b
, status
);
4683 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4684 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4686 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
4687 float_raise(float_flag_invalid
, status
);
4691 av
= float64_val(a
);
4692 bv
= float64_val(b
);
4693 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4697 /*----------------------------------------------------------------------------
4698 | Returns 1 if the double-precision floating-point value `a' is less than or
4699 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4700 | cause an exception. Otherwise, the comparison is performed according to the
4701 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4702 *----------------------------------------------------------------------------*/
4704 int float64_le_quiet(float64 a
, float64 b
, float_status
*status
)
4708 a
= float64_squash_input_denormal(a
, status
);
4709 b
= float64_squash_input_denormal(b
, status
);
4711 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4712 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4714 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
4715 float_raise(float_flag_invalid
, status
);
4719 aSign
= extractFloat64Sign( a
);
4720 bSign
= extractFloat64Sign( b
);
4721 av
= float64_val(a
);
4722 bv
= float64_val(b
);
4723 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4724 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4728 /*----------------------------------------------------------------------------
4729 | Returns 1 if the double-precision floating-point value `a' is less than
4730 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4731 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4732 | Standard for Binary Floating-Point Arithmetic.
4733 *----------------------------------------------------------------------------*/
4735 int float64_lt_quiet(float64 a
, float64 b
, float_status
*status
)
4739 a
= float64_squash_input_denormal(a
, status
);
4740 b
= float64_squash_input_denormal(b
, status
);
4742 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4743 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4745 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
4746 float_raise(float_flag_invalid
, status
);
4750 aSign
= extractFloat64Sign( a
);
4751 bSign
= extractFloat64Sign( b
);
4752 av
= float64_val(a
);
4753 bv
= float64_val(b
);
4754 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
4755 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4759 /*----------------------------------------------------------------------------
4760 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4761 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4762 | comparison is performed according to the IEC/IEEE Standard for Binary
4763 | Floating-Point Arithmetic.
4764 *----------------------------------------------------------------------------*/
4766 int float64_unordered_quiet(float64 a
, float64 b
, float_status
*status
)
4768 a
= float64_squash_input_denormal(a
, status
);
4769 b
= float64_squash_input_denormal(b
, status
);
4771 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4772 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4774 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
4775 float_raise(float_flag_invalid
, status
);
4782 /*----------------------------------------------------------------------------
4783 | Returns the result of converting the extended double-precision floating-
4784 | point value `a' to the 32-bit two's complement integer format. The
4785 | conversion is performed according to the IEC/IEEE Standard for Binary
4786 | Floating-Point Arithmetic---which means in particular that the conversion
4787 | is rounded according to the current rounding mode. If `a' is a NaN, the
4788 | largest positive integer is returned. Otherwise, if the conversion
4789 | overflows, the largest integer with the same sign as `a' is returned.
4790 *----------------------------------------------------------------------------*/
4792 int32_t floatx80_to_int32(floatx80 a
, float_status
*status
)
4795 int32_t aExp
, shiftCount
;
4798 aSig
= extractFloatx80Frac( a
);
4799 aExp
= extractFloatx80Exp( a
);
4800 aSign
= extractFloatx80Sign( a
);
4801 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
4802 shiftCount
= 0x4037 - aExp
;
4803 if ( shiftCount
<= 0 ) shiftCount
= 1;
4804 shift64RightJamming( aSig
, shiftCount
, &aSig
);
4805 return roundAndPackInt32(aSign
, aSig
, status
);
4809 /*----------------------------------------------------------------------------
4810 | Returns the result of converting the extended double-precision floating-
4811 | point value `a' to the 32-bit two's complement integer format. The
4812 | conversion is performed according to the IEC/IEEE Standard for Binary
4813 | Floating-Point Arithmetic, except that the conversion is always rounded
4814 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4815 | Otherwise, if the conversion overflows, the largest integer with the same
4816 | sign as `a' is returned.
4817 *----------------------------------------------------------------------------*/
4819 int32_t floatx80_to_int32_round_to_zero(floatx80 a
, float_status
*status
)
4822 int32_t aExp
, shiftCount
;
4823 uint64_t aSig
, savedASig
;
4826 aSig
= extractFloatx80Frac( a
);
4827 aExp
= extractFloatx80Exp( a
);
4828 aSign
= extractFloatx80Sign( a
);
4829 if ( 0x401E < aExp
) {
4830 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
4833 else if ( aExp
< 0x3FFF ) {
4835 status
->float_exception_flags
|= float_flag_inexact
;
4839 shiftCount
= 0x403E - aExp
;
4841 aSig
>>= shiftCount
;
4843 if ( aSign
) z
= - z
;
4844 if ( ( z
< 0 ) ^ aSign
) {
4846 float_raise(float_flag_invalid
, status
);
4847 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
4849 if ( ( aSig
<<shiftCount
) != savedASig
) {
4850 status
->float_exception_flags
|= float_flag_inexact
;
4856 /*----------------------------------------------------------------------------
4857 | Returns the result of converting the extended double-precision floating-
4858 | point value `a' to the 64-bit two's complement integer format. The
4859 | conversion is performed according to the IEC/IEEE Standard for Binary
4860 | Floating-Point Arithmetic---which means in particular that the conversion
4861 | is rounded according to the current rounding mode. If `a' is a NaN,
4862 | the largest positive integer is returned. Otherwise, if the conversion
4863 | overflows, the largest integer with the same sign as `a' is returned.
4864 *----------------------------------------------------------------------------*/
4866 int64_t floatx80_to_int64(floatx80 a
, float_status
*status
)
4869 int32_t aExp
, shiftCount
;
4870 uint64_t aSig
, aSigExtra
;
4872 aSig
= extractFloatx80Frac( a
);
4873 aExp
= extractFloatx80Exp( a
);
4874 aSign
= extractFloatx80Sign( a
);
4875 shiftCount
= 0x403E - aExp
;
4876 if ( shiftCount
<= 0 ) {
4878 float_raise(float_flag_invalid
, status
);
4880 || ( ( aExp
== 0x7FFF )
4881 && ( aSig
!= LIT64( 0x8000000000000000 ) ) )
4883 return LIT64( 0x7FFFFFFFFFFFFFFF );
4885 return (int64_t) LIT64( 0x8000000000000000 );
4890 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
4892 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
4896 /*----------------------------------------------------------------------------
4897 | Returns the result of converting the extended double-precision floating-
4898 | point value `a' to the 64-bit two's complement integer format. The
4899 | conversion is performed according to the IEC/IEEE Standard for Binary
4900 | Floating-Point Arithmetic, except that the conversion is always rounded
4901 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4902 | Otherwise, if the conversion overflows, the largest integer with the same
4903 | sign as `a' is returned.
4904 *----------------------------------------------------------------------------*/
4906 int64_t floatx80_to_int64_round_to_zero(floatx80 a
, float_status
*status
)
4909 int32_t aExp
, shiftCount
;
4913 aSig
= extractFloatx80Frac( a
);
4914 aExp
= extractFloatx80Exp( a
);
4915 aSign
= extractFloatx80Sign( a
);
4916 shiftCount
= aExp
- 0x403E;
4917 if ( 0 <= shiftCount
) {
4918 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
4919 if ( ( a
.high
!= 0xC03E ) || aSig
) {
4920 float_raise(float_flag_invalid
, status
);
4921 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
4922 return LIT64( 0x7FFFFFFFFFFFFFFF );
4925 return (int64_t) LIT64( 0x8000000000000000 );
4927 else if ( aExp
< 0x3FFF ) {
4929 status
->float_exception_flags
|= float_flag_inexact
;
4933 z
= aSig
>>( - shiftCount
);
4934 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
4935 status
->float_exception_flags
|= float_flag_inexact
;
4937 if ( aSign
) z
= - z
;
4942 /*----------------------------------------------------------------------------
4943 | Returns the result of converting the extended double-precision floating-
4944 | point value `a' to the single-precision floating-point format. The
4945 | conversion is performed according to the IEC/IEEE Standard for Binary
4946 | Floating-Point Arithmetic.
4947 *----------------------------------------------------------------------------*/
4949 float32
floatx80_to_float32(floatx80 a
, float_status
*status
)
4955 aSig
= extractFloatx80Frac( a
);
4956 aExp
= extractFloatx80Exp( a
);
4957 aSign
= extractFloatx80Sign( a
);
4958 if ( aExp
== 0x7FFF ) {
4959 if ( (uint64_t) ( aSig
<<1 ) ) {
4960 return commonNaNToFloat32(floatx80ToCommonNaN(a
, status
), status
);
4962 return packFloat32( aSign
, 0xFF, 0 );
4964 shift64RightJamming( aSig
, 33, &aSig
);
4965 if ( aExp
|| aSig
) aExp
-= 0x3F81;
4966 return roundAndPackFloat32(aSign
, aExp
, aSig
, status
);
4970 /*----------------------------------------------------------------------------
4971 | Returns the result of converting the extended double-precision floating-
4972 | point value `a' to the double-precision floating-point format. The
4973 | conversion is performed according to the IEC/IEEE Standard for Binary
4974 | Floating-Point Arithmetic.
4975 *----------------------------------------------------------------------------*/
4977 float64
floatx80_to_float64(floatx80 a
, float_status
*status
)
4981 uint64_t aSig
, zSig
;
4983 aSig
= extractFloatx80Frac( a
);
4984 aExp
= extractFloatx80Exp( a
);
4985 aSign
= extractFloatx80Sign( a
);
4986 if ( aExp
== 0x7FFF ) {
4987 if ( (uint64_t) ( aSig
<<1 ) ) {
4988 return commonNaNToFloat64(floatx80ToCommonNaN(a
, status
), status
);
4990 return packFloat64( aSign
, 0x7FF, 0 );
4992 shift64RightJamming( aSig
, 1, &zSig
);
4993 if ( aExp
|| aSig
) aExp
-= 0x3C01;
4994 return roundAndPackFloat64(aSign
, aExp
, zSig
, status
);
4998 /*----------------------------------------------------------------------------
4999 | Returns the result of converting the extended double-precision floating-
5000 | point value `a' to the quadruple-precision floating-point format. The
5001 | conversion is performed according to the IEC/IEEE Standard for Binary
5002 | Floating-Point Arithmetic.
5003 *----------------------------------------------------------------------------*/
5005 float128
floatx80_to_float128(floatx80 a
, float_status
*status
)
5009 uint64_t aSig
, zSig0
, zSig1
;
5011 aSig
= extractFloatx80Frac( a
);
5012 aExp
= extractFloatx80Exp( a
);
5013 aSign
= extractFloatx80Sign( a
);
5014 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
5015 return commonNaNToFloat128(floatx80ToCommonNaN(a
, status
), status
);
5017 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
5018 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
5022 /*----------------------------------------------------------------------------
5023 | Rounds the extended double-precision floating-point value `a' to an integer,
5024 | and returns the result as an extended quadruple-precision floating-point
5025 | value. The operation is performed according to the IEC/IEEE Standard for
5026 | Binary Floating-Point Arithmetic.
5027 *----------------------------------------------------------------------------*/
5029 floatx80
floatx80_round_to_int(floatx80 a
, float_status
*status
)
5033 uint64_t lastBitMask
, roundBitsMask
;
5036 aExp
= extractFloatx80Exp( a
);
5037 if ( 0x403E <= aExp
) {
5038 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
5039 return propagateFloatx80NaN(a
, a
, status
);
5043 if ( aExp
< 0x3FFF ) {
5045 && ( (uint64_t) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
5048 status
->float_exception_flags
|= float_flag_inexact
;
5049 aSign
= extractFloatx80Sign( a
);
5050 switch (status
->float_rounding_mode
) {
5051 case float_round_nearest_even
:
5052 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
5055 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
5058 case float_round_ties_away
:
5059 if (aExp
== 0x3FFE) {
5060 return packFloatx80(aSign
, 0x3FFF, LIT64(0x8000000000000000));
5063 case float_round_down
:
5066 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5067 : packFloatx80( 0, 0, 0 );
5068 case float_round_up
:
5070 aSign
? packFloatx80( 1, 0, 0 )
5071 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5073 return packFloatx80( aSign
, 0, 0 );
5076 lastBitMask
<<= 0x403E - aExp
;
5077 roundBitsMask
= lastBitMask
- 1;
5079 switch (status
->float_rounding_mode
) {
5080 case float_round_nearest_even
:
5081 z
.low
+= lastBitMask
>>1;
5082 if ((z
.low
& roundBitsMask
) == 0) {
5083 z
.low
&= ~lastBitMask
;
5086 case float_round_ties_away
:
5087 z
.low
+= lastBitMask
>> 1;
5089 case float_round_to_zero
:
5091 case float_round_up
:
5092 if (!extractFloatx80Sign(z
)) {
5093 z
.low
+= roundBitsMask
;
5096 case float_round_down
:
5097 if (extractFloatx80Sign(z
)) {
5098 z
.low
+= roundBitsMask
;
5104 z
.low
&= ~ roundBitsMask
;
5107 z
.low
= LIT64( 0x8000000000000000 );
5109 if (z
.low
!= a
.low
) {
5110 status
->float_exception_flags
|= float_flag_inexact
;
5116 /*----------------------------------------------------------------------------
5117 | Returns the result of adding the absolute values of the extended double-
5118 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5119 | negated before being returned. `zSign' is ignored if the result is a NaN.
5120 | The addition is performed according to the IEC/IEEE Standard for Binary
5121 | Floating-Point Arithmetic.
5122 *----------------------------------------------------------------------------*/
5124 static floatx80
addFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
5125 float_status
*status
)
5127 int32_t aExp
, bExp
, zExp
;
5128 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5131 aSig
= extractFloatx80Frac( a
);
5132 aExp
= extractFloatx80Exp( a
);
5133 bSig
= extractFloatx80Frac( b
);
5134 bExp
= extractFloatx80Exp( b
);
5135 expDiff
= aExp
- bExp
;
5136 if ( 0 < expDiff
) {
5137 if ( aExp
== 0x7FFF ) {
5138 if ((uint64_t)(aSig
<< 1)) {
5139 return propagateFloatx80NaN(a
, b
, status
);
5143 if ( bExp
== 0 ) --expDiff
;
5144 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5147 else if ( expDiff
< 0 ) {
5148 if ( bExp
== 0x7FFF ) {
5149 if ((uint64_t)(bSig
<< 1)) {
5150 return propagateFloatx80NaN(a
, b
, status
);
5152 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5154 if ( aExp
== 0 ) ++expDiff
;
5155 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5159 if ( aExp
== 0x7FFF ) {
5160 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5161 return propagateFloatx80NaN(a
, b
, status
);
5166 zSig0
= aSig
+ bSig
;
5168 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
5174 zSig0
= aSig
+ bSig
;
5175 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
5177 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5178 zSig0
|= LIT64( 0x8000000000000000 );
5181 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5182 zSign
, zExp
, zSig0
, zSig1
, status
);
5185 /*----------------------------------------------------------------------------
5186 | Returns the result of subtracting the absolute values of the extended
5187 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5188 | difference is negated before being returned. `zSign' is ignored if the
5189 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5190 | Standard for Binary Floating-Point Arithmetic.
5191 *----------------------------------------------------------------------------*/
5193 static floatx80
subFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
5194 float_status
*status
)
5196 int32_t aExp
, bExp
, zExp
;
5197 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5201 aSig
= extractFloatx80Frac( a
);
5202 aExp
= extractFloatx80Exp( a
);
5203 bSig
= extractFloatx80Frac( b
);
5204 bExp
= extractFloatx80Exp( b
);
5205 expDiff
= aExp
- bExp
;
5206 if ( 0 < expDiff
) goto aExpBigger
;
5207 if ( expDiff
< 0 ) goto bExpBigger
;
5208 if ( aExp
== 0x7FFF ) {
5209 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5210 return propagateFloatx80NaN(a
, b
, status
);
5212 float_raise(float_flag_invalid
, status
);
5213 z
.low
= floatx80_default_nan_low
;
5214 z
.high
= floatx80_default_nan_high
;
5222 if ( bSig
< aSig
) goto aBigger
;
5223 if ( aSig
< bSig
) goto bBigger
;
5224 return packFloatx80(status
->float_rounding_mode
== float_round_down
, 0, 0);
5226 if ( bExp
== 0x7FFF ) {
5227 if ((uint64_t)(bSig
<< 1)) {
5228 return propagateFloatx80NaN(a
, b
, status
);
5230 return packFloatx80( zSign
^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5232 if ( aExp
== 0 ) ++expDiff
;
5233 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5235 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
5238 goto normalizeRoundAndPack
;
5240 if ( aExp
== 0x7FFF ) {
5241 if ((uint64_t)(aSig
<< 1)) {
5242 return propagateFloatx80NaN(a
, b
, status
);
5246 if ( bExp
== 0 ) --expDiff
;
5247 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5249 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
5251 normalizeRoundAndPack
:
5252 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
5253 zSign
, zExp
, zSig0
, zSig1
, status
);
5256 /*----------------------------------------------------------------------------
5257 | Returns the result of adding the extended double-precision floating-point
5258 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5259 | Standard for Binary Floating-Point Arithmetic.
5260 *----------------------------------------------------------------------------*/
5262 floatx80
floatx80_add(floatx80 a
, floatx80 b
, float_status
*status
)
5266 aSign
= extractFloatx80Sign( a
);
5267 bSign
= extractFloatx80Sign( b
);
5268 if ( aSign
== bSign
) {
5269 return addFloatx80Sigs(a
, b
, aSign
, status
);
5272 return subFloatx80Sigs(a
, b
, aSign
, status
);
5277 /*----------------------------------------------------------------------------
5278 | Returns the result of subtracting the extended double-precision floating-
5279 | point values `a' and `b'. The operation is performed according to the
5280 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5281 *----------------------------------------------------------------------------*/
5283 floatx80
floatx80_sub(floatx80 a
, floatx80 b
, float_status
*status
)
5287 aSign
= extractFloatx80Sign( a
);
5288 bSign
= extractFloatx80Sign( b
);
5289 if ( aSign
== bSign
) {
5290 return subFloatx80Sigs(a
, b
, aSign
, status
);
5293 return addFloatx80Sigs(a
, b
, aSign
, status
);
5298 /*----------------------------------------------------------------------------
5299 | Returns the result of multiplying the extended double-precision floating-
5300 | point values `a' and `b'. The operation is performed according to the
5301 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5302 *----------------------------------------------------------------------------*/
5304 floatx80
floatx80_mul(floatx80 a
, floatx80 b
, float_status
*status
)
5306 flag aSign
, bSign
, zSign
;
5307 int32_t aExp
, bExp
, zExp
;
5308 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5311 aSig
= extractFloatx80Frac( a
);
5312 aExp
= extractFloatx80Exp( a
);
5313 aSign
= extractFloatx80Sign( a
);
5314 bSig
= extractFloatx80Frac( b
);
5315 bExp
= extractFloatx80Exp( b
);
5316 bSign
= extractFloatx80Sign( b
);
5317 zSign
= aSign
^ bSign
;
5318 if ( aExp
== 0x7FFF ) {
5319 if ( (uint64_t) ( aSig
<<1 )
5320 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5321 return propagateFloatx80NaN(a
, b
, status
);
5323 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
5324 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5326 if ( bExp
== 0x7FFF ) {
5327 if ((uint64_t)(bSig
<< 1)) {
5328 return propagateFloatx80NaN(a
, b
, status
);
5330 if ( ( aExp
| aSig
) == 0 ) {
5332 float_raise(float_flag_invalid
, status
);
5333 z
.low
= floatx80_default_nan_low
;
5334 z
.high
= floatx80_default_nan_high
;
5337 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5340 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5341 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5344 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5345 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5347 zExp
= aExp
+ bExp
- 0x3FFE;
5348 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
5349 if ( 0 < (int64_t) zSig0
) {
5350 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5353 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5354 zSign
, zExp
, zSig0
, zSig1
, status
);
5357 /*----------------------------------------------------------------------------
5358 | Returns the result of dividing the extended double-precision floating-point
5359 | value `a' by the corresponding value `b'. The operation is performed
5360 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5361 *----------------------------------------------------------------------------*/
5363 floatx80
floatx80_div(floatx80 a
, floatx80 b
, float_status
*status
)
5365 flag aSign
, bSign
, zSign
;
5366 int32_t aExp
, bExp
, zExp
;
5367 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5368 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
5371 aSig
= extractFloatx80Frac( a
);
5372 aExp
= extractFloatx80Exp( a
);
5373 aSign
= extractFloatx80Sign( a
);
5374 bSig
= extractFloatx80Frac( b
);
5375 bExp
= extractFloatx80Exp( b
);
5376 bSign
= extractFloatx80Sign( b
);
5377 zSign
= aSign
^ bSign
;
5378 if ( aExp
== 0x7FFF ) {
5379 if ((uint64_t)(aSig
<< 1)) {
5380 return propagateFloatx80NaN(a
, b
, status
);
5382 if ( bExp
== 0x7FFF ) {
5383 if ((uint64_t)(bSig
<< 1)) {
5384 return propagateFloatx80NaN(a
, b
, status
);
5388 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5390 if ( bExp
== 0x7FFF ) {
5391 if ((uint64_t)(bSig
<< 1)) {
5392 return propagateFloatx80NaN(a
, b
, status
);
5394 return packFloatx80( zSign
, 0, 0 );
5398 if ( ( aExp
| aSig
) == 0 ) {
5400 float_raise(float_flag_invalid
, status
);
5401 z
.low
= floatx80_default_nan_low
;
5402 z
.high
= floatx80_default_nan_high
;
5405 float_raise(float_flag_divbyzero
, status
);
5406 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5408 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5411 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5412 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5414 zExp
= aExp
- bExp
+ 0x3FFE;
5416 if ( bSig
<= aSig
) {
5417 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
5420 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
5421 mul64To128( bSig
, zSig0
, &term0
, &term1
);
5422 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
5423 while ( (int64_t) rem0
< 0 ) {
5425 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
5427 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
5428 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
5429 mul64To128( bSig
, zSig1
, &term1
, &term2
);
5430 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5431 while ( (int64_t) rem1
< 0 ) {
5433 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
5435 zSig1
|= ( ( rem1
| rem2
) != 0 );
5437 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5438 zSign
, zExp
, zSig0
, zSig1
, status
);
5441 /*----------------------------------------------------------------------------
5442 | Returns the remainder of the extended double-precision floating-point value
5443 | `a' with respect to the corresponding value `b'. The operation is performed
5444 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5445 *----------------------------------------------------------------------------*/
5447 floatx80
floatx80_rem(floatx80 a
, floatx80 b
, float_status
*status
)
5450 int32_t aExp
, bExp
, expDiff
;
5451 uint64_t aSig0
, aSig1
, bSig
;
5452 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
5455 aSig0
= extractFloatx80Frac( a
);
5456 aExp
= extractFloatx80Exp( a
);
5457 aSign
= extractFloatx80Sign( a
);
5458 bSig
= extractFloatx80Frac( b
);
5459 bExp
= extractFloatx80Exp( b
);
5460 if ( aExp
== 0x7FFF ) {
5461 if ( (uint64_t) ( aSig0
<<1 )
5462 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5463 return propagateFloatx80NaN(a
, b
, status
);
5467 if ( bExp
== 0x7FFF ) {
5468 if ((uint64_t)(bSig
<< 1)) {
5469 return propagateFloatx80NaN(a
, b
, status
);
5476 float_raise(float_flag_invalid
, status
);
5477 z
.low
= floatx80_default_nan_low
;
5478 z
.high
= floatx80_default_nan_high
;
5481 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5484 if ( (uint64_t) ( aSig0
<<1 ) == 0 ) return a
;
5485 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5487 bSig
|= LIT64( 0x8000000000000000 );
5489 expDiff
= aExp
- bExp
;
5491 if ( expDiff
< 0 ) {
5492 if ( expDiff
< -1 ) return a
;
5493 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
5496 q
= ( bSig
<= aSig0
);
5497 if ( q
) aSig0
-= bSig
;
5499 while ( 0 < expDiff
) {
5500 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5501 q
= ( 2 < q
) ? q
- 2 : 0;
5502 mul64To128( bSig
, q
, &term0
, &term1
);
5503 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5504 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
5508 if ( 0 < expDiff
) {
5509 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5510 q
= ( 2 < q
) ? q
- 2 : 0;
5512 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
5513 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5514 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
5515 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
5517 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5524 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
5525 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5526 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5529 aSig0
= alternateASig0
;
5530 aSig1
= alternateASig1
;
5534 normalizeRoundAndPackFloatx80(
5535 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
, status
);
5539 /*----------------------------------------------------------------------------
5540 | Returns the square root of the extended double-precision floating-point
5541 | value `a'. The operation is performed according to the IEC/IEEE Standard
5542 | for Binary Floating-Point Arithmetic.
5543 *----------------------------------------------------------------------------*/
5545 floatx80
floatx80_sqrt(floatx80 a
, float_status
*status
)
5549 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
5550 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5553 aSig0
= extractFloatx80Frac( a
);
5554 aExp
= extractFloatx80Exp( a
);
5555 aSign
= extractFloatx80Sign( a
);
5556 if ( aExp
== 0x7FFF ) {
5557 if ((uint64_t)(aSig0
<< 1)) {
5558 return propagateFloatx80NaN(a
, a
, status
);
5560 if ( ! aSign
) return a
;
5564 if ( ( aExp
| aSig0
) == 0 ) return a
;
5566 float_raise(float_flag_invalid
, status
);
5567 z
.low
= floatx80_default_nan_low
;
5568 z
.high
= floatx80_default_nan_high
;
5572 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
5573 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5575 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
5576 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
5577 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
5578 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
5579 doubleZSig0
= zSig0
<<1;
5580 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
5581 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
5582 while ( (int64_t) rem0
< 0 ) {
5585 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
5587 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
5588 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5589 if ( zSig1
== 0 ) zSig1
= 1;
5590 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
5591 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5592 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
5593 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5594 while ( (int64_t) rem1
< 0 ) {
5596 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
5598 term2
|= doubleZSig0
;
5599 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5601 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5603 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
5604 zSig0
|= doubleZSig0
;
5605 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5606 0, zExp
, zSig0
, zSig1
, status
);
5609 /*----------------------------------------------------------------------------
5610 | Returns 1 if the extended double-precision floating-point value `a' is equal
5611 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5612 | raised if either operand is a NaN. Otherwise, the comparison is performed
5613 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5614 *----------------------------------------------------------------------------*/
5616 int floatx80_eq(floatx80 a
, floatx80 b
, float_status
*status
)
5619 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5620 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5621 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5622 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5624 float_raise(float_flag_invalid
, status
);
5629 && ( ( a
.high
== b
.high
)
5631 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5636 /*----------------------------------------------------------------------------
5637 | Returns 1 if the extended double-precision floating-point value `a' is
5638 | less than or equal to the corresponding value `b', and 0 otherwise. The
5639 | invalid exception is raised if either operand is a NaN. The comparison is
5640 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5642 *----------------------------------------------------------------------------*/
5644 int floatx80_le(floatx80 a
, floatx80 b
, float_status
*status
)
5648 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5649 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5650 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5651 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5653 float_raise(float_flag_invalid
, status
);
5656 aSign
= extractFloatx80Sign( a
);
5657 bSign
= extractFloatx80Sign( b
);
5658 if ( aSign
!= bSign
) {
5661 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5665 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5666 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5670 /*----------------------------------------------------------------------------
5671 | Returns 1 if the extended double-precision floating-point value `a' is
5672 | less than the corresponding value `b', and 0 otherwise. The invalid
5673 | exception is raised if either operand is a NaN. The comparison is performed
5674 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5675 *----------------------------------------------------------------------------*/
5677 int floatx80_lt(floatx80 a
, floatx80 b
, float_status
*status
)
5681 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5682 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5683 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5684 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5686 float_raise(float_flag_invalid
, status
);
5689 aSign
= extractFloatx80Sign( a
);
5690 bSign
= extractFloatx80Sign( b
);
5691 if ( aSign
!= bSign
) {
5694 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5698 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5699 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5703 /*----------------------------------------------------------------------------
5704 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5705 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5706 | either operand is a NaN. The comparison is performed according to the
5707 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5708 *----------------------------------------------------------------------------*/
5709 int floatx80_unordered(floatx80 a
, floatx80 b
, float_status
*status
)
5711 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5712 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5713 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5714 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5716 float_raise(float_flag_invalid
, status
);
5722 /*----------------------------------------------------------------------------
5723 | Returns 1 if the extended double-precision floating-point value `a' is
5724 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5725 | cause an exception. The comparison is performed according to the IEC/IEEE
5726 | Standard for Binary Floating-Point Arithmetic.
5727 *----------------------------------------------------------------------------*/
5729 int floatx80_eq_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5732 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5733 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5734 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5735 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5737 if ( floatx80_is_signaling_nan( a
)
5738 || floatx80_is_signaling_nan( b
) ) {
5739 float_raise(float_flag_invalid
, status
);
5745 && ( ( a
.high
== b
.high
)
5747 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5752 /*----------------------------------------------------------------------------
5753 | Returns 1 if the extended double-precision floating-point value `a' is less
5754 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5755 | do not cause an exception. Otherwise, the comparison is performed according
5756 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5757 *----------------------------------------------------------------------------*/
5759 int floatx80_le_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5763 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5764 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5765 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5766 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5768 if ( floatx80_is_signaling_nan( a
)
5769 || floatx80_is_signaling_nan( b
) ) {
5770 float_raise(float_flag_invalid
, status
);
5774 aSign
= extractFloatx80Sign( a
);
5775 bSign
= extractFloatx80Sign( b
);
5776 if ( aSign
!= bSign
) {
5779 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5783 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5784 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5788 /*----------------------------------------------------------------------------
5789 | Returns 1 if the extended double-precision floating-point value `a' is less
5790 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5791 | an exception. Otherwise, the comparison is performed according to the
5792 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5793 *----------------------------------------------------------------------------*/
5795 int floatx80_lt_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5799 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5800 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5801 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5802 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5804 if ( floatx80_is_signaling_nan( a
)
5805 || floatx80_is_signaling_nan( b
) ) {
5806 float_raise(float_flag_invalid
, status
);
5810 aSign
= extractFloatx80Sign( a
);
5811 bSign
= extractFloatx80Sign( b
);
5812 if ( aSign
!= bSign
) {
5815 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5819 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5820 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5824 /*----------------------------------------------------------------------------
5825 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5826 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5827 | The comparison is performed according to the IEC/IEEE Standard for Binary
5828 | Floating-Point Arithmetic.
5829 *----------------------------------------------------------------------------*/
5830 int floatx80_unordered_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5832 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5833 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5834 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5835 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5837 if ( floatx80_is_signaling_nan( a
)
5838 || floatx80_is_signaling_nan( b
) ) {
5839 float_raise(float_flag_invalid
, status
);
5846 /*----------------------------------------------------------------------------
5847 | Returns the result of converting the quadruple-precision floating-point
5848 | value `a' to the 32-bit two's complement integer format. The conversion
5849 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5850 | Arithmetic---which means in particular that the conversion is rounded
5851 | according to the current rounding mode. If `a' is a NaN, the largest
5852 | positive integer is returned. Otherwise, if the conversion overflows, the
5853 | largest integer with the same sign as `a' is returned.
5854 *----------------------------------------------------------------------------*/
5856 int32_t float128_to_int32(float128 a
, float_status
*status
)
5859 int32_t aExp
, shiftCount
;
5860 uint64_t aSig0
, aSig1
;
5862 aSig1
= extractFloat128Frac1( a
);
5863 aSig0
= extractFloat128Frac0( a
);
5864 aExp
= extractFloat128Exp( a
);
5865 aSign
= extractFloat128Sign( a
);
5866 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
5867 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5868 aSig0
|= ( aSig1
!= 0 );
5869 shiftCount
= 0x4028 - aExp
;
5870 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
5871 return roundAndPackInt32(aSign
, aSig0
, status
);
5875 /*----------------------------------------------------------------------------
5876 | Returns the result of converting the quadruple-precision floating-point
5877 | value `a' to the 32-bit two's complement integer format. The conversion
5878 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5879 | Arithmetic, except that the conversion is always rounded toward zero. If
5880 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5881 | conversion overflows, the largest integer with the same sign as `a' is
5883 *----------------------------------------------------------------------------*/
5885 int32_t float128_to_int32_round_to_zero(float128 a
, float_status
*status
)
5888 int32_t aExp
, shiftCount
;
5889 uint64_t aSig0
, aSig1
, savedASig
;
5892 aSig1
= extractFloat128Frac1( a
);
5893 aSig0
= extractFloat128Frac0( a
);
5894 aExp
= extractFloat128Exp( a
);
5895 aSign
= extractFloat128Sign( a
);
5896 aSig0
|= ( aSig1
!= 0 );
5897 if ( 0x401E < aExp
) {
5898 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
5901 else if ( aExp
< 0x3FFF ) {
5902 if (aExp
|| aSig0
) {
5903 status
->float_exception_flags
|= float_flag_inexact
;
5907 aSig0
|= LIT64( 0x0001000000000000 );
5908 shiftCount
= 0x402F - aExp
;
5910 aSig0
>>= shiftCount
;
5912 if ( aSign
) z
= - z
;
5913 if ( ( z
< 0 ) ^ aSign
) {
5915 float_raise(float_flag_invalid
, status
);
5916 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
5918 if ( ( aSig0
<<shiftCount
) != savedASig
) {
5919 status
->float_exception_flags
|= float_flag_inexact
;
5925 /*----------------------------------------------------------------------------
5926 | Returns the result of converting the quadruple-precision floating-point
5927 | value `a' to the 64-bit two's complement integer format. The conversion
5928 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5929 | Arithmetic---which means in particular that the conversion is rounded
5930 | according to the current rounding mode. If `a' is a NaN, the largest
5931 | positive integer is returned. Otherwise, if the conversion overflows, the
5932 | largest integer with the same sign as `a' is returned.
5933 *----------------------------------------------------------------------------*/
5935 int64_t float128_to_int64(float128 a
, float_status
*status
)
5938 int32_t aExp
, shiftCount
;
5939 uint64_t aSig0
, aSig1
;
5941 aSig1
= extractFloat128Frac1( a
);
5942 aSig0
= extractFloat128Frac0( a
);
5943 aExp
= extractFloat128Exp( a
);
5944 aSign
= extractFloat128Sign( a
);
5945 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5946 shiftCount
= 0x402F - aExp
;
5947 if ( shiftCount
<= 0 ) {
5948 if ( 0x403E < aExp
) {
5949 float_raise(float_flag_invalid
, status
);
5951 || ( ( aExp
== 0x7FFF )
5952 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
5955 return LIT64( 0x7FFFFFFFFFFFFFFF );
5957 return (int64_t) LIT64( 0x8000000000000000 );
5959 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
5962 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
5964 return roundAndPackInt64(aSign
, aSig0
, aSig1
, status
);
5968 /*----------------------------------------------------------------------------
5969 | Returns the result of converting the quadruple-precision floating-point
5970 | value `a' to the 64-bit two's complement integer format. The conversion
5971 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5972 | Arithmetic, except that the conversion is always rounded toward zero.
5973 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5974 | the conversion overflows, the largest integer with the same sign as `a' is
5976 *----------------------------------------------------------------------------*/
5978 int64_t float128_to_int64_round_to_zero(float128 a
, float_status
*status
)
5981 int32_t aExp
, shiftCount
;
5982 uint64_t aSig0
, aSig1
;
5985 aSig1
= extractFloat128Frac1( a
);
5986 aSig0
= extractFloat128Frac0( a
);
5987 aExp
= extractFloat128Exp( a
);
5988 aSign
= extractFloat128Sign( a
);
5989 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5990 shiftCount
= aExp
- 0x402F;
5991 if ( 0 < shiftCount
) {
5992 if ( 0x403E <= aExp
) {
5993 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
5994 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
5995 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
5997 status
->float_exception_flags
|= float_flag_inexact
;
6001 float_raise(float_flag_invalid
, status
);
6002 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
6003 return LIT64( 0x7FFFFFFFFFFFFFFF );
6006 return (int64_t) LIT64( 0x8000000000000000 );
6008 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
6009 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
6010 status
->float_exception_flags
|= float_flag_inexact
;
6014 if ( aExp
< 0x3FFF ) {
6015 if ( aExp
| aSig0
| aSig1
) {
6016 status
->float_exception_flags
|= float_flag_inexact
;
6020 z
= aSig0
>>( - shiftCount
);
6022 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
6023 status
->float_exception_flags
|= float_flag_inexact
;
6026 if ( aSign
) z
= - z
;
6031 /*----------------------------------------------------------------------------
6032 | Returns the result of converting the quadruple-precision floating-point
6033 | value `a' to the single-precision floating-point format. The conversion
6034 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6036 *----------------------------------------------------------------------------*/
6038 float32
float128_to_float32(float128 a
, float_status
*status
)
6042 uint64_t aSig0
, aSig1
;
6045 aSig1
= extractFloat128Frac1( a
);
6046 aSig0
= extractFloat128Frac0( a
);
6047 aExp
= extractFloat128Exp( a
);
6048 aSign
= extractFloat128Sign( a
);
6049 if ( aExp
== 0x7FFF ) {
6050 if ( aSig0
| aSig1
) {
6051 return commonNaNToFloat32(float128ToCommonNaN(a
, status
), status
);
6053 return packFloat32( aSign
, 0xFF, 0 );
6055 aSig0
|= ( aSig1
!= 0 );
6056 shift64RightJamming( aSig0
, 18, &aSig0
);
6058 if ( aExp
|| zSig
) {
6062 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
6066 /*----------------------------------------------------------------------------
6067 | Returns the result of converting the quadruple-precision floating-point
6068 | value `a' to the double-precision floating-point format. The conversion
6069 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6071 *----------------------------------------------------------------------------*/
6073 float64
float128_to_float64(float128 a
, float_status
*status
)
6077 uint64_t aSig0
, aSig1
;
6079 aSig1
= extractFloat128Frac1( a
);
6080 aSig0
= extractFloat128Frac0( a
);
6081 aExp
= extractFloat128Exp( a
);
6082 aSign
= extractFloat128Sign( a
);
6083 if ( aExp
== 0x7FFF ) {
6084 if ( aSig0
| aSig1
) {
6085 return commonNaNToFloat64(float128ToCommonNaN(a
, status
), status
);
6087 return packFloat64( aSign
, 0x7FF, 0 );
6089 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6090 aSig0
|= ( aSig1
!= 0 );
6091 if ( aExp
|| aSig0
) {
6092 aSig0
|= LIT64( 0x4000000000000000 );
6095 return roundAndPackFloat64(aSign
, aExp
, aSig0
, status
);
6099 /*----------------------------------------------------------------------------
6100 | Returns the result of converting the quadruple-precision floating-point
6101 | value `a' to the extended double-precision floating-point format. The
6102 | conversion is performed according to the IEC/IEEE Standard for Binary
6103 | Floating-Point Arithmetic.
6104 *----------------------------------------------------------------------------*/
6106 floatx80
float128_to_floatx80(float128 a
, float_status
*status
)
6110 uint64_t aSig0
, aSig1
;
6112 aSig1
= extractFloat128Frac1( a
);
6113 aSig0
= extractFloat128Frac0( a
);
6114 aExp
= extractFloat128Exp( a
);
6115 aSign
= extractFloat128Sign( a
);
6116 if ( aExp
== 0x7FFF ) {
6117 if ( aSig0
| aSig1
) {
6118 return commonNaNToFloatx80(float128ToCommonNaN(a
, status
), status
);
6120 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
6123 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
6124 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6127 aSig0
|= LIT64( 0x0001000000000000 );
6129 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
6130 return roundAndPackFloatx80(80, aSign
, aExp
, aSig0
, aSig1
, status
);
6134 /*----------------------------------------------------------------------------
6135 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6136 | returns the result as a quadruple-precision floating-point value. The
6137 | operation is performed according to the IEC/IEEE Standard for Binary
6138 | Floating-Point Arithmetic.
6139 *----------------------------------------------------------------------------*/
6141 float128
float128_round_to_int(float128 a
, float_status
*status
)
6145 uint64_t lastBitMask
, roundBitsMask
;
6148 aExp
= extractFloat128Exp( a
);
6149 if ( 0x402F <= aExp
) {
6150 if ( 0x406F <= aExp
) {
6151 if ( ( aExp
== 0x7FFF )
6152 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
6154 return propagateFloat128NaN(a
, a
, status
);
6159 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
6160 roundBitsMask
= lastBitMask
- 1;
6162 switch (status
->float_rounding_mode
) {
6163 case float_round_nearest_even
:
6164 if ( lastBitMask
) {
6165 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
6166 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
6169 if ( (int64_t) z
.low
< 0 ) {
6171 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
6175 case float_round_ties_away
:
6177 add128(z
.high
, z
.low
, 0, lastBitMask
>> 1, &z
.high
, &z
.low
);
6179 if ((int64_t) z
.low
< 0) {
6184 case float_round_to_zero
:
6186 case float_round_up
:
6187 if (!extractFloat128Sign(z
)) {
6188 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6191 case float_round_down
:
6192 if (extractFloat128Sign(z
)) {
6193 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6199 z
.low
&= ~ roundBitsMask
;
6202 if ( aExp
< 0x3FFF ) {
6203 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
6204 status
->float_exception_flags
|= float_flag_inexact
;
6205 aSign
= extractFloat128Sign( a
);
6206 switch (status
->float_rounding_mode
) {
6207 case float_round_nearest_even
:
6208 if ( ( aExp
== 0x3FFE )
6209 && ( extractFloat128Frac0( a
)
6210 | extractFloat128Frac1( a
) )
6212 return packFloat128( aSign
, 0x3FFF, 0, 0 );
6215 case float_round_ties_away
:
6216 if (aExp
== 0x3FFE) {
6217 return packFloat128(aSign
, 0x3FFF, 0, 0);
6220 case float_round_down
:
6222 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
6223 : packFloat128( 0, 0, 0, 0 );
6224 case float_round_up
:
6226 aSign
? packFloat128( 1, 0, 0, 0 )
6227 : packFloat128( 0, 0x3FFF, 0, 0 );
6229 return packFloat128( aSign
, 0, 0, 0 );
6232 lastBitMask
<<= 0x402F - aExp
;
6233 roundBitsMask
= lastBitMask
- 1;
6236 switch (status
->float_rounding_mode
) {
6237 case float_round_nearest_even
:
6238 z
.high
+= lastBitMask
>>1;
6239 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
6240 z
.high
&= ~ lastBitMask
;
6243 case float_round_ties_away
:
6244 z
.high
+= lastBitMask
>>1;
6246 case float_round_to_zero
:
6248 case float_round_up
:
6249 if (!extractFloat128Sign(z
)) {
6250 z
.high
|= ( a
.low
!= 0 );
6251 z
.high
+= roundBitsMask
;
6254 case float_round_down
:
6255 if (extractFloat128Sign(z
)) {
6256 z
.high
|= (a
.low
!= 0);
6257 z
.high
+= roundBitsMask
;
6263 z
.high
&= ~ roundBitsMask
;
6265 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
6266 status
->float_exception_flags
|= float_flag_inexact
;
6272 /*----------------------------------------------------------------------------
6273 | Returns the result of adding the absolute values of the quadruple-precision
6274 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6275 | before being returned. `zSign' is ignored if the result is a NaN.
6276 | The addition is performed according to the IEC/IEEE Standard for Binary
6277 | Floating-Point Arithmetic.
6278 *----------------------------------------------------------------------------*/
6280 static float128
addFloat128Sigs(float128 a
, float128 b
, flag zSign
,
6281 float_status
*status
)
6283 int32_t aExp
, bExp
, zExp
;
6284 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6287 aSig1
= extractFloat128Frac1( a
);
6288 aSig0
= extractFloat128Frac0( a
);
6289 aExp
= extractFloat128Exp( a
);
6290 bSig1
= extractFloat128Frac1( b
);
6291 bSig0
= extractFloat128Frac0( b
);
6292 bExp
= extractFloat128Exp( b
);
6293 expDiff
= aExp
- bExp
;
6294 if ( 0 < expDiff
) {
6295 if ( aExp
== 0x7FFF ) {
6296 if (aSig0
| aSig1
) {
6297 return propagateFloat128NaN(a
, b
, status
);
6305 bSig0
|= LIT64( 0x0001000000000000 );
6307 shift128ExtraRightJamming(
6308 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
6311 else if ( expDiff
< 0 ) {
6312 if ( bExp
== 0x7FFF ) {
6313 if (bSig0
| bSig1
) {
6314 return propagateFloat128NaN(a
, b
, status
);
6316 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6322 aSig0
|= LIT64( 0x0001000000000000 );
6324 shift128ExtraRightJamming(
6325 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
6329 if ( aExp
== 0x7FFF ) {
6330 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6331 return propagateFloat128NaN(a
, b
, status
);
6335 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6337 if (status
->flush_to_zero
) {
6338 if (zSig0
| zSig1
) {
6339 float_raise(float_flag_output_denormal
, status
);
6341 return packFloat128(zSign
, 0, 0, 0);
6343 return packFloat128( zSign
, 0, zSig0
, zSig1
);
6346 zSig0
|= LIT64( 0x0002000000000000 );
6350 aSig0
|= LIT64( 0x0001000000000000 );
6351 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6353 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
6356 shift128ExtraRightJamming(
6357 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6359 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6363 /*----------------------------------------------------------------------------
6364 | Returns the result of subtracting the absolute values of the quadruple-
6365 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6366 | difference is negated before being returned. `zSign' is ignored if the
6367 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6368 | Standard for Binary Floating-Point Arithmetic.
6369 *----------------------------------------------------------------------------*/
6371 static float128
subFloat128Sigs(float128 a
, float128 b
, flag zSign
,
6372 float_status
*status
)
6374 int32_t aExp
, bExp
, zExp
;
6375 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
6379 aSig1
= extractFloat128Frac1( a
);
6380 aSig0
= extractFloat128Frac0( a
);
6381 aExp
= extractFloat128Exp( a
);
6382 bSig1
= extractFloat128Frac1( b
);
6383 bSig0
= extractFloat128Frac0( b
);
6384 bExp
= extractFloat128Exp( b
);
6385 expDiff
= aExp
- bExp
;
6386 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6387 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
6388 if ( 0 < expDiff
) goto aExpBigger
;
6389 if ( expDiff
< 0 ) goto bExpBigger
;
6390 if ( aExp
== 0x7FFF ) {
6391 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6392 return propagateFloat128NaN(a
, b
, status
);
6394 float_raise(float_flag_invalid
, status
);
6395 z
.low
= float128_default_nan_low
;
6396 z
.high
= float128_default_nan_high
;
6403 if ( bSig0
< aSig0
) goto aBigger
;
6404 if ( aSig0
< bSig0
) goto bBigger
;
6405 if ( bSig1
< aSig1
) goto aBigger
;
6406 if ( aSig1
< bSig1
) goto bBigger
;
6407 return packFloat128(status
->float_rounding_mode
== float_round_down
,
6410 if ( bExp
== 0x7FFF ) {
6411 if (bSig0
| bSig1
) {
6412 return propagateFloat128NaN(a
, b
, status
);
6414 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
6420 aSig0
|= LIT64( 0x4000000000000000 );
6422 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6423 bSig0
|= LIT64( 0x4000000000000000 );
6425 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6428 goto normalizeRoundAndPack
;
6430 if ( aExp
== 0x7FFF ) {
6431 if (aSig0
| aSig1
) {
6432 return propagateFloat128NaN(a
, b
, status
);
6440 bSig0
|= LIT64( 0x4000000000000000 );
6442 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
6443 aSig0
|= LIT64( 0x4000000000000000 );
6445 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6447 normalizeRoundAndPack
:
6449 return normalizeRoundAndPackFloat128(zSign
, zExp
- 14, zSig0
, zSig1
,
6454 /*----------------------------------------------------------------------------
6455 | Returns the result of adding the quadruple-precision floating-point values
6456 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6457 | for Binary Floating-Point Arithmetic.
6458 *----------------------------------------------------------------------------*/
6460 float128
float128_add(float128 a
, float128 b
, float_status
*status
)
6464 aSign
= extractFloat128Sign( a
);
6465 bSign
= extractFloat128Sign( b
);
6466 if ( aSign
== bSign
) {
6467 return addFloat128Sigs(a
, b
, aSign
, status
);
6470 return subFloat128Sigs(a
, b
, aSign
, status
);
6475 /*----------------------------------------------------------------------------
6476 | Returns the result of subtracting the quadruple-precision floating-point
6477 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6478 | Standard for Binary Floating-Point Arithmetic.
6479 *----------------------------------------------------------------------------*/
6481 float128
float128_sub(float128 a
, float128 b
, float_status
*status
)
6485 aSign
= extractFloat128Sign( a
);
6486 bSign
= extractFloat128Sign( b
);
6487 if ( aSign
== bSign
) {
6488 return subFloat128Sigs(a
, b
, aSign
, status
);
6491 return addFloat128Sigs(a
, b
, aSign
, status
);
6496 /*----------------------------------------------------------------------------
6497 | Returns the result of multiplying the quadruple-precision floating-point
6498 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6499 | Standard for Binary Floating-Point Arithmetic.
6500 *----------------------------------------------------------------------------*/
6502 float128
float128_mul(float128 a
, float128 b
, float_status
*status
)
6504 flag aSign
, bSign
, zSign
;
6505 int32_t aExp
, bExp
, zExp
;
6506 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
6509 aSig1
= extractFloat128Frac1( a
);
6510 aSig0
= extractFloat128Frac0( a
);
6511 aExp
= extractFloat128Exp( a
);
6512 aSign
= extractFloat128Sign( a
);
6513 bSig1
= extractFloat128Frac1( b
);
6514 bSig0
= extractFloat128Frac0( b
);
6515 bExp
= extractFloat128Exp( b
);
6516 bSign
= extractFloat128Sign( b
);
6517 zSign
= aSign
^ bSign
;
6518 if ( aExp
== 0x7FFF ) {
6519 if ( ( aSig0
| aSig1
)
6520 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6521 return propagateFloat128NaN(a
, b
, status
);
6523 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
6524 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6526 if ( bExp
== 0x7FFF ) {
6527 if (bSig0
| bSig1
) {
6528 return propagateFloat128NaN(a
, b
, status
);
6530 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6532 float_raise(float_flag_invalid
, status
);
6533 z
.low
= float128_default_nan_low
;
6534 z
.high
= float128_default_nan_high
;
6537 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6540 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6541 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6544 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6545 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6547 zExp
= aExp
+ bExp
- 0x4000;
6548 aSig0
|= LIT64( 0x0001000000000000 );
6549 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
6550 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
6551 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6552 zSig2
|= ( zSig3
!= 0 );
6553 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
6554 shift128ExtraRightJamming(
6555 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6558 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6562 /*----------------------------------------------------------------------------
6563 | Returns the result of dividing the quadruple-precision floating-point value
6564 | `a' by the corresponding value `b'. The operation is performed according to
6565 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6566 *----------------------------------------------------------------------------*/
6568 float128
float128_div(float128 a
, float128 b
, float_status
*status
)
6570 flag aSign
, bSign
, zSign
;
6571 int32_t aExp
, bExp
, zExp
;
6572 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6573 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6576 aSig1
= extractFloat128Frac1( a
);
6577 aSig0
= extractFloat128Frac0( a
);
6578 aExp
= extractFloat128Exp( a
);
6579 aSign
= extractFloat128Sign( a
);
6580 bSig1
= extractFloat128Frac1( b
);
6581 bSig0
= extractFloat128Frac0( b
);
6582 bExp
= extractFloat128Exp( b
);
6583 bSign
= extractFloat128Sign( b
);
6584 zSign
= aSign
^ bSign
;
6585 if ( aExp
== 0x7FFF ) {
6586 if (aSig0
| aSig1
) {
6587 return propagateFloat128NaN(a
, b
, status
);
6589 if ( bExp
== 0x7FFF ) {
6590 if (bSig0
| bSig1
) {
6591 return propagateFloat128NaN(a
, b
, status
);
6595 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6597 if ( bExp
== 0x7FFF ) {
6598 if (bSig0
| bSig1
) {
6599 return propagateFloat128NaN(a
, b
, status
);
6601 return packFloat128( zSign
, 0, 0, 0 );
6604 if ( ( bSig0
| bSig1
) == 0 ) {
6605 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6607 float_raise(float_flag_invalid
, status
);
6608 z
.low
= float128_default_nan_low
;
6609 z
.high
= float128_default_nan_high
;
6612 float_raise(float_flag_divbyzero
, status
);
6613 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6615 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6618 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6619 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6621 zExp
= aExp
- bExp
+ 0x3FFD;
6623 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
6625 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
6626 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
6627 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
6630 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6631 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
6632 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
6633 while ( (int64_t) rem0
< 0 ) {
6635 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
6637 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
6638 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
6639 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
6640 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
6641 while ( (int64_t) rem1
< 0 ) {
6643 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
6645 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6647 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
6648 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6652 /*----------------------------------------------------------------------------
6653 | Returns the remainder of the quadruple-precision floating-point value `a'
6654 | with respect to the corresponding value `b'. The operation is performed
6655 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6656 *----------------------------------------------------------------------------*/
6658 float128
float128_rem(float128 a
, float128 b
, float_status
*status
)
6661 int32_t aExp
, bExp
, expDiff
;
6662 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
6663 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
6667 aSig1
= extractFloat128Frac1( a
);
6668 aSig0
= extractFloat128Frac0( a
);
6669 aExp
= extractFloat128Exp( a
);
6670 aSign
= extractFloat128Sign( a
);
6671 bSig1
= extractFloat128Frac1( b
);
6672 bSig0
= extractFloat128Frac0( b
);
6673 bExp
= extractFloat128Exp( b
);
6674 if ( aExp
== 0x7FFF ) {
6675 if ( ( aSig0
| aSig1
)
6676 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6677 return propagateFloat128NaN(a
, b
, status
);
6681 if ( bExp
== 0x7FFF ) {
6682 if (bSig0
| bSig1
) {
6683 return propagateFloat128NaN(a
, b
, status
);
6688 if ( ( bSig0
| bSig1
) == 0 ) {
6690 float_raise(float_flag_invalid
, status
);
6691 z
.low
= float128_default_nan_low
;
6692 z
.high
= float128_default_nan_high
;
6695 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6698 if ( ( aSig0
| aSig1
) == 0 ) return a
;
6699 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6701 expDiff
= aExp
- bExp
;
6702 if ( expDiff
< -1 ) return a
;
6704 aSig0
| LIT64( 0x0001000000000000 ),
6706 15 - ( expDiff
< 0 ),
6711 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
6712 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
6713 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6715 while ( 0 < expDiff
) {
6716 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6717 q
= ( 4 < q
) ? q
- 4 : 0;
6718 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6719 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
6720 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
6721 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
6724 if ( -64 < expDiff
) {
6725 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6726 q
= ( 4 < q
) ? q
- 4 : 0;
6728 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6730 if ( expDiff
< 0 ) {
6731 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6734 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
6736 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6737 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
6740 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
6741 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6744 alternateASig0
= aSig0
;
6745 alternateASig1
= aSig1
;
6747 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6748 } while ( 0 <= (int64_t) aSig0
);
6750 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
6751 if ( ( sigMean0
< 0 )
6752 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
6753 aSig0
= alternateASig0
;
6754 aSig1
= alternateASig1
;
6756 zSign
= ( (int64_t) aSig0
< 0 );
6757 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
6758 return normalizeRoundAndPackFloat128(aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
,
6762 /*----------------------------------------------------------------------------
6763 | Returns the square root of the quadruple-precision floating-point value `a'.
6764 | The operation is performed according to the IEC/IEEE Standard for Binary
6765 | Floating-Point Arithmetic.
6766 *----------------------------------------------------------------------------*/
6768 float128
float128_sqrt(float128 a
, float_status
*status
)
6772 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
6773 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6776 aSig1
= extractFloat128Frac1( a
);
6777 aSig0
= extractFloat128Frac0( a
);
6778 aExp
= extractFloat128Exp( a
);
6779 aSign
= extractFloat128Sign( a
);
6780 if ( aExp
== 0x7FFF ) {
6781 if (aSig0
| aSig1
) {
6782 return propagateFloat128NaN(a
, a
, status
);
6784 if ( ! aSign
) return a
;
6788 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
6790 float_raise(float_flag_invalid
, status
);
6791 z
.low
= float128_default_nan_low
;
6792 z
.high
= float128_default_nan_high
;
6796 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
6797 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6799 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
6800 aSig0
|= LIT64( 0x0001000000000000 );
6801 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
6802 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
6803 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6804 doubleZSig0
= zSig0
<<1;
6805 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6806 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6807 while ( (int64_t) rem0
< 0 ) {
6810 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6812 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
6813 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
6814 if ( zSig1
== 0 ) zSig1
= 1;
6815 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
6816 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6817 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
6818 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6819 while ( (int64_t) rem1
< 0 ) {
6821 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
6823 term2
|= doubleZSig0
;
6824 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6826 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6828 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
6829 return roundAndPackFloat128(0, zExp
, zSig0
, zSig1
, zSig2
, status
);
6833 /*----------------------------------------------------------------------------
6834 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6835 | the corresponding value `b', and 0 otherwise. The invalid exception is
6836 | raised if either operand is a NaN. Otherwise, the comparison is performed
6837 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6838 *----------------------------------------------------------------------------*/
6840 int float128_eq(float128 a
, float128 b
, float_status
*status
)
6843 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6844 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6845 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6846 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6848 float_raise(float_flag_invalid
, status
);
6853 && ( ( a
.high
== b
.high
)
6855 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6860 /*----------------------------------------------------------------------------
6861 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6862 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6863 | exception is raised if either operand is a NaN. The comparison is performed
6864 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6865 *----------------------------------------------------------------------------*/
6867 int float128_le(float128 a
, float128 b
, float_status
*status
)
6871 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6872 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6873 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6874 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6876 float_raise(float_flag_invalid
, status
);
6879 aSign
= extractFloat128Sign( a
);
6880 bSign
= extractFloat128Sign( b
);
6881 if ( aSign
!= bSign
) {
6884 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6888 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6889 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6893 /*----------------------------------------------------------------------------
6894 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6895 | the corresponding value `b', and 0 otherwise. The invalid exception is
6896 | raised if either operand is a NaN. The comparison is performed according
6897 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6898 *----------------------------------------------------------------------------*/
6900 int float128_lt(float128 a
, float128 b
, float_status
*status
)
6904 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6905 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6906 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6907 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6909 float_raise(float_flag_invalid
, status
);
6912 aSign
= extractFloat128Sign( a
);
6913 bSign
= extractFloat128Sign( b
);
6914 if ( aSign
!= bSign
) {
6917 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6921 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
6922 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6926 /*----------------------------------------------------------------------------
6927 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6928 | be compared, and 0 otherwise. The invalid exception is raised if either
6929 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6930 | Standard for Binary Floating-Point Arithmetic.
6931 *----------------------------------------------------------------------------*/
6933 int float128_unordered(float128 a
, float128 b
, float_status
*status
)
6935 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6936 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6937 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6938 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6940 float_raise(float_flag_invalid
, status
);
6946 /*----------------------------------------------------------------------------
6947 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6948 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6949 | exception. The comparison is performed according to the IEC/IEEE Standard
6950 | for Binary Floating-Point Arithmetic.
6951 *----------------------------------------------------------------------------*/
6953 int float128_eq_quiet(float128 a
, float128 b
, float_status
*status
)
6956 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6957 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6958 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6959 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6961 if ( float128_is_signaling_nan( a
)
6962 || float128_is_signaling_nan( b
) ) {
6963 float_raise(float_flag_invalid
, status
);
6969 && ( ( a
.high
== b
.high
)
6971 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6976 /*----------------------------------------------------------------------------
6977 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6978 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6979 | cause an exception. Otherwise, the comparison is performed according to the
6980 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6981 *----------------------------------------------------------------------------*/
6983 int float128_le_quiet(float128 a
, float128 b
, float_status
*status
)
6987 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6988 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6989 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6990 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6992 if ( float128_is_signaling_nan( a
)
6993 || float128_is_signaling_nan( b
) ) {
6994 float_raise(float_flag_invalid
, status
);
6998 aSign
= extractFloat128Sign( a
);
6999 bSign
= extractFloat128Sign( b
);
7000 if ( aSign
!= bSign
) {
7003 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
7007 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
7008 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
7012 /*----------------------------------------------------------------------------
7013 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7014 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7015 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
7016 | Standard for Binary Floating-Point Arithmetic.
7017 *----------------------------------------------------------------------------*/
7019 int float128_lt_quiet(float128 a
, float128 b
, float_status
*status
)
7023 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7024 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7025 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7026 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7028 if ( float128_is_signaling_nan( a
)
7029 || float128_is_signaling_nan( b
) ) {
7030 float_raise(float_flag_invalid
, status
);
7034 aSign
= extractFloat128Sign( a
);
7035 bSign
= extractFloat128Sign( b
);
7036 if ( aSign
!= bSign
) {
7039 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
7043 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
7044 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
7048 /*----------------------------------------------------------------------------
7049 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7050 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
7051 | comparison is performed according to the IEC/IEEE Standard for Binary
7052 | Floating-Point Arithmetic.
7053 *----------------------------------------------------------------------------*/
7055 int float128_unordered_quiet(float128 a
, float128 b
, float_status
*status
)
7057 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7058 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7059 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7060 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7062 if ( float128_is_signaling_nan( a
)
7063 || float128_is_signaling_nan( b
) ) {
7064 float_raise(float_flag_invalid
, status
);
7071 /* misc functions */
7072 float32
uint32_to_float32(uint32_t a
, float_status
*status
)
7074 return int64_to_float32(a
, status
);
7077 float64
uint32_to_float64(uint32_t a
, float_status
*status
)
7079 return int64_to_float64(a
, status
);
7082 uint32_t float32_to_uint32(float32 a
, float_status
*status
)
7086 int old_exc_flags
= get_float_exception_flags(status
);
7088 v
= float32_to_int64(a
, status
);
7091 } else if (v
> 0xffffffff) {
7096 set_float_exception_flags(old_exc_flags
, status
);
7097 float_raise(float_flag_invalid
, status
);
7101 uint32_t float32_to_uint32_round_to_zero(float32 a
, float_status
*status
)
7105 int old_exc_flags
= get_float_exception_flags(status
);
7107 v
= float32_to_int64_round_to_zero(a
, status
);
7110 } else if (v
> 0xffffffff) {
7115 set_float_exception_flags(old_exc_flags
, status
);
7116 float_raise(float_flag_invalid
, status
);
7120 int_fast16_t float32_to_int16(float32 a
, float_status
*status
)
7124 int old_exc_flags
= get_float_exception_flags(status
);
7126 v
= float32_to_int32(a
, status
);
7129 } else if (v
> 0x7fff) {
7135 set_float_exception_flags(old_exc_flags
, status
);
7136 float_raise(float_flag_invalid
, status
);
7140 uint_fast16_t float32_to_uint16(float32 a
, float_status
*status
)
7144 int old_exc_flags
= get_float_exception_flags(status
);
7146 v
= float32_to_int32(a
, status
);
7149 } else if (v
> 0xffff) {
7155 set_float_exception_flags(old_exc_flags
, status
);
7156 float_raise(float_flag_invalid
, status
);
7160 uint_fast16_t float32_to_uint16_round_to_zero(float32 a
, float_status
*status
)
7164 int old_exc_flags
= get_float_exception_flags(status
);
7166 v
= float32_to_int64_round_to_zero(a
, status
);
7169 } else if (v
> 0xffff) {
7174 set_float_exception_flags(old_exc_flags
, status
);
7175 float_raise(float_flag_invalid
, status
);
7179 uint32_t float64_to_uint32(float64 a
, float_status
*status
)
7183 int old_exc_flags
= get_float_exception_flags(status
);
7185 v
= float64_to_uint64(a
, status
);
7186 if (v
> 0xffffffff) {
7191 set_float_exception_flags(old_exc_flags
, status
);
7192 float_raise(float_flag_invalid
, status
);
7196 uint32_t float64_to_uint32_round_to_zero(float64 a
, float_status
*status
)
7200 int old_exc_flags
= get_float_exception_flags(status
);
7202 v
= float64_to_uint64_round_to_zero(a
, status
);
7203 if (v
> 0xffffffff) {
7208 set_float_exception_flags(old_exc_flags
, status
);
7209 float_raise(float_flag_invalid
, status
);
7213 int_fast16_t float64_to_int16(float64 a
, float_status
*status
)
7217 int old_exc_flags
= get_float_exception_flags(status
);
7219 v
= float64_to_int32(a
, status
);
7222 } else if (v
> 0x7fff) {
7228 set_float_exception_flags(old_exc_flags
, status
);
7229 float_raise(float_flag_invalid
, status
);
7233 uint_fast16_t float64_to_uint16(float64 a
, float_status
*status
)
7237 int old_exc_flags
= get_float_exception_flags(status
);
7239 v
= float64_to_int32(a
, status
);
7242 } else if (v
> 0xffff) {
7248 set_float_exception_flags(old_exc_flags
, status
);
7249 float_raise(float_flag_invalid
, status
);
7253 uint_fast16_t float64_to_uint16_round_to_zero(float64 a
, float_status
*status
)
7257 int old_exc_flags
= get_float_exception_flags(status
);
7259 v
= float64_to_int64_round_to_zero(a
, status
);
7262 } else if (v
> 0xffff) {
7267 set_float_exception_flags(old_exc_flags
, status
);
7268 float_raise(float_flag_invalid
, status
);
7272 /*----------------------------------------------------------------------------
7273 | Returns the result of converting the double-precision floating-point value
7274 | `a' to the 64-bit unsigned integer format. The conversion is
7275 | performed according to the IEC/IEEE Standard for Binary Floating-Point
7276 | Arithmetic---which means in particular that the conversion is rounded
7277 | according to the current rounding mode. If `a' is a NaN, the largest
7278 | positive integer is returned. If the conversion overflows, the
7279 | largest unsigned integer is returned. If 'a' is negative, the value is
7280 | rounded and zero is returned; negative values that do not round to zero
7281 | will raise the inexact exception.
7282 *----------------------------------------------------------------------------*/
7284 uint64_t float64_to_uint64(float64 a
, float_status
*status
)
7287 int_fast16_t aExp
, shiftCount
;
7288 uint64_t aSig
, aSigExtra
;
7289 a
= float64_squash_input_denormal(a
, status
);
7291 aSig
= extractFloat64Frac(a
);
7292 aExp
= extractFloat64Exp(a
);
7293 aSign
= extractFloat64Sign(a
);
7294 if (aSign
&& (aExp
> 1022)) {
7295 float_raise(float_flag_invalid
, status
);
7296 if (float64_is_any_nan(a
)) {
7297 return LIT64(0xFFFFFFFFFFFFFFFF);
7303 aSig
|= LIT64(0x0010000000000000);
7305 shiftCount
= 0x433 - aExp
;
7306 if (shiftCount
<= 0) {
7308 float_raise(float_flag_invalid
, status
);
7309 return LIT64(0xFFFFFFFFFFFFFFFF);
7312 aSig
<<= -shiftCount
;
7314 shift64ExtraRightJamming(aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
7316 return roundAndPackUint64(aSign
, aSig
, aSigExtra
, status
);
7319 uint64_t float64_to_uint64_round_to_zero(float64 a
, float_status
*status
)
7321 signed char current_rounding_mode
= status
->float_rounding_mode
;
7322 set_float_rounding_mode(float_round_to_zero
, status
);
7323 int64_t v
= float64_to_uint64(a
, status
);
7324 set_float_rounding_mode(current_rounding_mode
, status
);
7328 #define COMPARE(s, nan_exp) \
7329 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7330 int is_quiet, float_status *status) \
7332 flag aSign, bSign; \
7333 uint ## s ## _t av, bv; \
7334 a = float ## s ## _squash_input_denormal(a, status); \
7335 b = float ## s ## _squash_input_denormal(b, status); \
7337 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
7338 extractFloat ## s ## Frac( a ) ) || \
7339 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
7340 extractFloat ## s ## Frac( b ) )) { \
7342 float ## s ## _is_signaling_nan( a ) || \
7343 float ## s ## _is_signaling_nan( b ) ) { \
7344 float_raise(float_flag_invalid, status); \
7346 return float_relation_unordered; \
7348 aSign = extractFloat ## s ## Sign( a ); \
7349 bSign = extractFloat ## s ## Sign( b ); \
7350 av = float ## s ## _val(a); \
7351 bv = float ## s ## _val(b); \
7352 if ( aSign != bSign ) { \
7353 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
7355 return float_relation_equal; \
7357 return 1 - (2 * aSign); \
7361 return float_relation_equal; \
7363 return 1 - 2 * (aSign ^ ( av < bv )); \
7368 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
7370 return float ## s ## _compare_internal(a, b, 0, status); \
7373 int float ## s ## _compare_quiet(float ## s a, float ## s b, \
7374 float_status *status) \
7376 return float ## s ## _compare_internal(a, b, 1, status); \
7382 static inline int floatx80_compare_internal(floatx80 a
, floatx80 b
,
7383 int is_quiet
, float_status
*status
)
7387 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
7388 ( extractFloatx80Frac( a
)<<1 ) ) ||
7389 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
7390 ( extractFloatx80Frac( b
)<<1 ) )) {
7392 floatx80_is_signaling_nan( a
) ||
7393 floatx80_is_signaling_nan( b
) ) {
7394 float_raise(float_flag_invalid
, status
);
7396 return float_relation_unordered
;
7398 aSign
= extractFloatx80Sign( a
);
7399 bSign
= extractFloatx80Sign( b
);
7400 if ( aSign
!= bSign
) {
7402 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
7403 ( ( a
.low
| b
.low
) == 0 ) ) {
7405 return float_relation_equal
;
7407 return 1 - (2 * aSign
);
7410 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7411 return float_relation_equal
;
7413 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7418 int floatx80_compare(floatx80 a
, floatx80 b
, float_status
*status
)
7420 return floatx80_compare_internal(a
, b
, 0, status
);
7423 int floatx80_compare_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
7425 return floatx80_compare_internal(a
, b
, 1, status
);
7428 static inline int float128_compare_internal(float128 a
, float128 b
,
7429 int is_quiet
, float_status
*status
)
7433 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
7434 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
7435 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
7436 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
7438 float128_is_signaling_nan( a
) ||
7439 float128_is_signaling_nan( b
) ) {
7440 float_raise(float_flag_invalid
, status
);
7442 return float_relation_unordered
;
7444 aSign
= extractFloat128Sign( a
);
7445 bSign
= extractFloat128Sign( b
);
7446 if ( aSign
!= bSign
) {
7447 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
7449 return float_relation_equal
;
7451 return 1 - (2 * aSign
);
7454 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7455 return float_relation_equal
;
7457 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7462 int float128_compare(float128 a
, float128 b
, float_status
*status
)
7464 return float128_compare_internal(a
, b
, 0, status
);
7467 int float128_compare_quiet(float128 a
, float128 b
, float_status
*status
)
7469 return float128_compare_internal(a
, b
, 1, status
);
7472 /* min() and max() functions. These can't be implemented as
7473 * 'compare and pick one input' because that would mishandle
7474 * NaNs and +0 vs -0.
7476 * minnum() and maxnum() functions. These are similar to the min()
7477 * and max() functions but if one of the arguments is a QNaN and
7478 * the other is numerical then the numerical argument is returned.
7479 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7480 * and maxNum() operations. min() and max() are the typical min/max
7481 * semantics provided by many CPUs which predate that specification.
7483 * minnummag() and maxnummag() functions correspond to minNumMag()
7484 * and minNumMag() from the IEEE-754 2008.
7487 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \
7488 int ismin, int isieee, \
7490 float_status *status) \
7492 flag aSign, bSign; \
7493 uint ## s ## _t av, bv, aav, abv; \
7494 a = float ## s ## _squash_input_denormal(a, status); \
7495 b = float ## s ## _squash_input_denormal(b, status); \
7496 if (float ## s ## _is_any_nan(a) || \
7497 float ## s ## _is_any_nan(b)) { \
7499 if (float ## s ## _is_quiet_nan(a) && \
7500 !float ## s ##_is_any_nan(b)) { \
7502 } else if (float ## s ## _is_quiet_nan(b) && \
7503 !float ## s ## _is_any_nan(a)) { \
7507 return propagateFloat ## s ## NaN(a, b, status); \
7509 aSign = extractFloat ## s ## Sign(a); \
7510 bSign = extractFloat ## s ## Sign(b); \
7511 av = float ## s ## _val(a); \
7512 bv = float ## s ## _val(b); \
7514 aav = float ## s ## _abs(av); \
7515 abv = float ## s ## _abs(bv); \
7518 return (aav < abv) ? a : b; \
7520 return (aav < abv) ? b : a; \
7524 if (aSign != bSign) { \
7526 return aSign ? a : b; \
7528 return aSign ? b : a; \
7532 return (aSign ^ (av < bv)) ? a : b; \
7534 return (aSign ^ (av < bv)) ? b : a; \
7539 float ## s float ## s ## _min(float ## s a, float ## s b, \
7540 float_status *status) \
7542 return float ## s ## _minmax(a, b, 1, 0, 0, status); \
7545 float ## s float ## s ## _max(float ## s a, float ## s b, \
7546 float_status *status) \
7548 return float ## s ## _minmax(a, b, 0, 0, 0, status); \
7551 float ## s float ## s ## _minnum(float ## s a, float ## s b, \
7552 float_status *status) \
7554 return float ## s ## _minmax(a, b, 1, 1, 0, status); \
7557 float ## s float ## s ## _maxnum(float ## s a, float ## s b, \
7558 float_status *status) \
7560 return float ## s ## _minmax(a, b, 0, 1, 0, status); \
7563 float ## s float ## s ## _minnummag(float ## s a, float ## s b, \
7564 float_status *status) \
7566 return float ## s ## _minmax(a, b, 1, 1, 1, status); \
7569 float ## s float ## s ## _maxnummag(float ## s a, float ## s b, \
7570 float_status *status) \
7572 return float ## s ## _minmax(a, b, 0, 1, 1, status); \
7579 /* Multiply A by 2 raised to the power N. */
7580 float32
float32_scalbn(float32 a
, int n
, float_status
*status
)
7586 a
= float32_squash_input_denormal(a
, status
);
7587 aSig
= extractFloat32Frac( a
);
7588 aExp
= extractFloat32Exp( a
);
7589 aSign
= extractFloat32Sign( a
);
7591 if ( aExp
== 0xFF ) {
7593 return propagateFloat32NaN(a
, a
, status
);
7599 } else if (aSig
== 0) {
7607 } else if (n
< -0x200) {
7613 return normalizeRoundAndPackFloat32(aSign
, aExp
, aSig
, status
);
7616 float64
float64_scalbn(float64 a
, int n
, float_status
*status
)
7622 a
= float64_squash_input_denormal(a
, status
);
7623 aSig
= extractFloat64Frac( a
);
7624 aExp
= extractFloat64Exp( a
);
7625 aSign
= extractFloat64Sign( a
);
7627 if ( aExp
== 0x7FF ) {
7629 return propagateFloat64NaN(a
, a
, status
);
7634 aSig
|= LIT64( 0x0010000000000000 );
7635 } else if (aSig
== 0) {
7643 } else if (n
< -0x1000) {
7649 return normalizeRoundAndPackFloat64(aSign
, aExp
, aSig
, status
);
7652 floatx80
floatx80_scalbn(floatx80 a
, int n
, float_status
*status
)
7658 aSig
= extractFloatx80Frac( a
);
7659 aExp
= extractFloatx80Exp( a
);
7660 aSign
= extractFloatx80Sign( a
);
7662 if ( aExp
== 0x7FFF ) {
7664 return propagateFloatx80NaN(a
, a
, status
);
7678 } else if (n
< -0x10000) {
7683 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
7684 aSign
, aExp
, aSig
, 0, status
);
7687 float128
float128_scalbn(float128 a
, int n
, float_status
*status
)
7691 uint64_t aSig0
, aSig1
;
7693 aSig1
= extractFloat128Frac1( a
);
7694 aSig0
= extractFloat128Frac0( a
);
7695 aExp
= extractFloat128Exp( a
);
7696 aSign
= extractFloat128Sign( a
);
7697 if ( aExp
== 0x7FFF ) {
7698 if ( aSig0
| aSig1
) {
7699 return propagateFloat128NaN(a
, a
, status
);
7704 aSig0
|= LIT64( 0x0001000000000000 );
7705 } else if (aSig0
== 0 && aSig1
== 0) {
7713 } else if (n
< -0x10000) {
7718 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1