4 * Derived from SoftFloat.
7 /*============================================================================
9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
12 Written by John R. Hauser. This work was made possible in part by the
13 International Computer Science Institute, located at Suite 600, 1947 Center
14 Street, Berkeley, California 94704. Funding was partially provided by the
15 National Science Foundation under grant MIP-9311980. The original version
16 of this code was written as part of a project to build a fixed-point vector
17 processor in collaboration with the University of California at Berkeley,
18 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20 arithmetic/SoftFloat.html'.
22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
31 Derivative works are acceptable, even for commercial purposes, so long as
32 (1) the source code for the derivative work includes prominent notice that
33 the work is derivative, and (2) the source code includes prominent notice with
34 these four paragraphs for those parts of this code that are retained.
36 =============================================================================*/
38 /* softfloat (and in particular the code in softfloat-specialize.h) is
39 * target-dependent and needs the TARGET_* macros.
43 #include "fpu/softfloat.h"
45 /* We only need stdlib for abort() */
48 /*----------------------------------------------------------------------------
49 | Primitive arithmetic functions, including multi-word arithmetic, and
50 | division and square root approximations. (Can be specialized to target if
52 *----------------------------------------------------------------------------*/
53 #include "softfloat-macros.h"
55 /*----------------------------------------------------------------------------
56 | Functions and definitions to determine: (1) whether tininess for underflow
57 | is detected before or after rounding by default, (2) what (if anything)
58 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
59 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
60 | are propagated from function inputs to output. These details are target-
62 *----------------------------------------------------------------------------*/
63 #include "softfloat-specialize.h"
65 /*----------------------------------------------------------------------------
66 | Returns the fraction bits of the half-precision floating-point value `a'.
67 *----------------------------------------------------------------------------*/
69 INLINE
uint32_t extractFloat16Frac(float16 a
)
71 return float16_val(a
) & 0x3ff;
74 /*----------------------------------------------------------------------------
75 | Returns the exponent bits of the half-precision floating-point value `a'.
76 *----------------------------------------------------------------------------*/
78 INLINE
int_fast16_t extractFloat16Exp(float16 a
)
80 return (float16_val(a
) >> 10) & 0x1f;
83 /*----------------------------------------------------------------------------
84 | Returns the sign bit of the single-precision floating-point value `a'.
85 *----------------------------------------------------------------------------*/
87 INLINE flag
extractFloat16Sign(float16 a
)
89 return float16_val(a
)>>15;
92 /*----------------------------------------------------------------------------
93 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
94 | and 7, and returns the properly rounded 32-bit integer corresponding to the
95 | input. If `zSign' is 1, the input is negated before being converted to an
96 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
97 | is simply rounded to an integer, with the inexact exception raised if the
98 | input cannot be represented exactly as an integer. However, if the fixed-
99 | point input is too large, the invalid exception is raised and the largest
100 | positive or negative integer is returned.
101 *----------------------------------------------------------------------------*/
103 static int32
roundAndPackInt32( flag zSign
, uint64_t absZ STATUS_PARAM
)
106 flag roundNearestEven
;
107 int8 roundIncrement
, roundBits
;
110 roundingMode
= STATUS(float_rounding_mode
);
111 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
112 switch (roundingMode
) {
113 case float_round_nearest_even
:
114 case float_round_ties_away
:
115 roundIncrement
= 0x40;
117 case float_round_to_zero
:
121 roundIncrement
= zSign
? 0 : 0x7f;
123 case float_round_down
:
124 roundIncrement
= zSign
? 0x7f : 0;
129 roundBits
= absZ
& 0x7F;
130 absZ
= ( absZ
+ roundIncrement
)>>7;
131 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
133 if ( zSign
) z
= - z
;
134 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
135 float_raise( float_flag_invalid STATUS_VAR
);
136 return zSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
138 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
143 /*----------------------------------------------------------------------------
144 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
145 | `absZ1', with binary point between bits 63 and 64 (between the input words),
146 | and returns the properly rounded 64-bit integer corresponding to the input.
147 | If `zSign' is 1, the input is negated before being converted to an integer.
148 | Ordinarily, the fixed-point input is simply rounded to an integer, with
149 | the inexact exception raised if the input cannot be represented exactly as
150 | an integer. However, if the fixed-point input is too large, the invalid
151 | exception is raised and the largest positive or negative integer is
153 *----------------------------------------------------------------------------*/
155 static int64
roundAndPackInt64( flag zSign
, uint64_t absZ0
, uint64_t absZ1 STATUS_PARAM
)
158 flag roundNearestEven
, increment
;
161 roundingMode
= STATUS(float_rounding_mode
);
162 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
163 switch (roundingMode
) {
164 case float_round_nearest_even
:
165 case float_round_ties_away
:
166 increment
= ((int64_t) absZ1
< 0);
168 case float_round_to_zero
:
172 increment
= !zSign
&& absZ1
;
174 case float_round_down
:
175 increment
= zSign
&& absZ1
;
182 if ( absZ0
== 0 ) goto overflow
;
183 absZ0
&= ~ ( ( (uint64_t) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
186 if ( zSign
) z
= - z
;
187 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
189 float_raise( float_flag_invalid STATUS_VAR
);
191 zSign
? (int64_t) LIT64( 0x8000000000000000 )
192 : LIT64( 0x7FFFFFFFFFFFFFFF );
194 if ( absZ1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
199 /*----------------------------------------------------------------------------
200 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
201 | `absZ1', with binary point between bits 63 and 64 (between the input words),
202 | and returns the properly rounded 64-bit unsigned integer corresponding to the
203 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
204 | with the inexact exception raised if the input cannot be represented exactly
205 | as an integer. However, if the fixed-point input is too large, the invalid
206 | exception is raised and the largest unsigned integer is returned.
207 *----------------------------------------------------------------------------*/
209 static int64
roundAndPackUint64(flag zSign
, uint64_t absZ0
,
210 uint64_t absZ1 STATUS_PARAM
)
213 flag roundNearestEven
, increment
;
215 roundingMode
= STATUS(float_rounding_mode
);
216 roundNearestEven
= (roundingMode
== float_round_nearest_even
);
217 switch (roundingMode
) {
218 case float_round_nearest_even
:
219 case float_round_ties_away
:
220 increment
= ((int64_t)absZ1
< 0);
222 case float_round_to_zero
:
226 increment
= !zSign
&& absZ1
;
228 case float_round_down
:
229 increment
= zSign
&& absZ1
;
237 float_raise(float_flag_invalid STATUS_VAR
);
238 return LIT64(0xFFFFFFFFFFFFFFFF);
240 absZ0
&= ~(((uint64_t)(absZ1
<<1) == 0) & roundNearestEven
);
243 if (zSign
&& absZ0
) {
244 float_raise(float_flag_invalid STATUS_VAR
);
249 STATUS(float_exception_flags
) |= float_flag_inexact
;
254 /*----------------------------------------------------------------------------
255 | Returns the fraction bits of the single-precision floating-point value `a'.
256 *----------------------------------------------------------------------------*/
258 INLINE
uint32_t extractFloat32Frac( float32 a
)
261 return float32_val(a
) & 0x007FFFFF;
265 /*----------------------------------------------------------------------------
266 | Returns the exponent bits of the single-precision floating-point value `a'.
267 *----------------------------------------------------------------------------*/
269 INLINE
int_fast16_t extractFloat32Exp(float32 a
)
272 return ( float32_val(a
)>>23 ) & 0xFF;
276 /*----------------------------------------------------------------------------
277 | Returns the sign bit of the single-precision floating-point value `a'.
278 *----------------------------------------------------------------------------*/
280 INLINE flag
extractFloat32Sign( float32 a
)
283 return float32_val(a
)>>31;
287 /*----------------------------------------------------------------------------
288 | If `a' is denormal and we are in flush-to-zero mode then set the
289 | input-denormal exception and return zero. Otherwise just return the value.
290 *----------------------------------------------------------------------------*/
291 float32
float32_squash_input_denormal(float32 a STATUS_PARAM
)
293 if (STATUS(flush_inputs_to_zero
)) {
294 if (extractFloat32Exp(a
) == 0 && extractFloat32Frac(a
) != 0) {
295 float_raise(float_flag_input_denormal STATUS_VAR
);
296 return make_float32(float32_val(a
) & 0x80000000);
302 /*----------------------------------------------------------------------------
303 | Normalizes the subnormal single-precision floating-point value represented
304 | by the denormalized significand `aSig'. The normalized exponent and
305 | significand are stored at the locations pointed to by `zExpPtr' and
306 | `zSigPtr', respectively.
307 *----------------------------------------------------------------------------*/
310 normalizeFloat32Subnormal(uint32_t aSig
, int_fast16_t *zExpPtr
, uint32_t *zSigPtr
)
314 shiftCount
= countLeadingZeros32( aSig
) - 8;
315 *zSigPtr
= aSig
<<shiftCount
;
316 *zExpPtr
= 1 - shiftCount
;
320 /*----------------------------------------------------------------------------
321 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
322 | single-precision floating-point value, returning the result. After being
323 | shifted into the proper positions, the three fields are simply added
324 | together to form the result. This means that any integer portion of `zSig'
325 | will be added into the exponent. Since a properly normalized significand
326 | will have an integer portion equal to 1, the `zExp' input should be 1 less
327 | than the desired result exponent whenever `zSig' is a complete, normalized
329 *----------------------------------------------------------------------------*/
331 INLINE float32
packFloat32(flag zSign
, int_fast16_t zExp
, uint32_t zSig
)
335 ( ( (uint32_t) zSign
)<<31 ) + ( ( (uint32_t) zExp
)<<23 ) + zSig
);
339 /*----------------------------------------------------------------------------
340 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
341 | and significand `zSig', and returns the proper single-precision floating-
342 | point value corresponding to the abstract input. Ordinarily, the abstract
343 | value is simply rounded and packed into the single-precision format, with
344 | the inexact exception raised if the abstract input cannot be represented
345 | exactly. However, if the abstract value is too large, the overflow and
346 | inexact exceptions are raised and an infinity or maximal finite value is
347 | returned. If the abstract value is too small, the input value is rounded to
348 | a subnormal number, and the underflow and inexact exceptions are raised if
349 | the abstract input cannot be represented exactly as a subnormal single-
350 | precision floating-point number.
351 | The input significand `zSig' has its binary point between bits 30
352 | and 29, which is 7 bits to the left of the usual location. This shifted
353 | significand must be normalized or smaller. If `zSig' is not normalized,
354 | `zExp' must be 0; in that case, the result returned is a subnormal number,
355 | and it must not require rounding. In the usual case that `zSig' is
356 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
357 | The handling of underflow and overflow follows the IEC/IEEE Standard for
358 | Binary Floating-Point Arithmetic.
359 *----------------------------------------------------------------------------*/
361 static float32
roundAndPackFloat32(flag zSign
, int_fast16_t zExp
, uint32_t zSig STATUS_PARAM
)
364 flag roundNearestEven
;
365 int8 roundIncrement
, roundBits
;
368 roundingMode
= STATUS(float_rounding_mode
);
369 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
370 switch (roundingMode
) {
371 case float_round_nearest_even
:
372 case float_round_ties_away
:
373 roundIncrement
= 0x40;
375 case float_round_to_zero
:
379 roundIncrement
= zSign
? 0 : 0x7f;
381 case float_round_down
:
382 roundIncrement
= zSign
? 0x7f : 0;
388 roundBits
= zSig
& 0x7F;
389 if ( 0xFD <= (uint16_t) zExp
) {
391 || ( ( zExp
== 0xFD )
392 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
394 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
395 return packFloat32( zSign
, 0xFF, - ( roundIncrement
== 0 ));
398 if (STATUS(flush_to_zero
)) {
399 float_raise(float_flag_output_denormal STATUS_VAR
);
400 return packFloat32(zSign
, 0, 0);
403 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
405 || ( zSig
+ roundIncrement
< 0x80000000 );
406 shift32RightJamming( zSig
, - zExp
, &zSig
);
408 roundBits
= zSig
& 0x7F;
409 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
412 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
413 zSig
= ( zSig
+ roundIncrement
)>>7;
414 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
415 if ( zSig
== 0 ) zExp
= 0;
416 return packFloat32( zSign
, zExp
, zSig
);
420 /*----------------------------------------------------------------------------
421 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
422 | and significand `zSig', and returns the proper single-precision floating-
423 | point value corresponding to the abstract input. This routine is just like
424 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
425 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
426 | floating-point exponent.
427 *----------------------------------------------------------------------------*/
430 normalizeRoundAndPackFloat32(flag zSign
, int_fast16_t zExp
, uint32_t zSig STATUS_PARAM
)
434 shiftCount
= countLeadingZeros32( zSig
) - 1;
435 return roundAndPackFloat32( zSign
, zExp
- shiftCount
, zSig
<<shiftCount STATUS_VAR
);
439 /*----------------------------------------------------------------------------
440 | Returns the fraction bits of the double-precision floating-point value `a'.
441 *----------------------------------------------------------------------------*/
443 INLINE
uint64_t extractFloat64Frac( float64 a
)
446 return float64_val(a
) & LIT64( 0x000FFFFFFFFFFFFF );
450 /*----------------------------------------------------------------------------
451 | Returns the exponent bits of the double-precision floating-point value `a'.
452 *----------------------------------------------------------------------------*/
454 INLINE
int_fast16_t extractFloat64Exp(float64 a
)
457 return ( float64_val(a
)>>52 ) & 0x7FF;
461 /*----------------------------------------------------------------------------
462 | Returns the sign bit of the double-precision floating-point value `a'.
463 *----------------------------------------------------------------------------*/
465 INLINE flag
extractFloat64Sign( float64 a
)
468 return float64_val(a
)>>63;
472 /*----------------------------------------------------------------------------
473 | If `a' is denormal and we are in flush-to-zero mode then set the
474 | input-denormal exception and return zero. Otherwise just return the value.
475 *----------------------------------------------------------------------------*/
476 float64
float64_squash_input_denormal(float64 a STATUS_PARAM
)
478 if (STATUS(flush_inputs_to_zero
)) {
479 if (extractFloat64Exp(a
) == 0 && extractFloat64Frac(a
) != 0) {
480 float_raise(float_flag_input_denormal STATUS_VAR
);
481 return make_float64(float64_val(a
) & (1ULL << 63));
487 /*----------------------------------------------------------------------------
488 | Normalizes the subnormal double-precision floating-point value represented
489 | by the denormalized significand `aSig'. The normalized exponent and
490 | significand are stored at the locations pointed to by `zExpPtr' and
491 | `zSigPtr', respectively.
492 *----------------------------------------------------------------------------*/
495 normalizeFloat64Subnormal(uint64_t aSig
, int_fast16_t *zExpPtr
, uint64_t *zSigPtr
)
499 shiftCount
= countLeadingZeros64( aSig
) - 11;
500 *zSigPtr
= aSig
<<shiftCount
;
501 *zExpPtr
= 1 - shiftCount
;
505 /*----------------------------------------------------------------------------
506 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
507 | double-precision floating-point value, returning the result. After being
508 | shifted into the proper positions, the three fields are simply added
509 | together to form the result. This means that any integer portion of `zSig'
510 | will be added into the exponent. Since a properly normalized significand
511 | will have an integer portion equal to 1, the `zExp' input should be 1 less
512 | than the desired result exponent whenever `zSig' is a complete, normalized
514 *----------------------------------------------------------------------------*/
516 INLINE float64
packFloat64(flag zSign
, int_fast16_t zExp
, uint64_t zSig
)
520 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
524 /*----------------------------------------------------------------------------
525 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
526 | and significand `zSig', and returns the proper double-precision floating-
527 | point value corresponding to the abstract input. Ordinarily, the abstract
528 | value is simply rounded and packed into the double-precision format, with
529 | the inexact exception raised if the abstract input cannot be represented
530 | exactly. However, if the abstract value is too large, the overflow and
531 | inexact exceptions are raised and an infinity or maximal finite value is
532 | returned. If the abstract value is too small, the input value is rounded
533 | to a subnormal number, and the underflow and inexact exceptions are raised
534 | if the abstract input cannot be represented exactly as a subnormal double-
535 | precision floating-point number.
536 | The input significand `zSig' has its binary point between bits 62
537 | and 61, which is 10 bits to the left of the usual location. This shifted
538 | significand must be normalized or smaller. If `zSig' is not normalized,
539 | `zExp' must be 0; in that case, the result returned is a subnormal number,
540 | and it must not require rounding. In the usual case that `zSig' is
541 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
542 | The handling of underflow and overflow follows the IEC/IEEE Standard for
543 | Binary Floating-Point Arithmetic.
544 *----------------------------------------------------------------------------*/
546 static float64
roundAndPackFloat64(flag zSign
, int_fast16_t zExp
, uint64_t zSig STATUS_PARAM
)
549 flag roundNearestEven
;
550 int_fast16_t roundIncrement
, roundBits
;
553 roundingMode
= STATUS(float_rounding_mode
);
554 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
555 switch (roundingMode
) {
556 case float_round_nearest_even
:
557 case float_round_ties_away
:
558 roundIncrement
= 0x200;
560 case float_round_to_zero
:
564 roundIncrement
= zSign
? 0 : 0x3ff;
566 case float_round_down
:
567 roundIncrement
= zSign
? 0x3ff : 0;
572 roundBits
= zSig
& 0x3FF;
573 if ( 0x7FD <= (uint16_t) zExp
) {
574 if ( ( 0x7FD < zExp
)
575 || ( ( zExp
== 0x7FD )
576 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
578 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
579 return packFloat64( zSign
, 0x7FF, - ( roundIncrement
== 0 ));
582 if (STATUS(flush_to_zero
)) {
583 float_raise(float_flag_output_denormal STATUS_VAR
);
584 return packFloat64(zSign
, 0, 0);
587 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
589 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
590 shift64RightJamming( zSig
, - zExp
, &zSig
);
592 roundBits
= zSig
& 0x3FF;
593 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
596 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
597 zSig
= ( zSig
+ roundIncrement
)>>10;
598 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
599 if ( zSig
== 0 ) zExp
= 0;
600 return packFloat64( zSign
, zExp
, zSig
);
604 /*----------------------------------------------------------------------------
605 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
606 | and significand `zSig', and returns the proper double-precision floating-
607 | point value corresponding to the abstract input. This routine is just like
608 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
609 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
610 | floating-point exponent.
611 *----------------------------------------------------------------------------*/
614 normalizeRoundAndPackFloat64(flag zSign
, int_fast16_t zExp
, uint64_t zSig STATUS_PARAM
)
618 shiftCount
= countLeadingZeros64( zSig
) - 1;
619 return roundAndPackFloat64( zSign
, zExp
- shiftCount
, zSig
<<shiftCount STATUS_VAR
);
623 /*----------------------------------------------------------------------------
624 | Returns the fraction bits of the extended double-precision floating-point
626 *----------------------------------------------------------------------------*/
628 INLINE
uint64_t extractFloatx80Frac( floatx80 a
)
635 /*----------------------------------------------------------------------------
636 | Returns the exponent bits of the extended double-precision floating-point
638 *----------------------------------------------------------------------------*/
640 INLINE int32
extractFloatx80Exp( floatx80 a
)
643 return a
.high
& 0x7FFF;
647 /*----------------------------------------------------------------------------
648 | Returns the sign bit of the extended double-precision floating-point value
650 *----------------------------------------------------------------------------*/
652 INLINE flag
extractFloatx80Sign( floatx80 a
)
659 /*----------------------------------------------------------------------------
660 | Normalizes the subnormal extended double-precision floating-point value
661 | represented by the denormalized significand `aSig'. The normalized exponent
662 | and significand are stored at the locations pointed to by `zExpPtr' and
663 | `zSigPtr', respectively.
664 *----------------------------------------------------------------------------*/
667 normalizeFloatx80Subnormal( uint64_t aSig
, int32
*zExpPtr
, uint64_t *zSigPtr
)
671 shiftCount
= countLeadingZeros64( aSig
);
672 *zSigPtr
= aSig
<<shiftCount
;
673 *zExpPtr
= 1 - shiftCount
;
677 /*----------------------------------------------------------------------------
678 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
679 | extended double-precision floating-point value, returning the result.
680 *----------------------------------------------------------------------------*/
682 INLINE floatx80
packFloatx80( flag zSign
, int32 zExp
, uint64_t zSig
)
687 z
.high
= ( ( (uint16_t) zSign
)<<15 ) + zExp
;
692 /*----------------------------------------------------------------------------
693 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
694 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
695 | and returns the proper extended double-precision floating-point value
696 | corresponding to the abstract input. Ordinarily, the abstract value is
697 | rounded and packed into the extended double-precision format, with the
698 | inexact exception raised if the abstract input cannot be represented
699 | exactly. However, if the abstract value is too large, the overflow and
700 | inexact exceptions are raised and an infinity or maximal finite value is
701 | returned. If the abstract value is too small, the input value is rounded to
702 | a subnormal number, and the underflow and inexact exceptions are raised if
703 | the abstract input cannot be represented exactly as a subnormal extended
704 | double-precision floating-point number.
705 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
706 | number of bits as single or double precision, respectively. Otherwise, the
707 | result is rounded to the full precision of the extended double-precision
709 | The input significand must be normalized or smaller. If the input
710 | significand is not normalized, `zExp' must be 0; in that case, the result
711 | returned is a subnormal number, and it must not require rounding. The
712 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
713 | Floating-Point Arithmetic.
714 *----------------------------------------------------------------------------*/
717 roundAndPackFloatx80(
718 int8 roundingPrecision
, flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1
722 flag roundNearestEven
, increment
, isTiny
;
723 int64 roundIncrement
, roundMask
, roundBits
;
725 roundingMode
= STATUS(float_rounding_mode
);
726 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
727 if ( roundingPrecision
== 80 ) goto precision80
;
728 if ( roundingPrecision
== 64 ) {
729 roundIncrement
= LIT64( 0x0000000000000400 );
730 roundMask
= LIT64( 0x00000000000007FF );
732 else if ( roundingPrecision
== 32 ) {
733 roundIncrement
= LIT64( 0x0000008000000000 );
734 roundMask
= LIT64( 0x000000FFFFFFFFFF );
739 zSig0
|= ( zSig1
!= 0 );
740 switch (roundingMode
) {
741 case float_round_nearest_even
:
742 case float_round_ties_away
:
744 case float_round_to_zero
:
748 roundIncrement
= zSign
? 0 : roundMask
;
750 case float_round_down
:
751 roundIncrement
= zSign
? roundMask
: 0;
756 roundBits
= zSig0
& roundMask
;
757 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
758 if ( ( 0x7FFE < zExp
)
759 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
764 if (STATUS(flush_to_zero
)) {
765 float_raise(float_flag_output_denormal STATUS_VAR
);
766 return packFloatx80(zSign
, 0, 0);
769 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
771 || ( zSig0
<= zSig0
+ roundIncrement
);
772 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
774 roundBits
= zSig0
& roundMask
;
775 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
776 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
777 zSig0
+= roundIncrement
;
778 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
779 roundIncrement
= roundMask
+ 1;
780 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
781 roundMask
|= roundIncrement
;
783 zSig0
&= ~ roundMask
;
784 return packFloatx80( zSign
, zExp
, zSig0
);
787 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
788 zSig0
+= roundIncrement
;
789 if ( zSig0
< roundIncrement
) {
791 zSig0
= LIT64( 0x8000000000000000 );
793 roundIncrement
= roundMask
+ 1;
794 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
795 roundMask
|= roundIncrement
;
797 zSig0
&= ~ roundMask
;
798 if ( zSig0
== 0 ) zExp
= 0;
799 return packFloatx80( zSign
, zExp
, zSig0
);
801 switch (roundingMode
) {
802 case float_round_nearest_even
:
803 case float_round_ties_away
:
804 increment
= ((int64_t)zSig1
< 0);
806 case float_round_to_zero
:
810 increment
= !zSign
&& zSig1
;
812 case float_round_down
:
813 increment
= zSign
&& zSig1
;
818 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
819 if ( ( 0x7FFE < zExp
)
820 || ( ( zExp
== 0x7FFE )
821 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
827 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
828 if ( ( roundingMode
== float_round_to_zero
)
829 || ( zSign
&& ( roundingMode
== float_round_up
) )
830 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
832 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
834 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
838 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
841 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
842 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
844 if ( isTiny
&& zSig1
) float_raise( float_flag_underflow STATUS_VAR
);
845 if ( zSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
846 switch (roundingMode
) {
847 case float_round_nearest_even
:
848 case float_round_ties_away
:
849 increment
= ((int64_t)zSig1
< 0);
851 case float_round_to_zero
:
855 increment
= !zSign
&& zSig1
;
857 case float_round_down
:
858 increment
= zSign
&& zSig1
;
866 ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
867 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
869 return packFloatx80( zSign
, zExp
, zSig0
);
872 if ( zSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
877 zSig0
= LIT64( 0x8000000000000000 );
880 zSig0
&= ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
884 if ( zSig0
== 0 ) zExp
= 0;
886 return packFloatx80( zSign
, zExp
, zSig0
);
890 /*----------------------------------------------------------------------------
891 | Takes an abstract floating-point value having sign `zSign', exponent
892 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
893 | and returns the proper extended double-precision floating-point value
894 | corresponding to the abstract input. This routine is just like
895 | `roundAndPackFloatx80' except that the input significand does not have to be
897 *----------------------------------------------------------------------------*/
900 normalizeRoundAndPackFloatx80(
901 int8 roundingPrecision
, flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1
911 shiftCount
= countLeadingZeros64( zSig0
);
912 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
915 roundAndPackFloatx80( roundingPrecision
, zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
919 /*----------------------------------------------------------------------------
920 | Returns the least-significant 64 fraction bits of the quadruple-precision
921 | floating-point value `a'.
922 *----------------------------------------------------------------------------*/
924 INLINE
uint64_t extractFloat128Frac1( float128 a
)
931 /*----------------------------------------------------------------------------
932 | Returns the most-significant 48 fraction bits of the quadruple-precision
933 | floating-point value `a'.
934 *----------------------------------------------------------------------------*/
936 INLINE
uint64_t extractFloat128Frac0( float128 a
)
939 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
943 /*----------------------------------------------------------------------------
944 | Returns the exponent bits of the quadruple-precision floating-point value
946 *----------------------------------------------------------------------------*/
948 INLINE int32
extractFloat128Exp( float128 a
)
951 return ( a
.high
>>48 ) & 0x7FFF;
955 /*----------------------------------------------------------------------------
956 | Returns the sign bit of the quadruple-precision floating-point value `a'.
957 *----------------------------------------------------------------------------*/
959 INLINE flag
extractFloat128Sign( float128 a
)
966 /*----------------------------------------------------------------------------
967 | Normalizes the subnormal quadruple-precision floating-point value
968 | represented by the denormalized significand formed by the concatenation of
969 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
970 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
971 | significand are stored at the location pointed to by `zSig0Ptr', and the
972 | least significant 64 bits of the normalized significand are stored at the
973 | location pointed to by `zSig1Ptr'.
974 *----------------------------------------------------------------------------*/
977 normalizeFloat128Subnormal(
988 shiftCount
= countLeadingZeros64( aSig1
) - 15;
989 if ( shiftCount
< 0 ) {
990 *zSig0Ptr
= aSig1
>>( - shiftCount
);
991 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
994 *zSig0Ptr
= aSig1
<<shiftCount
;
997 *zExpPtr
= - shiftCount
- 63;
1000 shiftCount
= countLeadingZeros64( aSig0
) - 15;
1001 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
1002 *zExpPtr
= 1 - shiftCount
;
1007 /*----------------------------------------------------------------------------
1008 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1009 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1010 | floating-point value, returning the result. After being shifted into the
1011 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1012 | added together to form the most significant 32 bits of the result. This
1013 | means that any integer portion of `zSig0' will be added into the exponent.
1014 | Since a properly normalized significand will have an integer portion equal
1015 | to 1, the `zExp' input should be 1 less than the desired result exponent
1016 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1018 *----------------------------------------------------------------------------*/
1021 packFloat128( flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1
)
1026 z
.high
= ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<48 ) + zSig0
;
1031 /*----------------------------------------------------------------------------
1032 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1033 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1034 | and `zSig2', and returns the proper quadruple-precision floating-point value
1035 | corresponding to the abstract input. Ordinarily, the abstract value is
1036 | simply rounded and packed into the quadruple-precision format, with the
1037 | inexact exception raised if the abstract input cannot be represented
1038 | exactly. However, if the abstract value is too large, the overflow and
1039 | inexact exceptions are raised and an infinity or maximal finite value is
1040 | returned. If the abstract value is too small, the input value is rounded to
1041 | a subnormal number, and the underflow and inexact exceptions are raised if
1042 | the abstract input cannot be represented exactly as a subnormal quadruple-
1043 | precision floating-point number.
1044 | The input significand must be normalized or smaller. If the input
1045 | significand is not normalized, `zExp' must be 0; in that case, the result
1046 | returned is a subnormal number, and it must not require rounding. In the
1047 | usual case that the input significand is normalized, `zExp' must be 1 less
1048 | than the ``true'' floating-point exponent. The handling of underflow and
1049 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1050 *----------------------------------------------------------------------------*/
1053 roundAndPackFloat128(
1054 flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1
, uint64_t zSig2 STATUS_PARAM
)
1057 flag roundNearestEven
, increment
, isTiny
;
1059 roundingMode
= STATUS(float_rounding_mode
);
1060 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
1061 switch (roundingMode
) {
1062 case float_round_nearest_even
:
1063 case float_round_ties_away
:
1064 increment
= ((int64_t)zSig2
< 0);
1066 case float_round_to_zero
:
1069 case float_round_up
:
1070 increment
= !zSign
&& zSig2
;
1072 case float_round_down
:
1073 increment
= zSign
&& zSig2
;
1078 if ( 0x7FFD <= (uint32_t) zExp
) {
1079 if ( ( 0x7FFD < zExp
)
1080 || ( ( zExp
== 0x7FFD )
1082 LIT64( 0x0001FFFFFFFFFFFF ),
1083 LIT64( 0xFFFFFFFFFFFFFFFF ),
1090 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
1091 if ( ( roundingMode
== float_round_to_zero
)
1092 || ( zSign
&& ( roundingMode
== float_round_up
) )
1093 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
1099 LIT64( 0x0000FFFFFFFFFFFF ),
1100 LIT64( 0xFFFFFFFFFFFFFFFF )
1103 return packFloat128( zSign
, 0x7FFF, 0, 0 );
1106 if (STATUS(flush_to_zero
)) {
1107 float_raise(float_flag_output_denormal STATUS_VAR
);
1108 return packFloat128(zSign
, 0, 0, 0);
1111 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
1117 LIT64( 0x0001FFFFFFFFFFFF ),
1118 LIT64( 0xFFFFFFFFFFFFFFFF )
1120 shift128ExtraRightJamming(
1121 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
1123 if ( isTiny
&& zSig2
) float_raise( float_flag_underflow STATUS_VAR
);
1124 switch (roundingMode
) {
1125 case float_round_nearest_even
:
1126 case float_round_ties_away
:
1127 increment
= ((int64_t)zSig2
< 0);
1129 case float_round_to_zero
:
1132 case float_round_up
:
1133 increment
= !zSign
&& zSig2
;
1135 case float_round_down
:
1136 increment
= zSign
&& zSig2
;
1143 if ( zSig2
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1145 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
1146 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
1149 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
1151 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1155 /*----------------------------------------------------------------------------
1156 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1157 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1158 | returns the proper quadruple-precision floating-point value corresponding
1159 | to the abstract input. This routine is just like `roundAndPackFloat128'
1160 | except that the input significand has fewer bits and does not have to be
1161 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1163 *----------------------------------------------------------------------------*/
1166 normalizeRoundAndPackFloat128(
1167 flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1 STATUS_PARAM
)
1177 shiftCount
= countLeadingZeros64( zSig0
) - 15;
1178 if ( 0 <= shiftCount
) {
1180 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1183 shift128ExtraRightJamming(
1184 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
1187 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
1191 /*----------------------------------------------------------------------------
1192 | Returns the result of converting the 32-bit two's complement integer `a'
1193 | to the single-precision floating-point format. The conversion is performed
1194 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1195 *----------------------------------------------------------------------------*/
1197 float32
int32_to_float32(int32_t a STATUS_PARAM
)
1201 if ( a
== 0 ) return float32_zero
;
1202 if ( a
== (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1204 return normalizeRoundAndPackFloat32( zSign
, 0x9C, zSign
? - a
: a STATUS_VAR
);
1208 /*----------------------------------------------------------------------------
1209 | Returns the result of converting the 32-bit two's complement integer `a'
1210 | to the double-precision floating-point format. The conversion is performed
1211 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1212 *----------------------------------------------------------------------------*/
1214 float64
int32_to_float64(int32_t a STATUS_PARAM
)
1221 if ( a
== 0 ) return float64_zero
;
1223 absA
= zSign
? - a
: a
;
1224 shiftCount
= countLeadingZeros32( absA
) + 21;
1226 return packFloat64( zSign
, 0x432 - shiftCount
, zSig
<<shiftCount
);
1230 /*----------------------------------------------------------------------------
1231 | Returns the result of converting the 32-bit two's complement integer `a'
1232 | to the extended double-precision floating-point format. The conversion
1233 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1235 *----------------------------------------------------------------------------*/
1237 floatx80
int32_to_floatx80(int32_t a STATUS_PARAM
)
1244 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1246 absA
= zSign
? - a
: a
;
1247 shiftCount
= countLeadingZeros32( absA
) + 32;
1249 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
1253 /*----------------------------------------------------------------------------
1254 | Returns the result of converting the 32-bit two's complement integer `a' to
1255 | the quadruple-precision floating-point format. The conversion is performed
1256 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1257 *----------------------------------------------------------------------------*/
1259 float128
int32_to_float128(int32_t a STATUS_PARAM
)
1266 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1268 absA
= zSign
? - a
: a
;
1269 shiftCount
= countLeadingZeros32( absA
) + 17;
1271 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
1275 /*----------------------------------------------------------------------------
1276 | Returns the result of converting the 64-bit two's complement integer `a'
1277 | to the single-precision floating-point format. The conversion is performed
1278 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1279 *----------------------------------------------------------------------------*/
1281 float32
int64_to_float32(int64_t a STATUS_PARAM
)
1287 if ( a
== 0 ) return float32_zero
;
1289 absA
= zSign
? - a
: a
;
1290 shiftCount
= countLeadingZeros64( absA
) - 40;
1291 if ( 0 <= shiftCount
) {
1292 return packFloat32( zSign
, 0x95 - shiftCount
, absA
<<shiftCount
);
1296 if ( shiftCount
< 0 ) {
1297 shift64RightJamming( absA
, - shiftCount
, &absA
);
1300 absA
<<= shiftCount
;
1302 return roundAndPackFloat32( zSign
, 0x9C - shiftCount
, absA STATUS_VAR
);
1307 float32
uint64_to_float32(uint64_t a STATUS_PARAM
)
1311 if ( a
== 0 ) return float32_zero
;
1312 shiftCount
= countLeadingZeros64( a
) - 40;
1313 if ( 0 <= shiftCount
) {
1314 return packFloat32(0, 0x95 - shiftCount
, a
<<shiftCount
);
1318 if ( shiftCount
< 0 ) {
1319 shift64RightJamming( a
, - shiftCount
, &a
);
1324 return roundAndPackFloat32(0, 0x9C - shiftCount
, a STATUS_VAR
);
1328 /*----------------------------------------------------------------------------
1329 | Returns the result of converting the 64-bit two's complement integer `a'
1330 | to the double-precision floating-point format. The conversion is performed
1331 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1332 *----------------------------------------------------------------------------*/
1334 float64
int64_to_float64(int64_t a STATUS_PARAM
)
1338 if ( a
== 0 ) return float64_zero
;
1339 if ( a
== (int64_t) LIT64( 0x8000000000000000 ) ) {
1340 return packFloat64( 1, 0x43E, 0 );
1343 return normalizeRoundAndPackFloat64( zSign
, 0x43C, zSign
? - a
: a STATUS_VAR
);
1347 float64
uint64_to_float64(uint64_t a STATUS_PARAM
)
1352 return float64_zero
;
1354 if ((int64_t)a
< 0) {
1355 shift64RightJamming(a
, 1, &a
);
1358 return normalizeRoundAndPackFloat64(0, exp
, a STATUS_VAR
);
1361 /*----------------------------------------------------------------------------
1362 | Returns the result of converting the 64-bit two's complement integer `a'
1363 | to the extended double-precision floating-point format. The conversion
1364 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1366 *----------------------------------------------------------------------------*/
1368 floatx80
int64_to_floatx80(int64_t a STATUS_PARAM
)
1374 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1376 absA
= zSign
? - a
: a
;
1377 shiftCount
= countLeadingZeros64( absA
);
1378 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
1382 /*----------------------------------------------------------------------------
1383 | Returns the result of converting the 64-bit two's complement integer `a' to
1384 | the quadruple-precision floating-point format. The conversion is performed
1385 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1386 *----------------------------------------------------------------------------*/
1388 float128
int64_to_float128(int64_t a STATUS_PARAM
)
1394 uint64_t zSig0
, zSig1
;
1396 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1398 absA
= zSign
? - a
: a
;
1399 shiftCount
= countLeadingZeros64( absA
) + 49;
1400 zExp
= 0x406E - shiftCount
;
1401 if ( 64 <= shiftCount
) {
1410 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1411 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1415 float128
uint64_to_float128(uint64_t a STATUS_PARAM
)
1418 return float128_zero
;
1420 return normalizeRoundAndPackFloat128(0, 0x406E, a
, 0 STATUS_VAR
);
1423 /*----------------------------------------------------------------------------
1424 | Returns the result of converting the single-precision floating-point value
1425 | `a' to the 32-bit two's complement integer format. The conversion is
1426 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1427 | Arithmetic---which means in particular that the conversion is rounded
1428 | according to the current rounding mode. If `a' is a NaN, the largest
1429 | positive integer is returned. Otherwise, if the conversion overflows, the
1430 | largest integer with the same sign as `a' is returned.
1431 *----------------------------------------------------------------------------*/
1433 int32
float32_to_int32( float32 a STATUS_PARAM
)
1436 int_fast16_t aExp
, shiftCount
;
1440 a
= float32_squash_input_denormal(a STATUS_VAR
);
1441 aSig
= extractFloat32Frac( a
);
1442 aExp
= extractFloat32Exp( a
);
1443 aSign
= extractFloat32Sign( a
);
1444 if ( ( aExp
== 0xFF ) && aSig
) aSign
= 0;
1445 if ( aExp
) aSig
|= 0x00800000;
1446 shiftCount
= 0xAF - aExp
;
1449 if ( 0 < shiftCount
) shift64RightJamming( aSig64
, shiftCount
, &aSig64
);
1450 return roundAndPackInt32( aSign
, aSig64 STATUS_VAR
);
1454 /*----------------------------------------------------------------------------
1455 | Returns the result of converting the single-precision floating-point value
1456 | `a' to the 32-bit two's complement integer format. The conversion is
1457 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1458 | Arithmetic, except that the conversion is always rounded toward zero.
1459 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1460 | the conversion overflows, the largest integer with the same sign as `a' is
1462 *----------------------------------------------------------------------------*/
1464 int32
float32_to_int32_round_to_zero( float32 a STATUS_PARAM
)
1467 int_fast16_t aExp
, shiftCount
;
1470 a
= float32_squash_input_denormal(a STATUS_VAR
);
1472 aSig
= extractFloat32Frac( a
);
1473 aExp
= extractFloat32Exp( a
);
1474 aSign
= extractFloat32Sign( a
);
1475 shiftCount
= aExp
- 0x9E;
1476 if ( 0 <= shiftCount
) {
1477 if ( float32_val(a
) != 0xCF000000 ) {
1478 float_raise( float_flag_invalid STATUS_VAR
);
1479 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
1481 return (int32_t) 0x80000000;
1483 else if ( aExp
<= 0x7E ) {
1484 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1487 aSig
= ( aSig
| 0x00800000 )<<8;
1488 z
= aSig
>>( - shiftCount
);
1489 if ( (uint32_t) ( aSig
<<( shiftCount
& 31 ) ) ) {
1490 STATUS(float_exception_flags
) |= float_flag_inexact
;
1492 if ( aSign
) z
= - z
;
1497 /*----------------------------------------------------------------------------
1498 | Returns the result of converting the single-precision floating-point value
1499 | `a' to the 16-bit two's complement integer format. The conversion is
1500 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1501 | Arithmetic, except that the conversion is always rounded toward zero.
1502 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1503 | the conversion overflows, the largest integer with the same sign as `a' is
1505 *----------------------------------------------------------------------------*/
1507 int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM
)
1510 int_fast16_t aExp
, shiftCount
;
1514 aSig
= extractFloat32Frac( a
);
1515 aExp
= extractFloat32Exp( a
);
1516 aSign
= extractFloat32Sign( a
);
1517 shiftCount
= aExp
- 0x8E;
1518 if ( 0 <= shiftCount
) {
1519 if ( float32_val(a
) != 0xC7000000 ) {
1520 float_raise( float_flag_invalid STATUS_VAR
);
1521 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1525 return (int32_t) 0xffff8000;
1527 else if ( aExp
<= 0x7E ) {
1528 if ( aExp
| aSig
) {
1529 STATUS(float_exception_flags
) |= float_flag_inexact
;
1534 aSig
= ( aSig
| 0x00800000 )<<8;
1535 z
= aSig
>>( - shiftCount
);
1536 if ( (uint32_t) ( aSig
<<( shiftCount
& 31 ) ) ) {
1537 STATUS(float_exception_flags
) |= float_flag_inexact
;
1546 /*----------------------------------------------------------------------------
1547 | Returns the result of converting the single-precision floating-point value
1548 | `a' to the 64-bit two's complement integer format. The conversion is
1549 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1550 | Arithmetic---which means in particular that the conversion is rounded
1551 | according to the current rounding mode. If `a' is a NaN, the largest
1552 | positive integer is returned. Otherwise, if the conversion overflows, the
1553 | largest integer with the same sign as `a' is returned.
1554 *----------------------------------------------------------------------------*/
1556 int64
float32_to_int64( float32 a STATUS_PARAM
)
1559 int_fast16_t aExp
, shiftCount
;
1561 uint64_t aSig64
, aSigExtra
;
1562 a
= float32_squash_input_denormal(a STATUS_VAR
);
1564 aSig
= extractFloat32Frac( a
);
1565 aExp
= extractFloat32Exp( a
);
1566 aSign
= extractFloat32Sign( a
);
1567 shiftCount
= 0xBE - aExp
;
1568 if ( shiftCount
< 0 ) {
1569 float_raise( float_flag_invalid STATUS_VAR
);
1570 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1571 return LIT64( 0x7FFFFFFFFFFFFFFF );
1573 return (int64_t) LIT64( 0x8000000000000000 );
1575 if ( aExp
) aSig
|= 0x00800000;
1578 shift64ExtraRightJamming( aSig64
, 0, shiftCount
, &aSig64
, &aSigExtra
);
1579 return roundAndPackInt64( aSign
, aSig64
, aSigExtra STATUS_VAR
);
1583 /*----------------------------------------------------------------------------
1584 | Returns the result of converting the single-precision floating-point value
1585 | `a' to the 64-bit unsigned integer format. The conversion is
1586 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1587 | Arithmetic---which means in particular that the conversion is rounded
1588 | according to the current rounding mode. If `a' is a NaN, the largest
1589 | unsigned integer is returned. Otherwise, if the conversion overflows, the
1590 | largest unsigned integer is returned. If the 'a' is negative, the result
1591 | is rounded and zero is returned; values that do not round to zero will
1592 | raise the inexact exception flag.
1593 *----------------------------------------------------------------------------*/
1595 uint64
float32_to_uint64(float32 a STATUS_PARAM
)
1598 int_fast16_t aExp
, shiftCount
;
1600 uint64_t aSig64
, aSigExtra
;
1601 a
= float32_squash_input_denormal(a STATUS_VAR
);
1603 aSig
= extractFloat32Frac(a
);
1604 aExp
= extractFloat32Exp(a
);
1605 aSign
= extractFloat32Sign(a
);
1606 if ((aSign
) && (aExp
> 126)) {
1607 float_raise(float_flag_invalid STATUS_VAR
);
1608 if (float32_is_any_nan(a
)) {
1609 return LIT64(0xFFFFFFFFFFFFFFFF);
1614 shiftCount
= 0xBE - aExp
;
1618 if (shiftCount
< 0) {
1619 float_raise(float_flag_invalid STATUS_VAR
);
1620 return LIT64(0xFFFFFFFFFFFFFFFF);
1625 shift64ExtraRightJamming(aSig64
, 0, shiftCount
, &aSig64
, &aSigExtra
);
1626 return roundAndPackUint64(aSign
, aSig64
, aSigExtra STATUS_VAR
);
1629 /*----------------------------------------------------------------------------
1630 | Returns the result of converting the single-precision floating-point value
1631 | `a' to the 64-bit unsigned integer format. The conversion is
1632 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1633 | Arithmetic, except that the conversion is always rounded toward zero. If
1634 | `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the
1635 | conversion overflows, the largest unsigned integer is returned. If the
1636 | 'a' is negative, the result is rounded and zero is returned; values that do
1637 | not round to zero will raise the inexact flag.
1638 *----------------------------------------------------------------------------*/
1640 uint64
float32_to_uint64_round_to_zero(float32 a STATUS_PARAM
)
1642 signed char current_rounding_mode
= STATUS(float_rounding_mode
);
1643 set_float_rounding_mode(float_round_to_zero STATUS_VAR
);
1644 int64_t v
= float32_to_uint64(a STATUS_VAR
);
1645 set_float_rounding_mode(current_rounding_mode STATUS_VAR
);
1649 /*----------------------------------------------------------------------------
1650 | Returns the result of converting the single-precision floating-point value
1651 | `a' to the 64-bit two's complement integer format. The conversion is
1652 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1653 | Arithmetic, except that the conversion is always rounded toward zero. If
1654 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1655 | conversion overflows, the largest integer with the same sign as `a' is
1657 *----------------------------------------------------------------------------*/
1659 int64
float32_to_int64_round_to_zero( float32 a STATUS_PARAM
)
1662 int_fast16_t aExp
, shiftCount
;
1666 a
= float32_squash_input_denormal(a STATUS_VAR
);
1668 aSig
= extractFloat32Frac( a
);
1669 aExp
= extractFloat32Exp( a
);
1670 aSign
= extractFloat32Sign( a
);
1671 shiftCount
= aExp
- 0xBE;
1672 if ( 0 <= shiftCount
) {
1673 if ( float32_val(a
) != 0xDF000000 ) {
1674 float_raise( float_flag_invalid STATUS_VAR
);
1675 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1676 return LIT64( 0x7FFFFFFFFFFFFFFF );
1679 return (int64_t) LIT64( 0x8000000000000000 );
1681 else if ( aExp
<= 0x7E ) {
1682 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1685 aSig64
= aSig
| 0x00800000;
1687 z
= aSig64
>>( - shiftCount
);
1688 if ( (uint64_t) ( aSig64
<<( shiftCount
& 63 ) ) ) {
1689 STATUS(float_exception_flags
) |= float_flag_inexact
;
1691 if ( aSign
) z
= - z
;
1696 /*----------------------------------------------------------------------------
1697 | Returns the result of converting the single-precision floating-point value
1698 | `a' to the double-precision floating-point format. The conversion is
1699 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1701 *----------------------------------------------------------------------------*/
1703 float64
float32_to_float64( float32 a STATUS_PARAM
)
1708 a
= float32_squash_input_denormal(a STATUS_VAR
);
1710 aSig
= extractFloat32Frac( a
);
1711 aExp
= extractFloat32Exp( a
);
1712 aSign
= extractFloat32Sign( a
);
1713 if ( aExp
== 0xFF ) {
1714 if ( aSig
) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
1715 return packFloat64( aSign
, 0x7FF, 0 );
1718 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
1719 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1722 return packFloat64( aSign
, aExp
+ 0x380, ( (uint64_t) aSig
)<<29 );
1726 /*----------------------------------------------------------------------------
1727 | Returns the result of converting the single-precision floating-point value
1728 | `a' to the extended double-precision floating-point format. The conversion
1729 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1731 *----------------------------------------------------------------------------*/
1733 floatx80
float32_to_floatx80( float32 a STATUS_PARAM
)
1739 a
= float32_squash_input_denormal(a STATUS_VAR
);
1740 aSig
= extractFloat32Frac( a
);
1741 aExp
= extractFloat32Exp( a
);
1742 aSign
= extractFloat32Sign( a
);
1743 if ( aExp
== 0xFF ) {
1744 if ( aSig
) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
1745 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1748 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
1749 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1752 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
1756 /*----------------------------------------------------------------------------
1757 | Returns the result of converting the single-precision floating-point value
1758 | `a' to the double-precision floating-point format. The conversion is
1759 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1761 *----------------------------------------------------------------------------*/
1763 float128
float32_to_float128( float32 a STATUS_PARAM
)
1769 a
= float32_squash_input_denormal(a STATUS_VAR
);
1770 aSig
= extractFloat32Frac( a
);
1771 aExp
= extractFloat32Exp( a
);
1772 aSign
= extractFloat32Sign( a
);
1773 if ( aExp
== 0xFF ) {
1774 if ( aSig
) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
1775 return packFloat128( aSign
, 0x7FFF, 0, 0 );
1778 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
1779 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1782 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
1786 /*----------------------------------------------------------------------------
1787 | Rounds the single-precision floating-point value `a' to an integer, and
1788 | returns the result as a single-precision floating-point value. The
1789 | operation is performed according to the IEC/IEEE Standard for Binary
1790 | Floating-Point Arithmetic.
1791 *----------------------------------------------------------------------------*/
1793 float32
float32_round_to_int( float32 a STATUS_PARAM
)
1797 uint32_t lastBitMask
, roundBitsMask
;
1799 a
= float32_squash_input_denormal(a STATUS_VAR
);
1801 aExp
= extractFloat32Exp( a
);
1802 if ( 0x96 <= aExp
) {
1803 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
1804 return propagateFloat32NaN( a
, a STATUS_VAR
);
1808 if ( aExp
<= 0x7E ) {
1809 if ( (uint32_t) ( float32_val(a
)<<1 ) == 0 ) return a
;
1810 STATUS(float_exception_flags
) |= float_flag_inexact
;
1811 aSign
= extractFloat32Sign( a
);
1812 switch ( STATUS(float_rounding_mode
) ) {
1813 case float_round_nearest_even
:
1814 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
1815 return packFloat32( aSign
, 0x7F, 0 );
1818 case float_round_ties_away
:
1820 return packFloat32(aSign
, 0x7F, 0);
1823 case float_round_down
:
1824 return make_float32(aSign
? 0xBF800000 : 0);
1825 case float_round_up
:
1826 return make_float32(aSign
? 0x80000000 : 0x3F800000);
1828 return packFloat32( aSign
, 0, 0 );
1831 lastBitMask
<<= 0x96 - aExp
;
1832 roundBitsMask
= lastBitMask
- 1;
1834 switch (STATUS(float_rounding_mode
)) {
1835 case float_round_nearest_even
:
1836 z
+= lastBitMask
>>1;
1837 if ((z
& roundBitsMask
) == 0) {
1841 case float_round_ties_away
:
1842 z
+= lastBitMask
>> 1;
1844 case float_round_to_zero
:
1846 case float_round_up
:
1847 if (!extractFloat32Sign(make_float32(z
))) {
1851 case float_round_down
:
1852 if (extractFloat32Sign(make_float32(z
))) {
1859 z
&= ~ roundBitsMask
;
1860 if ( z
!= float32_val(a
) ) STATUS(float_exception_flags
) |= float_flag_inexact
;
1861 return make_float32(z
);
1865 /*----------------------------------------------------------------------------
1866 | Returns the result of adding the absolute values of the single-precision
1867 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1868 | before being returned. `zSign' is ignored if the result is a NaN.
1869 | The addition is performed according to the IEC/IEEE Standard for Binary
1870 | Floating-Point Arithmetic.
1871 *----------------------------------------------------------------------------*/
1873 static float32
addFloat32Sigs( float32 a
, float32 b
, flag zSign STATUS_PARAM
)
1875 int_fast16_t aExp
, bExp
, zExp
;
1876 uint32_t aSig
, bSig
, zSig
;
1877 int_fast16_t expDiff
;
1879 aSig
= extractFloat32Frac( a
);
1880 aExp
= extractFloat32Exp( a
);
1881 bSig
= extractFloat32Frac( b
);
1882 bExp
= extractFloat32Exp( b
);
1883 expDiff
= aExp
- bExp
;
1886 if ( 0 < expDiff
) {
1887 if ( aExp
== 0xFF ) {
1888 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1897 shift32RightJamming( bSig
, expDiff
, &bSig
);
1900 else if ( expDiff
< 0 ) {
1901 if ( bExp
== 0xFF ) {
1902 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1903 return packFloat32( zSign
, 0xFF, 0 );
1911 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1915 if ( aExp
== 0xFF ) {
1916 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1920 if (STATUS(flush_to_zero
)) {
1922 float_raise(float_flag_output_denormal STATUS_VAR
);
1924 return packFloat32(zSign
, 0, 0);
1926 return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
1928 zSig
= 0x40000000 + aSig
+ bSig
;
1933 zSig
= ( aSig
+ bSig
)<<1;
1935 if ( (int32_t) zSig
< 0 ) {
1940 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1944 /*----------------------------------------------------------------------------
1945 | Returns the result of subtracting the absolute values of the single-
1946 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1947 | difference is negated before being returned. `zSign' is ignored if the
1948 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1949 | Standard for Binary Floating-Point Arithmetic.
1950 *----------------------------------------------------------------------------*/
1952 static float32
subFloat32Sigs( float32 a
, float32 b
, flag zSign STATUS_PARAM
)
1954 int_fast16_t aExp
, bExp
, zExp
;
1955 uint32_t aSig
, bSig
, zSig
;
1956 int_fast16_t expDiff
;
1958 aSig
= extractFloat32Frac( a
);
1959 aExp
= extractFloat32Exp( a
);
1960 bSig
= extractFloat32Frac( b
);
1961 bExp
= extractFloat32Exp( b
);
1962 expDiff
= aExp
- bExp
;
1965 if ( 0 < expDiff
) goto aExpBigger
;
1966 if ( expDiff
< 0 ) goto bExpBigger
;
1967 if ( aExp
== 0xFF ) {
1968 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1969 float_raise( float_flag_invalid STATUS_VAR
);
1970 return float32_default_nan
;
1976 if ( bSig
< aSig
) goto aBigger
;
1977 if ( aSig
< bSig
) goto bBigger
;
1978 return packFloat32( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
1980 if ( bExp
== 0xFF ) {
1981 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1982 return packFloat32( zSign
^ 1, 0xFF, 0 );
1990 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1996 goto normalizeRoundAndPack
;
1998 if ( aExp
== 0xFF ) {
1999 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
2008 shift32RightJamming( bSig
, expDiff
, &bSig
);
2013 normalizeRoundAndPack
:
2015 return normalizeRoundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
2019 /*----------------------------------------------------------------------------
2020 | Returns the result of adding the single-precision floating-point values `a'
2021 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2022 | Binary Floating-Point Arithmetic.
2023 *----------------------------------------------------------------------------*/
2025 float32
float32_add( float32 a
, float32 b STATUS_PARAM
)
2028 a
= float32_squash_input_denormal(a STATUS_VAR
);
2029 b
= float32_squash_input_denormal(b STATUS_VAR
);
2031 aSign
= extractFloat32Sign( a
);
2032 bSign
= extractFloat32Sign( b
);
2033 if ( aSign
== bSign
) {
2034 return addFloat32Sigs( a
, b
, aSign STATUS_VAR
);
2037 return subFloat32Sigs( a
, b
, aSign STATUS_VAR
);
2042 /*----------------------------------------------------------------------------
2043 | Returns the result of subtracting the single-precision floating-point values
2044 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2045 | for Binary Floating-Point Arithmetic.
2046 *----------------------------------------------------------------------------*/
2048 float32
float32_sub( float32 a
, float32 b STATUS_PARAM
)
2051 a
= float32_squash_input_denormal(a STATUS_VAR
);
2052 b
= float32_squash_input_denormal(b STATUS_VAR
);
2054 aSign
= extractFloat32Sign( a
);
2055 bSign
= extractFloat32Sign( b
);
2056 if ( aSign
== bSign
) {
2057 return subFloat32Sigs( a
, b
, aSign STATUS_VAR
);
2060 return addFloat32Sigs( a
, b
, aSign STATUS_VAR
);
2065 /*----------------------------------------------------------------------------
2066 | Returns the result of multiplying the single-precision floating-point values
2067 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2068 | for Binary Floating-Point Arithmetic.
2069 *----------------------------------------------------------------------------*/
2071 float32
float32_mul( float32 a
, float32 b STATUS_PARAM
)
2073 flag aSign
, bSign
, zSign
;
2074 int_fast16_t aExp
, bExp
, zExp
;
2075 uint32_t aSig
, bSig
;
2079 a
= float32_squash_input_denormal(a STATUS_VAR
);
2080 b
= float32_squash_input_denormal(b STATUS_VAR
);
2082 aSig
= extractFloat32Frac( a
);
2083 aExp
= extractFloat32Exp( a
);
2084 aSign
= extractFloat32Sign( a
);
2085 bSig
= extractFloat32Frac( b
);
2086 bExp
= extractFloat32Exp( b
);
2087 bSign
= extractFloat32Sign( b
);
2088 zSign
= aSign
^ bSign
;
2089 if ( aExp
== 0xFF ) {
2090 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
2091 return propagateFloat32NaN( a
, b STATUS_VAR
);
2093 if ( ( bExp
| bSig
) == 0 ) {
2094 float_raise( float_flag_invalid STATUS_VAR
);
2095 return float32_default_nan
;
2097 return packFloat32( zSign
, 0xFF, 0 );
2099 if ( bExp
== 0xFF ) {
2100 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
2101 if ( ( aExp
| aSig
) == 0 ) {
2102 float_raise( float_flag_invalid STATUS_VAR
);
2103 return float32_default_nan
;
2105 return packFloat32( zSign
, 0xFF, 0 );
2108 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
2109 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2112 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
2113 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2115 zExp
= aExp
+ bExp
- 0x7F;
2116 aSig
= ( aSig
| 0x00800000 )<<7;
2117 bSig
= ( bSig
| 0x00800000 )<<8;
2118 shift64RightJamming( ( (uint64_t) aSig
) * bSig
, 32, &zSig64
);
2120 if ( 0 <= (int32_t) ( zSig
<<1 ) ) {
2124 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
2128 /*----------------------------------------------------------------------------
2129 | Returns the result of dividing the single-precision floating-point value `a'
2130 | by the corresponding value `b'. The operation is performed according to the
2131 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2132 *----------------------------------------------------------------------------*/
2134 float32
float32_div( float32 a
, float32 b STATUS_PARAM
)
2136 flag aSign
, bSign
, zSign
;
2137 int_fast16_t aExp
, bExp
, zExp
;
2138 uint32_t aSig
, bSig
, zSig
;
2139 a
= float32_squash_input_denormal(a STATUS_VAR
);
2140 b
= float32_squash_input_denormal(b STATUS_VAR
);
2142 aSig
= extractFloat32Frac( a
);
2143 aExp
= extractFloat32Exp( a
);
2144 aSign
= extractFloat32Sign( a
);
2145 bSig
= extractFloat32Frac( b
);
2146 bExp
= extractFloat32Exp( b
);
2147 bSign
= extractFloat32Sign( b
);
2148 zSign
= aSign
^ bSign
;
2149 if ( aExp
== 0xFF ) {
2150 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
2151 if ( bExp
== 0xFF ) {
2152 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
2153 float_raise( float_flag_invalid STATUS_VAR
);
2154 return float32_default_nan
;
2156 return packFloat32( zSign
, 0xFF, 0 );
2158 if ( bExp
== 0xFF ) {
2159 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
2160 return packFloat32( zSign
, 0, 0 );
2164 if ( ( aExp
| aSig
) == 0 ) {
2165 float_raise( float_flag_invalid STATUS_VAR
);
2166 return float32_default_nan
;
2168 float_raise( float_flag_divbyzero STATUS_VAR
);
2169 return packFloat32( zSign
, 0xFF, 0 );
2171 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2174 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
2175 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2177 zExp
= aExp
- bExp
+ 0x7D;
2178 aSig
= ( aSig
| 0x00800000 )<<7;
2179 bSig
= ( bSig
| 0x00800000 )<<8;
2180 if ( bSig
<= ( aSig
+ aSig
) ) {
2184 zSig
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
2185 if ( ( zSig
& 0x3F ) == 0 ) {
2186 zSig
|= ( (uint64_t) bSig
* zSig
!= ( (uint64_t) aSig
)<<32 );
2188 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
2192 /*----------------------------------------------------------------------------
2193 | Returns the remainder of the single-precision floating-point value `a'
2194 | with respect to the corresponding value `b'. The operation is performed
2195 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2196 *----------------------------------------------------------------------------*/
2198 float32
float32_rem( float32 a
, float32 b STATUS_PARAM
)
2201 int_fast16_t aExp
, bExp
, expDiff
;
2202 uint32_t aSig
, bSig
;
2204 uint64_t aSig64
, bSig64
, q64
;
2205 uint32_t alternateASig
;
2207 a
= float32_squash_input_denormal(a STATUS_VAR
);
2208 b
= float32_squash_input_denormal(b STATUS_VAR
);
2210 aSig
= extractFloat32Frac( a
);
2211 aExp
= extractFloat32Exp( a
);
2212 aSign
= extractFloat32Sign( a
);
2213 bSig
= extractFloat32Frac( b
);
2214 bExp
= extractFloat32Exp( b
);
2215 if ( aExp
== 0xFF ) {
2216 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
2217 return propagateFloat32NaN( a
, b STATUS_VAR
);
2219 float_raise( float_flag_invalid STATUS_VAR
);
2220 return float32_default_nan
;
2222 if ( bExp
== 0xFF ) {
2223 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
2228 float_raise( float_flag_invalid STATUS_VAR
);
2229 return float32_default_nan
;
2231 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2234 if ( aSig
== 0 ) return a
;
2235 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2237 expDiff
= aExp
- bExp
;
2240 if ( expDiff
< 32 ) {
2243 if ( expDiff
< 0 ) {
2244 if ( expDiff
< -1 ) return a
;
2247 q
= ( bSig
<= aSig
);
2248 if ( q
) aSig
-= bSig
;
2249 if ( 0 < expDiff
) {
2250 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
2253 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
2261 if ( bSig
<= aSig
) aSig
-= bSig
;
2262 aSig64
= ( (uint64_t) aSig
)<<40;
2263 bSig64
= ( (uint64_t) bSig
)<<40;
2265 while ( 0 < expDiff
) {
2266 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2267 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2268 aSig64
= - ( ( bSig
* q64
)<<38 );
2272 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2273 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2274 q
= q64
>>( 64 - expDiff
);
2276 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
2279 alternateASig
= aSig
;
2282 } while ( 0 <= (int32_t) aSig
);
2283 sigMean
= aSig
+ alternateASig
;
2284 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
2285 aSig
= alternateASig
;
2287 zSign
= ( (int32_t) aSig
< 0 );
2288 if ( zSign
) aSig
= - aSig
;
2289 return normalizeRoundAndPackFloat32( aSign
^ zSign
, bExp
, aSig STATUS_VAR
);
2293 /*----------------------------------------------------------------------------
2294 | Returns the result of multiplying the single-precision floating-point values
2295 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2296 | multiplication. The operation is performed according to the IEC/IEEE
2297 | Standard for Binary Floating-Point Arithmetic 754-2008.
2298 | The flags argument allows the caller to select negation of the
2299 | addend, the intermediate product, or the final result. (The difference
2300 | between this and having the caller do a separate negation is that negating
2301 | externally will flip the sign bit on NaNs.)
2302 *----------------------------------------------------------------------------*/
2304 float32
float32_muladd(float32 a
, float32 b
, float32 c
, int flags STATUS_PARAM
)
2306 flag aSign
, bSign
, cSign
, zSign
;
2307 int_fast16_t aExp
, bExp
, cExp
, pExp
, zExp
, expDiff
;
2308 uint32_t aSig
, bSig
, cSig
;
2309 flag pInf
, pZero
, pSign
;
2310 uint64_t pSig64
, cSig64
, zSig64
;
2313 flag signflip
, infzero
;
2315 a
= float32_squash_input_denormal(a STATUS_VAR
);
2316 b
= float32_squash_input_denormal(b STATUS_VAR
);
2317 c
= float32_squash_input_denormal(c STATUS_VAR
);
2318 aSig
= extractFloat32Frac(a
);
2319 aExp
= extractFloat32Exp(a
);
2320 aSign
= extractFloat32Sign(a
);
2321 bSig
= extractFloat32Frac(b
);
2322 bExp
= extractFloat32Exp(b
);
2323 bSign
= extractFloat32Sign(b
);
2324 cSig
= extractFloat32Frac(c
);
2325 cExp
= extractFloat32Exp(c
);
2326 cSign
= extractFloat32Sign(c
);
2328 infzero
= ((aExp
== 0 && aSig
== 0 && bExp
== 0xff && bSig
== 0) ||
2329 (aExp
== 0xff && aSig
== 0 && bExp
== 0 && bSig
== 0));
2331 /* It is implementation-defined whether the cases of (0,inf,qnan)
2332 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2333 * they return if they do), so we have to hand this information
2334 * off to the target-specific pick-a-NaN routine.
2336 if (((aExp
== 0xff) && aSig
) ||
2337 ((bExp
== 0xff) && bSig
) ||
2338 ((cExp
== 0xff) && cSig
)) {
2339 return propagateFloat32MulAddNaN(a
, b
, c
, infzero STATUS_VAR
);
2343 float_raise(float_flag_invalid STATUS_VAR
);
2344 return float32_default_nan
;
2347 if (flags
& float_muladd_negate_c
) {
2351 signflip
= (flags
& float_muladd_negate_result
) ? 1 : 0;
2353 /* Work out the sign and type of the product */
2354 pSign
= aSign
^ bSign
;
2355 if (flags
& float_muladd_negate_product
) {
2358 pInf
= (aExp
== 0xff) || (bExp
== 0xff);
2359 pZero
= ((aExp
| aSig
) == 0) || ((bExp
| bSig
) == 0);
2362 if (pInf
&& (pSign
^ cSign
)) {
2363 /* addition of opposite-signed infinities => InvalidOperation */
2364 float_raise(float_flag_invalid STATUS_VAR
);
2365 return float32_default_nan
;
2367 /* Otherwise generate an infinity of the same sign */
2368 return packFloat32(cSign
^ signflip
, 0xff, 0);
2372 return packFloat32(pSign
^ signflip
, 0xff, 0);
2378 /* Adding two exact zeroes */
2379 if (pSign
== cSign
) {
2381 } else if (STATUS(float_rounding_mode
) == float_round_down
) {
2386 return packFloat32(zSign
^ signflip
, 0, 0);
2388 /* Exact zero plus a denorm */
2389 if (STATUS(flush_to_zero
)) {
2390 float_raise(float_flag_output_denormal STATUS_VAR
);
2391 return packFloat32(cSign
^ signflip
, 0, 0);
2394 /* Zero plus something non-zero : just return the something */
2395 if (flags
& float_muladd_halve_result
) {
2397 normalizeFloat32Subnormal(cSig
, &cExp
, &cSig
);
2399 /* Subtract one to halve, and one again because roundAndPackFloat32
2400 * wants one less than the true exponent.
2403 cSig
= (cSig
| 0x00800000) << 7;
2404 return roundAndPackFloat32(cSign
^ signflip
, cExp
, cSig STATUS_VAR
);
2406 return packFloat32(cSign
^ signflip
, cExp
, cSig
);
2410 normalizeFloat32Subnormal(aSig
, &aExp
, &aSig
);
2413 normalizeFloat32Subnormal(bSig
, &bExp
, &bSig
);
2416 /* Calculate the actual result a * b + c */
2418 /* Multiply first; this is easy. */
2419 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2420 * because we want the true exponent, not the "one-less-than"
2421 * flavour that roundAndPackFloat32() takes.
2423 pExp
= aExp
+ bExp
- 0x7e;
2424 aSig
= (aSig
| 0x00800000) << 7;
2425 bSig
= (bSig
| 0x00800000) << 8;
2426 pSig64
= (uint64_t)aSig
* bSig
;
2427 if ((int64_t)(pSig64
<< 1) >= 0) {
2432 zSign
= pSign
^ signflip
;
2434 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2439 /* Throw out the special case of c being an exact zero now */
2440 shift64RightJamming(pSig64
, 32, &pSig64
);
2442 if (flags
& float_muladd_halve_result
) {
2445 return roundAndPackFloat32(zSign
, pExp
- 1,
2448 normalizeFloat32Subnormal(cSig
, &cExp
, &cSig
);
2451 cSig64
= (uint64_t)cSig
<< (62 - 23);
2452 cSig64
|= LIT64(0x4000000000000000);
2453 expDiff
= pExp
- cExp
;
2455 if (pSign
== cSign
) {
2458 /* scale c to match p */
2459 shift64RightJamming(cSig64
, expDiff
, &cSig64
);
2461 } else if (expDiff
< 0) {
2462 /* scale p to match c */
2463 shift64RightJamming(pSig64
, -expDiff
, &pSig64
);
2466 /* no scaling needed */
2469 /* Add significands and make sure explicit bit ends up in posn 62 */
2470 zSig64
= pSig64
+ cSig64
;
2471 if ((int64_t)zSig64
< 0) {
2472 shift64RightJamming(zSig64
, 1, &zSig64
);
2479 shift64RightJamming(cSig64
, expDiff
, &cSig64
);
2480 zSig64
= pSig64
- cSig64
;
2482 } else if (expDiff
< 0) {
2483 shift64RightJamming(pSig64
, -expDiff
, &pSig64
);
2484 zSig64
= cSig64
- pSig64
;
2489 if (cSig64
< pSig64
) {
2490 zSig64
= pSig64
- cSig64
;
2491 } else if (pSig64
< cSig64
) {
2492 zSig64
= cSig64
- pSig64
;
2497 if (STATUS(float_rounding_mode
) == float_round_down
) {
2500 return packFloat32(zSign
, 0, 0);
2504 /* Normalize to put the explicit bit back into bit 62. */
2505 shiftcount
= countLeadingZeros64(zSig64
) - 1;
2506 zSig64
<<= shiftcount
;
2509 if (flags
& float_muladd_halve_result
) {
2513 shift64RightJamming(zSig64
, 32, &zSig64
);
2514 return roundAndPackFloat32(zSign
, zExp
, zSig64 STATUS_VAR
);
2518 /*----------------------------------------------------------------------------
2519 | Returns the square root of the single-precision floating-point value `a'.
2520 | The operation is performed according to the IEC/IEEE Standard for Binary
2521 | Floating-Point Arithmetic.
2522 *----------------------------------------------------------------------------*/
2524 float32
float32_sqrt( float32 a STATUS_PARAM
)
2527 int_fast16_t aExp
, zExp
;
2528 uint32_t aSig
, zSig
;
2530 a
= float32_squash_input_denormal(a STATUS_VAR
);
2532 aSig
= extractFloat32Frac( a
);
2533 aExp
= extractFloat32Exp( a
);
2534 aSign
= extractFloat32Sign( a
);
2535 if ( aExp
== 0xFF ) {
2536 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2537 if ( ! aSign
) return a
;
2538 float_raise( float_flag_invalid STATUS_VAR
);
2539 return float32_default_nan
;
2542 if ( ( aExp
| aSig
) == 0 ) return a
;
2543 float_raise( float_flag_invalid STATUS_VAR
);
2544 return float32_default_nan
;
2547 if ( aSig
== 0 ) return float32_zero
;
2548 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2550 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
2551 aSig
= ( aSig
| 0x00800000 )<<8;
2552 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
2553 if ( ( zSig
& 0x7F ) <= 5 ) {
2559 term
= ( (uint64_t) zSig
) * zSig
;
2560 rem
= ( ( (uint64_t) aSig
)<<32 ) - term
;
2561 while ( (int64_t) rem
< 0 ) {
2563 rem
+= ( ( (uint64_t) zSig
)<<1 ) | 1;
2565 zSig
|= ( rem
!= 0 );
2567 shift32RightJamming( zSig
, 1, &zSig
);
2569 return roundAndPackFloat32( 0, zExp
, zSig STATUS_VAR
);
2573 /*----------------------------------------------------------------------------
2574 | Returns the binary exponential of the single-precision floating-point value
2575 | `a'. The operation is performed according to the IEC/IEEE Standard for
2576 | Binary Floating-Point Arithmetic.
2578 | Uses the following identities:
2580 | 1. -------------------------------------------------------------------------
2584 | 2. -------------------------------------------------------------------------
2587 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2589 *----------------------------------------------------------------------------*/
2591 static const float64 float32_exp2_coefficients
[15] =
2593 const_float64( 0x3ff0000000000000ll
), /* 1 */
2594 const_float64( 0x3fe0000000000000ll
), /* 2 */
2595 const_float64( 0x3fc5555555555555ll
), /* 3 */
2596 const_float64( 0x3fa5555555555555ll
), /* 4 */
2597 const_float64( 0x3f81111111111111ll
), /* 5 */
2598 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
2599 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
2600 const_float64( 0x3efa01a01a01a01all
), /* 8 */
2601 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
2602 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
2603 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
2604 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
2605 const_float64( 0x3de6124613a86d09ll
), /* 13 */
2606 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
2607 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
2610 float32
float32_exp2( float32 a STATUS_PARAM
)
2617 a
= float32_squash_input_denormal(a STATUS_VAR
);
2619 aSig
= extractFloat32Frac( a
);
2620 aExp
= extractFloat32Exp( a
);
2621 aSign
= extractFloat32Sign( a
);
2623 if ( aExp
== 0xFF) {
2624 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2625 return (aSign
) ? float32_zero
: a
;
2628 if (aSig
== 0) return float32_one
;
2631 float_raise( float_flag_inexact STATUS_VAR
);
2633 /* ******************************* */
2634 /* using float64 for approximation */
2635 /* ******************************* */
2636 x
= float32_to_float64(a STATUS_VAR
);
2637 x
= float64_mul(x
, float64_ln2 STATUS_VAR
);
2641 for (i
= 0 ; i
< 15 ; i
++) {
2644 f
= float64_mul(xn
, float32_exp2_coefficients
[i
] STATUS_VAR
);
2645 r
= float64_add(r
, f STATUS_VAR
);
2647 xn
= float64_mul(xn
, x STATUS_VAR
);
2650 return float64_to_float32(r
, status
);
2653 /*----------------------------------------------------------------------------
2654 | Returns the binary log of the single-precision floating-point value `a'.
2655 | The operation is performed according to the IEC/IEEE Standard for Binary
2656 | Floating-Point Arithmetic.
2657 *----------------------------------------------------------------------------*/
2658 float32
float32_log2( float32 a STATUS_PARAM
)
2662 uint32_t aSig
, zSig
, i
;
2664 a
= float32_squash_input_denormal(a STATUS_VAR
);
2665 aSig
= extractFloat32Frac( a
);
2666 aExp
= extractFloat32Exp( a
);
2667 aSign
= extractFloat32Sign( a
);
2670 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
2671 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2674 float_raise( float_flag_invalid STATUS_VAR
);
2675 return float32_default_nan
;
2677 if ( aExp
== 0xFF ) {
2678 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2687 for (i
= 1 << 22; i
> 0; i
>>= 1) {
2688 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
2689 if ( aSig
& 0x01000000 ) {
2698 return normalizeRoundAndPackFloat32( zSign
, 0x85, zSig STATUS_VAR
);
2701 /*----------------------------------------------------------------------------
2702 | Returns 1 if the single-precision floating-point value `a' is equal to
2703 | the corresponding value `b', and 0 otherwise. The invalid exception is
2704 | raised if either operand is a NaN. Otherwise, the comparison is performed
2705 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2706 *----------------------------------------------------------------------------*/
2708 int float32_eq( float32 a
, float32 b STATUS_PARAM
)
2711 a
= float32_squash_input_denormal(a STATUS_VAR
);
2712 b
= float32_squash_input_denormal(b STATUS_VAR
);
2714 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2715 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2717 float_raise( float_flag_invalid STATUS_VAR
);
2720 av
= float32_val(a
);
2721 bv
= float32_val(b
);
2722 return ( av
== bv
) || ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
2725 /*----------------------------------------------------------------------------
2726 | Returns 1 if the single-precision floating-point value `a' is less than
2727 | or equal to the corresponding value `b', and 0 otherwise. The invalid
2728 | exception is raised if either operand is a NaN. The comparison is performed
2729 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2730 *----------------------------------------------------------------------------*/
2732 int float32_le( float32 a
, float32 b STATUS_PARAM
)
2736 a
= float32_squash_input_denormal(a STATUS_VAR
);
2737 b
= float32_squash_input_denormal(b STATUS_VAR
);
2739 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2740 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2742 float_raise( float_flag_invalid STATUS_VAR
);
2745 aSign
= extractFloat32Sign( a
);
2746 bSign
= extractFloat32Sign( b
);
2747 av
= float32_val(a
);
2748 bv
= float32_val(b
);
2749 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
2750 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
2754 /*----------------------------------------------------------------------------
2755 | Returns 1 if the single-precision floating-point value `a' is less than
2756 | the corresponding value `b', and 0 otherwise. The invalid exception is
2757 | raised if either operand is a NaN. The comparison is performed according
2758 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2759 *----------------------------------------------------------------------------*/
2761 int float32_lt( float32 a
, float32 b STATUS_PARAM
)
2765 a
= float32_squash_input_denormal(a STATUS_VAR
);
2766 b
= float32_squash_input_denormal(b STATUS_VAR
);
2768 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2769 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2771 float_raise( float_flag_invalid STATUS_VAR
);
2774 aSign
= extractFloat32Sign( a
);
2775 bSign
= extractFloat32Sign( b
);
2776 av
= float32_val(a
);
2777 bv
= float32_val(b
);
2778 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
2779 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
2783 /*----------------------------------------------------------------------------
2784 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2785 | be compared, and 0 otherwise. The invalid exception is raised if either
2786 | operand is a NaN. The comparison is performed according to the IEC/IEEE
2787 | Standard for Binary Floating-Point Arithmetic.
2788 *----------------------------------------------------------------------------*/
2790 int float32_unordered( float32 a
, float32 b STATUS_PARAM
)
2792 a
= float32_squash_input_denormal(a STATUS_VAR
);
2793 b
= float32_squash_input_denormal(b STATUS_VAR
);
2795 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2796 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2798 float_raise( float_flag_invalid STATUS_VAR
);
2804 /*----------------------------------------------------------------------------
2805 | Returns 1 if the single-precision floating-point value `a' is equal to
2806 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2807 | exception. The comparison is performed according to the IEC/IEEE Standard
2808 | for Binary Floating-Point Arithmetic.
2809 *----------------------------------------------------------------------------*/
2811 int float32_eq_quiet( float32 a
, float32 b STATUS_PARAM
)
2813 a
= float32_squash_input_denormal(a STATUS_VAR
);
2814 b
= float32_squash_input_denormal(b STATUS_VAR
);
2816 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2817 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2819 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2820 float_raise( float_flag_invalid STATUS_VAR
);
2824 return ( float32_val(a
) == float32_val(b
) ) ||
2825 ( (uint32_t) ( ( float32_val(a
) | float32_val(b
) )<<1 ) == 0 );
2828 /*----------------------------------------------------------------------------
2829 | Returns 1 if the single-precision floating-point value `a' is less than or
2830 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2831 | cause an exception. Otherwise, the comparison is performed according to the
2832 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2833 *----------------------------------------------------------------------------*/
2835 int float32_le_quiet( float32 a
, float32 b STATUS_PARAM
)
2839 a
= float32_squash_input_denormal(a STATUS_VAR
);
2840 b
= float32_squash_input_denormal(b STATUS_VAR
);
2842 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2843 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2845 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2846 float_raise( float_flag_invalid STATUS_VAR
);
2850 aSign
= extractFloat32Sign( a
);
2851 bSign
= extractFloat32Sign( b
);
2852 av
= float32_val(a
);
2853 bv
= float32_val(b
);
2854 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
2855 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
2859 /*----------------------------------------------------------------------------
2860 | Returns 1 if the single-precision floating-point value `a' is less than
2861 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2862 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2863 | Standard for Binary Floating-Point Arithmetic.
2864 *----------------------------------------------------------------------------*/
2866 int float32_lt_quiet( float32 a
, float32 b STATUS_PARAM
)
2870 a
= float32_squash_input_denormal(a STATUS_VAR
);
2871 b
= float32_squash_input_denormal(b STATUS_VAR
);
2873 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2874 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2876 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2877 float_raise( float_flag_invalid STATUS_VAR
);
2881 aSign
= extractFloat32Sign( a
);
2882 bSign
= extractFloat32Sign( b
);
2883 av
= float32_val(a
);
2884 bv
= float32_val(b
);
2885 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
2886 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
2890 /*----------------------------------------------------------------------------
2891 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2892 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2893 | comparison is performed according to the IEC/IEEE Standard for Binary
2894 | Floating-Point Arithmetic.
2895 *----------------------------------------------------------------------------*/
2897 int float32_unordered_quiet( float32 a
, float32 b STATUS_PARAM
)
2899 a
= float32_squash_input_denormal(a STATUS_VAR
);
2900 b
= float32_squash_input_denormal(b STATUS_VAR
);
2902 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2903 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2905 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2906 float_raise( float_flag_invalid STATUS_VAR
);
2913 /*----------------------------------------------------------------------------
2914 | Returns the result of converting the double-precision floating-point value
2915 | `a' to the 32-bit two's complement integer format. The conversion is
2916 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2917 | Arithmetic---which means in particular that the conversion is rounded
2918 | according to the current rounding mode. If `a' is a NaN, the largest
2919 | positive integer is returned. Otherwise, if the conversion overflows, the
2920 | largest integer with the same sign as `a' is returned.
2921 *----------------------------------------------------------------------------*/
2923 int32
float64_to_int32( float64 a STATUS_PARAM
)
2926 int_fast16_t aExp
, shiftCount
;
2928 a
= float64_squash_input_denormal(a STATUS_VAR
);
2930 aSig
= extractFloat64Frac( a
);
2931 aExp
= extractFloat64Exp( a
);
2932 aSign
= extractFloat64Sign( a
);
2933 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2934 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2935 shiftCount
= 0x42C - aExp
;
2936 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
2937 return roundAndPackInt32( aSign
, aSig STATUS_VAR
);
2941 /*----------------------------------------------------------------------------
2942 | Returns the result of converting the double-precision floating-point value
2943 | `a' to the 32-bit two's complement integer format. The conversion is
2944 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2945 | Arithmetic, except that the conversion is always rounded toward zero.
2946 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2947 | the conversion overflows, the largest integer with the same sign as `a' is
2949 *----------------------------------------------------------------------------*/
2951 int32
float64_to_int32_round_to_zero( float64 a STATUS_PARAM
)
2954 int_fast16_t aExp
, shiftCount
;
2955 uint64_t aSig
, savedASig
;
2957 a
= float64_squash_input_denormal(a STATUS_VAR
);
2959 aSig
= extractFloat64Frac( a
);
2960 aExp
= extractFloat64Exp( a
);
2961 aSign
= extractFloat64Sign( a
);
2962 if ( 0x41E < aExp
) {
2963 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2966 else if ( aExp
< 0x3FF ) {
2967 if ( aExp
|| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
2970 aSig
|= LIT64( 0x0010000000000000 );
2971 shiftCount
= 0x433 - aExp
;
2973 aSig
>>= shiftCount
;
2975 if ( aSign
) z
= - z
;
2976 if ( ( z
< 0 ) ^ aSign
) {
2978 float_raise( float_flag_invalid STATUS_VAR
);
2979 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
2981 if ( ( aSig
<<shiftCount
) != savedASig
) {
2982 STATUS(float_exception_flags
) |= float_flag_inexact
;
2988 /*----------------------------------------------------------------------------
2989 | Returns the result of converting the double-precision floating-point value
2990 | `a' to the 16-bit two's complement integer format. The conversion is
2991 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2992 | Arithmetic, except that the conversion is always rounded toward zero.
2993 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2994 | the conversion overflows, the largest integer with the same sign as `a' is
2996 *----------------------------------------------------------------------------*/
2998 int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM
)
3001 int_fast16_t aExp
, shiftCount
;
3002 uint64_t aSig
, savedASig
;
3005 aSig
= extractFloat64Frac( a
);
3006 aExp
= extractFloat64Exp( a
);
3007 aSign
= extractFloat64Sign( a
);
3008 if ( 0x40E < aExp
) {
3009 if ( ( aExp
== 0x7FF ) && aSig
) {
3014 else if ( aExp
< 0x3FF ) {
3015 if ( aExp
|| aSig
) {
3016 STATUS(float_exception_flags
) |= float_flag_inexact
;
3020 aSig
|= LIT64( 0x0010000000000000 );
3021 shiftCount
= 0x433 - aExp
;
3023 aSig
>>= shiftCount
;
3028 if ( ( (int16_t)z
< 0 ) ^ aSign
) {
3030 float_raise( float_flag_invalid STATUS_VAR
);
3031 return aSign
? (int32_t) 0xffff8000 : 0x7FFF;
3033 if ( ( aSig
<<shiftCount
) != savedASig
) {
3034 STATUS(float_exception_flags
) |= float_flag_inexact
;
3039 /*----------------------------------------------------------------------------
3040 | Returns the result of converting the double-precision floating-point value
3041 | `a' to the 64-bit two's complement integer format. The conversion is
3042 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3043 | Arithmetic---which means in particular that the conversion is rounded
3044 | according to the current rounding mode. If `a' is a NaN, the largest
3045 | positive integer is returned. Otherwise, if the conversion overflows, the
3046 | largest integer with the same sign as `a' is returned.
3047 *----------------------------------------------------------------------------*/
3049 int64
float64_to_int64( float64 a STATUS_PARAM
)
3052 int_fast16_t aExp
, shiftCount
;
3053 uint64_t aSig
, aSigExtra
;
3054 a
= float64_squash_input_denormal(a STATUS_VAR
);
3056 aSig
= extractFloat64Frac( a
);
3057 aExp
= extractFloat64Exp( a
);
3058 aSign
= extractFloat64Sign( a
);
3059 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
3060 shiftCount
= 0x433 - aExp
;
3061 if ( shiftCount
<= 0 ) {
3062 if ( 0x43E < aExp
) {
3063 float_raise( float_flag_invalid STATUS_VAR
);
3065 || ( ( aExp
== 0x7FF )
3066 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
3068 return LIT64( 0x7FFFFFFFFFFFFFFF );
3070 return (int64_t) LIT64( 0x8000000000000000 );
3073 aSig
<<= - shiftCount
;
3076 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
3078 return roundAndPackInt64( aSign
, aSig
, aSigExtra STATUS_VAR
);
3082 /*----------------------------------------------------------------------------
3083 | Returns the result of converting the double-precision floating-point value
3084 | `a' to the 64-bit two's complement integer format. The conversion is
3085 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3086 | Arithmetic, except that the conversion is always rounded toward zero.
3087 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3088 | the conversion overflows, the largest integer with the same sign as `a' is
3090 *----------------------------------------------------------------------------*/
3092 int64
float64_to_int64_round_to_zero( float64 a STATUS_PARAM
)
3095 int_fast16_t aExp
, shiftCount
;
3098 a
= float64_squash_input_denormal(a STATUS_VAR
);
3100 aSig
= extractFloat64Frac( a
);
3101 aExp
= extractFloat64Exp( a
);
3102 aSign
= extractFloat64Sign( a
);
3103 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
3104 shiftCount
= aExp
- 0x433;
3105 if ( 0 <= shiftCount
) {
3106 if ( 0x43E <= aExp
) {
3107 if ( float64_val(a
) != LIT64( 0xC3E0000000000000 ) ) {
3108 float_raise( float_flag_invalid STATUS_VAR
);
3110 || ( ( aExp
== 0x7FF )
3111 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
3113 return LIT64( 0x7FFFFFFFFFFFFFFF );
3116 return (int64_t) LIT64( 0x8000000000000000 );
3118 z
= aSig
<<shiftCount
;
3121 if ( aExp
< 0x3FE ) {
3122 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
3125 z
= aSig
>>( - shiftCount
);
3126 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
3127 STATUS(float_exception_flags
) |= float_flag_inexact
;
3130 if ( aSign
) z
= - z
;
3135 /*----------------------------------------------------------------------------
3136 | Returns the result of converting the double-precision floating-point value
3137 | `a' to the single-precision floating-point format. The conversion is
3138 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3140 *----------------------------------------------------------------------------*/
3142 float32
float64_to_float32( float64 a STATUS_PARAM
)
3148 a
= float64_squash_input_denormal(a STATUS_VAR
);
3150 aSig
= extractFloat64Frac( a
);
3151 aExp
= extractFloat64Exp( a
);
3152 aSign
= extractFloat64Sign( a
);
3153 if ( aExp
== 0x7FF ) {
3154 if ( aSig
) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
3155 return packFloat32( aSign
, 0xFF, 0 );
3157 shift64RightJamming( aSig
, 22, &aSig
);
3159 if ( aExp
|| zSig
) {
3163 return roundAndPackFloat32( aSign
, aExp
, zSig STATUS_VAR
);
3168 /*----------------------------------------------------------------------------
3169 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3170 | half-precision floating-point value, returning the result. After being
3171 | shifted into the proper positions, the three fields are simply added
3172 | together to form the result. This means that any integer portion of `zSig'
3173 | will be added into the exponent. Since a properly normalized significand
3174 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3175 | than the desired result exponent whenever `zSig' is a complete, normalized
3177 *----------------------------------------------------------------------------*/
3178 static float16
packFloat16(flag zSign
, int_fast16_t zExp
, uint16_t zSig
)
3180 return make_float16(
3181 (((uint32_t)zSign
) << 15) + (((uint32_t)zExp
) << 10) + zSig
);
3184 /*----------------------------------------------------------------------------
3185 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3186 | and significand `zSig', and returns the proper half-precision floating-
3187 | point value corresponding to the abstract input. Ordinarily, the abstract
3188 | value is simply rounded and packed into the half-precision format, with
3189 | the inexact exception raised if the abstract input cannot be represented
3190 | exactly. However, if the abstract value is too large, the overflow and
3191 | inexact exceptions are raised and an infinity or maximal finite value is
3192 | returned. If the abstract value is too small, the input value is rounded to
3193 | a subnormal number, and the underflow and inexact exceptions are raised if
3194 | the abstract input cannot be represented exactly as a subnormal half-
3195 | precision floating-point number.
3196 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3197 | ARM-style "alternative representation", which omits the NaN and Inf
3198 | encodings in order to raise the maximum representable exponent by one.
3199 | The input significand `zSig' has its binary point between bits 22
3200 | and 23, which is 13 bits to the left of the usual location. This shifted
3201 | significand must be normalized or smaller. If `zSig' is not normalized,
3202 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3203 | and it must not require rounding. In the usual case that `zSig' is
3204 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3205 | Note the slightly odd position of the binary point in zSig compared with the
3206 | other roundAndPackFloat functions. This should probably be fixed if we
3207 | need to implement more float16 routines than just conversion.
3208 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3209 | Binary Floating-Point Arithmetic.
3210 *----------------------------------------------------------------------------*/
3212 static float32
roundAndPackFloat16(flag zSign
, int_fast16_t zExp
,
3213 uint32_t zSig
, flag ieee STATUS_PARAM
)
3215 int maxexp
= ieee
? 29 : 30;
3218 bool rounding_bumps_exp
;
3219 bool is_tiny
= false;
3221 /* Calculate the mask of bits of the mantissa which are not
3222 * representable in half-precision and will be lost.
3225 /* Will be denormal in halfprec */
3231 /* Normal number in halfprec */
3235 switch (STATUS(float_rounding_mode
)) {
3236 case float_round_nearest_even
:
3237 increment
= (mask
+ 1) >> 1;
3238 if ((zSig
& mask
) == increment
) {
3239 increment
= zSig
& (increment
<< 1);
3242 case float_round_ties_away
:
3243 increment
= (mask
+ 1) >> 1;
3245 case float_round_up
:
3246 increment
= zSign
? 0 : mask
;
3248 case float_round_down
:
3249 increment
= zSign
? mask
: 0;
3251 default: /* round_to_zero */
3256 rounding_bumps_exp
= (zSig
+ increment
>= 0x01000000);
3258 if (zExp
> maxexp
|| (zExp
== maxexp
&& rounding_bumps_exp
)) {
3260 float_raise(float_flag_overflow
| float_flag_inexact STATUS_VAR
);
3261 return packFloat16(zSign
, 0x1f, 0);
3263 float_raise(float_flag_invalid STATUS_VAR
);
3264 return packFloat16(zSign
, 0x1f, 0x3ff);
3269 /* Note that flush-to-zero does not affect half-precision results */
3271 (STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
3273 || (!rounding_bumps_exp
);
3276 float_raise(float_flag_inexact STATUS_VAR
);
3278 float_raise(float_flag_underflow STATUS_VAR
);
3283 if (rounding_bumps_exp
) {
3289 return packFloat16(zSign
, 0, 0);
3295 return packFloat16(zSign
, zExp
, zSig
>> 13);
3298 static void normalizeFloat16Subnormal(uint32_t aSig
, int_fast16_t *zExpPtr
,
3301 int8_t shiftCount
= countLeadingZeros32(aSig
) - 21;
3302 *zSigPtr
= aSig
<< shiftCount
;
3303 *zExpPtr
= 1 - shiftCount
;
3306 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3307 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3309 float32
float16_to_float32(float16 a
, flag ieee STATUS_PARAM
)
3315 aSign
= extractFloat16Sign(a
);
3316 aExp
= extractFloat16Exp(a
);
3317 aSig
= extractFloat16Frac(a
);
3319 if (aExp
== 0x1f && ieee
) {
3321 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR
) STATUS_VAR
);
3323 return packFloat32(aSign
, 0xff, 0);
3327 return packFloat32(aSign
, 0, 0);
3330 normalizeFloat16Subnormal(aSig
, &aExp
, &aSig
);
3333 return packFloat32( aSign
, aExp
+ 0x70, aSig
<< 13);
3336 float16
float32_to_float16(float32 a
, flag ieee STATUS_PARAM
)
3342 a
= float32_squash_input_denormal(a STATUS_VAR
);
3344 aSig
= extractFloat32Frac( a
);
3345 aExp
= extractFloat32Exp( a
);
3346 aSign
= extractFloat32Sign( a
);
3347 if ( aExp
== 0xFF ) {
3349 /* Input is a NaN */
3351 float_raise(float_flag_invalid STATUS_VAR
);
3352 return packFloat16(aSign
, 0, 0);
3354 return commonNaNToFloat16(
3355 float32ToCommonNaN(a STATUS_VAR
) STATUS_VAR
);
3359 float_raise(float_flag_invalid STATUS_VAR
);
3360 return packFloat16(aSign
, 0x1f, 0x3ff);
3362 return packFloat16(aSign
, 0x1f, 0);
3364 if (aExp
== 0 && aSig
== 0) {
3365 return packFloat16(aSign
, 0, 0);
3367 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3368 * even if the input is denormal; however this is harmless because
3369 * the largest possible single-precision denormal is still smaller
3370 * than the smallest representable half-precision denormal, and so we
3371 * will end up ignoring aSig and returning via the "always return zero"
3377 return roundAndPackFloat16(aSign
, aExp
, aSig
, ieee STATUS_VAR
);
3380 float64
float16_to_float64(float16 a
, flag ieee STATUS_PARAM
)
3386 aSign
= extractFloat16Sign(a
);
3387 aExp
= extractFloat16Exp(a
);
3388 aSig
= extractFloat16Frac(a
);
3390 if (aExp
== 0x1f && ieee
) {
3392 return commonNaNToFloat64(
3393 float16ToCommonNaN(a STATUS_VAR
) STATUS_VAR
);
3395 return packFloat64(aSign
, 0x7ff, 0);
3399 return packFloat64(aSign
, 0, 0);
3402 normalizeFloat16Subnormal(aSig
, &aExp
, &aSig
);
3405 return packFloat64(aSign
, aExp
+ 0x3f0, ((uint64_t)aSig
) << 42);
3408 float16
float64_to_float16(float64 a
, flag ieee STATUS_PARAM
)
3415 a
= float64_squash_input_denormal(a STATUS_VAR
);
3417 aSig
= extractFloat64Frac(a
);
3418 aExp
= extractFloat64Exp(a
);
3419 aSign
= extractFloat64Sign(a
);
3420 if (aExp
== 0x7FF) {
3422 /* Input is a NaN */
3424 float_raise(float_flag_invalid STATUS_VAR
);
3425 return packFloat16(aSign
, 0, 0);
3427 return commonNaNToFloat16(
3428 float64ToCommonNaN(a STATUS_VAR
) STATUS_VAR
);
3432 float_raise(float_flag_invalid STATUS_VAR
);
3433 return packFloat16(aSign
, 0x1f, 0x3ff);
3435 return packFloat16(aSign
, 0x1f, 0);
3437 shift64RightJamming(aSig
, 29, &aSig
);
3439 if (aExp
== 0 && zSig
== 0) {
3440 return packFloat16(aSign
, 0, 0);
3442 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3443 * even if the input is denormal; however this is harmless because
3444 * the largest possible single-precision denormal is still smaller
3445 * than the smallest representable half-precision denormal, and so we
3446 * will end up ignoring aSig and returning via the "always return zero"
3452 return roundAndPackFloat16(aSign
, aExp
, zSig
, ieee STATUS_VAR
);
3455 /*----------------------------------------------------------------------------
3456 | Returns the result of converting the double-precision floating-point value
3457 | `a' to the extended double-precision floating-point format. The conversion
3458 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3460 *----------------------------------------------------------------------------*/
3462 floatx80
float64_to_floatx80( float64 a STATUS_PARAM
)
3468 a
= float64_squash_input_denormal(a STATUS_VAR
);
3469 aSig
= extractFloat64Frac( a
);
3470 aExp
= extractFloat64Exp( a
);
3471 aSign
= extractFloat64Sign( a
);
3472 if ( aExp
== 0x7FF ) {
3473 if ( aSig
) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
3474 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3477 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
3478 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3482 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
3486 /*----------------------------------------------------------------------------
3487 | Returns the result of converting the double-precision floating-point value
3488 | `a' to the quadruple-precision floating-point format. The conversion is
3489 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3491 *----------------------------------------------------------------------------*/
3493 float128
float64_to_float128( float64 a STATUS_PARAM
)
3497 uint64_t aSig
, zSig0
, zSig1
;
3499 a
= float64_squash_input_denormal(a STATUS_VAR
);
3500 aSig
= extractFloat64Frac( a
);
3501 aExp
= extractFloat64Exp( a
);
3502 aSign
= extractFloat64Sign( a
);
3503 if ( aExp
== 0x7FF ) {
3504 if ( aSig
) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
3505 return packFloat128( aSign
, 0x7FFF, 0, 0 );
3508 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
3509 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3512 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
3513 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
3517 /*----------------------------------------------------------------------------
3518 | Rounds the double-precision floating-point value `a' to an integer, and
3519 | returns the result as a double-precision floating-point value. The
3520 | operation is performed according to the IEC/IEEE Standard for Binary
3521 | Floating-Point Arithmetic.
3522 *----------------------------------------------------------------------------*/
3524 float64
float64_round_to_int( float64 a STATUS_PARAM
)
3528 uint64_t lastBitMask
, roundBitsMask
;
3530 a
= float64_squash_input_denormal(a STATUS_VAR
);
3532 aExp
= extractFloat64Exp( a
);
3533 if ( 0x433 <= aExp
) {
3534 if ( ( aExp
== 0x7FF ) && extractFloat64Frac( a
) ) {
3535 return propagateFloat64NaN( a
, a STATUS_VAR
);
3539 if ( aExp
< 0x3FF ) {
3540 if ( (uint64_t) ( float64_val(a
)<<1 ) == 0 ) return a
;
3541 STATUS(float_exception_flags
) |= float_flag_inexact
;
3542 aSign
= extractFloat64Sign( a
);
3543 switch ( STATUS(float_rounding_mode
) ) {
3544 case float_round_nearest_even
:
3545 if ( ( aExp
== 0x3FE ) && extractFloat64Frac( a
) ) {
3546 return packFloat64( aSign
, 0x3FF, 0 );
3549 case float_round_ties_away
:
3550 if (aExp
== 0x3FE) {
3551 return packFloat64(aSign
, 0x3ff, 0);
3554 case float_round_down
:
3555 return make_float64(aSign
? LIT64( 0xBFF0000000000000 ) : 0);
3556 case float_round_up
:
3557 return make_float64(
3558 aSign
? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3560 return packFloat64( aSign
, 0, 0 );
3563 lastBitMask
<<= 0x433 - aExp
;
3564 roundBitsMask
= lastBitMask
- 1;
3566 switch (STATUS(float_rounding_mode
)) {
3567 case float_round_nearest_even
:
3568 z
+= lastBitMask
>> 1;
3569 if ((z
& roundBitsMask
) == 0) {
3573 case float_round_ties_away
:
3574 z
+= lastBitMask
>> 1;
3576 case float_round_to_zero
:
3578 case float_round_up
:
3579 if (!extractFloat64Sign(make_float64(z
))) {
3583 case float_round_down
:
3584 if (extractFloat64Sign(make_float64(z
))) {
3591 z
&= ~ roundBitsMask
;
3592 if ( z
!= float64_val(a
) )
3593 STATUS(float_exception_flags
) |= float_flag_inexact
;
3594 return make_float64(z
);
3598 float64
float64_trunc_to_int( float64 a STATUS_PARAM
)
3602 oldmode
= STATUS(float_rounding_mode
);
3603 STATUS(float_rounding_mode
) = float_round_to_zero
;
3604 res
= float64_round_to_int(a STATUS_VAR
);
3605 STATUS(float_rounding_mode
) = oldmode
;
3609 /*----------------------------------------------------------------------------
3610 | Returns the result of adding the absolute values of the double-precision
3611 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3612 | before being returned. `zSign' is ignored if the result is a NaN.
3613 | The addition is performed according to the IEC/IEEE Standard for Binary
3614 | Floating-Point Arithmetic.
3615 *----------------------------------------------------------------------------*/
3617 static float64
addFloat64Sigs( float64 a
, float64 b
, flag zSign STATUS_PARAM
)
3619 int_fast16_t aExp
, bExp
, zExp
;
3620 uint64_t aSig
, bSig
, zSig
;
3621 int_fast16_t expDiff
;
3623 aSig
= extractFloat64Frac( a
);
3624 aExp
= extractFloat64Exp( a
);
3625 bSig
= extractFloat64Frac( b
);
3626 bExp
= extractFloat64Exp( b
);
3627 expDiff
= aExp
- bExp
;
3630 if ( 0 < expDiff
) {
3631 if ( aExp
== 0x7FF ) {
3632 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3639 bSig
|= LIT64( 0x2000000000000000 );
3641 shift64RightJamming( bSig
, expDiff
, &bSig
);
3644 else if ( expDiff
< 0 ) {
3645 if ( bExp
== 0x7FF ) {
3646 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3647 return packFloat64( zSign
, 0x7FF, 0 );
3653 aSig
|= LIT64( 0x2000000000000000 );
3655 shift64RightJamming( aSig
, - expDiff
, &aSig
);
3659 if ( aExp
== 0x7FF ) {
3660 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3664 if (STATUS(flush_to_zero
)) {
3666 float_raise(float_flag_output_denormal STATUS_VAR
);
3668 return packFloat64(zSign
, 0, 0);
3670 return packFloat64( zSign
, 0, ( aSig
+ bSig
)>>9 );
3672 zSig
= LIT64( 0x4000000000000000 ) + aSig
+ bSig
;
3676 aSig
|= LIT64( 0x2000000000000000 );
3677 zSig
= ( aSig
+ bSig
)<<1;
3679 if ( (int64_t) zSig
< 0 ) {
3684 return roundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
3688 /*----------------------------------------------------------------------------
3689 | Returns the result of subtracting the absolute values of the double-
3690 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3691 | difference is negated before being returned. `zSign' is ignored if the
3692 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3693 | Standard for Binary Floating-Point Arithmetic.
3694 *----------------------------------------------------------------------------*/
3696 static float64
subFloat64Sigs( float64 a
, float64 b
, flag zSign STATUS_PARAM
)
3698 int_fast16_t aExp
, bExp
, zExp
;
3699 uint64_t aSig
, bSig
, zSig
;
3700 int_fast16_t expDiff
;
3702 aSig
= extractFloat64Frac( a
);
3703 aExp
= extractFloat64Exp( a
);
3704 bSig
= extractFloat64Frac( b
);
3705 bExp
= extractFloat64Exp( b
);
3706 expDiff
= aExp
- bExp
;
3709 if ( 0 < expDiff
) goto aExpBigger
;
3710 if ( expDiff
< 0 ) goto bExpBigger
;
3711 if ( aExp
== 0x7FF ) {
3712 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3713 float_raise( float_flag_invalid STATUS_VAR
);
3714 return float64_default_nan
;
3720 if ( bSig
< aSig
) goto aBigger
;
3721 if ( aSig
< bSig
) goto bBigger
;
3722 return packFloat64( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
3724 if ( bExp
== 0x7FF ) {
3725 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3726 return packFloat64( zSign
^ 1, 0x7FF, 0 );
3732 aSig
|= LIT64( 0x4000000000000000 );
3734 shift64RightJamming( aSig
, - expDiff
, &aSig
);
3735 bSig
|= LIT64( 0x4000000000000000 );
3740 goto normalizeRoundAndPack
;
3742 if ( aExp
== 0x7FF ) {
3743 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3750 bSig
|= LIT64( 0x4000000000000000 );
3752 shift64RightJamming( bSig
, expDiff
, &bSig
);
3753 aSig
|= LIT64( 0x4000000000000000 );
3757 normalizeRoundAndPack
:
3759 return normalizeRoundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
3763 /*----------------------------------------------------------------------------
3764 | Returns the result of adding the double-precision floating-point values `a'
3765 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3766 | Binary Floating-Point Arithmetic.
3767 *----------------------------------------------------------------------------*/
3769 float64
float64_add( float64 a
, float64 b STATUS_PARAM
)
3772 a
= float64_squash_input_denormal(a STATUS_VAR
);
3773 b
= float64_squash_input_denormal(b STATUS_VAR
);
3775 aSign
= extractFloat64Sign( a
);
3776 bSign
= extractFloat64Sign( b
);
3777 if ( aSign
== bSign
) {
3778 return addFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3781 return subFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3786 /*----------------------------------------------------------------------------
3787 | Returns the result of subtracting the double-precision floating-point values
3788 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3789 | for Binary Floating-Point Arithmetic.
3790 *----------------------------------------------------------------------------*/
3792 float64
float64_sub( float64 a
, float64 b STATUS_PARAM
)
3795 a
= float64_squash_input_denormal(a STATUS_VAR
);
3796 b
= float64_squash_input_denormal(b STATUS_VAR
);
3798 aSign
= extractFloat64Sign( a
);
3799 bSign
= extractFloat64Sign( b
);
3800 if ( aSign
== bSign
) {
3801 return subFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3804 return addFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3809 /*----------------------------------------------------------------------------
3810 | Returns the result of multiplying the double-precision floating-point values
3811 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3812 | for Binary Floating-Point Arithmetic.
3813 *----------------------------------------------------------------------------*/
3815 float64
float64_mul( float64 a
, float64 b STATUS_PARAM
)
3817 flag aSign
, bSign
, zSign
;
3818 int_fast16_t aExp
, bExp
, zExp
;
3819 uint64_t aSig
, bSig
, zSig0
, zSig1
;
3821 a
= float64_squash_input_denormal(a STATUS_VAR
);
3822 b
= float64_squash_input_denormal(b STATUS_VAR
);
3824 aSig
= extractFloat64Frac( a
);
3825 aExp
= extractFloat64Exp( a
);
3826 aSign
= extractFloat64Sign( a
);
3827 bSig
= extractFloat64Frac( b
);
3828 bExp
= extractFloat64Exp( b
);
3829 bSign
= extractFloat64Sign( b
);
3830 zSign
= aSign
^ bSign
;
3831 if ( aExp
== 0x7FF ) {
3832 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
3833 return propagateFloat64NaN( a
, b STATUS_VAR
);
3835 if ( ( bExp
| bSig
) == 0 ) {
3836 float_raise( float_flag_invalid STATUS_VAR
);
3837 return float64_default_nan
;
3839 return packFloat64( zSign
, 0x7FF, 0 );
3841 if ( bExp
== 0x7FF ) {
3842 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3843 if ( ( aExp
| aSig
) == 0 ) {
3844 float_raise( float_flag_invalid STATUS_VAR
);
3845 return float64_default_nan
;
3847 return packFloat64( zSign
, 0x7FF, 0 );
3850 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
3851 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3854 if ( bSig
== 0 ) return packFloat64( zSign
, 0, 0 );
3855 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3857 zExp
= aExp
+ bExp
- 0x3FF;
3858 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
3859 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3860 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
3861 zSig0
|= ( zSig1
!= 0 );
3862 if ( 0 <= (int64_t) ( zSig0
<<1 ) ) {
3866 return roundAndPackFloat64( zSign
, zExp
, zSig0 STATUS_VAR
);
3870 /*----------------------------------------------------------------------------
3871 | Returns the result of dividing the double-precision floating-point value `a'
3872 | by the corresponding value `b'. The operation is performed according to
3873 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3874 *----------------------------------------------------------------------------*/
3876 float64
float64_div( float64 a
, float64 b STATUS_PARAM
)
3878 flag aSign
, bSign
, zSign
;
3879 int_fast16_t aExp
, bExp
, zExp
;
3880 uint64_t aSig
, bSig
, zSig
;
3881 uint64_t rem0
, rem1
;
3882 uint64_t term0
, term1
;
3883 a
= float64_squash_input_denormal(a STATUS_VAR
);
3884 b
= float64_squash_input_denormal(b STATUS_VAR
);
3886 aSig
= extractFloat64Frac( a
);
3887 aExp
= extractFloat64Exp( a
);
3888 aSign
= extractFloat64Sign( a
);
3889 bSig
= extractFloat64Frac( b
);
3890 bExp
= extractFloat64Exp( b
);
3891 bSign
= extractFloat64Sign( b
);
3892 zSign
= aSign
^ bSign
;
3893 if ( aExp
== 0x7FF ) {
3894 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3895 if ( bExp
== 0x7FF ) {
3896 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3897 float_raise( float_flag_invalid STATUS_VAR
);
3898 return float64_default_nan
;
3900 return packFloat64( zSign
, 0x7FF, 0 );
3902 if ( bExp
== 0x7FF ) {
3903 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3904 return packFloat64( zSign
, 0, 0 );
3908 if ( ( aExp
| aSig
) == 0 ) {
3909 float_raise( float_flag_invalid STATUS_VAR
);
3910 return float64_default_nan
;
3912 float_raise( float_flag_divbyzero STATUS_VAR
);
3913 return packFloat64( zSign
, 0x7FF, 0 );
3915 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3918 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
3919 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3921 zExp
= aExp
- bExp
+ 0x3FD;
3922 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
3923 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3924 if ( bSig
<= ( aSig
+ aSig
) ) {
3928 zSig
= estimateDiv128To64( aSig
, 0, bSig
);
3929 if ( ( zSig
& 0x1FF ) <= 2 ) {
3930 mul64To128( bSig
, zSig
, &term0
, &term1
);
3931 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
3932 while ( (int64_t) rem0
< 0 ) {
3934 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
3936 zSig
|= ( rem1
!= 0 );
3938 return roundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
3942 /*----------------------------------------------------------------------------
3943 | Returns the remainder of the double-precision floating-point value `a'
3944 | with respect to the corresponding value `b'. The operation is performed
3945 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3946 *----------------------------------------------------------------------------*/
3948 float64
float64_rem( float64 a
, float64 b STATUS_PARAM
)
3951 int_fast16_t aExp
, bExp
, expDiff
;
3952 uint64_t aSig
, bSig
;
3953 uint64_t q
, alternateASig
;
3956 a
= float64_squash_input_denormal(a STATUS_VAR
);
3957 b
= float64_squash_input_denormal(b STATUS_VAR
);
3958 aSig
= extractFloat64Frac( a
);
3959 aExp
= extractFloat64Exp( a
);
3960 aSign
= extractFloat64Sign( a
);
3961 bSig
= extractFloat64Frac( b
);
3962 bExp
= extractFloat64Exp( b
);
3963 if ( aExp
== 0x7FF ) {
3964 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
3965 return propagateFloat64NaN( a
, b STATUS_VAR
);
3967 float_raise( float_flag_invalid STATUS_VAR
);
3968 return float64_default_nan
;
3970 if ( bExp
== 0x7FF ) {
3971 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3976 float_raise( float_flag_invalid STATUS_VAR
);
3977 return float64_default_nan
;
3979 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3982 if ( aSig
== 0 ) return a
;
3983 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3985 expDiff
= aExp
- bExp
;
3986 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
3987 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3988 if ( expDiff
< 0 ) {
3989 if ( expDiff
< -1 ) return a
;
3992 q
= ( bSig
<= aSig
);
3993 if ( q
) aSig
-= bSig
;
3995 while ( 0 < expDiff
) {
3996 q
= estimateDiv128To64( aSig
, 0, bSig
);
3997 q
= ( 2 < q
) ? q
- 2 : 0;
3998 aSig
= - ( ( bSig
>>2 ) * q
);
4002 if ( 0 < expDiff
) {
4003 q
= estimateDiv128To64( aSig
, 0, bSig
);
4004 q
= ( 2 < q
) ? q
- 2 : 0;
4007 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
4014 alternateASig
= aSig
;
4017 } while ( 0 <= (int64_t) aSig
);
4018 sigMean
= aSig
+ alternateASig
;
4019 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
4020 aSig
= alternateASig
;
4022 zSign
= ( (int64_t) aSig
< 0 );
4023 if ( zSign
) aSig
= - aSig
;
4024 return normalizeRoundAndPackFloat64( aSign
^ zSign
, bExp
, aSig STATUS_VAR
);
4028 /*----------------------------------------------------------------------------
4029 | Returns the result of multiplying the double-precision floating-point values
4030 | `a' and `b' then adding 'c', with no intermediate rounding step after the
4031 | multiplication. The operation is performed according to the IEC/IEEE
4032 | Standard for Binary Floating-Point Arithmetic 754-2008.
4033 | The flags argument allows the caller to select negation of the
4034 | addend, the intermediate product, or the final result. (The difference
4035 | between this and having the caller do a separate negation is that negating
4036 | externally will flip the sign bit on NaNs.)
4037 *----------------------------------------------------------------------------*/
4039 float64
float64_muladd(float64 a
, float64 b
, float64 c
, int flags STATUS_PARAM
)
4041 flag aSign
, bSign
, cSign
, zSign
;
4042 int_fast16_t aExp
, bExp
, cExp
, pExp
, zExp
, expDiff
;
4043 uint64_t aSig
, bSig
, cSig
;
4044 flag pInf
, pZero
, pSign
;
4045 uint64_t pSig0
, pSig1
, cSig0
, cSig1
, zSig0
, zSig1
;
4047 flag signflip
, infzero
;
4049 a
= float64_squash_input_denormal(a STATUS_VAR
);
4050 b
= float64_squash_input_denormal(b STATUS_VAR
);
4051 c
= float64_squash_input_denormal(c STATUS_VAR
);
4052 aSig
= extractFloat64Frac(a
);
4053 aExp
= extractFloat64Exp(a
);
4054 aSign
= extractFloat64Sign(a
);
4055 bSig
= extractFloat64Frac(b
);
4056 bExp
= extractFloat64Exp(b
);
4057 bSign
= extractFloat64Sign(b
);
4058 cSig
= extractFloat64Frac(c
);
4059 cExp
= extractFloat64Exp(c
);
4060 cSign
= extractFloat64Sign(c
);
4062 infzero
= ((aExp
== 0 && aSig
== 0 && bExp
== 0x7ff && bSig
== 0) ||
4063 (aExp
== 0x7ff && aSig
== 0 && bExp
== 0 && bSig
== 0));
4065 /* It is implementation-defined whether the cases of (0,inf,qnan)
4066 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4067 * they return if they do), so we have to hand this information
4068 * off to the target-specific pick-a-NaN routine.
4070 if (((aExp
== 0x7ff) && aSig
) ||
4071 ((bExp
== 0x7ff) && bSig
) ||
4072 ((cExp
== 0x7ff) && cSig
)) {
4073 return propagateFloat64MulAddNaN(a
, b
, c
, infzero STATUS_VAR
);
4077 float_raise(float_flag_invalid STATUS_VAR
);
4078 return float64_default_nan
;
4081 if (flags
& float_muladd_negate_c
) {
4085 signflip
= (flags
& float_muladd_negate_result
) ? 1 : 0;
4087 /* Work out the sign and type of the product */
4088 pSign
= aSign
^ bSign
;
4089 if (flags
& float_muladd_negate_product
) {
4092 pInf
= (aExp
== 0x7ff) || (bExp
== 0x7ff);
4093 pZero
= ((aExp
| aSig
) == 0) || ((bExp
| bSig
) == 0);
4095 if (cExp
== 0x7ff) {
4096 if (pInf
&& (pSign
^ cSign
)) {
4097 /* addition of opposite-signed infinities => InvalidOperation */
4098 float_raise(float_flag_invalid STATUS_VAR
);
4099 return float64_default_nan
;
4101 /* Otherwise generate an infinity of the same sign */
4102 return packFloat64(cSign
^ signflip
, 0x7ff, 0);
4106 return packFloat64(pSign
^ signflip
, 0x7ff, 0);
4112 /* Adding two exact zeroes */
4113 if (pSign
== cSign
) {
4115 } else if (STATUS(float_rounding_mode
) == float_round_down
) {
4120 return packFloat64(zSign
^ signflip
, 0, 0);
4122 /* Exact zero plus a denorm */
4123 if (STATUS(flush_to_zero
)) {
4124 float_raise(float_flag_output_denormal STATUS_VAR
);
4125 return packFloat64(cSign
^ signflip
, 0, 0);
4128 /* Zero plus something non-zero : just return the something */
4129 if (flags
& float_muladd_halve_result
) {
4131 normalizeFloat64Subnormal(cSig
, &cExp
, &cSig
);
4133 /* Subtract one to halve, and one again because roundAndPackFloat64
4134 * wants one less than the true exponent.
4137 cSig
= (cSig
| 0x0010000000000000ULL
) << 10;
4138 return roundAndPackFloat64(cSign
^ signflip
, cExp
, cSig STATUS_VAR
);
4140 return packFloat64(cSign
^ signflip
, cExp
, cSig
);
4144 normalizeFloat64Subnormal(aSig
, &aExp
, &aSig
);
4147 normalizeFloat64Subnormal(bSig
, &bExp
, &bSig
);
4150 /* Calculate the actual result a * b + c */
4152 /* Multiply first; this is easy. */
4153 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4154 * because we want the true exponent, not the "one-less-than"
4155 * flavour that roundAndPackFloat64() takes.
4157 pExp
= aExp
+ bExp
- 0x3fe;
4158 aSig
= (aSig
| LIT64(0x0010000000000000))<<10;
4159 bSig
= (bSig
| LIT64(0x0010000000000000))<<11;
4160 mul64To128(aSig
, bSig
, &pSig0
, &pSig1
);
4161 if ((int64_t)(pSig0
<< 1) >= 0) {
4162 shortShift128Left(pSig0
, pSig1
, 1, &pSig0
, &pSig1
);
4166 zSign
= pSign
^ signflip
;
4168 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4169 * bit in position 126.
4173 /* Throw out the special case of c being an exact zero now */
4174 shift128RightJamming(pSig0
, pSig1
, 64, &pSig0
, &pSig1
);
4175 if (flags
& float_muladd_halve_result
) {
4178 return roundAndPackFloat64(zSign
, pExp
- 1,
4181 normalizeFloat64Subnormal(cSig
, &cExp
, &cSig
);
4184 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4185 * significand of the addend, with the explicit bit in position 126.
4187 cSig0
= cSig
<< (126 - 64 - 52);
4189 cSig0
|= LIT64(0x4000000000000000);
4190 expDiff
= pExp
- cExp
;
4192 if (pSign
== cSign
) {
4195 /* scale c to match p */
4196 shift128RightJamming(cSig0
, cSig1
, expDiff
, &cSig0
, &cSig1
);
4198 } else if (expDiff
< 0) {
4199 /* scale p to match c */
4200 shift128RightJamming(pSig0
, pSig1
, -expDiff
, &pSig0
, &pSig1
);
4203 /* no scaling needed */
4206 /* Add significands and make sure explicit bit ends up in posn 126 */
4207 add128(pSig0
, pSig1
, cSig0
, cSig1
, &zSig0
, &zSig1
);
4208 if ((int64_t)zSig0
< 0) {
4209 shift128RightJamming(zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
4213 shift128RightJamming(zSig0
, zSig1
, 64, &zSig0
, &zSig1
);
4214 if (flags
& float_muladd_halve_result
) {
4217 return roundAndPackFloat64(zSign
, zExp
, zSig1 STATUS_VAR
);
4221 shift128RightJamming(cSig0
, cSig1
, expDiff
, &cSig0
, &cSig1
);
4222 sub128(pSig0
, pSig1
, cSig0
, cSig1
, &zSig0
, &zSig1
);
4224 } else if (expDiff
< 0) {
4225 shift128RightJamming(pSig0
, pSig1
, -expDiff
, &pSig0
, &pSig1
);
4226 sub128(cSig0
, cSig1
, pSig0
, pSig1
, &zSig0
, &zSig1
);
4231 if (lt128(cSig0
, cSig1
, pSig0
, pSig1
)) {
4232 sub128(pSig0
, pSig1
, cSig0
, cSig1
, &zSig0
, &zSig1
);
4233 } else if (lt128(pSig0
, pSig1
, cSig0
, cSig1
)) {
4234 sub128(cSig0
, cSig1
, pSig0
, pSig1
, &zSig0
, &zSig1
);
4239 if (STATUS(float_rounding_mode
) == float_round_down
) {
4242 return packFloat64(zSign
, 0, 0);
4246 /* Do the equivalent of normalizeRoundAndPackFloat64() but
4247 * starting with the significand in a pair of uint64_t.
4250 shiftcount
= countLeadingZeros64(zSig0
) - 1;
4251 shortShift128Left(zSig0
, zSig1
, shiftcount
, &zSig0
, &zSig1
);
4257 shiftcount
= countLeadingZeros64(zSig1
);
4258 if (shiftcount
== 0) {
4259 zSig0
= (zSig1
>> 1) | (zSig1
& 1);
4263 zSig0
= zSig1
<< shiftcount
;
4264 zExp
-= (shiftcount
+ 64);
4267 if (flags
& float_muladd_halve_result
) {
4270 return roundAndPackFloat64(zSign
, zExp
, zSig0 STATUS_VAR
);
4274 /*----------------------------------------------------------------------------
4275 | Returns the square root of the double-precision floating-point value `a'.
4276 | The operation is performed according to the IEC/IEEE Standard for Binary
4277 | Floating-Point Arithmetic.
4278 *----------------------------------------------------------------------------*/
4280 float64
float64_sqrt( float64 a STATUS_PARAM
)
4283 int_fast16_t aExp
, zExp
;
4284 uint64_t aSig
, zSig
, doubleZSig
;
4285 uint64_t rem0
, rem1
, term0
, term1
;
4286 a
= float64_squash_input_denormal(a STATUS_VAR
);
4288 aSig
= extractFloat64Frac( a
);
4289 aExp
= extractFloat64Exp( a
);
4290 aSign
= extractFloat64Sign( a
);
4291 if ( aExp
== 0x7FF ) {
4292 if ( aSig
) return propagateFloat64NaN( a
, a STATUS_VAR
);
4293 if ( ! aSign
) return a
;
4294 float_raise( float_flag_invalid STATUS_VAR
);
4295 return float64_default_nan
;
4298 if ( ( aExp
| aSig
) == 0 ) return a
;
4299 float_raise( float_flag_invalid STATUS_VAR
);
4300 return float64_default_nan
;
4303 if ( aSig
== 0 ) return float64_zero
;
4304 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4306 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
4307 aSig
|= LIT64( 0x0010000000000000 );
4308 zSig
= estimateSqrt32( aExp
, aSig
>>21 );
4309 aSig
<<= 9 - ( aExp
& 1 );
4310 zSig
= estimateDiv128To64( aSig
, 0, zSig
<<32 ) + ( zSig
<<30 );
4311 if ( ( zSig
& 0x1FF ) <= 5 ) {
4312 doubleZSig
= zSig
<<1;
4313 mul64To128( zSig
, zSig
, &term0
, &term1
);
4314 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
4315 while ( (int64_t) rem0
< 0 ) {
4318 add128( rem0
, rem1
, zSig
>>63, doubleZSig
| 1, &rem0
, &rem1
);
4320 zSig
|= ( ( rem0
| rem1
) != 0 );
4322 return roundAndPackFloat64( 0, zExp
, zSig STATUS_VAR
);
4326 /*----------------------------------------------------------------------------
4327 | Returns the binary log of the double-precision floating-point value `a'.
4328 | The operation is performed according to the IEC/IEEE Standard for Binary
4329 | Floating-Point Arithmetic.
4330 *----------------------------------------------------------------------------*/
4331 float64
float64_log2( float64 a STATUS_PARAM
)
4335 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
4336 a
= float64_squash_input_denormal(a STATUS_VAR
);
4338 aSig
= extractFloat64Frac( a
);
4339 aExp
= extractFloat64Exp( a
);
4340 aSign
= extractFloat64Sign( a
);
4343 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
4344 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4347 float_raise( float_flag_invalid STATUS_VAR
);
4348 return float64_default_nan
;
4350 if ( aExp
== 0x7FF ) {
4351 if ( aSig
) return propagateFloat64NaN( a
, float64_zero STATUS_VAR
);
4356 aSig
|= LIT64( 0x0010000000000000 );
4358 zSig
= (uint64_t)aExp
<< 52;
4359 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
4360 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
4361 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
4362 if ( aSig
& LIT64( 0x0020000000000000 ) ) {
4370 return normalizeRoundAndPackFloat64( zSign
, 0x408, zSig STATUS_VAR
);
4373 /*----------------------------------------------------------------------------
4374 | Returns 1 if the double-precision floating-point value `a' is equal to the
4375 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4376 | if either operand is a NaN. Otherwise, the comparison is performed
4377 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4378 *----------------------------------------------------------------------------*/
4380 int float64_eq( float64 a
, float64 b STATUS_PARAM
)
4383 a
= float64_squash_input_denormal(a STATUS_VAR
);
4384 b
= float64_squash_input_denormal(b STATUS_VAR
);
4386 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4387 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4389 float_raise( float_flag_invalid STATUS_VAR
);
4392 av
= float64_val(a
);
4393 bv
= float64_val(b
);
4394 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4398 /*----------------------------------------------------------------------------
4399 | Returns 1 if the double-precision floating-point value `a' is less than or
4400 | equal to the corresponding value `b', and 0 otherwise. The invalid
4401 | exception is raised if either operand is a NaN. The comparison is performed
4402 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4403 *----------------------------------------------------------------------------*/
4405 int float64_le( float64 a
, float64 b STATUS_PARAM
)
4409 a
= float64_squash_input_denormal(a STATUS_VAR
);
4410 b
= float64_squash_input_denormal(b STATUS_VAR
);
4412 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4413 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4415 float_raise( float_flag_invalid STATUS_VAR
);
4418 aSign
= extractFloat64Sign( a
);
4419 bSign
= extractFloat64Sign( b
);
4420 av
= float64_val(a
);
4421 bv
= float64_val(b
);
4422 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4423 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4427 /*----------------------------------------------------------------------------
4428 | Returns 1 if the double-precision floating-point value `a' is less than
4429 | the corresponding value `b', and 0 otherwise. The invalid exception is
4430 | raised if either operand is a NaN. The comparison is performed according
4431 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4432 *----------------------------------------------------------------------------*/
4434 int float64_lt( float64 a
, float64 b STATUS_PARAM
)
4439 a
= float64_squash_input_denormal(a STATUS_VAR
);
4440 b
= float64_squash_input_denormal(b STATUS_VAR
);
4441 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4442 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4444 float_raise( float_flag_invalid STATUS_VAR
);
4447 aSign
= extractFloat64Sign( a
);
4448 bSign
= extractFloat64Sign( b
);
4449 av
= float64_val(a
);
4450 bv
= float64_val(b
);
4451 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
4452 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4456 /*----------------------------------------------------------------------------
4457 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4458 | be compared, and 0 otherwise. The invalid exception is raised if either
4459 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4460 | Standard for Binary Floating-Point Arithmetic.
4461 *----------------------------------------------------------------------------*/
4463 int float64_unordered( float64 a
, float64 b STATUS_PARAM
)
4465 a
= float64_squash_input_denormal(a STATUS_VAR
);
4466 b
= float64_squash_input_denormal(b STATUS_VAR
);
4468 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4469 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4471 float_raise( float_flag_invalid STATUS_VAR
);
4477 /*----------------------------------------------------------------------------
4478 | Returns 1 if the double-precision floating-point value `a' is equal to the
4479 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4480 | exception.The comparison is performed according to the IEC/IEEE Standard
4481 | for Binary Floating-Point Arithmetic.
4482 *----------------------------------------------------------------------------*/
4484 int float64_eq_quiet( float64 a
, float64 b STATUS_PARAM
)
4487 a
= float64_squash_input_denormal(a STATUS_VAR
);
4488 b
= float64_squash_input_denormal(b STATUS_VAR
);
4490 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4491 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4493 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
4494 float_raise( float_flag_invalid STATUS_VAR
);
4498 av
= float64_val(a
);
4499 bv
= float64_val(b
);
4500 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4504 /*----------------------------------------------------------------------------
4505 | Returns 1 if the double-precision floating-point value `a' is less than or
4506 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4507 | cause an exception. Otherwise, the comparison is performed according to the
4508 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4509 *----------------------------------------------------------------------------*/
4511 int float64_le_quiet( float64 a
, float64 b STATUS_PARAM
)
4515 a
= float64_squash_input_denormal(a STATUS_VAR
);
4516 b
= float64_squash_input_denormal(b STATUS_VAR
);
4518 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4519 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4521 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
4522 float_raise( float_flag_invalid STATUS_VAR
);
4526 aSign
= extractFloat64Sign( a
);
4527 bSign
= extractFloat64Sign( b
);
4528 av
= float64_val(a
);
4529 bv
= float64_val(b
);
4530 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4531 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4535 /*----------------------------------------------------------------------------
4536 | Returns 1 if the double-precision floating-point value `a' is less than
4537 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4538 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4539 | Standard for Binary Floating-Point Arithmetic.
4540 *----------------------------------------------------------------------------*/
4542 int float64_lt_quiet( float64 a
, float64 b STATUS_PARAM
)
4546 a
= float64_squash_input_denormal(a STATUS_VAR
);
4547 b
= float64_squash_input_denormal(b STATUS_VAR
);
4549 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4550 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4552 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
4553 float_raise( float_flag_invalid STATUS_VAR
);
4557 aSign
= extractFloat64Sign( a
);
4558 bSign
= extractFloat64Sign( b
);
4559 av
= float64_val(a
);
4560 bv
= float64_val(b
);
4561 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
4562 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4566 /*----------------------------------------------------------------------------
4567 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4568 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4569 | comparison is performed according to the IEC/IEEE Standard for Binary
4570 | Floating-Point Arithmetic.
4571 *----------------------------------------------------------------------------*/
4573 int float64_unordered_quiet( float64 a
, float64 b STATUS_PARAM
)
4575 a
= float64_squash_input_denormal(a STATUS_VAR
);
4576 b
= float64_squash_input_denormal(b STATUS_VAR
);
4578 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4579 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4581 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
4582 float_raise( float_flag_invalid STATUS_VAR
);
4589 /*----------------------------------------------------------------------------
4590 | Returns the result of converting the extended double-precision floating-
4591 | point value `a' to the 32-bit two's complement integer format. The
4592 | conversion is performed according to the IEC/IEEE Standard for Binary
4593 | Floating-Point Arithmetic---which means in particular that the conversion
4594 | is rounded according to the current rounding mode. If `a' is a NaN, the
4595 | largest positive integer is returned. Otherwise, if the conversion
4596 | overflows, the largest integer with the same sign as `a' is returned.
4597 *----------------------------------------------------------------------------*/
4599 int32
floatx80_to_int32( floatx80 a STATUS_PARAM
)
4602 int32 aExp
, shiftCount
;
4605 aSig
= extractFloatx80Frac( a
);
4606 aExp
= extractFloatx80Exp( a
);
4607 aSign
= extractFloatx80Sign( a
);
4608 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
4609 shiftCount
= 0x4037 - aExp
;
4610 if ( shiftCount
<= 0 ) shiftCount
= 1;
4611 shift64RightJamming( aSig
, shiftCount
, &aSig
);
4612 return roundAndPackInt32( aSign
, aSig STATUS_VAR
);
4616 /*----------------------------------------------------------------------------
4617 | Returns the result of converting the extended double-precision floating-
4618 | point value `a' to the 32-bit two's complement integer format. The
4619 | conversion is performed according to the IEC/IEEE Standard for Binary
4620 | Floating-Point Arithmetic, except that the conversion is always rounded
4621 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4622 | Otherwise, if the conversion overflows, the largest integer with the same
4623 | sign as `a' is returned.
4624 *----------------------------------------------------------------------------*/
4626 int32
floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM
)
4629 int32 aExp
, shiftCount
;
4630 uint64_t aSig
, savedASig
;
4633 aSig
= extractFloatx80Frac( a
);
4634 aExp
= extractFloatx80Exp( a
);
4635 aSign
= extractFloatx80Sign( a
);
4636 if ( 0x401E < aExp
) {
4637 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
4640 else if ( aExp
< 0x3FFF ) {
4641 if ( aExp
|| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4644 shiftCount
= 0x403E - aExp
;
4646 aSig
>>= shiftCount
;
4648 if ( aSign
) z
= - z
;
4649 if ( ( z
< 0 ) ^ aSign
) {
4651 float_raise( float_flag_invalid STATUS_VAR
);
4652 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
4654 if ( ( aSig
<<shiftCount
) != savedASig
) {
4655 STATUS(float_exception_flags
) |= float_flag_inexact
;
4661 /*----------------------------------------------------------------------------
4662 | Returns the result of converting the extended double-precision floating-
4663 | point value `a' to the 64-bit two's complement integer format. The
4664 | conversion is performed according to the IEC/IEEE Standard for Binary
4665 | Floating-Point Arithmetic---which means in particular that the conversion
4666 | is rounded according to the current rounding mode. If `a' is a NaN,
4667 | the largest positive integer is returned. Otherwise, if the conversion
4668 | overflows, the largest integer with the same sign as `a' is returned.
4669 *----------------------------------------------------------------------------*/
4671 int64
floatx80_to_int64( floatx80 a STATUS_PARAM
)
4674 int32 aExp
, shiftCount
;
4675 uint64_t aSig
, aSigExtra
;
4677 aSig
= extractFloatx80Frac( a
);
4678 aExp
= extractFloatx80Exp( a
);
4679 aSign
= extractFloatx80Sign( a
);
4680 shiftCount
= 0x403E - aExp
;
4681 if ( shiftCount
<= 0 ) {
4683 float_raise( float_flag_invalid STATUS_VAR
);
4685 || ( ( aExp
== 0x7FFF )
4686 && ( aSig
!= LIT64( 0x8000000000000000 ) ) )
4688 return LIT64( 0x7FFFFFFFFFFFFFFF );
4690 return (int64_t) LIT64( 0x8000000000000000 );
4695 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
4697 return roundAndPackInt64( aSign
, aSig
, aSigExtra STATUS_VAR
);
4701 /*----------------------------------------------------------------------------
4702 | Returns the result of converting the extended double-precision floating-
4703 | point value `a' to the 64-bit two's complement integer format. The
4704 | conversion is performed according to the IEC/IEEE Standard for Binary
4705 | Floating-Point Arithmetic, except that the conversion is always rounded
4706 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4707 | Otherwise, if the conversion overflows, the largest integer with the same
4708 | sign as `a' is returned.
4709 *----------------------------------------------------------------------------*/
4711 int64
floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM
)
4714 int32 aExp
, shiftCount
;
4718 aSig
= extractFloatx80Frac( a
);
4719 aExp
= extractFloatx80Exp( a
);
4720 aSign
= extractFloatx80Sign( a
);
4721 shiftCount
= aExp
- 0x403E;
4722 if ( 0 <= shiftCount
) {
4723 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
4724 if ( ( a
.high
!= 0xC03E ) || aSig
) {
4725 float_raise( float_flag_invalid STATUS_VAR
);
4726 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
4727 return LIT64( 0x7FFFFFFFFFFFFFFF );
4730 return (int64_t) LIT64( 0x8000000000000000 );
4732 else if ( aExp
< 0x3FFF ) {
4733 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4736 z
= aSig
>>( - shiftCount
);
4737 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
4738 STATUS(float_exception_flags
) |= float_flag_inexact
;
4740 if ( aSign
) z
= - z
;
4745 /*----------------------------------------------------------------------------
4746 | Returns the result of converting the extended double-precision floating-
4747 | point value `a' to the single-precision floating-point format. The
4748 | conversion is performed according to the IEC/IEEE Standard for Binary
4749 | Floating-Point Arithmetic.
4750 *----------------------------------------------------------------------------*/
4752 float32
floatx80_to_float32( floatx80 a STATUS_PARAM
)
4758 aSig
= extractFloatx80Frac( a
);
4759 aExp
= extractFloatx80Exp( a
);
4760 aSign
= extractFloatx80Sign( a
);
4761 if ( aExp
== 0x7FFF ) {
4762 if ( (uint64_t) ( aSig
<<1 ) ) {
4763 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
4765 return packFloat32( aSign
, 0xFF, 0 );
4767 shift64RightJamming( aSig
, 33, &aSig
);
4768 if ( aExp
|| aSig
) aExp
-= 0x3F81;
4769 return roundAndPackFloat32( aSign
, aExp
, aSig STATUS_VAR
);
4773 /*----------------------------------------------------------------------------
4774 | Returns the result of converting the extended double-precision floating-
4775 | point value `a' to the double-precision floating-point format. The
4776 | conversion is performed according to the IEC/IEEE Standard for Binary
4777 | Floating-Point Arithmetic.
4778 *----------------------------------------------------------------------------*/
4780 float64
floatx80_to_float64( floatx80 a STATUS_PARAM
)
4784 uint64_t aSig
, zSig
;
4786 aSig
= extractFloatx80Frac( a
);
4787 aExp
= extractFloatx80Exp( a
);
4788 aSign
= extractFloatx80Sign( a
);
4789 if ( aExp
== 0x7FFF ) {
4790 if ( (uint64_t) ( aSig
<<1 ) ) {
4791 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
4793 return packFloat64( aSign
, 0x7FF, 0 );
4795 shift64RightJamming( aSig
, 1, &zSig
);
4796 if ( aExp
|| aSig
) aExp
-= 0x3C01;
4797 return roundAndPackFloat64( aSign
, aExp
, zSig STATUS_VAR
);
4801 /*----------------------------------------------------------------------------
4802 | Returns the result of converting the extended double-precision floating-
4803 | point value `a' to the quadruple-precision floating-point format. The
4804 | conversion is performed according to the IEC/IEEE Standard for Binary
4805 | Floating-Point Arithmetic.
4806 *----------------------------------------------------------------------------*/
4808 float128
floatx80_to_float128( floatx80 a STATUS_PARAM
)
4812 uint64_t aSig
, zSig0
, zSig1
;
4814 aSig
= extractFloatx80Frac( a
);
4815 aExp
= extractFloatx80Exp( a
);
4816 aSign
= extractFloatx80Sign( a
);
4817 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
4818 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
4820 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
4821 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
4825 /*----------------------------------------------------------------------------
4826 | Rounds the extended double-precision floating-point value `a' to an integer,
4827 | and returns the result as an extended quadruple-precision floating-point
4828 | value. The operation is performed according to the IEC/IEEE Standard for
4829 | Binary Floating-Point Arithmetic.
4830 *----------------------------------------------------------------------------*/
4832 floatx80
floatx80_round_to_int( floatx80 a STATUS_PARAM
)
4836 uint64_t lastBitMask
, roundBitsMask
;
4839 aExp
= extractFloatx80Exp( a
);
4840 if ( 0x403E <= aExp
) {
4841 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
4842 return propagateFloatx80NaN( a
, a STATUS_VAR
);
4846 if ( aExp
< 0x3FFF ) {
4848 && ( (uint64_t) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
4851 STATUS(float_exception_flags
) |= float_flag_inexact
;
4852 aSign
= extractFloatx80Sign( a
);
4853 switch ( STATUS(float_rounding_mode
) ) {
4854 case float_round_nearest_even
:
4855 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
4858 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
4861 case float_round_ties_away
:
4862 if (aExp
== 0x3FFE) {
4863 return packFloatx80(aSign
, 0x3FFF, LIT64(0x8000000000000000));
4866 case float_round_down
:
4869 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4870 : packFloatx80( 0, 0, 0 );
4871 case float_round_up
:
4873 aSign
? packFloatx80( 1, 0, 0 )
4874 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4876 return packFloatx80( aSign
, 0, 0 );
4879 lastBitMask
<<= 0x403E - aExp
;
4880 roundBitsMask
= lastBitMask
- 1;
4882 switch (STATUS(float_rounding_mode
)) {
4883 case float_round_nearest_even
:
4884 z
.low
+= lastBitMask
>>1;
4885 if ((z
.low
& roundBitsMask
) == 0) {
4886 z
.low
&= ~lastBitMask
;
4889 case float_round_ties_away
:
4890 z
.low
+= lastBitMask
>> 1;
4892 case float_round_to_zero
:
4894 case float_round_up
:
4895 if (!extractFloatx80Sign(z
)) {
4896 z
.low
+= roundBitsMask
;
4899 case float_round_down
:
4900 if (extractFloatx80Sign(z
)) {
4901 z
.low
+= roundBitsMask
;
4907 z
.low
&= ~ roundBitsMask
;
4910 z
.low
= LIT64( 0x8000000000000000 );
4912 if ( z
.low
!= a
.low
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4917 /*----------------------------------------------------------------------------
4918 | Returns the result of adding the absolute values of the extended double-
4919 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4920 | negated before being returned. `zSign' is ignored if the result is a NaN.
4921 | The addition is performed according to the IEC/IEEE Standard for Binary
4922 | Floating-Point Arithmetic.
4923 *----------------------------------------------------------------------------*/
4925 static floatx80
addFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign STATUS_PARAM
)
4927 int32 aExp
, bExp
, zExp
;
4928 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4931 aSig
= extractFloatx80Frac( a
);
4932 aExp
= extractFloatx80Exp( a
);
4933 bSig
= extractFloatx80Frac( b
);
4934 bExp
= extractFloatx80Exp( b
);
4935 expDiff
= aExp
- bExp
;
4936 if ( 0 < expDiff
) {
4937 if ( aExp
== 0x7FFF ) {
4938 if ( (uint64_t) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4941 if ( bExp
== 0 ) --expDiff
;
4942 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
4945 else if ( expDiff
< 0 ) {
4946 if ( bExp
== 0x7FFF ) {
4947 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4948 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4950 if ( aExp
== 0 ) ++expDiff
;
4951 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
4955 if ( aExp
== 0x7FFF ) {
4956 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
4957 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4962 zSig0
= aSig
+ bSig
;
4964 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
4970 zSig0
= aSig
+ bSig
;
4971 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
4973 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
4974 zSig0
|= LIT64( 0x8000000000000000 );
4978 roundAndPackFloatx80(
4979 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4983 /*----------------------------------------------------------------------------
4984 | Returns the result of subtracting the absolute values of the extended
4985 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4986 | difference is negated before being returned. `zSign' is ignored if the
4987 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4988 | Standard for Binary Floating-Point Arithmetic.
4989 *----------------------------------------------------------------------------*/
4991 static floatx80
subFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign STATUS_PARAM
)
4993 int32 aExp
, bExp
, zExp
;
4994 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4998 aSig
= extractFloatx80Frac( a
);
4999 aExp
= extractFloatx80Exp( a
);
5000 bSig
= extractFloatx80Frac( b
);
5001 bExp
= extractFloatx80Exp( b
);
5002 expDiff
= aExp
- bExp
;
5003 if ( 0 < expDiff
) goto aExpBigger
;
5004 if ( expDiff
< 0 ) goto bExpBigger
;
5005 if ( aExp
== 0x7FFF ) {
5006 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5007 return propagateFloatx80NaN( a
, b STATUS_VAR
);
5009 float_raise( float_flag_invalid STATUS_VAR
);
5010 z
.low
= floatx80_default_nan_low
;
5011 z
.high
= floatx80_default_nan_high
;
5019 if ( bSig
< aSig
) goto aBigger
;
5020 if ( aSig
< bSig
) goto bBigger
;
5021 return packFloatx80( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
5023 if ( bExp
== 0x7FFF ) {
5024 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
5025 return packFloatx80( zSign
^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5027 if ( aExp
== 0 ) ++expDiff
;
5028 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5030 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
5033 goto normalizeRoundAndPack
;
5035 if ( aExp
== 0x7FFF ) {
5036 if ( (uint64_t) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
5039 if ( bExp
== 0 ) --expDiff
;
5040 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5042 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
5044 normalizeRoundAndPack
:
5046 normalizeRoundAndPackFloatx80(
5047 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
5051 /*----------------------------------------------------------------------------
5052 | Returns the result of adding the extended double-precision floating-point
5053 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5054 | Standard for Binary Floating-Point Arithmetic.
5055 *----------------------------------------------------------------------------*/
5057 floatx80
floatx80_add( floatx80 a
, floatx80 b STATUS_PARAM
)
5061 aSign
= extractFloatx80Sign( a
);
5062 bSign
= extractFloatx80Sign( b
);
5063 if ( aSign
== bSign
) {
5064 return addFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
5067 return subFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
5072 /*----------------------------------------------------------------------------
5073 | Returns the result of subtracting the extended double-precision floating-
5074 | point values `a' and `b'. The operation is performed according to the
5075 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5076 *----------------------------------------------------------------------------*/
5078 floatx80
floatx80_sub( floatx80 a
, floatx80 b STATUS_PARAM
)
5082 aSign
= extractFloatx80Sign( a
);
5083 bSign
= extractFloatx80Sign( b
);
5084 if ( aSign
== bSign
) {
5085 return subFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
5088 return addFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
5093 /*----------------------------------------------------------------------------
5094 | Returns the result of multiplying the extended double-precision floating-
5095 | point values `a' and `b'. The operation is performed according to the
5096 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5097 *----------------------------------------------------------------------------*/
5099 floatx80
floatx80_mul( floatx80 a
, floatx80 b STATUS_PARAM
)
5101 flag aSign
, bSign
, zSign
;
5102 int32 aExp
, bExp
, zExp
;
5103 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5106 aSig
= extractFloatx80Frac( a
);
5107 aExp
= extractFloatx80Exp( a
);
5108 aSign
= extractFloatx80Sign( a
);
5109 bSig
= extractFloatx80Frac( b
);
5110 bExp
= extractFloatx80Exp( b
);
5111 bSign
= extractFloatx80Sign( b
);
5112 zSign
= aSign
^ bSign
;
5113 if ( aExp
== 0x7FFF ) {
5114 if ( (uint64_t) ( aSig
<<1 )
5115 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5116 return propagateFloatx80NaN( a
, b STATUS_VAR
);
5118 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
5119 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5121 if ( bExp
== 0x7FFF ) {
5122 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
5123 if ( ( aExp
| aSig
) == 0 ) {
5125 float_raise( float_flag_invalid STATUS_VAR
);
5126 z
.low
= floatx80_default_nan_low
;
5127 z
.high
= floatx80_default_nan_high
;
5130 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5133 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5134 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5137 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5138 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5140 zExp
= aExp
+ bExp
- 0x3FFE;
5141 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
5142 if ( 0 < (int64_t) zSig0
) {
5143 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5147 roundAndPackFloatx80(
5148 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
5152 /*----------------------------------------------------------------------------
5153 | Returns the result of dividing the extended double-precision floating-point
5154 | value `a' by the corresponding value `b'. The operation is performed
5155 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5156 *----------------------------------------------------------------------------*/
5158 floatx80
floatx80_div( floatx80 a
, floatx80 b STATUS_PARAM
)
5160 flag aSign
, bSign
, zSign
;
5161 int32 aExp
, bExp
, zExp
;
5162 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5163 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
5166 aSig
= extractFloatx80Frac( a
);
5167 aExp
= extractFloatx80Exp( a
);
5168 aSign
= extractFloatx80Sign( a
);
5169 bSig
= extractFloatx80Frac( b
);
5170 bExp
= extractFloatx80Exp( b
);
5171 bSign
= extractFloatx80Sign( b
);
5172 zSign
= aSign
^ bSign
;
5173 if ( aExp
== 0x7FFF ) {
5174 if ( (uint64_t) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
5175 if ( bExp
== 0x7FFF ) {
5176 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
5179 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5181 if ( bExp
== 0x7FFF ) {
5182 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
5183 return packFloatx80( zSign
, 0, 0 );
5187 if ( ( aExp
| aSig
) == 0 ) {
5189 float_raise( float_flag_invalid STATUS_VAR
);
5190 z
.low
= floatx80_default_nan_low
;
5191 z
.high
= floatx80_default_nan_high
;
5194 float_raise( float_flag_divbyzero STATUS_VAR
);
5195 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5197 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5200 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5201 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5203 zExp
= aExp
- bExp
+ 0x3FFE;
5205 if ( bSig
<= aSig
) {
5206 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
5209 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
5210 mul64To128( bSig
, zSig0
, &term0
, &term1
);
5211 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
5212 while ( (int64_t) rem0
< 0 ) {
5214 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
5216 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
5217 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
5218 mul64To128( bSig
, zSig1
, &term1
, &term2
);
5219 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5220 while ( (int64_t) rem1
< 0 ) {
5222 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
5224 zSig1
|= ( ( rem1
| rem2
) != 0 );
5227 roundAndPackFloatx80(
5228 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
5232 /*----------------------------------------------------------------------------
5233 | Returns the remainder of the extended double-precision floating-point value
5234 | `a' with respect to the corresponding value `b'. The operation is performed
5235 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5236 *----------------------------------------------------------------------------*/
5238 floatx80
floatx80_rem( floatx80 a
, floatx80 b STATUS_PARAM
)
5241 int32 aExp
, bExp
, expDiff
;
5242 uint64_t aSig0
, aSig1
, bSig
;
5243 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
5246 aSig0
= extractFloatx80Frac( a
);
5247 aExp
= extractFloatx80Exp( a
);
5248 aSign
= extractFloatx80Sign( a
);
5249 bSig
= extractFloatx80Frac( b
);
5250 bExp
= extractFloatx80Exp( b
);
5251 if ( aExp
== 0x7FFF ) {
5252 if ( (uint64_t) ( aSig0
<<1 )
5253 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5254 return propagateFloatx80NaN( a
, b STATUS_VAR
);
5258 if ( bExp
== 0x7FFF ) {
5259 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
5265 float_raise( float_flag_invalid STATUS_VAR
);
5266 z
.low
= floatx80_default_nan_low
;
5267 z
.high
= floatx80_default_nan_high
;
5270 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5273 if ( (uint64_t) ( aSig0
<<1 ) == 0 ) return a
;
5274 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5276 bSig
|= LIT64( 0x8000000000000000 );
5278 expDiff
= aExp
- bExp
;
5280 if ( expDiff
< 0 ) {
5281 if ( expDiff
< -1 ) return a
;
5282 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
5285 q
= ( bSig
<= aSig0
);
5286 if ( q
) aSig0
-= bSig
;
5288 while ( 0 < expDiff
) {
5289 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5290 q
= ( 2 < q
) ? q
- 2 : 0;
5291 mul64To128( bSig
, q
, &term0
, &term1
);
5292 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5293 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
5297 if ( 0 < expDiff
) {
5298 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5299 q
= ( 2 < q
) ? q
- 2 : 0;
5301 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
5302 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5303 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
5304 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
5306 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5313 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
5314 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5315 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5318 aSig0
= alternateASig0
;
5319 aSig1
= alternateASig1
;
5323 normalizeRoundAndPackFloatx80(
5324 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1 STATUS_VAR
);
5328 /*----------------------------------------------------------------------------
5329 | Returns the square root of the extended double-precision floating-point
5330 | value `a'. The operation is performed according to the IEC/IEEE Standard
5331 | for Binary Floating-Point Arithmetic.
5332 *----------------------------------------------------------------------------*/
5334 floatx80
floatx80_sqrt( floatx80 a STATUS_PARAM
)
5338 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
5339 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5342 aSig0
= extractFloatx80Frac( a
);
5343 aExp
= extractFloatx80Exp( a
);
5344 aSign
= extractFloatx80Sign( a
);
5345 if ( aExp
== 0x7FFF ) {
5346 if ( (uint64_t) ( aSig0
<<1 ) ) return propagateFloatx80NaN( a
, a STATUS_VAR
);
5347 if ( ! aSign
) return a
;
5351 if ( ( aExp
| aSig0
) == 0 ) return a
;
5353 float_raise( float_flag_invalid STATUS_VAR
);
5354 z
.low
= floatx80_default_nan_low
;
5355 z
.high
= floatx80_default_nan_high
;
5359 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
5360 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5362 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
5363 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
5364 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
5365 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
5366 doubleZSig0
= zSig0
<<1;
5367 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
5368 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
5369 while ( (int64_t) rem0
< 0 ) {
5372 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
5374 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
5375 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5376 if ( zSig1
== 0 ) zSig1
= 1;
5377 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
5378 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5379 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
5380 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5381 while ( (int64_t) rem1
< 0 ) {
5383 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
5385 term2
|= doubleZSig0
;
5386 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5388 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5390 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
5391 zSig0
|= doubleZSig0
;
5393 roundAndPackFloatx80(
5394 STATUS(floatx80_rounding_precision
), 0, zExp
, zSig0
, zSig1 STATUS_VAR
);
5398 /*----------------------------------------------------------------------------
5399 | Returns 1 if the extended double-precision floating-point value `a' is equal
5400 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5401 | raised if either operand is a NaN. Otherwise, the comparison is performed
5402 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5403 *----------------------------------------------------------------------------*/
5405 int floatx80_eq( floatx80 a
, floatx80 b STATUS_PARAM
)
5408 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5409 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5410 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5411 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5413 float_raise( float_flag_invalid STATUS_VAR
);
5418 && ( ( a
.high
== b
.high
)
5420 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5425 /*----------------------------------------------------------------------------
5426 | Returns 1 if the extended double-precision floating-point value `a' is
5427 | less than or equal to the corresponding value `b', and 0 otherwise. The
5428 | invalid exception is raised if either operand is a NaN. The comparison is
5429 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5431 *----------------------------------------------------------------------------*/
5433 int floatx80_le( floatx80 a
, floatx80 b STATUS_PARAM
)
5437 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5438 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5439 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5440 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5442 float_raise( float_flag_invalid STATUS_VAR
);
5445 aSign
= extractFloatx80Sign( a
);
5446 bSign
= extractFloatx80Sign( b
);
5447 if ( aSign
!= bSign
) {
5450 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5454 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5455 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5459 /*----------------------------------------------------------------------------
5460 | Returns 1 if the extended double-precision floating-point value `a' is
5461 | less than the corresponding value `b', and 0 otherwise. The invalid
5462 | exception is raised if either operand is a NaN. The comparison is performed
5463 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5464 *----------------------------------------------------------------------------*/
5466 int floatx80_lt( floatx80 a
, floatx80 b STATUS_PARAM
)
5470 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5471 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5472 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5473 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5475 float_raise( float_flag_invalid STATUS_VAR
);
5478 aSign
= extractFloatx80Sign( a
);
5479 bSign
= extractFloatx80Sign( b
);
5480 if ( aSign
!= bSign
) {
5483 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5487 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5488 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5492 /*----------------------------------------------------------------------------
5493 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5494 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5495 | either operand is a NaN. The comparison is performed according to the
5496 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5497 *----------------------------------------------------------------------------*/
5498 int floatx80_unordered( floatx80 a
, floatx80 b STATUS_PARAM
)
5500 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5501 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5502 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5503 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5505 float_raise( float_flag_invalid STATUS_VAR
);
5511 /*----------------------------------------------------------------------------
5512 | Returns 1 if the extended double-precision floating-point value `a' is
5513 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5514 | cause an exception. The comparison is performed according to the IEC/IEEE
5515 | Standard for Binary Floating-Point Arithmetic.
5516 *----------------------------------------------------------------------------*/
5518 int floatx80_eq_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
5521 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5522 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5523 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5524 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5526 if ( floatx80_is_signaling_nan( a
)
5527 || floatx80_is_signaling_nan( b
) ) {
5528 float_raise( float_flag_invalid STATUS_VAR
);
5534 && ( ( a
.high
== b
.high
)
5536 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5541 /*----------------------------------------------------------------------------
5542 | Returns 1 if the extended double-precision floating-point value `a' is less
5543 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5544 | do not cause an exception. Otherwise, the comparison is performed according
5545 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5546 *----------------------------------------------------------------------------*/
5548 int floatx80_le_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
5552 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5553 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5554 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5555 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5557 if ( floatx80_is_signaling_nan( a
)
5558 || floatx80_is_signaling_nan( b
) ) {
5559 float_raise( float_flag_invalid STATUS_VAR
);
5563 aSign
= extractFloatx80Sign( a
);
5564 bSign
= extractFloatx80Sign( b
);
5565 if ( aSign
!= bSign
) {
5568 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5572 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5573 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5577 /*----------------------------------------------------------------------------
5578 | Returns 1 if the extended double-precision floating-point value `a' is less
5579 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5580 | an exception. Otherwise, the comparison is performed according to the
5581 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5582 *----------------------------------------------------------------------------*/
5584 int floatx80_lt_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
5588 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5589 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5590 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5591 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5593 if ( floatx80_is_signaling_nan( a
)
5594 || floatx80_is_signaling_nan( b
) ) {
5595 float_raise( float_flag_invalid STATUS_VAR
);
5599 aSign
= extractFloatx80Sign( a
);
5600 bSign
= extractFloatx80Sign( b
);
5601 if ( aSign
!= bSign
) {
5604 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5608 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5609 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5613 /*----------------------------------------------------------------------------
5614 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5615 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5616 | The comparison is performed according to the IEC/IEEE Standard for Binary
5617 | Floating-Point Arithmetic.
5618 *----------------------------------------------------------------------------*/
5619 int floatx80_unordered_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
5621 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5622 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5623 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5624 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5626 if ( floatx80_is_signaling_nan( a
)
5627 || floatx80_is_signaling_nan( b
) ) {
5628 float_raise( float_flag_invalid STATUS_VAR
);
5635 /*----------------------------------------------------------------------------
5636 | Returns the result of converting the quadruple-precision floating-point
5637 | value `a' to the 32-bit two's complement integer format. The conversion
5638 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5639 | Arithmetic---which means in particular that the conversion is rounded
5640 | according to the current rounding mode. If `a' is a NaN, the largest
5641 | positive integer is returned. Otherwise, if the conversion overflows, the
5642 | largest integer with the same sign as `a' is returned.
5643 *----------------------------------------------------------------------------*/
5645 int32
float128_to_int32( float128 a STATUS_PARAM
)
5648 int32 aExp
, shiftCount
;
5649 uint64_t aSig0
, aSig1
;
5651 aSig1
= extractFloat128Frac1( a
);
5652 aSig0
= extractFloat128Frac0( a
);
5653 aExp
= extractFloat128Exp( a
);
5654 aSign
= extractFloat128Sign( a
);
5655 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
5656 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5657 aSig0
|= ( aSig1
!= 0 );
5658 shiftCount
= 0x4028 - aExp
;
5659 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
5660 return roundAndPackInt32( aSign
, aSig0 STATUS_VAR
);
5664 /*----------------------------------------------------------------------------
5665 | Returns the result of converting the quadruple-precision floating-point
5666 | value `a' to the 32-bit two's complement integer format. The conversion
5667 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5668 | Arithmetic, except that the conversion is always rounded toward zero. If
5669 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5670 | conversion overflows, the largest integer with the same sign as `a' is
5672 *----------------------------------------------------------------------------*/
5674 int32
float128_to_int32_round_to_zero( float128 a STATUS_PARAM
)
5677 int32 aExp
, shiftCount
;
5678 uint64_t aSig0
, aSig1
, savedASig
;
5681 aSig1
= extractFloat128Frac1( a
);
5682 aSig0
= extractFloat128Frac0( a
);
5683 aExp
= extractFloat128Exp( a
);
5684 aSign
= extractFloat128Sign( a
);
5685 aSig0
|= ( aSig1
!= 0 );
5686 if ( 0x401E < aExp
) {
5687 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
5690 else if ( aExp
< 0x3FFF ) {
5691 if ( aExp
|| aSig0
) STATUS(float_exception_flags
) |= float_flag_inexact
;
5694 aSig0
|= LIT64( 0x0001000000000000 );
5695 shiftCount
= 0x402F - aExp
;
5697 aSig0
>>= shiftCount
;
5699 if ( aSign
) z
= - z
;
5700 if ( ( z
< 0 ) ^ aSign
) {
5702 float_raise( float_flag_invalid STATUS_VAR
);
5703 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
5705 if ( ( aSig0
<<shiftCount
) != savedASig
) {
5706 STATUS(float_exception_flags
) |= float_flag_inexact
;
5712 /*----------------------------------------------------------------------------
5713 | Returns the result of converting the quadruple-precision floating-point
5714 | value `a' to the 64-bit two's complement integer format. The conversion
5715 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5716 | Arithmetic---which means in particular that the conversion is rounded
5717 | according to the current rounding mode. If `a' is a NaN, the largest
5718 | positive integer is returned. Otherwise, if the conversion overflows, the
5719 | largest integer with the same sign as `a' is returned.
5720 *----------------------------------------------------------------------------*/
5722 int64
float128_to_int64( float128 a STATUS_PARAM
)
5725 int32 aExp
, shiftCount
;
5726 uint64_t aSig0
, aSig1
;
5728 aSig1
= extractFloat128Frac1( a
);
5729 aSig0
= extractFloat128Frac0( a
);
5730 aExp
= extractFloat128Exp( a
);
5731 aSign
= extractFloat128Sign( a
);
5732 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5733 shiftCount
= 0x402F - aExp
;
5734 if ( shiftCount
<= 0 ) {
5735 if ( 0x403E < aExp
) {
5736 float_raise( float_flag_invalid STATUS_VAR
);
5738 || ( ( aExp
== 0x7FFF )
5739 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
5742 return LIT64( 0x7FFFFFFFFFFFFFFF );
5744 return (int64_t) LIT64( 0x8000000000000000 );
5746 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
5749 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
5751 return roundAndPackInt64( aSign
, aSig0
, aSig1 STATUS_VAR
);
5755 /*----------------------------------------------------------------------------
5756 | Returns the result of converting the quadruple-precision floating-point
5757 | value `a' to the 64-bit two's complement integer format. The conversion
5758 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5759 | Arithmetic, except that the conversion is always rounded toward zero.
5760 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5761 | the conversion overflows, the largest integer with the same sign as `a' is
5763 *----------------------------------------------------------------------------*/
5765 int64
float128_to_int64_round_to_zero( float128 a STATUS_PARAM
)
5768 int32 aExp
, shiftCount
;
5769 uint64_t aSig0
, aSig1
;
5772 aSig1
= extractFloat128Frac1( a
);
5773 aSig0
= extractFloat128Frac0( a
);
5774 aExp
= extractFloat128Exp( a
);
5775 aSign
= extractFloat128Sign( a
);
5776 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5777 shiftCount
= aExp
- 0x402F;
5778 if ( 0 < shiftCount
) {
5779 if ( 0x403E <= aExp
) {
5780 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
5781 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
5782 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
5783 if ( aSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
5786 float_raise( float_flag_invalid STATUS_VAR
);
5787 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
5788 return LIT64( 0x7FFFFFFFFFFFFFFF );
5791 return (int64_t) LIT64( 0x8000000000000000 );
5793 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
5794 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
5795 STATUS(float_exception_flags
) |= float_flag_inexact
;
5799 if ( aExp
< 0x3FFF ) {
5800 if ( aExp
| aSig0
| aSig1
) {
5801 STATUS(float_exception_flags
) |= float_flag_inexact
;
5805 z
= aSig0
>>( - shiftCount
);
5807 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
5808 STATUS(float_exception_flags
) |= float_flag_inexact
;
5811 if ( aSign
) z
= - z
;
5816 /*----------------------------------------------------------------------------
5817 | Returns the result of converting the quadruple-precision floating-point
5818 | value `a' to the single-precision floating-point format. The conversion
5819 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5821 *----------------------------------------------------------------------------*/
5823 float32
float128_to_float32( float128 a STATUS_PARAM
)
5827 uint64_t aSig0
, aSig1
;
5830 aSig1
= extractFloat128Frac1( a
);
5831 aSig0
= extractFloat128Frac0( a
);
5832 aExp
= extractFloat128Exp( a
);
5833 aSign
= extractFloat128Sign( a
);
5834 if ( aExp
== 0x7FFF ) {
5835 if ( aSig0
| aSig1
) {
5836 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
5838 return packFloat32( aSign
, 0xFF, 0 );
5840 aSig0
|= ( aSig1
!= 0 );
5841 shift64RightJamming( aSig0
, 18, &aSig0
);
5843 if ( aExp
|| zSig
) {
5847 return roundAndPackFloat32( aSign
, aExp
, zSig STATUS_VAR
);
5851 /*----------------------------------------------------------------------------
5852 | Returns the result of converting the quadruple-precision floating-point
5853 | value `a' to the double-precision floating-point format. The conversion
5854 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5856 *----------------------------------------------------------------------------*/
5858 float64
float128_to_float64( float128 a STATUS_PARAM
)
5862 uint64_t aSig0
, aSig1
;
5864 aSig1
= extractFloat128Frac1( a
);
5865 aSig0
= extractFloat128Frac0( a
);
5866 aExp
= extractFloat128Exp( a
);
5867 aSign
= extractFloat128Sign( a
);
5868 if ( aExp
== 0x7FFF ) {
5869 if ( aSig0
| aSig1
) {
5870 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
5872 return packFloat64( aSign
, 0x7FF, 0 );
5874 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
5875 aSig0
|= ( aSig1
!= 0 );
5876 if ( aExp
|| aSig0
) {
5877 aSig0
|= LIT64( 0x4000000000000000 );
5880 return roundAndPackFloat64( aSign
, aExp
, aSig0 STATUS_VAR
);
5884 /*----------------------------------------------------------------------------
5885 | Returns the result of converting the quadruple-precision floating-point
5886 | value `a' to the extended double-precision floating-point format. The
5887 | conversion is performed according to the IEC/IEEE Standard for Binary
5888 | Floating-Point Arithmetic.
5889 *----------------------------------------------------------------------------*/
5891 floatx80
float128_to_floatx80( float128 a STATUS_PARAM
)
5895 uint64_t aSig0
, aSig1
;
5897 aSig1
= extractFloat128Frac1( a
);
5898 aSig0
= extractFloat128Frac0( a
);
5899 aExp
= extractFloat128Exp( a
);
5900 aSign
= extractFloat128Sign( a
);
5901 if ( aExp
== 0x7FFF ) {
5902 if ( aSig0
| aSig1
) {
5903 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
5905 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5908 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
5909 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5912 aSig0
|= LIT64( 0x0001000000000000 );
5914 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
5915 return roundAndPackFloatx80( 80, aSign
, aExp
, aSig0
, aSig1 STATUS_VAR
);
5919 /*----------------------------------------------------------------------------
5920 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5921 | returns the result as a quadruple-precision floating-point value. The
5922 | operation is performed according to the IEC/IEEE Standard for Binary
5923 | Floating-Point Arithmetic.
5924 *----------------------------------------------------------------------------*/
5926 float128
float128_round_to_int( float128 a STATUS_PARAM
)
5930 uint64_t lastBitMask
, roundBitsMask
;
5933 aExp
= extractFloat128Exp( a
);
5934 if ( 0x402F <= aExp
) {
5935 if ( 0x406F <= aExp
) {
5936 if ( ( aExp
== 0x7FFF )
5937 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
5939 return propagateFloat128NaN( a
, a STATUS_VAR
);
5944 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
5945 roundBitsMask
= lastBitMask
- 1;
5947 switch (STATUS(float_rounding_mode
)) {
5948 case float_round_nearest_even
:
5949 if ( lastBitMask
) {
5950 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
5951 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
5954 if ( (int64_t) z
.low
< 0 ) {
5956 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
5960 case float_round_ties_away
:
5962 add128(z
.high
, z
.low
, 0, lastBitMask
>> 1, &z
.high
, &z
.low
);
5964 if ((int64_t) z
.low
< 0) {
5969 case float_round_to_zero
:
5971 case float_round_up
:
5972 if (!extractFloat128Sign(z
)) {
5973 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
5976 case float_round_down
:
5977 if (extractFloat128Sign(z
)) {
5978 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
5984 z
.low
&= ~ roundBitsMask
;
5987 if ( aExp
< 0x3FFF ) {
5988 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
5989 STATUS(float_exception_flags
) |= float_flag_inexact
;
5990 aSign
= extractFloat128Sign( a
);
5991 switch ( STATUS(float_rounding_mode
) ) {
5992 case float_round_nearest_even
:
5993 if ( ( aExp
== 0x3FFE )
5994 && ( extractFloat128Frac0( a
)
5995 | extractFloat128Frac1( a
) )
5997 return packFloat128( aSign
, 0x3FFF, 0, 0 );
6000 case float_round_ties_away
:
6001 if (aExp
== 0x3FFE) {
6002 return packFloat128(aSign
, 0x3FFF, 0, 0);
6005 case float_round_down
:
6007 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
6008 : packFloat128( 0, 0, 0, 0 );
6009 case float_round_up
:
6011 aSign
? packFloat128( 1, 0, 0, 0 )
6012 : packFloat128( 0, 0x3FFF, 0, 0 );
6014 return packFloat128( aSign
, 0, 0, 0 );
6017 lastBitMask
<<= 0x402F - aExp
;
6018 roundBitsMask
= lastBitMask
- 1;
6021 switch (STATUS(float_rounding_mode
)) {
6022 case float_round_nearest_even
:
6023 z
.high
+= lastBitMask
>>1;
6024 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
6025 z
.high
&= ~ lastBitMask
;
6028 case float_round_ties_away
:
6029 z
.high
+= lastBitMask
>>1;
6031 case float_round_to_zero
:
6033 case float_round_up
:
6034 if (!extractFloat128Sign(z
)) {
6035 z
.high
|= ( a
.low
!= 0 );
6036 z
.high
+= roundBitsMask
;
6039 case float_round_down
:
6040 if (extractFloat128Sign(z
)) {
6041 z
.high
|= (a
.low
!= 0);
6042 z
.high
+= roundBitsMask
;
6048 z
.high
&= ~ roundBitsMask
;
6050 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
6051 STATUS(float_exception_flags
) |= float_flag_inexact
;
6057 /*----------------------------------------------------------------------------
6058 | Returns the result of adding the absolute values of the quadruple-precision
6059 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6060 | before being returned. `zSign' is ignored if the result is a NaN.
6061 | The addition is performed according to the IEC/IEEE Standard for Binary
6062 | Floating-Point Arithmetic.
6063 *----------------------------------------------------------------------------*/
6065 static float128
addFloat128Sigs( float128 a
, float128 b
, flag zSign STATUS_PARAM
)
6067 int32 aExp
, bExp
, zExp
;
6068 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6071 aSig1
= extractFloat128Frac1( a
);
6072 aSig0
= extractFloat128Frac0( a
);
6073 aExp
= extractFloat128Exp( a
);
6074 bSig1
= extractFloat128Frac1( b
);
6075 bSig0
= extractFloat128Frac0( b
);
6076 bExp
= extractFloat128Exp( b
);
6077 expDiff
= aExp
- bExp
;
6078 if ( 0 < expDiff
) {
6079 if ( aExp
== 0x7FFF ) {
6080 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
6087 bSig0
|= LIT64( 0x0001000000000000 );
6089 shift128ExtraRightJamming(
6090 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
6093 else if ( expDiff
< 0 ) {
6094 if ( bExp
== 0x7FFF ) {
6095 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
6096 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6102 aSig0
|= LIT64( 0x0001000000000000 );
6104 shift128ExtraRightJamming(
6105 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
6109 if ( aExp
== 0x7FFF ) {
6110 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6111 return propagateFloat128NaN( a
, b STATUS_VAR
);
6115 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6117 if (STATUS(flush_to_zero
)) {
6118 if (zSig0
| zSig1
) {
6119 float_raise(float_flag_output_denormal STATUS_VAR
);
6121 return packFloat128(zSign
, 0, 0, 0);
6123 return packFloat128( zSign
, 0, zSig0
, zSig1
);
6126 zSig0
|= LIT64( 0x0002000000000000 );
6130 aSig0
|= LIT64( 0x0001000000000000 );
6131 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6133 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
6136 shift128ExtraRightJamming(
6137 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6139 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
6143 /*----------------------------------------------------------------------------
6144 | Returns the result of subtracting the absolute values of the quadruple-
6145 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6146 | difference is negated before being returned. `zSign' is ignored if the
6147 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6148 | Standard for Binary Floating-Point Arithmetic.
6149 *----------------------------------------------------------------------------*/
6151 static float128
subFloat128Sigs( float128 a
, float128 b
, flag zSign STATUS_PARAM
)
6153 int32 aExp
, bExp
, zExp
;
6154 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
6158 aSig1
= extractFloat128Frac1( a
);
6159 aSig0
= extractFloat128Frac0( a
);
6160 aExp
= extractFloat128Exp( a
);
6161 bSig1
= extractFloat128Frac1( b
);
6162 bSig0
= extractFloat128Frac0( b
);
6163 bExp
= extractFloat128Exp( b
);
6164 expDiff
= aExp
- bExp
;
6165 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6166 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
6167 if ( 0 < expDiff
) goto aExpBigger
;
6168 if ( expDiff
< 0 ) goto bExpBigger
;
6169 if ( aExp
== 0x7FFF ) {
6170 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6171 return propagateFloat128NaN( a
, b STATUS_VAR
);
6173 float_raise( float_flag_invalid STATUS_VAR
);
6174 z
.low
= float128_default_nan_low
;
6175 z
.high
= float128_default_nan_high
;
6182 if ( bSig0
< aSig0
) goto aBigger
;
6183 if ( aSig0
< bSig0
) goto bBigger
;
6184 if ( bSig1
< aSig1
) goto aBigger
;
6185 if ( aSig1
< bSig1
) goto bBigger
;
6186 return packFloat128( STATUS(float_rounding_mode
) == float_round_down
, 0, 0, 0 );
6188 if ( bExp
== 0x7FFF ) {
6189 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
6190 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
6196 aSig0
|= LIT64( 0x4000000000000000 );
6198 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6199 bSig0
|= LIT64( 0x4000000000000000 );
6201 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6204 goto normalizeRoundAndPack
;
6206 if ( aExp
== 0x7FFF ) {
6207 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
6214 bSig0
|= LIT64( 0x4000000000000000 );
6216 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
6217 aSig0
|= LIT64( 0x4000000000000000 );
6219 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6221 normalizeRoundAndPack
:
6223 return normalizeRoundAndPackFloat128( zSign
, zExp
- 14, zSig0
, zSig1 STATUS_VAR
);
6227 /*----------------------------------------------------------------------------
6228 | Returns the result of adding the quadruple-precision floating-point values
6229 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6230 | for Binary Floating-Point Arithmetic.
6231 *----------------------------------------------------------------------------*/
6233 float128
float128_add( float128 a
, float128 b STATUS_PARAM
)
6237 aSign
= extractFloat128Sign( a
);
6238 bSign
= extractFloat128Sign( b
);
6239 if ( aSign
== bSign
) {
6240 return addFloat128Sigs( a
, b
, aSign STATUS_VAR
);
6243 return subFloat128Sigs( a
, b
, aSign STATUS_VAR
);
6248 /*----------------------------------------------------------------------------
6249 | Returns the result of subtracting the quadruple-precision floating-point
6250 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6251 | Standard for Binary Floating-Point Arithmetic.
6252 *----------------------------------------------------------------------------*/
6254 float128
float128_sub( float128 a
, float128 b STATUS_PARAM
)
6258 aSign
= extractFloat128Sign( a
);
6259 bSign
= extractFloat128Sign( b
);
6260 if ( aSign
== bSign
) {
6261 return subFloat128Sigs( a
, b
, aSign STATUS_VAR
);
6264 return addFloat128Sigs( a
, b
, aSign STATUS_VAR
);
6269 /*----------------------------------------------------------------------------
6270 | Returns the result of multiplying the quadruple-precision floating-point
6271 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6272 | Standard for Binary Floating-Point Arithmetic.
6273 *----------------------------------------------------------------------------*/
6275 float128
float128_mul( float128 a
, float128 b STATUS_PARAM
)
6277 flag aSign
, bSign
, zSign
;
6278 int32 aExp
, bExp
, zExp
;
6279 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
6282 aSig1
= extractFloat128Frac1( a
);
6283 aSig0
= extractFloat128Frac0( a
);
6284 aExp
= extractFloat128Exp( a
);
6285 aSign
= extractFloat128Sign( a
);
6286 bSig1
= extractFloat128Frac1( b
);
6287 bSig0
= extractFloat128Frac0( b
);
6288 bExp
= extractFloat128Exp( b
);
6289 bSign
= extractFloat128Sign( b
);
6290 zSign
= aSign
^ bSign
;
6291 if ( aExp
== 0x7FFF ) {
6292 if ( ( aSig0
| aSig1
)
6293 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6294 return propagateFloat128NaN( a
, b STATUS_VAR
);
6296 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
6297 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6299 if ( bExp
== 0x7FFF ) {
6300 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
6301 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6303 float_raise( float_flag_invalid STATUS_VAR
);
6304 z
.low
= float128_default_nan_low
;
6305 z
.high
= float128_default_nan_high
;
6308 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6311 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6312 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6315 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6316 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6318 zExp
= aExp
+ bExp
- 0x4000;
6319 aSig0
|= LIT64( 0x0001000000000000 );
6320 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
6321 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
6322 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6323 zSig2
|= ( zSig3
!= 0 );
6324 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
6325 shift128ExtraRightJamming(
6326 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6329 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
6333 /*----------------------------------------------------------------------------
6334 | Returns the result of dividing the quadruple-precision floating-point value
6335 | `a' by the corresponding value `b'. The operation is performed according to
6336 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6337 *----------------------------------------------------------------------------*/
6339 float128
float128_div( float128 a
, float128 b STATUS_PARAM
)
6341 flag aSign
, bSign
, zSign
;
6342 int32 aExp
, bExp
, zExp
;
6343 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6344 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6347 aSig1
= extractFloat128Frac1( a
);
6348 aSig0
= extractFloat128Frac0( a
);
6349 aExp
= extractFloat128Exp( a
);
6350 aSign
= extractFloat128Sign( a
);
6351 bSig1
= extractFloat128Frac1( b
);
6352 bSig0
= extractFloat128Frac0( b
);
6353 bExp
= extractFloat128Exp( b
);
6354 bSign
= extractFloat128Sign( b
);
6355 zSign
= aSign
^ bSign
;
6356 if ( aExp
== 0x7FFF ) {
6357 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
6358 if ( bExp
== 0x7FFF ) {
6359 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
6362 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6364 if ( bExp
== 0x7FFF ) {
6365 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
6366 return packFloat128( zSign
, 0, 0, 0 );
6369 if ( ( bSig0
| bSig1
) == 0 ) {
6370 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6372 float_raise( float_flag_invalid STATUS_VAR
);
6373 z
.low
= float128_default_nan_low
;
6374 z
.high
= float128_default_nan_high
;
6377 float_raise( float_flag_divbyzero STATUS_VAR
);
6378 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6380 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6383 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6384 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6386 zExp
= aExp
- bExp
+ 0x3FFD;
6388 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
6390 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
6391 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
6392 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
6395 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6396 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
6397 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
6398 while ( (int64_t) rem0
< 0 ) {
6400 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
6402 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
6403 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
6404 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
6405 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
6406 while ( (int64_t) rem1
< 0 ) {
6408 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
6410 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6412 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
6413 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
6417 /*----------------------------------------------------------------------------
6418 | Returns the remainder of the quadruple-precision floating-point value `a'
6419 | with respect to the corresponding value `b'. The operation is performed
6420 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6421 *----------------------------------------------------------------------------*/
6423 float128
float128_rem( float128 a
, float128 b STATUS_PARAM
)
6426 int32 aExp
, bExp
, expDiff
;
6427 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
6428 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
6432 aSig1
= extractFloat128Frac1( a
);
6433 aSig0
= extractFloat128Frac0( a
);
6434 aExp
= extractFloat128Exp( a
);
6435 aSign
= extractFloat128Sign( a
);
6436 bSig1
= extractFloat128Frac1( b
);
6437 bSig0
= extractFloat128Frac0( b
);
6438 bExp
= extractFloat128Exp( b
);
6439 if ( aExp
== 0x7FFF ) {
6440 if ( ( aSig0
| aSig1
)
6441 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6442 return propagateFloat128NaN( a
, b STATUS_VAR
);
6446 if ( bExp
== 0x7FFF ) {
6447 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
6451 if ( ( bSig0
| bSig1
) == 0 ) {
6453 float_raise( float_flag_invalid STATUS_VAR
);
6454 z
.low
= float128_default_nan_low
;
6455 z
.high
= float128_default_nan_high
;
6458 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6461 if ( ( aSig0
| aSig1
) == 0 ) return a
;
6462 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6464 expDiff
= aExp
- bExp
;
6465 if ( expDiff
< -1 ) return a
;
6467 aSig0
| LIT64( 0x0001000000000000 ),
6469 15 - ( expDiff
< 0 ),
6474 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
6475 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
6476 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6478 while ( 0 < expDiff
) {
6479 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6480 q
= ( 4 < q
) ? q
- 4 : 0;
6481 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6482 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
6483 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
6484 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
6487 if ( -64 < expDiff
) {
6488 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6489 q
= ( 4 < q
) ? q
- 4 : 0;
6491 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6493 if ( expDiff
< 0 ) {
6494 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6497 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
6499 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6500 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
6503 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
6504 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6507 alternateASig0
= aSig0
;
6508 alternateASig1
= aSig1
;
6510 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6511 } while ( 0 <= (int64_t) aSig0
);
6513 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
6514 if ( ( sigMean0
< 0 )
6515 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
6516 aSig0
= alternateASig0
;
6517 aSig1
= alternateASig1
;
6519 zSign
= ( (int64_t) aSig0
< 0 );
6520 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
6522 normalizeRoundAndPackFloat128( aSign
^ zSign
, bExp
- 4, aSig0
, aSig1 STATUS_VAR
);
6526 /*----------------------------------------------------------------------------
6527 | Returns the square root of the quadruple-precision floating-point value `a'.
6528 | The operation is performed according to the IEC/IEEE Standard for Binary
6529 | Floating-Point Arithmetic.
6530 *----------------------------------------------------------------------------*/
6532 float128
float128_sqrt( float128 a STATUS_PARAM
)
6536 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
6537 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6540 aSig1
= extractFloat128Frac1( a
);
6541 aSig0
= extractFloat128Frac0( a
);
6542 aExp
= extractFloat128Exp( a
);
6543 aSign
= extractFloat128Sign( a
);
6544 if ( aExp
== 0x7FFF ) {
6545 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, a STATUS_VAR
);
6546 if ( ! aSign
) return a
;
6550 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
6552 float_raise( float_flag_invalid STATUS_VAR
);
6553 z
.low
= float128_default_nan_low
;
6554 z
.high
= float128_default_nan_high
;
6558 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
6559 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6561 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
6562 aSig0
|= LIT64( 0x0001000000000000 );
6563 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
6564 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
6565 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6566 doubleZSig0
= zSig0
<<1;
6567 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6568 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6569 while ( (int64_t) rem0
< 0 ) {
6572 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6574 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
6575 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
6576 if ( zSig1
== 0 ) zSig1
= 1;
6577 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
6578 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6579 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
6580 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6581 while ( (int64_t) rem1
< 0 ) {
6583 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
6585 term2
|= doubleZSig0
;
6586 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6588 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6590 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
6591 return roundAndPackFloat128( 0, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
6595 /*----------------------------------------------------------------------------
6596 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6597 | the corresponding value `b', and 0 otherwise. The invalid exception is
6598 | raised if either operand is a NaN. Otherwise, the comparison is performed
6599 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6600 *----------------------------------------------------------------------------*/
6602 int float128_eq( float128 a
, float128 b STATUS_PARAM
)
6605 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6606 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6607 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6608 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6610 float_raise( float_flag_invalid STATUS_VAR
);
6615 && ( ( a
.high
== b
.high
)
6617 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6622 /*----------------------------------------------------------------------------
6623 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6624 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6625 | exception is raised if either operand is a NaN. The comparison is performed
6626 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6627 *----------------------------------------------------------------------------*/
6629 int float128_le( float128 a
, float128 b STATUS_PARAM
)
6633 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6634 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6635 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6636 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6638 float_raise( float_flag_invalid STATUS_VAR
);
6641 aSign
= extractFloat128Sign( a
);
6642 bSign
= extractFloat128Sign( b
);
6643 if ( aSign
!= bSign
) {
6646 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6650 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6651 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6655 /*----------------------------------------------------------------------------
6656 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6657 | the corresponding value `b', and 0 otherwise. The invalid exception is
6658 | raised if either operand is a NaN. The comparison is performed according
6659 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6660 *----------------------------------------------------------------------------*/
6662 int float128_lt( float128 a
, float128 b STATUS_PARAM
)
6666 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6667 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6668 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6669 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6671 float_raise( float_flag_invalid STATUS_VAR
);
6674 aSign
= extractFloat128Sign( a
);
6675 bSign
= extractFloat128Sign( b
);
6676 if ( aSign
!= bSign
) {
6679 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6683 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
6684 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6688 /*----------------------------------------------------------------------------
6689 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6690 | be compared, and 0 otherwise. The invalid exception is raised if either
6691 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6692 | Standard for Binary Floating-Point Arithmetic.
6693 *----------------------------------------------------------------------------*/
6695 int float128_unordered( float128 a
, float128 b STATUS_PARAM
)
6697 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6698 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6699 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6700 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6702 float_raise( float_flag_invalid STATUS_VAR
);
6708 /*----------------------------------------------------------------------------
6709 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6710 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6711 | exception. The comparison is performed according to the IEC/IEEE Standard
6712 | for Binary Floating-Point Arithmetic.
6713 *----------------------------------------------------------------------------*/
6715 int float128_eq_quiet( float128 a
, float128 b STATUS_PARAM
)
6718 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6719 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6720 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6721 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6723 if ( float128_is_signaling_nan( a
)
6724 || float128_is_signaling_nan( b
) ) {
6725 float_raise( float_flag_invalid STATUS_VAR
);
6731 && ( ( a
.high
== b
.high
)
6733 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6738 /*----------------------------------------------------------------------------
6739 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6740 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6741 | cause an exception. Otherwise, the comparison is performed according to the
6742 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6743 *----------------------------------------------------------------------------*/
6745 int float128_le_quiet( float128 a
, float128 b STATUS_PARAM
)
6749 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6750 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6751 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6752 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6754 if ( float128_is_signaling_nan( a
)
6755 || float128_is_signaling_nan( b
) ) {
6756 float_raise( float_flag_invalid STATUS_VAR
);
6760 aSign
= extractFloat128Sign( a
);
6761 bSign
= extractFloat128Sign( b
);
6762 if ( aSign
!= bSign
) {
6765 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6769 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6770 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6774 /*----------------------------------------------------------------------------
6775 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6776 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6777 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6778 | Standard for Binary Floating-Point Arithmetic.
6779 *----------------------------------------------------------------------------*/
6781 int float128_lt_quiet( float128 a
, float128 b STATUS_PARAM
)
6785 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6786 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6787 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6788 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6790 if ( float128_is_signaling_nan( a
)
6791 || float128_is_signaling_nan( b
) ) {
6792 float_raise( float_flag_invalid STATUS_VAR
);
6796 aSign
= extractFloat128Sign( a
);
6797 bSign
= extractFloat128Sign( b
);
6798 if ( aSign
!= bSign
) {
6801 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6805 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
6806 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6810 /*----------------------------------------------------------------------------
6811 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6812 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6813 | comparison is performed according to the IEC/IEEE Standard for Binary
6814 | Floating-Point Arithmetic.
6815 *----------------------------------------------------------------------------*/
6817 int float128_unordered_quiet( float128 a
, float128 b STATUS_PARAM
)
6819 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6820 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6821 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6822 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6824 if ( float128_is_signaling_nan( a
)
6825 || float128_is_signaling_nan( b
) ) {
6826 float_raise( float_flag_invalid STATUS_VAR
);
6833 /* misc functions */
6834 float32
uint32_to_float32(uint32_t a STATUS_PARAM
)
6836 return int64_to_float32(a STATUS_VAR
);
6839 float64
uint32_to_float64(uint32_t a STATUS_PARAM
)
6841 return int64_to_float64(a STATUS_VAR
);
6844 uint32
float32_to_uint32( float32 a STATUS_PARAM
)
6848 int old_exc_flags
= get_float_exception_flags(status
);
6850 v
= float32_to_int64(a STATUS_VAR
);
6853 } else if (v
> 0xffffffff) {
6858 set_float_exception_flags(old_exc_flags
, status
);
6859 float_raise(float_flag_invalid STATUS_VAR
);
6863 uint32
float32_to_uint32_round_to_zero( float32 a STATUS_PARAM
)
6867 int old_exc_flags
= get_float_exception_flags(status
);
6869 v
= float32_to_int64_round_to_zero(a STATUS_VAR
);
6872 } else if (v
> 0xffffffff) {
6877 set_float_exception_flags(old_exc_flags
, status
);
6878 float_raise(float_flag_invalid STATUS_VAR
);
6882 int_fast16_t float32_to_int16(float32 a STATUS_PARAM
)
6886 int old_exc_flags
= get_float_exception_flags(status
);
6888 v
= float32_to_int32(a STATUS_VAR
);
6891 } else if (v
> 0x7fff) {
6897 set_float_exception_flags(old_exc_flags
, status
);
6898 float_raise(float_flag_invalid STATUS_VAR
);
6902 uint_fast16_t float32_to_uint16(float32 a STATUS_PARAM
)
6906 int old_exc_flags
= get_float_exception_flags(status
);
6908 v
= float32_to_int32(a STATUS_VAR
);
6911 } else if (v
> 0xffff) {
6917 set_float_exception_flags(old_exc_flags
, status
);
6918 float_raise(float_flag_invalid STATUS_VAR
);
6922 uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM
)
6926 int old_exc_flags
= get_float_exception_flags(status
);
6928 v
= float32_to_int64_round_to_zero(a STATUS_VAR
);
6931 } else if (v
> 0xffff) {
6936 set_float_exception_flags(old_exc_flags
, status
);
6937 float_raise(float_flag_invalid STATUS_VAR
);
6941 uint32
float64_to_uint32( float64 a STATUS_PARAM
)
6945 int old_exc_flags
= get_float_exception_flags(status
);
6947 v
= float64_to_uint64(a STATUS_VAR
);
6948 if (v
> 0xffffffff) {
6953 set_float_exception_flags(old_exc_flags
, status
);
6954 float_raise(float_flag_invalid STATUS_VAR
);
6958 uint32
float64_to_uint32_round_to_zero( float64 a STATUS_PARAM
)
6962 int old_exc_flags
= get_float_exception_flags(status
);
6964 v
= float64_to_uint64_round_to_zero(a STATUS_VAR
);
6965 if (v
> 0xffffffff) {
6970 set_float_exception_flags(old_exc_flags
, status
);
6971 float_raise(float_flag_invalid STATUS_VAR
);
6975 int_fast16_t float64_to_int16(float64 a STATUS_PARAM
)
6979 int old_exc_flags
= get_float_exception_flags(status
);
6981 v
= float64_to_int32(a STATUS_VAR
);
6984 } else if (v
> 0x7fff) {
6990 set_float_exception_flags(old_exc_flags
, status
);
6991 float_raise(float_flag_invalid STATUS_VAR
);
6995 uint_fast16_t float64_to_uint16(float64 a STATUS_PARAM
)
6999 int old_exc_flags
= get_float_exception_flags(status
);
7001 v
= float64_to_int32(a STATUS_VAR
);
7004 } else if (v
> 0xffff) {
7010 set_float_exception_flags(old_exc_flags
, status
);
7011 float_raise(float_flag_invalid STATUS_VAR
);
7015 uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM
)
7019 int old_exc_flags
= get_float_exception_flags(status
);
7021 v
= float64_to_int64_round_to_zero(a STATUS_VAR
);
7024 } else if (v
> 0xffff) {
7029 set_float_exception_flags(old_exc_flags
, status
);
7030 float_raise(float_flag_invalid STATUS_VAR
);
7034 /*----------------------------------------------------------------------------
7035 | Returns the result of converting the double-precision floating-point value
7036 | `a' to the 64-bit unsigned integer format. The conversion is
7037 | performed according to the IEC/IEEE Standard for Binary Floating-Point
7038 | Arithmetic---which means in particular that the conversion is rounded
7039 | according to the current rounding mode. If `a' is a NaN, the largest
7040 | positive integer is returned. If the conversion overflows, the
7041 | largest unsigned integer is returned. If 'a' is negative, the value is
7042 | rounded and zero is returned; negative values that do not round to zero
7043 | will raise the inexact exception.
7044 *----------------------------------------------------------------------------*/
7046 uint64_t float64_to_uint64(float64 a STATUS_PARAM
)
7049 int_fast16_t aExp
, shiftCount
;
7050 uint64_t aSig
, aSigExtra
;
7051 a
= float64_squash_input_denormal(a STATUS_VAR
);
7053 aSig
= extractFloat64Frac(a
);
7054 aExp
= extractFloat64Exp(a
);
7055 aSign
= extractFloat64Sign(a
);
7056 if (aSign
&& (aExp
> 1022)) {
7057 float_raise(float_flag_invalid STATUS_VAR
);
7058 if (float64_is_any_nan(a
)) {
7059 return LIT64(0xFFFFFFFFFFFFFFFF);
7065 aSig
|= LIT64(0x0010000000000000);
7067 shiftCount
= 0x433 - aExp
;
7068 if (shiftCount
<= 0) {
7070 float_raise(float_flag_invalid STATUS_VAR
);
7071 return LIT64(0xFFFFFFFFFFFFFFFF);
7074 aSig
<<= -shiftCount
;
7076 shift64ExtraRightJamming(aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
7078 return roundAndPackUint64(aSign
, aSig
, aSigExtra STATUS_VAR
);
7081 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM
)
7083 signed char current_rounding_mode
= STATUS(float_rounding_mode
);
7084 set_float_rounding_mode(float_round_to_zero STATUS_VAR
);
7085 int64_t v
= float64_to_uint64(a STATUS_VAR
);
7086 set_float_rounding_mode(current_rounding_mode STATUS_VAR
);
7090 #define COMPARE(s, nan_exp) \
7091 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
7092 int is_quiet STATUS_PARAM ) \
7094 flag aSign, bSign; \
7095 uint ## s ## _t av, bv; \
7096 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
7097 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
7099 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
7100 extractFloat ## s ## Frac( a ) ) || \
7101 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
7102 extractFloat ## s ## Frac( b ) )) { \
7104 float ## s ## _is_signaling_nan( a ) || \
7105 float ## s ## _is_signaling_nan( b ) ) { \
7106 float_raise( float_flag_invalid STATUS_VAR); \
7108 return float_relation_unordered; \
7110 aSign = extractFloat ## s ## Sign( a ); \
7111 bSign = extractFloat ## s ## Sign( b ); \
7112 av = float ## s ## _val(a); \
7113 bv = float ## s ## _val(b); \
7114 if ( aSign != bSign ) { \
7115 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
7117 return float_relation_equal; \
7119 return 1 - (2 * aSign); \
7123 return float_relation_equal; \
7125 return 1 - 2 * (aSign ^ ( av < bv )); \
7130 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
7132 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
7135 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
7137 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
7143 INLINE
int floatx80_compare_internal( floatx80 a
, floatx80 b
,
7144 int is_quiet STATUS_PARAM
)
7148 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
7149 ( extractFloatx80Frac( a
)<<1 ) ) ||
7150 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
7151 ( extractFloatx80Frac( b
)<<1 ) )) {
7153 floatx80_is_signaling_nan( a
) ||
7154 floatx80_is_signaling_nan( b
) ) {
7155 float_raise( float_flag_invalid STATUS_VAR
);
7157 return float_relation_unordered
;
7159 aSign
= extractFloatx80Sign( a
);
7160 bSign
= extractFloatx80Sign( b
);
7161 if ( aSign
!= bSign
) {
7163 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
7164 ( ( a
.low
| b
.low
) == 0 ) ) {
7166 return float_relation_equal
;
7168 return 1 - (2 * aSign
);
7171 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7172 return float_relation_equal
;
7174 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7179 int floatx80_compare( floatx80 a
, floatx80 b STATUS_PARAM
)
7181 return floatx80_compare_internal(a
, b
, 0 STATUS_VAR
);
7184 int floatx80_compare_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
7186 return floatx80_compare_internal(a
, b
, 1 STATUS_VAR
);
7189 INLINE
int float128_compare_internal( float128 a
, float128 b
,
7190 int is_quiet STATUS_PARAM
)
7194 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
7195 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
7196 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
7197 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
7199 float128_is_signaling_nan( a
) ||
7200 float128_is_signaling_nan( b
) ) {
7201 float_raise( float_flag_invalid STATUS_VAR
);
7203 return float_relation_unordered
;
7205 aSign
= extractFloat128Sign( a
);
7206 bSign
= extractFloat128Sign( b
);
7207 if ( aSign
!= bSign
) {
7208 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
7210 return float_relation_equal
;
7212 return 1 - (2 * aSign
);
7215 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7216 return float_relation_equal
;
7218 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7223 int float128_compare( float128 a
, float128 b STATUS_PARAM
)
7225 return float128_compare_internal(a
, b
, 0 STATUS_VAR
);
7228 int float128_compare_quiet( float128 a
, float128 b STATUS_PARAM
)
7230 return float128_compare_internal(a
, b
, 1 STATUS_VAR
);
7233 /* min() and max() functions. These can't be implemented as
7234 * 'compare and pick one input' because that would mishandle
7235 * NaNs and +0 vs -0.
7237 * minnum() and maxnum() functions. These are similar to the min()
7238 * and max() functions but if one of the arguments is a QNaN and
7239 * the other is numerical then the numerical argument is returned.
7240 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7241 * and maxNum() operations. min() and max() are the typical min/max
7242 * semantics provided by many CPUs which predate that specification.
7245 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \
7246 int ismin, int isieee STATUS_PARAM) \
7248 flag aSign, bSign; \
7249 uint ## s ## _t av, bv; \
7250 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
7251 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
7252 if (float ## s ## _is_any_nan(a) || \
7253 float ## s ## _is_any_nan(b)) { \
7255 if (float ## s ## _is_quiet_nan(a) && \
7256 !float ## s ##_is_any_nan(b)) { \
7258 } else if (float ## s ## _is_quiet_nan(b) && \
7259 !float ## s ## _is_any_nan(a)) { \
7263 return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
7265 aSign = extractFloat ## s ## Sign(a); \
7266 bSign = extractFloat ## s ## Sign(b); \
7267 av = float ## s ## _val(a); \
7268 bv = float ## s ## _val(b); \
7269 if (aSign != bSign) { \
7271 return aSign ? a : b; \
7273 return aSign ? b : a; \
7277 return (aSign ^ (av < bv)) ? a : b; \
7279 return (aSign ^ (av < bv)) ? b : a; \
7284 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
7286 return float ## s ## _minmax(a, b, 1, 0 STATUS_VAR); \
7289 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
7291 return float ## s ## _minmax(a, b, 0, 0 STATUS_VAR); \
7294 float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \
7296 return float ## s ## _minmax(a, b, 1, 1 STATUS_VAR); \
7299 float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \
7301 return float ## s ## _minmax(a, b, 0, 1 STATUS_VAR); \
7308 /* Multiply A by 2 raised to the power N. */
7309 float32
float32_scalbn( float32 a
, int n STATUS_PARAM
)
7315 a
= float32_squash_input_denormal(a STATUS_VAR
);
7316 aSig
= extractFloat32Frac( a
);
7317 aExp
= extractFloat32Exp( a
);
7318 aSign
= extractFloat32Sign( a
);
7320 if ( aExp
== 0xFF ) {
7322 return propagateFloat32NaN( a
, a STATUS_VAR
);
7328 } else if (aSig
== 0) {
7336 } else if (n
< -0x200) {
7342 return normalizeRoundAndPackFloat32( aSign
, aExp
, aSig STATUS_VAR
);
7345 float64
float64_scalbn( float64 a
, int n STATUS_PARAM
)
7351 a
= float64_squash_input_denormal(a STATUS_VAR
);
7352 aSig
= extractFloat64Frac( a
);
7353 aExp
= extractFloat64Exp( a
);
7354 aSign
= extractFloat64Sign( a
);
7356 if ( aExp
== 0x7FF ) {
7358 return propagateFloat64NaN( a
, a STATUS_VAR
);
7363 aSig
|= LIT64( 0x0010000000000000 );
7364 } else if (aSig
== 0) {
7372 } else if (n
< -0x1000) {
7378 return normalizeRoundAndPackFloat64( aSign
, aExp
, aSig STATUS_VAR
);
7381 floatx80
floatx80_scalbn( floatx80 a
, int n STATUS_PARAM
)
7387 aSig
= extractFloatx80Frac( a
);
7388 aExp
= extractFloatx80Exp( a
);
7389 aSign
= extractFloatx80Sign( a
);
7391 if ( aExp
== 0x7FFF ) {
7393 return propagateFloatx80NaN( a
, a STATUS_VAR
);
7407 } else if (n
< -0x10000) {
7412 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision
),
7413 aSign
, aExp
, aSig
, 0 STATUS_VAR
);
7416 float128
float128_scalbn( float128 a
, int n STATUS_PARAM
)
7420 uint64_t aSig0
, aSig1
;
7422 aSig1
= extractFloat128Frac1( a
);
7423 aSig0
= extractFloat128Frac0( a
);
7424 aExp
= extractFloat128Exp( a
);
7425 aSign
= extractFloat128Sign( a
);
7426 if ( aExp
== 0x7FFF ) {
7427 if ( aSig0
| aSig1
) {
7428 return propagateFloat128NaN( a
, a STATUS_VAR
);
7433 aSig0
|= LIT64( 0x0001000000000000 );
7434 } else if (aSig0
== 0 && aSig1
== 0) {
7442 } else if (n
< -0x10000) {
7447 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1