4 * Derived from SoftFloat.
7 /*============================================================================
9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
12 Written by John R. Hauser. This work was made possible in part by the
13 International Computer Science Institute, located at Suite 600, 1947 Center
14 Street, Berkeley, California 94704. Funding was partially provided by the
15 National Science Foundation under grant MIP-9311980. The original version
16 of this code was written as part of a project to build a fixed-point vector
17 processor in collaboration with the University of California at Berkeley,
18 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20 arithmetic/SoftFloat.html'.
22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
31 Derivative works are acceptable, even for commercial purposes, so long as
32 (1) the source code for the derivative work includes prominent notice that
33 the work is derivative, and (2) the source code includes prominent notice with
34 these four paragraphs for those parts of this code that are retained.
36 =============================================================================*/
38 #include "softfloat.h"
40 /*----------------------------------------------------------------------------
41 | Primitive arithmetic functions, including multi-word arithmetic, and
42 | division and square root approximations. (Can be specialized to target if
44 *----------------------------------------------------------------------------*/
45 #include "softfloat-macros.h"
47 /*----------------------------------------------------------------------------
48 | Functions and definitions to determine: (1) whether tininess for underflow
49 | is detected before or after rounding by default, (2) what (if anything)
50 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
51 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
52 | are propagated from function inputs to output. These details are target-
54 *----------------------------------------------------------------------------*/
55 #include "softfloat-specialize.h"
57 void set_float_rounding_mode(int val STATUS_PARAM
)
59 STATUS(float_rounding_mode
) = val
;
62 void set_float_exception_flags(int val STATUS_PARAM
)
64 STATUS(float_exception_flags
) = val
;
67 void set_floatx80_rounding_precision(int val STATUS_PARAM
)
69 STATUS(floatx80_rounding_precision
) = val
;
72 /*----------------------------------------------------------------------------
73 | Returns the fraction bits of the half-precision floating-point value `a'.
74 *----------------------------------------------------------------------------*/
76 INLINE
uint32_t extractFloat16Frac(float16 a
)
78 return float16_val(a
) & 0x3ff;
81 /*----------------------------------------------------------------------------
82 | Returns the exponent bits of the half-precision floating-point value `a'.
83 *----------------------------------------------------------------------------*/
85 INLINE int16
extractFloat16Exp(float16 a
)
87 return (float16_val(a
) >> 10) & 0x1f;
90 /*----------------------------------------------------------------------------
91 | Returns the sign bit of the single-precision floating-point value `a'.
92 *----------------------------------------------------------------------------*/
94 INLINE flag
extractFloat16Sign(float16 a
)
96 return float16_val(a
)>>15;
99 /*----------------------------------------------------------------------------
100 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
101 | and 7, and returns the properly rounded 32-bit integer corresponding to the
102 | input. If `zSign' is 1, the input is negated before being converted to an
103 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
104 | is simply rounded to an integer, with the inexact exception raised if the
105 | input cannot be represented exactly as an integer. However, if the fixed-
106 | point input is too large, the invalid exception is raised and the largest
107 | positive or negative integer is returned.
108 *----------------------------------------------------------------------------*/
110 static int32
roundAndPackInt32( flag zSign
, uint64_t absZ STATUS_PARAM
)
113 flag roundNearestEven
;
114 int8 roundIncrement
, roundBits
;
117 roundingMode
= STATUS(float_rounding_mode
);
118 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
119 roundIncrement
= 0x40;
120 if ( ! roundNearestEven
) {
121 if ( roundingMode
== float_round_to_zero
) {
125 roundIncrement
= 0x7F;
127 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
130 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
134 roundBits
= absZ
& 0x7F;
135 absZ
= ( absZ
+ roundIncrement
)>>7;
136 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
138 if ( zSign
) z
= - z
;
139 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
140 float_raise( float_flag_invalid STATUS_VAR
);
141 return zSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
143 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
148 /*----------------------------------------------------------------------------
149 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
150 | `absZ1', with binary point between bits 63 and 64 (between the input words),
151 | and returns the properly rounded 64-bit integer corresponding to the input.
152 | If `zSign' is 1, the input is negated before being converted to an integer.
153 | Ordinarily, the fixed-point input is simply rounded to an integer, with
154 | the inexact exception raised if the input cannot be represented exactly as
155 | an integer. However, if the fixed-point input is too large, the invalid
156 | exception is raised and the largest positive or negative integer is
158 *----------------------------------------------------------------------------*/
160 static int64
roundAndPackInt64( flag zSign
, uint64_t absZ0
, uint64_t absZ1 STATUS_PARAM
)
163 flag roundNearestEven
, increment
;
166 roundingMode
= STATUS(float_rounding_mode
);
167 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
168 increment
= ( (int64_t) absZ1
< 0 );
169 if ( ! roundNearestEven
) {
170 if ( roundingMode
== float_round_to_zero
) {
175 increment
= ( roundingMode
== float_round_down
) && absZ1
;
178 increment
= ( roundingMode
== float_round_up
) && absZ1
;
184 if ( absZ0
== 0 ) goto overflow
;
185 absZ0
&= ~ ( ( (uint64_t) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
188 if ( zSign
) z
= - z
;
189 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
191 float_raise( float_flag_invalid STATUS_VAR
);
193 zSign
? (int64_t) LIT64( 0x8000000000000000 )
194 : LIT64( 0x7FFFFFFFFFFFFFFF );
196 if ( absZ1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
201 /*----------------------------------------------------------------------------
202 | Returns the fraction bits of the single-precision floating-point value `a'.
203 *----------------------------------------------------------------------------*/
205 INLINE
uint32_t extractFloat32Frac( float32 a
)
208 return float32_val(a
) & 0x007FFFFF;
212 /*----------------------------------------------------------------------------
213 | Returns the exponent bits of the single-precision floating-point value `a'.
214 *----------------------------------------------------------------------------*/
216 INLINE int16
extractFloat32Exp( float32 a
)
219 return ( float32_val(a
)>>23 ) & 0xFF;
223 /*----------------------------------------------------------------------------
224 | Returns the sign bit of the single-precision floating-point value `a'.
225 *----------------------------------------------------------------------------*/
227 INLINE flag
extractFloat32Sign( float32 a
)
230 return float32_val(a
)>>31;
234 /*----------------------------------------------------------------------------
235 | If `a' is denormal and we are in flush-to-zero mode then set the
236 | input-denormal exception and return zero. Otherwise just return the value.
237 *----------------------------------------------------------------------------*/
238 static float32
float32_squash_input_denormal(float32 a STATUS_PARAM
)
240 if (STATUS(flush_inputs_to_zero
)) {
241 if (extractFloat32Exp(a
) == 0 && extractFloat32Frac(a
) != 0) {
242 float_raise(float_flag_input_denormal STATUS_VAR
);
243 return make_float32(float32_val(a
) & 0x80000000);
249 /*----------------------------------------------------------------------------
250 | Normalizes the subnormal single-precision floating-point value represented
251 | by the denormalized significand `aSig'. The normalized exponent and
252 | significand are stored at the locations pointed to by `zExpPtr' and
253 | `zSigPtr', respectively.
254 *----------------------------------------------------------------------------*/
257 normalizeFloat32Subnormal( uint32_t aSig
, int16
*zExpPtr
, uint32_t *zSigPtr
)
261 shiftCount
= countLeadingZeros32( aSig
) - 8;
262 *zSigPtr
= aSig
<<shiftCount
;
263 *zExpPtr
= 1 - shiftCount
;
267 /*----------------------------------------------------------------------------
268 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
269 | single-precision floating-point value, returning the result. After being
270 | shifted into the proper positions, the three fields are simply added
271 | together to form the result. This means that any integer portion of `zSig'
272 | will be added into the exponent. Since a properly normalized significand
273 | will have an integer portion equal to 1, the `zExp' input should be 1 less
274 | than the desired result exponent whenever `zSig' is a complete, normalized
276 *----------------------------------------------------------------------------*/
278 INLINE float32
packFloat32( flag zSign
, int16 zExp
, uint32_t zSig
)
282 ( ( (uint32_t) zSign
)<<31 ) + ( ( (uint32_t) zExp
)<<23 ) + zSig
);
286 /*----------------------------------------------------------------------------
287 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
288 | and significand `zSig', and returns the proper single-precision floating-
289 | point value corresponding to the abstract input. Ordinarily, the abstract
290 | value is simply rounded and packed into the single-precision format, with
291 | the inexact exception raised if the abstract input cannot be represented
292 | exactly. However, if the abstract value is too large, the overflow and
293 | inexact exceptions are raised and an infinity or maximal finite value is
294 | returned. If the abstract value is too small, the input value is rounded to
295 | a subnormal number, and the underflow and inexact exceptions are raised if
296 | the abstract input cannot be represented exactly as a subnormal single-
297 | precision floating-point number.
298 | The input significand `zSig' has its binary point between bits 30
299 | and 29, which is 7 bits to the left of the usual location. This shifted
300 | significand must be normalized or smaller. If `zSig' is not normalized,
301 | `zExp' must be 0; in that case, the result returned is a subnormal number,
302 | and it must not require rounding. In the usual case that `zSig' is
303 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
304 | The handling of underflow and overflow follows the IEC/IEEE Standard for
305 | Binary Floating-Point Arithmetic.
306 *----------------------------------------------------------------------------*/
308 static float32
roundAndPackFloat32( flag zSign
, int16 zExp
, uint32_t zSig STATUS_PARAM
)
311 flag roundNearestEven
;
312 int8 roundIncrement
, roundBits
;
315 roundingMode
= STATUS(float_rounding_mode
);
316 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
317 roundIncrement
= 0x40;
318 if ( ! roundNearestEven
) {
319 if ( roundingMode
== float_round_to_zero
) {
323 roundIncrement
= 0x7F;
325 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
328 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
332 roundBits
= zSig
& 0x7F;
333 if ( 0xFD <= (uint16_t) zExp
) {
335 || ( ( zExp
== 0xFD )
336 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
338 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
339 return packFloat32( zSign
, 0xFF, - ( roundIncrement
== 0 ));
342 if (STATUS(flush_to_zero
)) {
343 float_raise(float_flag_output_denormal STATUS_VAR
);
344 return packFloat32(zSign
, 0, 0);
347 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
349 || ( zSig
+ roundIncrement
< 0x80000000 );
350 shift32RightJamming( zSig
, - zExp
, &zSig
);
352 roundBits
= zSig
& 0x7F;
353 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
356 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
357 zSig
= ( zSig
+ roundIncrement
)>>7;
358 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
359 if ( zSig
== 0 ) zExp
= 0;
360 return packFloat32( zSign
, zExp
, zSig
);
364 /*----------------------------------------------------------------------------
365 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
366 | and significand `zSig', and returns the proper single-precision floating-
367 | point value corresponding to the abstract input. This routine is just like
368 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
369 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
370 | floating-point exponent.
371 *----------------------------------------------------------------------------*/
374 normalizeRoundAndPackFloat32( flag zSign
, int16 zExp
, uint32_t zSig STATUS_PARAM
)
378 shiftCount
= countLeadingZeros32( zSig
) - 1;
379 return roundAndPackFloat32( zSign
, zExp
- shiftCount
, zSig
<<shiftCount STATUS_VAR
);
383 /*----------------------------------------------------------------------------
384 | Returns the fraction bits of the double-precision floating-point value `a'.
385 *----------------------------------------------------------------------------*/
387 INLINE
uint64_t extractFloat64Frac( float64 a
)
390 return float64_val(a
) & LIT64( 0x000FFFFFFFFFFFFF );
394 /*----------------------------------------------------------------------------
395 | Returns the exponent bits of the double-precision floating-point value `a'.
396 *----------------------------------------------------------------------------*/
398 INLINE int16
extractFloat64Exp( float64 a
)
401 return ( float64_val(a
)>>52 ) & 0x7FF;
405 /*----------------------------------------------------------------------------
406 | Returns the sign bit of the double-precision floating-point value `a'.
407 *----------------------------------------------------------------------------*/
409 INLINE flag
extractFloat64Sign( float64 a
)
412 return float64_val(a
)>>63;
416 /*----------------------------------------------------------------------------
417 | If `a' is denormal and we are in flush-to-zero mode then set the
418 | input-denormal exception and return zero. Otherwise just return the value.
419 *----------------------------------------------------------------------------*/
420 static float64
float64_squash_input_denormal(float64 a STATUS_PARAM
)
422 if (STATUS(flush_inputs_to_zero
)) {
423 if (extractFloat64Exp(a
) == 0 && extractFloat64Frac(a
) != 0) {
424 float_raise(float_flag_input_denormal STATUS_VAR
);
425 return make_float64(float64_val(a
) & (1ULL << 63));
431 /*----------------------------------------------------------------------------
432 | Normalizes the subnormal double-precision floating-point value represented
433 | by the denormalized significand `aSig'. The normalized exponent and
434 | significand are stored at the locations pointed to by `zExpPtr' and
435 | `zSigPtr', respectively.
436 *----------------------------------------------------------------------------*/
439 normalizeFloat64Subnormal( uint64_t aSig
, int16
*zExpPtr
, uint64_t *zSigPtr
)
443 shiftCount
= countLeadingZeros64( aSig
) - 11;
444 *zSigPtr
= aSig
<<shiftCount
;
445 *zExpPtr
= 1 - shiftCount
;
449 /*----------------------------------------------------------------------------
450 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
451 | double-precision floating-point value, returning the result. After being
452 | shifted into the proper positions, the three fields are simply added
453 | together to form the result. This means that any integer portion of `zSig'
454 | will be added into the exponent. Since a properly normalized significand
455 | will have an integer portion equal to 1, the `zExp' input should be 1 less
456 | than the desired result exponent whenever `zSig' is a complete, normalized
458 *----------------------------------------------------------------------------*/
460 INLINE float64
packFloat64( flag zSign
, int16 zExp
, uint64_t zSig
)
464 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
468 /*----------------------------------------------------------------------------
469 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
470 | and significand `zSig', and returns the proper double-precision floating-
471 | point value corresponding to the abstract input. Ordinarily, the abstract
472 | value is simply rounded and packed into the double-precision format, with
473 | the inexact exception raised if the abstract input cannot be represented
474 | exactly. However, if the abstract value is too large, the overflow and
475 | inexact exceptions are raised and an infinity or maximal finite value is
476 | returned. If the abstract value is too small, the input value is rounded
477 | to a subnormal number, and the underflow and inexact exceptions are raised
478 | if the abstract input cannot be represented exactly as a subnormal double-
479 | precision floating-point number.
480 | The input significand `zSig' has its binary point between bits 62
481 | and 61, which is 10 bits to the left of the usual location. This shifted
482 | significand must be normalized or smaller. If `zSig' is not normalized,
483 | `zExp' must be 0; in that case, the result returned is a subnormal number,
484 | and it must not require rounding. In the usual case that `zSig' is
485 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
486 | The handling of underflow and overflow follows the IEC/IEEE Standard for
487 | Binary Floating-Point Arithmetic.
488 *----------------------------------------------------------------------------*/
490 static float64
roundAndPackFloat64( flag zSign
, int16 zExp
, uint64_t zSig STATUS_PARAM
)
493 flag roundNearestEven
;
494 int16 roundIncrement
, roundBits
;
497 roundingMode
= STATUS(float_rounding_mode
);
498 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
499 roundIncrement
= 0x200;
500 if ( ! roundNearestEven
) {
501 if ( roundingMode
== float_round_to_zero
) {
505 roundIncrement
= 0x3FF;
507 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
510 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
514 roundBits
= zSig
& 0x3FF;
515 if ( 0x7FD <= (uint16_t) zExp
) {
516 if ( ( 0x7FD < zExp
)
517 || ( ( zExp
== 0x7FD )
518 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
520 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
521 return packFloat64( zSign
, 0x7FF, - ( roundIncrement
== 0 ));
524 if (STATUS(flush_to_zero
)) {
525 float_raise(float_flag_output_denormal STATUS_VAR
);
526 return packFloat64(zSign
, 0, 0);
529 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
531 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
532 shift64RightJamming( zSig
, - zExp
, &zSig
);
534 roundBits
= zSig
& 0x3FF;
535 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
538 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
539 zSig
= ( zSig
+ roundIncrement
)>>10;
540 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
541 if ( zSig
== 0 ) zExp
= 0;
542 return packFloat64( zSign
, zExp
, zSig
);
546 /*----------------------------------------------------------------------------
547 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
548 | and significand `zSig', and returns the proper double-precision floating-
549 | point value corresponding to the abstract input. This routine is just like
550 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
551 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
552 | floating-point exponent.
553 *----------------------------------------------------------------------------*/
556 normalizeRoundAndPackFloat64( flag zSign
, int16 zExp
, uint64_t zSig STATUS_PARAM
)
560 shiftCount
= countLeadingZeros64( zSig
) - 1;
561 return roundAndPackFloat64( zSign
, zExp
- shiftCount
, zSig
<<shiftCount STATUS_VAR
);
565 /*----------------------------------------------------------------------------
566 | Returns the fraction bits of the extended double-precision floating-point
568 *----------------------------------------------------------------------------*/
570 INLINE
uint64_t extractFloatx80Frac( floatx80 a
)
577 /*----------------------------------------------------------------------------
578 | Returns the exponent bits of the extended double-precision floating-point
580 *----------------------------------------------------------------------------*/
582 INLINE int32
extractFloatx80Exp( floatx80 a
)
585 return a
.high
& 0x7FFF;
589 /*----------------------------------------------------------------------------
590 | Returns the sign bit of the extended double-precision floating-point value
592 *----------------------------------------------------------------------------*/
594 INLINE flag
extractFloatx80Sign( floatx80 a
)
601 /*----------------------------------------------------------------------------
602 | Normalizes the subnormal extended double-precision floating-point value
603 | represented by the denormalized significand `aSig'. The normalized exponent
604 | and significand are stored at the locations pointed to by `zExpPtr' and
605 | `zSigPtr', respectively.
606 *----------------------------------------------------------------------------*/
609 normalizeFloatx80Subnormal( uint64_t aSig
, int32
*zExpPtr
, uint64_t *zSigPtr
)
613 shiftCount
= countLeadingZeros64( aSig
);
614 *zSigPtr
= aSig
<<shiftCount
;
615 *zExpPtr
= 1 - shiftCount
;
619 /*----------------------------------------------------------------------------
620 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
621 | extended double-precision floating-point value, returning the result.
622 *----------------------------------------------------------------------------*/
624 INLINE floatx80
packFloatx80( flag zSign
, int32 zExp
, uint64_t zSig
)
629 z
.high
= ( ( (uint16_t) zSign
)<<15 ) + zExp
;
634 /*----------------------------------------------------------------------------
635 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
636 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
637 | and returns the proper extended double-precision floating-point value
638 | corresponding to the abstract input. Ordinarily, the abstract value is
639 | rounded and packed into the extended double-precision format, with the
640 | inexact exception raised if the abstract input cannot be represented
641 | exactly. However, if the abstract value is too large, the overflow and
642 | inexact exceptions are raised and an infinity or maximal finite value is
643 | returned. If the abstract value is too small, the input value is rounded to
644 | a subnormal number, and the underflow and inexact exceptions are raised if
645 | the abstract input cannot be represented exactly as a subnormal extended
646 | double-precision floating-point number.
647 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
648 | number of bits as single or double precision, respectively. Otherwise, the
649 | result is rounded to the full precision of the extended double-precision
651 | The input significand must be normalized or smaller. If the input
652 | significand is not normalized, `zExp' must be 0; in that case, the result
653 | returned is a subnormal number, and it must not require rounding. The
654 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
655 | Floating-Point Arithmetic.
656 *----------------------------------------------------------------------------*/
659 roundAndPackFloatx80(
660 int8 roundingPrecision
, flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1
664 flag roundNearestEven
, increment
, isTiny
;
665 int64 roundIncrement
, roundMask
, roundBits
;
667 roundingMode
= STATUS(float_rounding_mode
);
668 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
669 if ( roundingPrecision
== 80 ) goto precision80
;
670 if ( roundingPrecision
== 64 ) {
671 roundIncrement
= LIT64( 0x0000000000000400 );
672 roundMask
= LIT64( 0x00000000000007FF );
674 else if ( roundingPrecision
== 32 ) {
675 roundIncrement
= LIT64( 0x0000008000000000 );
676 roundMask
= LIT64( 0x000000FFFFFFFFFF );
681 zSig0
|= ( zSig1
!= 0 );
682 if ( ! roundNearestEven
) {
683 if ( roundingMode
== float_round_to_zero
) {
687 roundIncrement
= roundMask
;
689 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
692 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
696 roundBits
= zSig0
& roundMask
;
697 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
698 if ( ( 0x7FFE < zExp
)
699 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
704 if (STATUS(flush_to_zero
)) {
705 float_raise(float_flag_output_denormal STATUS_VAR
);
706 return packFloatx80(zSign
, 0, 0);
709 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
711 || ( zSig0
<= zSig0
+ roundIncrement
);
712 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
714 roundBits
= zSig0
& roundMask
;
715 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
716 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
717 zSig0
+= roundIncrement
;
718 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
719 roundIncrement
= roundMask
+ 1;
720 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
721 roundMask
|= roundIncrement
;
723 zSig0
&= ~ roundMask
;
724 return packFloatx80( zSign
, zExp
, zSig0
);
727 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
728 zSig0
+= roundIncrement
;
729 if ( zSig0
< roundIncrement
) {
731 zSig0
= LIT64( 0x8000000000000000 );
733 roundIncrement
= roundMask
+ 1;
734 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
735 roundMask
|= roundIncrement
;
737 zSig0
&= ~ roundMask
;
738 if ( zSig0
== 0 ) zExp
= 0;
739 return packFloatx80( zSign
, zExp
, zSig0
);
741 increment
= ( (int64_t) zSig1
< 0 );
742 if ( ! roundNearestEven
) {
743 if ( roundingMode
== float_round_to_zero
) {
748 increment
= ( roundingMode
== float_round_down
) && zSig1
;
751 increment
= ( roundingMode
== float_round_up
) && zSig1
;
755 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
756 if ( ( 0x7FFE < zExp
)
757 || ( ( zExp
== 0x7FFE )
758 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
764 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
765 if ( ( roundingMode
== float_round_to_zero
)
766 || ( zSign
&& ( roundingMode
== float_round_up
) )
767 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
769 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
771 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
775 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
778 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
779 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
781 if ( isTiny
&& zSig1
) float_raise( float_flag_underflow STATUS_VAR
);
782 if ( zSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
783 if ( roundNearestEven
) {
784 increment
= ( (int64_t) zSig1
< 0 );
788 increment
= ( roundingMode
== float_round_down
) && zSig1
;
791 increment
= ( roundingMode
== float_round_up
) && zSig1
;
797 ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
798 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
800 return packFloatx80( zSign
, zExp
, zSig0
);
803 if ( zSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
808 zSig0
= LIT64( 0x8000000000000000 );
811 zSig0
&= ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
815 if ( zSig0
== 0 ) zExp
= 0;
817 return packFloatx80( zSign
, zExp
, zSig0
);
821 /*----------------------------------------------------------------------------
822 | Takes an abstract floating-point value having sign `zSign', exponent
823 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
824 | and returns the proper extended double-precision floating-point value
825 | corresponding to the abstract input. This routine is just like
826 | `roundAndPackFloatx80' except that the input significand does not have to be
828 *----------------------------------------------------------------------------*/
831 normalizeRoundAndPackFloatx80(
832 int8 roundingPrecision
, flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1
842 shiftCount
= countLeadingZeros64( zSig0
);
843 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
846 roundAndPackFloatx80( roundingPrecision
, zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
850 /*----------------------------------------------------------------------------
851 | Returns the least-significant 64 fraction bits of the quadruple-precision
852 | floating-point value `a'.
853 *----------------------------------------------------------------------------*/
855 INLINE
uint64_t extractFloat128Frac1( float128 a
)
862 /*----------------------------------------------------------------------------
863 | Returns the most-significant 48 fraction bits of the quadruple-precision
864 | floating-point value `a'.
865 *----------------------------------------------------------------------------*/
867 INLINE
uint64_t extractFloat128Frac0( float128 a
)
870 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
874 /*----------------------------------------------------------------------------
875 | Returns the exponent bits of the quadruple-precision floating-point value
877 *----------------------------------------------------------------------------*/
879 INLINE int32
extractFloat128Exp( float128 a
)
882 return ( a
.high
>>48 ) & 0x7FFF;
886 /*----------------------------------------------------------------------------
887 | Returns the sign bit of the quadruple-precision floating-point value `a'.
888 *----------------------------------------------------------------------------*/
890 INLINE flag
extractFloat128Sign( float128 a
)
897 /*----------------------------------------------------------------------------
898 | Normalizes the subnormal quadruple-precision floating-point value
899 | represented by the denormalized significand formed by the concatenation of
900 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
901 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
902 | significand are stored at the location pointed to by `zSig0Ptr', and the
903 | least significant 64 bits of the normalized significand are stored at the
904 | location pointed to by `zSig1Ptr'.
905 *----------------------------------------------------------------------------*/
908 normalizeFloat128Subnormal(
919 shiftCount
= countLeadingZeros64( aSig1
) - 15;
920 if ( shiftCount
< 0 ) {
921 *zSig0Ptr
= aSig1
>>( - shiftCount
);
922 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
925 *zSig0Ptr
= aSig1
<<shiftCount
;
928 *zExpPtr
= - shiftCount
- 63;
931 shiftCount
= countLeadingZeros64( aSig0
) - 15;
932 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
933 *zExpPtr
= 1 - shiftCount
;
938 /*----------------------------------------------------------------------------
939 | Packs the sign `zSign', the exponent `zExp', and the significand formed
940 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
941 | floating-point value, returning the result. After being shifted into the
942 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
943 | added together to form the most significant 32 bits of the result. This
944 | means that any integer portion of `zSig0' will be added into the exponent.
945 | Since a properly normalized significand will have an integer portion equal
946 | to 1, the `zExp' input should be 1 less than the desired result exponent
947 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
949 *----------------------------------------------------------------------------*/
952 packFloat128( flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1
)
957 z
.high
= ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<48 ) + zSig0
;
962 /*----------------------------------------------------------------------------
963 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
964 | and extended significand formed by the concatenation of `zSig0', `zSig1',
965 | and `zSig2', and returns the proper quadruple-precision floating-point value
966 | corresponding to the abstract input. Ordinarily, the abstract value is
967 | simply rounded and packed into the quadruple-precision format, with the
968 | inexact exception raised if the abstract input cannot be represented
969 | exactly. However, if the abstract value is too large, the overflow and
970 | inexact exceptions are raised and an infinity or maximal finite value is
971 | returned. If the abstract value is too small, the input value is rounded to
972 | a subnormal number, and the underflow and inexact exceptions are raised if
973 | the abstract input cannot be represented exactly as a subnormal quadruple-
974 | precision floating-point number.
975 | The input significand must be normalized or smaller. If the input
976 | significand is not normalized, `zExp' must be 0; in that case, the result
977 | returned is a subnormal number, and it must not require rounding. In the
978 | usual case that the input significand is normalized, `zExp' must be 1 less
979 | than the ``true'' floating-point exponent. The handling of underflow and
980 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
981 *----------------------------------------------------------------------------*/
984 roundAndPackFloat128(
985 flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1
, uint64_t zSig2 STATUS_PARAM
)
988 flag roundNearestEven
, increment
, isTiny
;
990 roundingMode
= STATUS(float_rounding_mode
);
991 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
992 increment
= ( (int64_t) zSig2
< 0 );
993 if ( ! roundNearestEven
) {
994 if ( roundingMode
== float_round_to_zero
) {
999 increment
= ( roundingMode
== float_round_down
) && zSig2
;
1002 increment
= ( roundingMode
== float_round_up
) && zSig2
;
1006 if ( 0x7FFD <= (uint32_t) zExp
) {
1007 if ( ( 0x7FFD < zExp
)
1008 || ( ( zExp
== 0x7FFD )
1010 LIT64( 0x0001FFFFFFFFFFFF ),
1011 LIT64( 0xFFFFFFFFFFFFFFFF ),
1018 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
1019 if ( ( roundingMode
== float_round_to_zero
)
1020 || ( zSign
&& ( roundingMode
== float_round_up
) )
1021 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
1027 LIT64( 0x0000FFFFFFFFFFFF ),
1028 LIT64( 0xFFFFFFFFFFFFFFFF )
1031 return packFloat128( zSign
, 0x7FFF, 0, 0 );
1034 if (STATUS(flush_to_zero
)) {
1035 float_raise(float_flag_output_denormal STATUS_VAR
);
1036 return packFloat128(zSign
, 0, 0, 0);
1039 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
1045 LIT64( 0x0001FFFFFFFFFFFF ),
1046 LIT64( 0xFFFFFFFFFFFFFFFF )
1048 shift128ExtraRightJamming(
1049 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
1051 if ( isTiny
&& zSig2
) float_raise( float_flag_underflow STATUS_VAR
);
1052 if ( roundNearestEven
) {
1053 increment
= ( (int64_t) zSig2
< 0 );
1057 increment
= ( roundingMode
== float_round_down
) && zSig2
;
1060 increment
= ( roundingMode
== float_round_up
) && zSig2
;
1065 if ( zSig2
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1067 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
1068 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
1071 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
1073 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1077 /*----------------------------------------------------------------------------
1078 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1079 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1080 | returns the proper quadruple-precision floating-point value corresponding
1081 | to the abstract input. This routine is just like `roundAndPackFloat128'
1082 | except that the input significand has fewer bits and does not have to be
1083 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1085 *----------------------------------------------------------------------------*/
1088 normalizeRoundAndPackFloat128(
1089 flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1 STATUS_PARAM
)
1099 shiftCount
= countLeadingZeros64( zSig0
) - 15;
1100 if ( 0 <= shiftCount
) {
1102 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1105 shift128ExtraRightJamming(
1106 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
1109 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
1113 /*----------------------------------------------------------------------------
1114 | Returns the result of converting the 32-bit two's complement integer `a'
1115 | to the single-precision floating-point format. The conversion is performed
1116 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1117 *----------------------------------------------------------------------------*/
1119 float32
int32_to_float32( int32 a STATUS_PARAM
)
1123 if ( a
== 0 ) return float32_zero
;
1124 if ( a
== (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1126 return normalizeRoundAndPackFloat32( zSign
, 0x9C, zSign
? - a
: a STATUS_VAR
);
1130 /*----------------------------------------------------------------------------
1131 | Returns the result of converting the 32-bit two's complement integer `a'
1132 | to the double-precision floating-point format. The conversion is performed
1133 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1134 *----------------------------------------------------------------------------*/
1136 float64
int32_to_float64( int32 a STATUS_PARAM
)
1143 if ( a
== 0 ) return float64_zero
;
1145 absA
= zSign
? - a
: a
;
1146 shiftCount
= countLeadingZeros32( absA
) + 21;
1148 return packFloat64( zSign
, 0x432 - shiftCount
, zSig
<<shiftCount
);
1152 /*----------------------------------------------------------------------------
1153 | Returns the result of converting the 32-bit two's complement integer `a'
1154 | to the extended double-precision floating-point format. The conversion
1155 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1157 *----------------------------------------------------------------------------*/
1159 floatx80
int32_to_floatx80( int32 a STATUS_PARAM
)
1166 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1168 absA
= zSign
? - a
: a
;
1169 shiftCount
= countLeadingZeros32( absA
) + 32;
1171 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
1175 /*----------------------------------------------------------------------------
1176 | Returns the result of converting the 32-bit two's complement integer `a' to
1177 | the quadruple-precision floating-point format. The conversion is performed
1178 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1179 *----------------------------------------------------------------------------*/
1181 float128
int32_to_float128( int32 a STATUS_PARAM
)
1188 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1190 absA
= zSign
? - a
: a
;
1191 shiftCount
= countLeadingZeros32( absA
) + 17;
1193 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
1197 /*----------------------------------------------------------------------------
1198 | Returns the result of converting the 64-bit two's complement integer `a'
1199 | to the single-precision floating-point format. The conversion is performed
1200 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1201 *----------------------------------------------------------------------------*/
1203 float32
int64_to_float32( int64 a STATUS_PARAM
)
1209 if ( a
== 0 ) return float32_zero
;
1211 absA
= zSign
? - a
: a
;
1212 shiftCount
= countLeadingZeros64( absA
) - 40;
1213 if ( 0 <= shiftCount
) {
1214 return packFloat32( zSign
, 0x95 - shiftCount
, absA
<<shiftCount
);
1218 if ( shiftCount
< 0 ) {
1219 shift64RightJamming( absA
, - shiftCount
, &absA
);
1222 absA
<<= shiftCount
;
1224 return roundAndPackFloat32( zSign
, 0x9C - shiftCount
, absA STATUS_VAR
);
1229 float32
uint64_to_float32( uint64 a STATUS_PARAM
)
1233 if ( a
== 0 ) return float32_zero
;
1234 shiftCount
= countLeadingZeros64( a
) - 40;
1235 if ( 0 <= shiftCount
) {
1236 return packFloat32( 1 > 0, 0x95 - shiftCount
, a
<<shiftCount
);
1240 if ( shiftCount
< 0 ) {
1241 shift64RightJamming( a
, - shiftCount
, &a
);
1246 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount
, a STATUS_VAR
);
1250 /*----------------------------------------------------------------------------
1251 | Returns the result of converting the 64-bit two's complement integer `a'
1252 | to the double-precision floating-point format. The conversion is performed
1253 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1254 *----------------------------------------------------------------------------*/
1256 float64
int64_to_float64( int64 a STATUS_PARAM
)
1260 if ( a
== 0 ) return float64_zero
;
1261 if ( a
== (int64_t) LIT64( 0x8000000000000000 ) ) {
1262 return packFloat64( 1, 0x43E, 0 );
1265 return normalizeRoundAndPackFloat64( zSign
, 0x43C, zSign
? - a
: a STATUS_VAR
);
1269 float64
uint64_to_float64( uint64 a STATUS_PARAM
)
1271 if ( a
== 0 ) return float64_zero
;
1272 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR
);
1276 /*----------------------------------------------------------------------------
1277 | Returns the result of converting the 64-bit two's complement integer `a'
1278 | to the extended double-precision floating-point format. The conversion
1279 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1281 *----------------------------------------------------------------------------*/
1283 floatx80
int64_to_floatx80( int64 a STATUS_PARAM
)
1289 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1291 absA
= zSign
? - a
: a
;
1292 shiftCount
= countLeadingZeros64( absA
);
1293 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
1297 /*----------------------------------------------------------------------------
1298 | Returns the result of converting the 64-bit two's complement integer `a' to
1299 | the quadruple-precision floating-point format. The conversion is performed
1300 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1301 *----------------------------------------------------------------------------*/
1303 float128
int64_to_float128( int64 a STATUS_PARAM
)
1309 uint64_t zSig0
, zSig1
;
1311 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1313 absA
= zSign
? - a
: a
;
1314 shiftCount
= countLeadingZeros64( absA
) + 49;
1315 zExp
= 0x406E - shiftCount
;
1316 if ( 64 <= shiftCount
) {
1325 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1326 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1330 /*----------------------------------------------------------------------------
1331 | Returns the result of converting the single-precision floating-point value
1332 | `a' to the 32-bit two's complement integer format. The conversion is
1333 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1334 | Arithmetic---which means in particular that the conversion is rounded
1335 | according to the current rounding mode. If `a' is a NaN, the largest
1336 | positive integer is returned. Otherwise, if the conversion overflows, the
1337 | largest integer with the same sign as `a' is returned.
1338 *----------------------------------------------------------------------------*/
1340 int32
float32_to_int32( float32 a STATUS_PARAM
)
1343 int16 aExp
, shiftCount
;
1347 a
= float32_squash_input_denormal(a STATUS_VAR
);
1348 aSig
= extractFloat32Frac( a
);
1349 aExp
= extractFloat32Exp( a
);
1350 aSign
= extractFloat32Sign( a
);
1351 if ( ( aExp
== 0xFF ) && aSig
) aSign
= 0;
1352 if ( aExp
) aSig
|= 0x00800000;
1353 shiftCount
= 0xAF - aExp
;
1356 if ( 0 < shiftCount
) shift64RightJamming( aSig64
, shiftCount
, &aSig64
);
1357 return roundAndPackInt32( aSign
, aSig64 STATUS_VAR
);
1361 /*----------------------------------------------------------------------------
1362 | Returns the result of converting the single-precision floating-point value
1363 | `a' to the 32-bit two's complement integer format. The conversion is
1364 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1365 | Arithmetic, except that the conversion is always rounded toward zero.
1366 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1367 | the conversion overflows, the largest integer with the same sign as `a' is
1369 *----------------------------------------------------------------------------*/
1371 int32
float32_to_int32_round_to_zero( float32 a STATUS_PARAM
)
1374 int16 aExp
, shiftCount
;
1377 a
= float32_squash_input_denormal(a STATUS_VAR
);
1379 aSig
= extractFloat32Frac( a
);
1380 aExp
= extractFloat32Exp( a
);
1381 aSign
= extractFloat32Sign( a
);
1382 shiftCount
= aExp
- 0x9E;
1383 if ( 0 <= shiftCount
) {
1384 if ( float32_val(a
) != 0xCF000000 ) {
1385 float_raise( float_flag_invalid STATUS_VAR
);
1386 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
1388 return (int32_t) 0x80000000;
1390 else if ( aExp
<= 0x7E ) {
1391 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1394 aSig
= ( aSig
| 0x00800000 )<<8;
1395 z
= aSig
>>( - shiftCount
);
1396 if ( (uint32_t) ( aSig
<<( shiftCount
& 31 ) ) ) {
1397 STATUS(float_exception_flags
) |= float_flag_inexact
;
1399 if ( aSign
) z
= - z
;
1404 /*----------------------------------------------------------------------------
1405 | Returns the result of converting the single-precision floating-point value
1406 | `a' to the 16-bit two's complement integer format. The conversion is
1407 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1408 | Arithmetic, except that the conversion is always rounded toward zero.
1409 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1410 | the conversion overflows, the largest integer with the same sign as `a' is
1412 *----------------------------------------------------------------------------*/
1414 int16
float32_to_int16_round_to_zero( float32 a STATUS_PARAM
)
1417 int16 aExp
, shiftCount
;
1421 aSig
= extractFloat32Frac( a
);
1422 aExp
= extractFloat32Exp( a
);
1423 aSign
= extractFloat32Sign( a
);
1424 shiftCount
= aExp
- 0x8E;
1425 if ( 0 <= shiftCount
) {
1426 if ( float32_val(a
) != 0xC7000000 ) {
1427 float_raise( float_flag_invalid STATUS_VAR
);
1428 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1432 return (int32_t) 0xffff8000;
1434 else if ( aExp
<= 0x7E ) {
1435 if ( aExp
| aSig
) {
1436 STATUS(float_exception_flags
) |= float_flag_inexact
;
1441 aSig
= ( aSig
| 0x00800000 )<<8;
1442 z
= aSig
>>( - shiftCount
);
1443 if ( (uint32_t) ( aSig
<<( shiftCount
& 31 ) ) ) {
1444 STATUS(float_exception_flags
) |= float_flag_inexact
;
1453 /*----------------------------------------------------------------------------
1454 | Returns the result of converting the single-precision floating-point value
1455 | `a' to the 64-bit two's complement integer format. The conversion is
1456 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1457 | Arithmetic---which means in particular that the conversion is rounded
1458 | according to the current rounding mode. If `a' is a NaN, the largest
1459 | positive integer is returned. Otherwise, if the conversion overflows, the
1460 | largest integer with the same sign as `a' is returned.
1461 *----------------------------------------------------------------------------*/
1463 int64
float32_to_int64( float32 a STATUS_PARAM
)
1466 int16 aExp
, shiftCount
;
1468 uint64_t aSig64
, aSigExtra
;
1469 a
= float32_squash_input_denormal(a STATUS_VAR
);
1471 aSig
= extractFloat32Frac( a
);
1472 aExp
= extractFloat32Exp( a
);
1473 aSign
= extractFloat32Sign( a
);
1474 shiftCount
= 0xBE - aExp
;
1475 if ( shiftCount
< 0 ) {
1476 float_raise( float_flag_invalid STATUS_VAR
);
1477 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1478 return LIT64( 0x7FFFFFFFFFFFFFFF );
1480 return (int64_t) LIT64( 0x8000000000000000 );
1482 if ( aExp
) aSig
|= 0x00800000;
1485 shift64ExtraRightJamming( aSig64
, 0, shiftCount
, &aSig64
, &aSigExtra
);
1486 return roundAndPackInt64( aSign
, aSig64
, aSigExtra STATUS_VAR
);
1490 /*----------------------------------------------------------------------------
1491 | Returns the result of converting the single-precision floating-point value
1492 | `a' to the 64-bit two's complement integer format. The conversion is
1493 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1494 | Arithmetic, except that the conversion is always rounded toward zero. If
1495 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1496 | conversion overflows, the largest integer with the same sign as `a' is
1498 *----------------------------------------------------------------------------*/
1500 int64
float32_to_int64_round_to_zero( float32 a STATUS_PARAM
)
1503 int16 aExp
, shiftCount
;
1507 a
= float32_squash_input_denormal(a STATUS_VAR
);
1509 aSig
= extractFloat32Frac( a
);
1510 aExp
= extractFloat32Exp( a
);
1511 aSign
= extractFloat32Sign( a
);
1512 shiftCount
= aExp
- 0xBE;
1513 if ( 0 <= shiftCount
) {
1514 if ( float32_val(a
) != 0xDF000000 ) {
1515 float_raise( float_flag_invalid STATUS_VAR
);
1516 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1517 return LIT64( 0x7FFFFFFFFFFFFFFF );
1520 return (int64_t) LIT64( 0x8000000000000000 );
1522 else if ( aExp
<= 0x7E ) {
1523 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1526 aSig64
= aSig
| 0x00800000;
1528 z
= aSig64
>>( - shiftCount
);
1529 if ( (uint64_t) ( aSig64
<<( shiftCount
& 63 ) ) ) {
1530 STATUS(float_exception_flags
) |= float_flag_inexact
;
1532 if ( aSign
) z
= - z
;
1537 /*----------------------------------------------------------------------------
1538 | Returns the result of converting the single-precision floating-point value
1539 | `a' to the double-precision floating-point format. The conversion is
1540 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1542 *----------------------------------------------------------------------------*/
1544 float64
float32_to_float64( float32 a STATUS_PARAM
)
1549 a
= float32_squash_input_denormal(a STATUS_VAR
);
1551 aSig
= extractFloat32Frac( a
);
1552 aExp
= extractFloat32Exp( a
);
1553 aSign
= extractFloat32Sign( a
);
1554 if ( aExp
== 0xFF ) {
1555 if ( aSig
) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
1556 return packFloat64( aSign
, 0x7FF, 0 );
1559 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
1560 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1563 return packFloat64( aSign
, aExp
+ 0x380, ( (uint64_t) aSig
)<<29 );
1567 /*----------------------------------------------------------------------------
1568 | Returns the result of converting the single-precision floating-point value
1569 | `a' to the extended double-precision floating-point format. The conversion
1570 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1572 *----------------------------------------------------------------------------*/
1574 floatx80
float32_to_floatx80( float32 a STATUS_PARAM
)
1580 a
= float32_squash_input_denormal(a STATUS_VAR
);
1581 aSig
= extractFloat32Frac( a
);
1582 aExp
= extractFloat32Exp( a
);
1583 aSign
= extractFloat32Sign( a
);
1584 if ( aExp
== 0xFF ) {
1585 if ( aSig
) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
1586 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1589 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
1590 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1593 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
1597 /*----------------------------------------------------------------------------
1598 | Returns the result of converting the single-precision floating-point value
1599 | `a' to the double-precision floating-point format. The conversion is
1600 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1602 *----------------------------------------------------------------------------*/
1604 float128
float32_to_float128( float32 a STATUS_PARAM
)
1610 a
= float32_squash_input_denormal(a STATUS_VAR
);
1611 aSig
= extractFloat32Frac( a
);
1612 aExp
= extractFloat32Exp( a
);
1613 aSign
= extractFloat32Sign( a
);
1614 if ( aExp
== 0xFF ) {
1615 if ( aSig
) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
1616 return packFloat128( aSign
, 0x7FFF, 0, 0 );
1619 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
1620 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1623 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
1627 /*----------------------------------------------------------------------------
1628 | Rounds the single-precision floating-point value `a' to an integer, and
1629 | returns the result as a single-precision floating-point value. The
1630 | operation is performed according to the IEC/IEEE Standard for Binary
1631 | Floating-Point Arithmetic.
1632 *----------------------------------------------------------------------------*/
1634 float32
float32_round_to_int( float32 a STATUS_PARAM
)
1638 uint32_t lastBitMask
, roundBitsMask
;
1641 a
= float32_squash_input_denormal(a STATUS_VAR
);
1643 aExp
= extractFloat32Exp( a
);
1644 if ( 0x96 <= aExp
) {
1645 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
1646 return propagateFloat32NaN( a
, a STATUS_VAR
);
1650 if ( aExp
<= 0x7E ) {
1651 if ( (uint32_t) ( float32_val(a
)<<1 ) == 0 ) return a
;
1652 STATUS(float_exception_flags
) |= float_flag_inexact
;
1653 aSign
= extractFloat32Sign( a
);
1654 switch ( STATUS(float_rounding_mode
) ) {
1655 case float_round_nearest_even
:
1656 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
1657 return packFloat32( aSign
, 0x7F, 0 );
1660 case float_round_down
:
1661 return make_float32(aSign
? 0xBF800000 : 0);
1662 case float_round_up
:
1663 return make_float32(aSign
? 0x80000000 : 0x3F800000);
1665 return packFloat32( aSign
, 0, 0 );
1668 lastBitMask
<<= 0x96 - aExp
;
1669 roundBitsMask
= lastBitMask
- 1;
1671 roundingMode
= STATUS(float_rounding_mode
);
1672 if ( roundingMode
== float_round_nearest_even
) {
1673 z
+= lastBitMask
>>1;
1674 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
1676 else if ( roundingMode
!= float_round_to_zero
) {
1677 if ( extractFloat32Sign( make_float32(z
) ) ^ ( roundingMode
== float_round_up
) ) {
1681 z
&= ~ roundBitsMask
;
1682 if ( z
!= float32_val(a
) ) STATUS(float_exception_flags
) |= float_flag_inexact
;
1683 return make_float32(z
);
1687 /*----------------------------------------------------------------------------
1688 | Returns the result of adding the absolute values of the single-precision
1689 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1690 | before being returned. `zSign' is ignored if the result is a NaN.
1691 | The addition is performed according to the IEC/IEEE Standard for Binary
1692 | Floating-Point Arithmetic.
1693 *----------------------------------------------------------------------------*/
1695 static float32
addFloat32Sigs( float32 a
, float32 b
, flag zSign STATUS_PARAM
)
1697 int16 aExp
, bExp
, zExp
;
1698 uint32_t aSig
, bSig
, zSig
;
1701 aSig
= extractFloat32Frac( a
);
1702 aExp
= extractFloat32Exp( a
);
1703 bSig
= extractFloat32Frac( b
);
1704 bExp
= extractFloat32Exp( b
);
1705 expDiff
= aExp
- bExp
;
1708 if ( 0 < expDiff
) {
1709 if ( aExp
== 0xFF ) {
1710 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1719 shift32RightJamming( bSig
, expDiff
, &bSig
);
1722 else if ( expDiff
< 0 ) {
1723 if ( bExp
== 0xFF ) {
1724 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1725 return packFloat32( zSign
, 0xFF, 0 );
1733 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1737 if ( aExp
== 0xFF ) {
1738 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1742 if (STATUS(flush_to_zero
)) {
1744 float_raise(float_flag_output_denormal STATUS_VAR
);
1746 return packFloat32(zSign
, 0, 0);
1748 return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
1750 zSig
= 0x40000000 + aSig
+ bSig
;
1755 zSig
= ( aSig
+ bSig
)<<1;
1757 if ( (int32_t) zSig
< 0 ) {
1762 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1766 /*----------------------------------------------------------------------------
1767 | Returns the result of subtracting the absolute values of the single-
1768 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1769 | difference is negated before being returned. `zSign' is ignored if the
1770 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1771 | Standard for Binary Floating-Point Arithmetic.
1772 *----------------------------------------------------------------------------*/
1774 static float32
subFloat32Sigs( float32 a
, float32 b
, flag zSign STATUS_PARAM
)
1776 int16 aExp
, bExp
, zExp
;
1777 uint32_t aSig
, bSig
, zSig
;
1780 aSig
= extractFloat32Frac( a
);
1781 aExp
= extractFloat32Exp( a
);
1782 bSig
= extractFloat32Frac( b
);
1783 bExp
= extractFloat32Exp( b
);
1784 expDiff
= aExp
- bExp
;
1787 if ( 0 < expDiff
) goto aExpBigger
;
1788 if ( expDiff
< 0 ) goto bExpBigger
;
1789 if ( aExp
== 0xFF ) {
1790 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1791 float_raise( float_flag_invalid STATUS_VAR
);
1792 return float32_default_nan
;
1798 if ( bSig
< aSig
) goto aBigger
;
1799 if ( aSig
< bSig
) goto bBigger
;
1800 return packFloat32( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
1802 if ( bExp
== 0xFF ) {
1803 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1804 return packFloat32( zSign
^ 1, 0xFF, 0 );
1812 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1818 goto normalizeRoundAndPack
;
1820 if ( aExp
== 0xFF ) {
1821 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1830 shift32RightJamming( bSig
, expDiff
, &bSig
);
1835 normalizeRoundAndPack
:
1837 return normalizeRoundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1841 /*----------------------------------------------------------------------------
1842 | Returns the result of adding the single-precision floating-point values `a'
1843 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1844 | Binary Floating-Point Arithmetic.
1845 *----------------------------------------------------------------------------*/
1847 float32
float32_add( float32 a
, float32 b STATUS_PARAM
)
1850 a
= float32_squash_input_denormal(a STATUS_VAR
);
1851 b
= float32_squash_input_denormal(b STATUS_VAR
);
1853 aSign
= extractFloat32Sign( a
);
1854 bSign
= extractFloat32Sign( b
);
1855 if ( aSign
== bSign
) {
1856 return addFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1859 return subFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1864 /*----------------------------------------------------------------------------
1865 | Returns the result of subtracting the single-precision floating-point values
1866 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1867 | for Binary Floating-Point Arithmetic.
1868 *----------------------------------------------------------------------------*/
1870 float32
float32_sub( float32 a
, float32 b STATUS_PARAM
)
1873 a
= float32_squash_input_denormal(a STATUS_VAR
);
1874 b
= float32_squash_input_denormal(b STATUS_VAR
);
1876 aSign
= extractFloat32Sign( a
);
1877 bSign
= extractFloat32Sign( b
);
1878 if ( aSign
== bSign
) {
1879 return subFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1882 return addFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1887 /*----------------------------------------------------------------------------
1888 | Returns the result of multiplying the single-precision floating-point values
1889 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1890 | for Binary Floating-Point Arithmetic.
1891 *----------------------------------------------------------------------------*/
1893 float32
float32_mul( float32 a
, float32 b STATUS_PARAM
)
1895 flag aSign
, bSign
, zSign
;
1896 int16 aExp
, bExp
, zExp
;
1897 uint32_t aSig
, bSig
;
1901 a
= float32_squash_input_denormal(a STATUS_VAR
);
1902 b
= float32_squash_input_denormal(b STATUS_VAR
);
1904 aSig
= extractFloat32Frac( a
);
1905 aExp
= extractFloat32Exp( a
);
1906 aSign
= extractFloat32Sign( a
);
1907 bSig
= extractFloat32Frac( b
);
1908 bExp
= extractFloat32Exp( b
);
1909 bSign
= extractFloat32Sign( b
);
1910 zSign
= aSign
^ bSign
;
1911 if ( aExp
== 0xFF ) {
1912 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1913 return propagateFloat32NaN( a
, b STATUS_VAR
);
1915 if ( ( bExp
| bSig
) == 0 ) {
1916 float_raise( float_flag_invalid STATUS_VAR
);
1917 return float32_default_nan
;
1919 return packFloat32( zSign
, 0xFF, 0 );
1921 if ( bExp
== 0xFF ) {
1922 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1923 if ( ( aExp
| aSig
) == 0 ) {
1924 float_raise( float_flag_invalid STATUS_VAR
);
1925 return float32_default_nan
;
1927 return packFloat32( zSign
, 0xFF, 0 );
1930 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1931 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1934 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1935 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1937 zExp
= aExp
+ bExp
- 0x7F;
1938 aSig
= ( aSig
| 0x00800000 )<<7;
1939 bSig
= ( bSig
| 0x00800000 )<<8;
1940 shift64RightJamming( ( (uint64_t) aSig
) * bSig
, 32, &zSig64
);
1942 if ( 0 <= (int32_t) ( zSig
<<1 ) ) {
1946 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1950 /*----------------------------------------------------------------------------
1951 | Returns the result of dividing the single-precision floating-point value `a'
1952 | by the corresponding value `b'. The operation is performed according to the
1953 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1954 *----------------------------------------------------------------------------*/
1956 float32
float32_div( float32 a
, float32 b STATUS_PARAM
)
1958 flag aSign
, bSign
, zSign
;
1959 int16 aExp
, bExp
, zExp
;
1960 uint32_t aSig
, bSig
, zSig
;
1961 a
= float32_squash_input_denormal(a STATUS_VAR
);
1962 b
= float32_squash_input_denormal(b STATUS_VAR
);
1964 aSig
= extractFloat32Frac( a
);
1965 aExp
= extractFloat32Exp( a
);
1966 aSign
= extractFloat32Sign( a
);
1967 bSig
= extractFloat32Frac( b
);
1968 bExp
= extractFloat32Exp( b
);
1969 bSign
= extractFloat32Sign( b
);
1970 zSign
= aSign
^ bSign
;
1971 if ( aExp
== 0xFF ) {
1972 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1973 if ( bExp
== 0xFF ) {
1974 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1975 float_raise( float_flag_invalid STATUS_VAR
);
1976 return float32_default_nan
;
1978 return packFloat32( zSign
, 0xFF, 0 );
1980 if ( bExp
== 0xFF ) {
1981 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1982 return packFloat32( zSign
, 0, 0 );
1986 if ( ( aExp
| aSig
) == 0 ) {
1987 float_raise( float_flag_invalid STATUS_VAR
);
1988 return float32_default_nan
;
1990 float_raise( float_flag_divbyzero STATUS_VAR
);
1991 return packFloat32( zSign
, 0xFF, 0 );
1993 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1996 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1997 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1999 zExp
= aExp
- bExp
+ 0x7D;
2000 aSig
= ( aSig
| 0x00800000 )<<7;
2001 bSig
= ( bSig
| 0x00800000 )<<8;
2002 if ( bSig
<= ( aSig
+ aSig
) ) {
2006 zSig
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
2007 if ( ( zSig
& 0x3F ) == 0 ) {
2008 zSig
|= ( (uint64_t) bSig
* zSig
!= ( (uint64_t) aSig
)<<32 );
2010 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
2014 /*----------------------------------------------------------------------------
2015 | Returns the remainder of the single-precision floating-point value `a'
2016 | with respect to the corresponding value `b'. The operation is performed
2017 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2018 *----------------------------------------------------------------------------*/
2020 float32
float32_rem( float32 a
, float32 b STATUS_PARAM
)
2023 int16 aExp
, bExp
, expDiff
;
2024 uint32_t aSig
, bSig
;
2026 uint64_t aSig64
, bSig64
, q64
;
2027 uint32_t alternateASig
;
2029 a
= float32_squash_input_denormal(a STATUS_VAR
);
2030 b
= float32_squash_input_denormal(b STATUS_VAR
);
2032 aSig
= extractFloat32Frac( a
);
2033 aExp
= extractFloat32Exp( a
);
2034 aSign
= extractFloat32Sign( a
);
2035 bSig
= extractFloat32Frac( b
);
2036 bExp
= extractFloat32Exp( b
);
2037 if ( aExp
== 0xFF ) {
2038 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
2039 return propagateFloat32NaN( a
, b STATUS_VAR
);
2041 float_raise( float_flag_invalid STATUS_VAR
);
2042 return float32_default_nan
;
2044 if ( bExp
== 0xFF ) {
2045 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
2050 float_raise( float_flag_invalid STATUS_VAR
);
2051 return float32_default_nan
;
2053 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2056 if ( aSig
== 0 ) return a
;
2057 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2059 expDiff
= aExp
- bExp
;
2062 if ( expDiff
< 32 ) {
2065 if ( expDiff
< 0 ) {
2066 if ( expDiff
< -1 ) return a
;
2069 q
= ( bSig
<= aSig
);
2070 if ( q
) aSig
-= bSig
;
2071 if ( 0 < expDiff
) {
2072 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
2075 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
2083 if ( bSig
<= aSig
) aSig
-= bSig
;
2084 aSig64
= ( (uint64_t) aSig
)<<40;
2085 bSig64
= ( (uint64_t) bSig
)<<40;
2087 while ( 0 < expDiff
) {
2088 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2089 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2090 aSig64
= - ( ( bSig
* q64
)<<38 );
2094 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2095 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2096 q
= q64
>>( 64 - expDiff
);
2098 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
2101 alternateASig
= aSig
;
2104 } while ( 0 <= (int32_t) aSig
);
2105 sigMean
= aSig
+ alternateASig
;
2106 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
2107 aSig
= alternateASig
;
2109 zSign
= ( (int32_t) aSig
< 0 );
2110 if ( zSign
) aSig
= - aSig
;
2111 return normalizeRoundAndPackFloat32( aSign
^ zSign
, bExp
, aSig STATUS_VAR
);
2115 /*----------------------------------------------------------------------------
2116 | Returns the square root of the single-precision floating-point value `a'.
2117 | The operation is performed according to the IEC/IEEE Standard for Binary
2118 | Floating-Point Arithmetic.
2119 *----------------------------------------------------------------------------*/
2121 float32
float32_sqrt( float32 a STATUS_PARAM
)
2125 uint32_t aSig
, zSig
;
2127 a
= float32_squash_input_denormal(a STATUS_VAR
);
2129 aSig
= extractFloat32Frac( a
);
2130 aExp
= extractFloat32Exp( a
);
2131 aSign
= extractFloat32Sign( a
);
2132 if ( aExp
== 0xFF ) {
2133 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2134 if ( ! aSign
) return a
;
2135 float_raise( float_flag_invalid STATUS_VAR
);
2136 return float32_default_nan
;
2139 if ( ( aExp
| aSig
) == 0 ) return a
;
2140 float_raise( float_flag_invalid STATUS_VAR
);
2141 return float32_default_nan
;
2144 if ( aSig
== 0 ) return float32_zero
;
2145 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2147 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
2148 aSig
= ( aSig
| 0x00800000 )<<8;
2149 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
2150 if ( ( zSig
& 0x7F ) <= 5 ) {
2156 term
= ( (uint64_t) zSig
) * zSig
;
2157 rem
= ( ( (uint64_t) aSig
)<<32 ) - term
;
2158 while ( (int64_t) rem
< 0 ) {
2160 rem
+= ( ( (uint64_t) zSig
)<<1 ) | 1;
2162 zSig
|= ( rem
!= 0 );
2164 shift32RightJamming( zSig
, 1, &zSig
);
2166 return roundAndPackFloat32( 0, zExp
, zSig STATUS_VAR
);
2170 /*----------------------------------------------------------------------------
2171 | Returns the binary exponential of the single-precision floating-point value
2172 | `a'. The operation is performed according to the IEC/IEEE Standard for
2173 | Binary Floating-Point Arithmetic.
2175 | Uses the following identities:
2177 | 1. -------------------------------------------------------------------------
2181 | 2. -------------------------------------------------------------------------
2184 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2186 *----------------------------------------------------------------------------*/
2188 static const float64 float32_exp2_coefficients
[15] =
2190 const_float64( 0x3ff0000000000000ll
), /* 1 */
2191 const_float64( 0x3fe0000000000000ll
), /* 2 */
2192 const_float64( 0x3fc5555555555555ll
), /* 3 */
2193 const_float64( 0x3fa5555555555555ll
), /* 4 */
2194 const_float64( 0x3f81111111111111ll
), /* 5 */
2195 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
2196 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
2197 const_float64( 0x3efa01a01a01a01all
), /* 8 */
2198 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
2199 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
2200 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
2201 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
2202 const_float64( 0x3de6124613a86d09ll
), /* 13 */
2203 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
2204 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
2207 float32
float32_exp2( float32 a STATUS_PARAM
)
2214 a
= float32_squash_input_denormal(a STATUS_VAR
);
2216 aSig
= extractFloat32Frac( a
);
2217 aExp
= extractFloat32Exp( a
);
2218 aSign
= extractFloat32Sign( a
);
2220 if ( aExp
== 0xFF) {
2221 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2222 return (aSign
) ? float32_zero
: a
;
2225 if (aSig
== 0) return float32_one
;
2228 float_raise( float_flag_inexact STATUS_VAR
);
2230 /* ******************************* */
2231 /* using float64 for approximation */
2232 /* ******************************* */
2233 x
= float32_to_float64(a STATUS_VAR
);
2234 x
= float64_mul(x
, float64_ln2 STATUS_VAR
);
2238 for (i
= 0 ; i
< 15 ; i
++) {
2241 f
= float64_mul(xn
, float32_exp2_coefficients
[i
] STATUS_VAR
);
2242 r
= float64_add(r
, f STATUS_VAR
);
2244 xn
= float64_mul(xn
, x STATUS_VAR
);
2247 return float64_to_float32(r
, status
);
2250 /*----------------------------------------------------------------------------
2251 | Returns the binary log of the single-precision floating-point value `a'.
2252 | The operation is performed according to the IEC/IEEE Standard for Binary
2253 | Floating-Point Arithmetic.
2254 *----------------------------------------------------------------------------*/
2255 float32
float32_log2( float32 a STATUS_PARAM
)
2259 uint32_t aSig
, zSig
, i
;
2261 a
= float32_squash_input_denormal(a STATUS_VAR
);
2262 aSig
= extractFloat32Frac( a
);
2263 aExp
= extractFloat32Exp( a
);
2264 aSign
= extractFloat32Sign( a
);
2267 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
2268 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2271 float_raise( float_flag_invalid STATUS_VAR
);
2272 return float32_default_nan
;
2274 if ( aExp
== 0xFF ) {
2275 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2284 for (i
= 1 << 22; i
> 0; i
>>= 1) {
2285 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
2286 if ( aSig
& 0x01000000 ) {
2295 return normalizeRoundAndPackFloat32( zSign
, 0x85, zSig STATUS_VAR
);
2298 /*----------------------------------------------------------------------------
2299 | Returns 1 if the single-precision floating-point value `a' is equal to
2300 | the corresponding value `b', and 0 otherwise. The invalid exception is
2301 | raised if either operand is a NaN. Otherwise, the comparison is performed
2302 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2303 *----------------------------------------------------------------------------*/
2305 int float32_eq( float32 a
, float32 b STATUS_PARAM
)
2308 a
= float32_squash_input_denormal(a STATUS_VAR
);
2309 b
= float32_squash_input_denormal(b STATUS_VAR
);
2311 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2312 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2314 float_raise( float_flag_invalid STATUS_VAR
);
2317 av
= float32_val(a
);
2318 bv
= float32_val(b
);
2319 return ( av
== bv
) || ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
2322 /*----------------------------------------------------------------------------
2323 | Returns 1 if the single-precision floating-point value `a' is less than
2324 | or equal to the corresponding value `b', and 0 otherwise. The invalid
2325 | exception is raised if either operand is a NaN. The comparison is performed
2326 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2327 *----------------------------------------------------------------------------*/
2329 int float32_le( float32 a
, float32 b STATUS_PARAM
)
2333 a
= float32_squash_input_denormal(a STATUS_VAR
);
2334 b
= float32_squash_input_denormal(b STATUS_VAR
);
2336 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2337 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2339 float_raise( float_flag_invalid STATUS_VAR
);
2342 aSign
= extractFloat32Sign( a
);
2343 bSign
= extractFloat32Sign( b
);
2344 av
= float32_val(a
);
2345 bv
= float32_val(b
);
2346 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
2347 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
2351 /*----------------------------------------------------------------------------
2352 | Returns 1 if the single-precision floating-point value `a' is less than
2353 | the corresponding value `b', and 0 otherwise. The invalid exception is
2354 | raised if either operand is a NaN. The comparison is performed according
2355 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2356 *----------------------------------------------------------------------------*/
2358 int float32_lt( float32 a
, float32 b STATUS_PARAM
)
2362 a
= float32_squash_input_denormal(a STATUS_VAR
);
2363 b
= float32_squash_input_denormal(b STATUS_VAR
);
2365 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2366 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2368 float_raise( float_flag_invalid STATUS_VAR
);
2371 aSign
= extractFloat32Sign( a
);
2372 bSign
= extractFloat32Sign( b
);
2373 av
= float32_val(a
);
2374 bv
= float32_val(b
);
2375 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
2376 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
2380 /*----------------------------------------------------------------------------
2381 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2382 | be compared, and 0 otherwise. The invalid exception is raised if either
2383 | operand is a NaN. The comparison is performed according to the IEC/IEEE
2384 | Standard for Binary Floating-Point Arithmetic.
2385 *----------------------------------------------------------------------------*/
2387 int float32_unordered( float32 a
, float32 b STATUS_PARAM
)
2389 a
= float32_squash_input_denormal(a STATUS_VAR
);
2390 b
= float32_squash_input_denormal(b STATUS_VAR
);
2392 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2393 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2395 float_raise( float_flag_invalid STATUS_VAR
);
2401 /*----------------------------------------------------------------------------
2402 | Returns 1 if the single-precision floating-point value `a' is equal to
2403 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2404 | exception. The comparison is performed according to the IEC/IEEE Standard
2405 | for Binary Floating-Point Arithmetic.
2406 *----------------------------------------------------------------------------*/
2408 int float32_eq_quiet( float32 a
, float32 b STATUS_PARAM
)
2410 a
= float32_squash_input_denormal(a STATUS_VAR
);
2411 b
= float32_squash_input_denormal(b STATUS_VAR
);
2413 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2414 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2416 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2417 float_raise( float_flag_invalid STATUS_VAR
);
2421 return ( float32_val(a
) == float32_val(b
) ) ||
2422 ( (uint32_t) ( ( float32_val(a
) | float32_val(b
) )<<1 ) == 0 );
2425 /*----------------------------------------------------------------------------
2426 | Returns 1 if the single-precision floating-point value `a' is less than or
2427 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2428 | cause an exception. Otherwise, the comparison is performed according to the
2429 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2430 *----------------------------------------------------------------------------*/
2432 int float32_le_quiet( float32 a
, float32 b STATUS_PARAM
)
2436 a
= float32_squash_input_denormal(a STATUS_VAR
);
2437 b
= float32_squash_input_denormal(b STATUS_VAR
);
2439 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2440 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2442 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2443 float_raise( float_flag_invalid STATUS_VAR
);
2447 aSign
= extractFloat32Sign( a
);
2448 bSign
= extractFloat32Sign( b
);
2449 av
= float32_val(a
);
2450 bv
= float32_val(b
);
2451 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
2452 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
2456 /*----------------------------------------------------------------------------
2457 | Returns 1 if the single-precision floating-point value `a' is less than
2458 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2459 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2460 | Standard for Binary Floating-Point Arithmetic.
2461 *----------------------------------------------------------------------------*/
2463 int float32_lt_quiet( float32 a
, float32 b STATUS_PARAM
)
2467 a
= float32_squash_input_denormal(a STATUS_VAR
);
2468 b
= float32_squash_input_denormal(b STATUS_VAR
);
2470 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2471 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2473 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2474 float_raise( float_flag_invalid STATUS_VAR
);
2478 aSign
= extractFloat32Sign( a
);
2479 bSign
= extractFloat32Sign( b
);
2480 av
= float32_val(a
);
2481 bv
= float32_val(b
);
2482 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
2483 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
2487 /*----------------------------------------------------------------------------
2488 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2489 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2490 | comparison is performed according to the IEC/IEEE Standard for Binary
2491 | Floating-Point Arithmetic.
2492 *----------------------------------------------------------------------------*/
2494 int float32_unordered_quiet( float32 a
, float32 b STATUS_PARAM
)
2496 a
= float32_squash_input_denormal(a STATUS_VAR
);
2497 b
= float32_squash_input_denormal(b STATUS_VAR
);
2499 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2500 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2502 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2503 float_raise( float_flag_invalid STATUS_VAR
);
2510 /*----------------------------------------------------------------------------
2511 | Returns the result of converting the double-precision floating-point value
2512 | `a' to the 32-bit two's complement integer format. The conversion is
2513 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2514 | Arithmetic---which means in particular that the conversion is rounded
2515 | according to the current rounding mode. If `a' is a NaN, the largest
2516 | positive integer is returned. Otherwise, if the conversion overflows, the
2517 | largest integer with the same sign as `a' is returned.
2518 *----------------------------------------------------------------------------*/
2520 int32
float64_to_int32( float64 a STATUS_PARAM
)
2523 int16 aExp
, shiftCount
;
2525 a
= float64_squash_input_denormal(a STATUS_VAR
);
2527 aSig
= extractFloat64Frac( a
);
2528 aExp
= extractFloat64Exp( a
);
2529 aSign
= extractFloat64Sign( a
);
2530 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2531 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2532 shiftCount
= 0x42C - aExp
;
2533 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
2534 return roundAndPackInt32( aSign
, aSig STATUS_VAR
);
2538 /*----------------------------------------------------------------------------
2539 | Returns the result of converting the double-precision floating-point value
2540 | `a' to the 32-bit two's complement integer format. The conversion is
2541 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2542 | Arithmetic, except that the conversion is always rounded toward zero.
2543 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2544 | the conversion overflows, the largest integer with the same sign as `a' is
2546 *----------------------------------------------------------------------------*/
2548 int32
float64_to_int32_round_to_zero( float64 a STATUS_PARAM
)
2551 int16 aExp
, shiftCount
;
2552 uint64_t aSig
, savedASig
;
2554 a
= float64_squash_input_denormal(a STATUS_VAR
);
2556 aSig
= extractFloat64Frac( a
);
2557 aExp
= extractFloat64Exp( a
);
2558 aSign
= extractFloat64Sign( a
);
2559 if ( 0x41E < aExp
) {
2560 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2563 else if ( aExp
< 0x3FF ) {
2564 if ( aExp
|| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
2567 aSig
|= LIT64( 0x0010000000000000 );
2568 shiftCount
= 0x433 - aExp
;
2570 aSig
>>= shiftCount
;
2572 if ( aSign
) z
= - z
;
2573 if ( ( z
< 0 ) ^ aSign
) {
2575 float_raise( float_flag_invalid STATUS_VAR
);
2576 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
2578 if ( ( aSig
<<shiftCount
) != savedASig
) {
2579 STATUS(float_exception_flags
) |= float_flag_inexact
;
2585 /*----------------------------------------------------------------------------
2586 | Returns the result of converting the double-precision floating-point value
2587 | `a' to the 16-bit two's complement integer format. The conversion is
2588 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2589 | Arithmetic, except that the conversion is always rounded toward zero.
2590 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2591 | the conversion overflows, the largest integer with the same sign as `a' is
2593 *----------------------------------------------------------------------------*/
2595 int16
float64_to_int16_round_to_zero( float64 a STATUS_PARAM
)
2598 int16 aExp
, shiftCount
;
2599 uint64_t aSig
, savedASig
;
2602 aSig
= extractFloat64Frac( a
);
2603 aExp
= extractFloat64Exp( a
);
2604 aSign
= extractFloat64Sign( a
);
2605 if ( 0x40E < aExp
) {
2606 if ( ( aExp
== 0x7FF ) && aSig
) {
2611 else if ( aExp
< 0x3FF ) {
2612 if ( aExp
|| aSig
) {
2613 STATUS(float_exception_flags
) |= float_flag_inexact
;
2617 aSig
|= LIT64( 0x0010000000000000 );
2618 shiftCount
= 0x433 - aExp
;
2620 aSig
>>= shiftCount
;
2625 if ( ( (int16_t)z
< 0 ) ^ aSign
) {
2627 float_raise( float_flag_invalid STATUS_VAR
);
2628 return aSign
? (int32_t) 0xffff8000 : 0x7FFF;
2630 if ( ( aSig
<<shiftCount
) != savedASig
) {
2631 STATUS(float_exception_flags
) |= float_flag_inexact
;
2636 /*----------------------------------------------------------------------------
2637 | Returns the result of converting the double-precision floating-point value
2638 | `a' to the 64-bit two's complement integer format. The conversion is
2639 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2640 | Arithmetic---which means in particular that the conversion is rounded
2641 | according to the current rounding mode. If `a' is a NaN, the largest
2642 | positive integer is returned. Otherwise, if the conversion overflows, the
2643 | largest integer with the same sign as `a' is returned.
2644 *----------------------------------------------------------------------------*/
2646 int64
float64_to_int64( float64 a STATUS_PARAM
)
2649 int16 aExp
, shiftCount
;
2650 uint64_t aSig
, aSigExtra
;
2651 a
= float64_squash_input_denormal(a STATUS_VAR
);
2653 aSig
= extractFloat64Frac( a
);
2654 aExp
= extractFloat64Exp( a
);
2655 aSign
= extractFloat64Sign( a
);
2656 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2657 shiftCount
= 0x433 - aExp
;
2658 if ( shiftCount
<= 0 ) {
2659 if ( 0x43E < aExp
) {
2660 float_raise( float_flag_invalid STATUS_VAR
);
2662 || ( ( aExp
== 0x7FF )
2663 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2665 return LIT64( 0x7FFFFFFFFFFFFFFF );
2667 return (int64_t) LIT64( 0x8000000000000000 );
2670 aSig
<<= - shiftCount
;
2673 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
2675 return roundAndPackInt64( aSign
, aSig
, aSigExtra STATUS_VAR
);
2679 /*----------------------------------------------------------------------------
2680 | Returns the result of converting the double-precision floating-point value
2681 | `a' to the 64-bit two's complement integer format. The conversion is
2682 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2683 | Arithmetic, except that the conversion is always rounded toward zero.
2684 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2685 | the conversion overflows, the largest integer with the same sign as `a' is
2687 *----------------------------------------------------------------------------*/
2689 int64
float64_to_int64_round_to_zero( float64 a STATUS_PARAM
)
2692 int16 aExp
, shiftCount
;
2695 a
= float64_squash_input_denormal(a STATUS_VAR
);
2697 aSig
= extractFloat64Frac( a
);
2698 aExp
= extractFloat64Exp( a
);
2699 aSign
= extractFloat64Sign( a
);
2700 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2701 shiftCount
= aExp
- 0x433;
2702 if ( 0 <= shiftCount
) {
2703 if ( 0x43E <= aExp
) {
2704 if ( float64_val(a
) != LIT64( 0xC3E0000000000000 ) ) {
2705 float_raise( float_flag_invalid STATUS_VAR
);
2707 || ( ( aExp
== 0x7FF )
2708 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2710 return LIT64( 0x7FFFFFFFFFFFFFFF );
2713 return (int64_t) LIT64( 0x8000000000000000 );
2715 z
= aSig
<<shiftCount
;
2718 if ( aExp
< 0x3FE ) {
2719 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
2722 z
= aSig
>>( - shiftCount
);
2723 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
2724 STATUS(float_exception_flags
) |= float_flag_inexact
;
2727 if ( aSign
) z
= - z
;
2732 /*----------------------------------------------------------------------------
2733 | Returns the result of converting the double-precision floating-point value
2734 | `a' to the single-precision floating-point format. The conversion is
2735 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2737 *----------------------------------------------------------------------------*/
2739 float32
float64_to_float32( float64 a STATUS_PARAM
)
2745 a
= float64_squash_input_denormal(a STATUS_VAR
);
2747 aSig
= extractFloat64Frac( a
);
2748 aExp
= extractFloat64Exp( a
);
2749 aSign
= extractFloat64Sign( a
);
2750 if ( aExp
== 0x7FF ) {
2751 if ( aSig
) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
2752 return packFloat32( aSign
, 0xFF, 0 );
2754 shift64RightJamming( aSig
, 22, &aSig
);
2756 if ( aExp
|| zSig
) {
2760 return roundAndPackFloat32( aSign
, aExp
, zSig STATUS_VAR
);
2765 /*----------------------------------------------------------------------------
2766 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2767 | half-precision floating-point value, returning the result. After being
2768 | shifted into the proper positions, the three fields are simply added
2769 | together to form the result. This means that any integer portion of `zSig'
2770 | will be added into the exponent. Since a properly normalized significand
2771 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2772 | than the desired result exponent whenever `zSig' is a complete, normalized
2774 *----------------------------------------------------------------------------*/
2775 static float16
packFloat16(flag zSign
, int16 zExp
, uint16_t zSig
)
2777 return make_float16(
2778 (((uint32_t)zSign
) << 15) + (((uint32_t)zExp
) << 10) + zSig
);
2781 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2782 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
2784 float32
float16_to_float32(float16 a
, flag ieee STATUS_PARAM
)
2790 aSign
= extractFloat16Sign(a
);
2791 aExp
= extractFloat16Exp(a
);
2792 aSig
= extractFloat16Frac(a
);
2794 if (aExp
== 0x1f && ieee
) {
2796 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR
) STATUS_VAR
);
2798 return packFloat32(aSign
, 0xff, aSig
<< 13);
2804 return packFloat32(aSign
, 0, 0);
2807 shiftCount
= countLeadingZeros32( aSig
) - 21;
2808 aSig
= aSig
<< shiftCount
;
2811 return packFloat32( aSign
, aExp
+ 0x70, aSig
<< 13);
2814 float16
float32_to_float16(float32 a
, flag ieee STATUS_PARAM
)
2822 a
= float32_squash_input_denormal(a STATUS_VAR
);
2824 aSig
= extractFloat32Frac( a
);
2825 aExp
= extractFloat32Exp( a
);
2826 aSign
= extractFloat32Sign( a
);
2827 if ( aExp
== 0xFF ) {
2829 /* Input is a NaN */
2830 float16 r
= commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
2832 return packFloat16(aSign
, 0, 0);
2838 float_raise(float_flag_invalid STATUS_VAR
);
2839 return packFloat16(aSign
, 0x1f, 0x3ff);
2841 return packFloat16(aSign
, 0x1f, 0);
2843 if (aExp
== 0 && aSig
== 0) {
2844 return packFloat16(aSign
, 0, 0);
2846 /* Decimal point between bits 22 and 23. */
2858 float_raise( float_flag_underflow STATUS_VAR
);
2859 roundingMode
= STATUS(float_rounding_mode
);
2860 switch (roundingMode
) {
2861 case float_round_nearest_even
:
2862 increment
= (mask
+ 1) >> 1;
2863 if ((aSig
& mask
) == increment
) {
2864 increment
= aSig
& (increment
<< 1);
2867 case float_round_up
:
2868 increment
= aSign
? 0 : mask
;
2870 case float_round_down
:
2871 increment
= aSign
? mask
: 0;
2873 default: /* round_to_zero */
2878 if (aSig
>= 0x01000000) {
2882 } else if (aExp
< -14
2883 && STATUS(float_detect_tininess
) == float_tininess_before_rounding
) {
2884 float_raise( float_flag_underflow STATUS_VAR
);
2889 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
2890 return packFloat16(aSign
, 0x1f, 0);
2894 float_raise(float_flag_invalid
| float_flag_inexact STATUS_VAR
);
2895 return packFloat16(aSign
, 0x1f, 0x3ff);
2899 return packFloat16(aSign
, 0, 0);
2902 aSig
>>= -14 - aExp
;
2905 return packFloat16(aSign
, aExp
+ 14, aSig
>> 13);
2908 /*----------------------------------------------------------------------------
2909 | Returns the result of converting the double-precision floating-point value
2910 | `a' to the extended double-precision floating-point format. The conversion
2911 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2913 *----------------------------------------------------------------------------*/
2915 floatx80
float64_to_floatx80( float64 a STATUS_PARAM
)
2921 a
= float64_squash_input_denormal(a STATUS_VAR
);
2922 aSig
= extractFloat64Frac( a
);
2923 aExp
= extractFloat64Exp( a
);
2924 aSign
= extractFloat64Sign( a
);
2925 if ( aExp
== 0x7FF ) {
2926 if ( aSig
) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
2927 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2930 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
2931 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2935 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
2939 /*----------------------------------------------------------------------------
2940 | Returns the result of converting the double-precision floating-point value
2941 | `a' to the quadruple-precision floating-point format. The conversion is
2942 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2944 *----------------------------------------------------------------------------*/
2946 float128
float64_to_float128( float64 a STATUS_PARAM
)
2950 uint64_t aSig
, zSig0
, zSig1
;
2952 a
= float64_squash_input_denormal(a STATUS_VAR
);
2953 aSig
= extractFloat64Frac( a
);
2954 aExp
= extractFloat64Exp( a
);
2955 aSign
= extractFloat64Sign( a
);
2956 if ( aExp
== 0x7FF ) {
2957 if ( aSig
) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
2958 return packFloat128( aSign
, 0x7FFF, 0, 0 );
2961 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
2962 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2965 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
2966 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
2970 /*----------------------------------------------------------------------------
2971 | Rounds the double-precision floating-point value `a' to an integer, and
2972 | returns the result as a double-precision floating-point value. The
2973 | operation is performed according to the IEC/IEEE Standard for Binary
2974 | Floating-Point Arithmetic.
2975 *----------------------------------------------------------------------------*/
2977 float64
float64_round_to_int( float64 a STATUS_PARAM
)
2981 uint64_t lastBitMask
, roundBitsMask
;
2984 a
= float64_squash_input_denormal(a STATUS_VAR
);
2986 aExp
= extractFloat64Exp( a
);
2987 if ( 0x433 <= aExp
) {
2988 if ( ( aExp
== 0x7FF ) && extractFloat64Frac( a
) ) {
2989 return propagateFloat64NaN( a
, a STATUS_VAR
);
2993 if ( aExp
< 0x3FF ) {
2994 if ( (uint64_t) ( float64_val(a
)<<1 ) == 0 ) return a
;
2995 STATUS(float_exception_flags
) |= float_flag_inexact
;
2996 aSign
= extractFloat64Sign( a
);
2997 switch ( STATUS(float_rounding_mode
) ) {
2998 case float_round_nearest_even
:
2999 if ( ( aExp
== 0x3FE ) && extractFloat64Frac( a
) ) {
3000 return packFloat64( aSign
, 0x3FF, 0 );
3003 case float_round_down
:
3004 return make_float64(aSign
? LIT64( 0xBFF0000000000000 ) : 0);
3005 case float_round_up
:
3006 return make_float64(
3007 aSign
? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3009 return packFloat64( aSign
, 0, 0 );
3012 lastBitMask
<<= 0x433 - aExp
;
3013 roundBitsMask
= lastBitMask
- 1;
3015 roundingMode
= STATUS(float_rounding_mode
);
3016 if ( roundingMode
== float_round_nearest_even
) {
3017 z
+= lastBitMask
>>1;
3018 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
3020 else if ( roundingMode
!= float_round_to_zero
) {
3021 if ( extractFloat64Sign( make_float64(z
) ) ^ ( roundingMode
== float_round_up
) ) {
3025 z
&= ~ roundBitsMask
;
3026 if ( z
!= float64_val(a
) )
3027 STATUS(float_exception_flags
) |= float_flag_inexact
;
3028 return make_float64(z
);
3032 float64
float64_trunc_to_int( float64 a STATUS_PARAM
)
3036 oldmode
= STATUS(float_rounding_mode
);
3037 STATUS(float_rounding_mode
) = float_round_to_zero
;
3038 res
= float64_round_to_int(a STATUS_VAR
);
3039 STATUS(float_rounding_mode
) = oldmode
;
3043 /*----------------------------------------------------------------------------
3044 | Returns the result of adding the absolute values of the double-precision
3045 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3046 | before being returned. `zSign' is ignored if the result is a NaN.
3047 | The addition is performed according to the IEC/IEEE Standard for Binary
3048 | Floating-Point Arithmetic.
3049 *----------------------------------------------------------------------------*/
3051 static float64
addFloat64Sigs( float64 a
, float64 b
, flag zSign STATUS_PARAM
)
3053 int16 aExp
, bExp
, zExp
;
3054 uint64_t aSig
, bSig
, zSig
;
3057 aSig
= extractFloat64Frac( a
);
3058 aExp
= extractFloat64Exp( a
);
3059 bSig
= extractFloat64Frac( b
);
3060 bExp
= extractFloat64Exp( b
);
3061 expDiff
= aExp
- bExp
;
3064 if ( 0 < expDiff
) {
3065 if ( aExp
== 0x7FF ) {
3066 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3073 bSig
|= LIT64( 0x2000000000000000 );
3075 shift64RightJamming( bSig
, expDiff
, &bSig
);
3078 else if ( expDiff
< 0 ) {
3079 if ( bExp
== 0x7FF ) {
3080 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3081 return packFloat64( zSign
, 0x7FF, 0 );
3087 aSig
|= LIT64( 0x2000000000000000 );
3089 shift64RightJamming( aSig
, - expDiff
, &aSig
);
3093 if ( aExp
== 0x7FF ) {
3094 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3098 if (STATUS(flush_to_zero
)) {
3100 float_raise(float_flag_output_denormal STATUS_VAR
);
3102 return packFloat64(zSign
, 0, 0);
3104 return packFloat64( zSign
, 0, ( aSig
+ bSig
)>>9 );
3106 zSig
= LIT64( 0x4000000000000000 ) + aSig
+ bSig
;
3110 aSig
|= LIT64( 0x2000000000000000 );
3111 zSig
= ( aSig
+ bSig
)<<1;
3113 if ( (int64_t) zSig
< 0 ) {
3118 return roundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
3122 /*----------------------------------------------------------------------------
3123 | Returns the result of subtracting the absolute values of the double-
3124 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3125 | difference is negated before being returned. `zSign' is ignored if the
3126 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3127 | Standard for Binary Floating-Point Arithmetic.
3128 *----------------------------------------------------------------------------*/
3130 static float64
subFloat64Sigs( float64 a
, float64 b
, flag zSign STATUS_PARAM
)
3132 int16 aExp
, bExp
, zExp
;
3133 uint64_t aSig
, bSig
, zSig
;
3136 aSig
= extractFloat64Frac( a
);
3137 aExp
= extractFloat64Exp( a
);
3138 bSig
= extractFloat64Frac( b
);
3139 bExp
= extractFloat64Exp( b
);
3140 expDiff
= aExp
- bExp
;
3143 if ( 0 < expDiff
) goto aExpBigger
;
3144 if ( expDiff
< 0 ) goto bExpBigger
;
3145 if ( aExp
== 0x7FF ) {
3146 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3147 float_raise( float_flag_invalid STATUS_VAR
);
3148 return float64_default_nan
;
3154 if ( bSig
< aSig
) goto aBigger
;
3155 if ( aSig
< bSig
) goto bBigger
;
3156 return packFloat64( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
3158 if ( bExp
== 0x7FF ) {
3159 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3160 return packFloat64( zSign
^ 1, 0x7FF, 0 );
3166 aSig
|= LIT64( 0x4000000000000000 );
3168 shift64RightJamming( aSig
, - expDiff
, &aSig
);
3169 bSig
|= LIT64( 0x4000000000000000 );
3174 goto normalizeRoundAndPack
;
3176 if ( aExp
== 0x7FF ) {
3177 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3184 bSig
|= LIT64( 0x4000000000000000 );
3186 shift64RightJamming( bSig
, expDiff
, &bSig
);
3187 aSig
|= LIT64( 0x4000000000000000 );
3191 normalizeRoundAndPack
:
3193 return normalizeRoundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
3197 /*----------------------------------------------------------------------------
3198 | Returns the result of adding the double-precision floating-point values `a'
3199 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3200 | Binary Floating-Point Arithmetic.
3201 *----------------------------------------------------------------------------*/
3203 float64
float64_add( float64 a
, float64 b STATUS_PARAM
)
3206 a
= float64_squash_input_denormal(a STATUS_VAR
);
3207 b
= float64_squash_input_denormal(b STATUS_VAR
);
3209 aSign
= extractFloat64Sign( a
);
3210 bSign
= extractFloat64Sign( b
);
3211 if ( aSign
== bSign
) {
3212 return addFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3215 return subFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3220 /*----------------------------------------------------------------------------
3221 | Returns the result of subtracting the double-precision floating-point values
3222 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3223 | for Binary Floating-Point Arithmetic.
3224 *----------------------------------------------------------------------------*/
3226 float64
float64_sub( float64 a
, float64 b STATUS_PARAM
)
3229 a
= float64_squash_input_denormal(a STATUS_VAR
);
3230 b
= float64_squash_input_denormal(b STATUS_VAR
);
3232 aSign
= extractFloat64Sign( a
);
3233 bSign
= extractFloat64Sign( b
);
3234 if ( aSign
== bSign
) {
3235 return subFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3238 return addFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3243 /*----------------------------------------------------------------------------
3244 | Returns the result of multiplying the double-precision floating-point values
3245 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3246 | for Binary Floating-Point Arithmetic.
3247 *----------------------------------------------------------------------------*/
3249 float64
float64_mul( float64 a
, float64 b STATUS_PARAM
)
3251 flag aSign
, bSign
, zSign
;
3252 int16 aExp
, bExp
, zExp
;
3253 uint64_t aSig
, bSig
, zSig0
, zSig1
;
3255 a
= float64_squash_input_denormal(a STATUS_VAR
);
3256 b
= float64_squash_input_denormal(b STATUS_VAR
);
3258 aSig
= extractFloat64Frac( a
);
3259 aExp
= extractFloat64Exp( a
);
3260 aSign
= extractFloat64Sign( a
);
3261 bSig
= extractFloat64Frac( b
);
3262 bExp
= extractFloat64Exp( b
);
3263 bSign
= extractFloat64Sign( b
);
3264 zSign
= aSign
^ bSign
;
3265 if ( aExp
== 0x7FF ) {
3266 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
3267 return propagateFloat64NaN( a
, b STATUS_VAR
);
3269 if ( ( bExp
| bSig
) == 0 ) {
3270 float_raise( float_flag_invalid STATUS_VAR
);
3271 return float64_default_nan
;
3273 return packFloat64( zSign
, 0x7FF, 0 );
3275 if ( bExp
== 0x7FF ) {
3276 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3277 if ( ( aExp
| aSig
) == 0 ) {
3278 float_raise( float_flag_invalid STATUS_VAR
);
3279 return float64_default_nan
;
3281 return packFloat64( zSign
, 0x7FF, 0 );
3284 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
3285 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3288 if ( bSig
== 0 ) return packFloat64( zSign
, 0, 0 );
3289 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3291 zExp
= aExp
+ bExp
- 0x3FF;
3292 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
3293 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3294 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
3295 zSig0
|= ( zSig1
!= 0 );
3296 if ( 0 <= (int64_t) ( zSig0
<<1 ) ) {
3300 return roundAndPackFloat64( zSign
, zExp
, zSig0 STATUS_VAR
);
3304 /*----------------------------------------------------------------------------
3305 | Returns the result of dividing the double-precision floating-point value `a'
3306 | by the corresponding value `b'. The operation is performed according to
3307 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3308 *----------------------------------------------------------------------------*/
3310 float64
float64_div( float64 a
, float64 b STATUS_PARAM
)
3312 flag aSign
, bSign
, zSign
;
3313 int16 aExp
, bExp
, zExp
;
3314 uint64_t aSig
, bSig
, zSig
;
3315 uint64_t rem0
, rem1
;
3316 uint64_t term0
, term1
;
3317 a
= float64_squash_input_denormal(a STATUS_VAR
);
3318 b
= float64_squash_input_denormal(b STATUS_VAR
);
3320 aSig
= extractFloat64Frac( a
);
3321 aExp
= extractFloat64Exp( a
);
3322 aSign
= extractFloat64Sign( a
);
3323 bSig
= extractFloat64Frac( b
);
3324 bExp
= extractFloat64Exp( b
);
3325 bSign
= extractFloat64Sign( b
);
3326 zSign
= aSign
^ bSign
;
3327 if ( aExp
== 0x7FF ) {
3328 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3329 if ( bExp
== 0x7FF ) {
3330 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3331 float_raise( float_flag_invalid STATUS_VAR
);
3332 return float64_default_nan
;
3334 return packFloat64( zSign
, 0x7FF, 0 );
3336 if ( bExp
== 0x7FF ) {
3337 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3338 return packFloat64( zSign
, 0, 0 );
3342 if ( ( aExp
| aSig
) == 0 ) {
3343 float_raise( float_flag_invalid STATUS_VAR
);
3344 return float64_default_nan
;
3346 float_raise( float_flag_divbyzero STATUS_VAR
);
3347 return packFloat64( zSign
, 0x7FF, 0 );
3349 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3352 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
3353 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3355 zExp
= aExp
- bExp
+ 0x3FD;
3356 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
3357 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3358 if ( bSig
<= ( aSig
+ aSig
) ) {
3362 zSig
= estimateDiv128To64( aSig
, 0, bSig
);
3363 if ( ( zSig
& 0x1FF ) <= 2 ) {
3364 mul64To128( bSig
, zSig
, &term0
, &term1
);
3365 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
3366 while ( (int64_t) rem0
< 0 ) {
3368 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
3370 zSig
|= ( rem1
!= 0 );
3372 return roundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
3376 /*----------------------------------------------------------------------------
3377 | Returns the remainder of the double-precision floating-point value `a'
3378 | with respect to the corresponding value `b'. The operation is performed
3379 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3380 *----------------------------------------------------------------------------*/
3382 float64
float64_rem( float64 a
, float64 b STATUS_PARAM
)
3385 int16 aExp
, bExp
, expDiff
;
3386 uint64_t aSig
, bSig
;
3387 uint64_t q
, alternateASig
;
3390 a
= float64_squash_input_denormal(a STATUS_VAR
);
3391 b
= float64_squash_input_denormal(b STATUS_VAR
);
3392 aSig
= extractFloat64Frac( a
);
3393 aExp
= extractFloat64Exp( a
);
3394 aSign
= extractFloat64Sign( a
);
3395 bSig
= extractFloat64Frac( b
);
3396 bExp
= extractFloat64Exp( b
);
3397 if ( aExp
== 0x7FF ) {
3398 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
3399 return propagateFloat64NaN( a
, b STATUS_VAR
);
3401 float_raise( float_flag_invalid STATUS_VAR
);
3402 return float64_default_nan
;
3404 if ( bExp
== 0x7FF ) {
3405 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3410 float_raise( float_flag_invalid STATUS_VAR
);
3411 return float64_default_nan
;
3413 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3416 if ( aSig
== 0 ) return a
;
3417 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3419 expDiff
= aExp
- bExp
;
3420 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
3421 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3422 if ( expDiff
< 0 ) {
3423 if ( expDiff
< -1 ) return a
;
3426 q
= ( bSig
<= aSig
);
3427 if ( q
) aSig
-= bSig
;
3429 while ( 0 < expDiff
) {
3430 q
= estimateDiv128To64( aSig
, 0, bSig
);
3431 q
= ( 2 < q
) ? q
- 2 : 0;
3432 aSig
= - ( ( bSig
>>2 ) * q
);
3436 if ( 0 < expDiff
) {
3437 q
= estimateDiv128To64( aSig
, 0, bSig
);
3438 q
= ( 2 < q
) ? q
- 2 : 0;
3441 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
3448 alternateASig
= aSig
;
3451 } while ( 0 <= (int64_t) aSig
);
3452 sigMean
= aSig
+ alternateASig
;
3453 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
3454 aSig
= alternateASig
;
3456 zSign
= ( (int64_t) aSig
< 0 );
3457 if ( zSign
) aSig
= - aSig
;
3458 return normalizeRoundAndPackFloat64( aSign
^ zSign
, bExp
, aSig STATUS_VAR
);
3462 /*----------------------------------------------------------------------------
3463 | Returns the square root of the double-precision floating-point value `a'.
3464 | The operation is performed according to the IEC/IEEE Standard for Binary
3465 | Floating-Point Arithmetic.
3466 *----------------------------------------------------------------------------*/
3468 float64
float64_sqrt( float64 a STATUS_PARAM
)
3472 uint64_t aSig
, zSig
, doubleZSig
;
3473 uint64_t rem0
, rem1
, term0
, term1
;
3474 a
= float64_squash_input_denormal(a STATUS_VAR
);
3476 aSig
= extractFloat64Frac( a
);
3477 aExp
= extractFloat64Exp( a
);
3478 aSign
= extractFloat64Sign( a
);
3479 if ( aExp
== 0x7FF ) {
3480 if ( aSig
) return propagateFloat64NaN( a
, a STATUS_VAR
);
3481 if ( ! aSign
) return a
;
3482 float_raise( float_flag_invalid STATUS_VAR
);
3483 return float64_default_nan
;
3486 if ( ( aExp
| aSig
) == 0 ) return a
;
3487 float_raise( float_flag_invalid STATUS_VAR
);
3488 return float64_default_nan
;
3491 if ( aSig
== 0 ) return float64_zero
;
3492 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3494 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
3495 aSig
|= LIT64( 0x0010000000000000 );
3496 zSig
= estimateSqrt32( aExp
, aSig
>>21 );
3497 aSig
<<= 9 - ( aExp
& 1 );
3498 zSig
= estimateDiv128To64( aSig
, 0, zSig
<<32 ) + ( zSig
<<30 );
3499 if ( ( zSig
& 0x1FF ) <= 5 ) {
3500 doubleZSig
= zSig
<<1;
3501 mul64To128( zSig
, zSig
, &term0
, &term1
);
3502 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
3503 while ( (int64_t) rem0
< 0 ) {
3506 add128( rem0
, rem1
, zSig
>>63, doubleZSig
| 1, &rem0
, &rem1
);
3508 zSig
|= ( ( rem0
| rem1
) != 0 );
3510 return roundAndPackFloat64( 0, zExp
, zSig STATUS_VAR
);
3514 /*----------------------------------------------------------------------------
3515 | Returns the binary log of the double-precision floating-point value `a'.
3516 | The operation is performed according to the IEC/IEEE Standard for Binary
3517 | Floating-Point Arithmetic.
3518 *----------------------------------------------------------------------------*/
3519 float64
float64_log2( float64 a STATUS_PARAM
)
3523 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
3524 a
= float64_squash_input_denormal(a STATUS_VAR
);
3526 aSig
= extractFloat64Frac( a
);
3527 aExp
= extractFloat64Exp( a
);
3528 aSign
= extractFloat64Sign( a
);
3531 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
3532 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3535 float_raise( float_flag_invalid STATUS_VAR
);
3536 return float64_default_nan
;
3538 if ( aExp
== 0x7FF ) {
3539 if ( aSig
) return propagateFloat64NaN( a
, float64_zero STATUS_VAR
);
3544 aSig
|= LIT64( 0x0010000000000000 );
3546 zSig
= (uint64_t)aExp
<< 52;
3547 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
3548 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
3549 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
3550 if ( aSig
& LIT64( 0x0020000000000000 ) ) {
3558 return normalizeRoundAndPackFloat64( zSign
, 0x408, zSig STATUS_VAR
);
3561 /*----------------------------------------------------------------------------
3562 | Returns 1 if the double-precision floating-point value `a' is equal to the
3563 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3564 | if either operand is a NaN. Otherwise, the comparison is performed
3565 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3566 *----------------------------------------------------------------------------*/
3568 int float64_eq( float64 a
, float64 b STATUS_PARAM
)
3571 a
= float64_squash_input_denormal(a STATUS_VAR
);
3572 b
= float64_squash_input_denormal(b STATUS_VAR
);
3574 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3575 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3577 float_raise( float_flag_invalid STATUS_VAR
);
3580 av
= float64_val(a
);
3581 bv
= float64_val(b
);
3582 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
3586 /*----------------------------------------------------------------------------
3587 | Returns 1 if the double-precision floating-point value `a' is less than or
3588 | equal to the corresponding value `b', and 0 otherwise. The invalid
3589 | exception is raised if either operand is a NaN. The comparison is performed
3590 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3591 *----------------------------------------------------------------------------*/
3593 int float64_le( float64 a
, float64 b STATUS_PARAM
)
3597 a
= float64_squash_input_denormal(a STATUS_VAR
);
3598 b
= float64_squash_input_denormal(b STATUS_VAR
);
3600 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3601 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3603 float_raise( float_flag_invalid STATUS_VAR
);
3606 aSign
= extractFloat64Sign( a
);
3607 bSign
= extractFloat64Sign( b
);
3608 av
= float64_val(a
);
3609 bv
= float64_val(b
);
3610 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
3611 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3615 /*----------------------------------------------------------------------------
3616 | Returns 1 if the double-precision floating-point value `a' is less than
3617 | the corresponding value `b', and 0 otherwise. The invalid exception is
3618 | raised if either operand is a NaN. The comparison is performed according
3619 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3620 *----------------------------------------------------------------------------*/
3622 int float64_lt( float64 a
, float64 b STATUS_PARAM
)
3627 a
= float64_squash_input_denormal(a STATUS_VAR
);
3628 b
= float64_squash_input_denormal(b STATUS_VAR
);
3629 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3630 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3632 float_raise( float_flag_invalid STATUS_VAR
);
3635 aSign
= extractFloat64Sign( a
);
3636 bSign
= extractFloat64Sign( b
);
3637 av
= float64_val(a
);
3638 bv
= float64_val(b
);
3639 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
3640 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3644 /*----------------------------------------------------------------------------
3645 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
3646 | be compared, and 0 otherwise. The invalid exception is raised if either
3647 | operand is a NaN. The comparison is performed according to the IEC/IEEE
3648 | Standard for Binary Floating-Point Arithmetic.
3649 *----------------------------------------------------------------------------*/
3651 int float64_unordered( float64 a
, float64 b STATUS_PARAM
)
3653 a
= float64_squash_input_denormal(a STATUS_VAR
);
3654 b
= float64_squash_input_denormal(b STATUS_VAR
);
3656 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3657 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3659 float_raise( float_flag_invalid STATUS_VAR
);
3665 /*----------------------------------------------------------------------------
3666 | Returns 1 if the double-precision floating-point value `a' is equal to the
3667 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3668 | exception.The comparison is performed according to the IEC/IEEE Standard
3669 | for Binary Floating-Point Arithmetic.
3670 *----------------------------------------------------------------------------*/
3672 int float64_eq_quiet( float64 a
, float64 b STATUS_PARAM
)
3675 a
= float64_squash_input_denormal(a STATUS_VAR
);
3676 b
= float64_squash_input_denormal(b STATUS_VAR
);
3678 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3679 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3681 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3682 float_raise( float_flag_invalid STATUS_VAR
);
3686 av
= float64_val(a
);
3687 bv
= float64_val(b
);
3688 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
3692 /*----------------------------------------------------------------------------
3693 | Returns 1 if the double-precision floating-point value `a' is less than or
3694 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3695 | cause an exception. Otherwise, the comparison is performed according to the
3696 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3697 *----------------------------------------------------------------------------*/
3699 int float64_le_quiet( float64 a
, float64 b STATUS_PARAM
)
3703 a
= float64_squash_input_denormal(a STATUS_VAR
);
3704 b
= float64_squash_input_denormal(b STATUS_VAR
);
3706 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3707 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3709 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3710 float_raise( float_flag_invalid STATUS_VAR
);
3714 aSign
= extractFloat64Sign( a
);
3715 bSign
= extractFloat64Sign( b
);
3716 av
= float64_val(a
);
3717 bv
= float64_val(b
);
3718 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
3719 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3723 /*----------------------------------------------------------------------------
3724 | Returns 1 if the double-precision floating-point value `a' is less than
3725 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3726 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3727 | Standard for Binary Floating-Point Arithmetic.
3728 *----------------------------------------------------------------------------*/
3730 int float64_lt_quiet( float64 a
, float64 b STATUS_PARAM
)
3734 a
= float64_squash_input_denormal(a STATUS_VAR
);
3735 b
= float64_squash_input_denormal(b STATUS_VAR
);
3737 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3738 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3740 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3741 float_raise( float_flag_invalid STATUS_VAR
);
3745 aSign
= extractFloat64Sign( a
);
3746 bSign
= extractFloat64Sign( b
);
3747 av
= float64_val(a
);
3748 bv
= float64_val(b
);
3749 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
3750 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3754 /*----------------------------------------------------------------------------
3755 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
3756 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3757 | comparison is performed according to the IEC/IEEE Standard for Binary
3758 | Floating-Point Arithmetic.
3759 *----------------------------------------------------------------------------*/
3761 int float64_unordered_quiet( float64 a
, float64 b STATUS_PARAM
)
3763 a
= float64_squash_input_denormal(a STATUS_VAR
);
3764 b
= float64_squash_input_denormal(b STATUS_VAR
);
3766 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3767 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3769 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3770 float_raise( float_flag_invalid STATUS_VAR
);
3777 /*----------------------------------------------------------------------------
3778 | Returns the result of converting the extended double-precision floating-
3779 | point value `a' to the 32-bit two's complement integer format. The
3780 | conversion is performed according to the IEC/IEEE Standard for Binary
3781 | Floating-Point Arithmetic---which means in particular that the conversion
3782 | is rounded according to the current rounding mode. If `a' is a NaN, the
3783 | largest positive integer is returned. Otherwise, if the conversion
3784 | overflows, the largest integer with the same sign as `a' is returned.
3785 *----------------------------------------------------------------------------*/
3787 int32
floatx80_to_int32( floatx80 a STATUS_PARAM
)
3790 int32 aExp
, shiftCount
;
3793 aSig
= extractFloatx80Frac( a
);
3794 aExp
= extractFloatx80Exp( a
);
3795 aSign
= extractFloatx80Sign( a
);
3796 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
3797 shiftCount
= 0x4037 - aExp
;
3798 if ( shiftCount
<= 0 ) shiftCount
= 1;
3799 shift64RightJamming( aSig
, shiftCount
, &aSig
);
3800 return roundAndPackInt32( aSign
, aSig STATUS_VAR
);
3804 /*----------------------------------------------------------------------------
3805 | Returns the result of converting the extended double-precision floating-
3806 | point value `a' to the 32-bit two's complement integer format. The
3807 | conversion is performed according to the IEC/IEEE Standard for Binary
3808 | Floating-Point Arithmetic, except that the conversion is always rounded
3809 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3810 | Otherwise, if the conversion overflows, the largest integer with the same
3811 | sign as `a' is returned.
3812 *----------------------------------------------------------------------------*/
3814 int32
floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM
)
3817 int32 aExp
, shiftCount
;
3818 uint64_t aSig
, savedASig
;
3821 aSig
= extractFloatx80Frac( a
);
3822 aExp
= extractFloatx80Exp( a
);
3823 aSign
= extractFloatx80Sign( a
);
3824 if ( 0x401E < aExp
) {
3825 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
3828 else if ( aExp
< 0x3FFF ) {
3829 if ( aExp
|| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
3832 shiftCount
= 0x403E - aExp
;
3834 aSig
>>= shiftCount
;
3836 if ( aSign
) z
= - z
;
3837 if ( ( z
< 0 ) ^ aSign
) {
3839 float_raise( float_flag_invalid STATUS_VAR
);
3840 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
3842 if ( ( aSig
<<shiftCount
) != savedASig
) {
3843 STATUS(float_exception_flags
) |= float_flag_inexact
;
3849 /*----------------------------------------------------------------------------
3850 | Returns the result of converting the extended double-precision floating-
3851 | point value `a' to the 64-bit two's complement integer format. The
3852 | conversion is performed according to the IEC/IEEE Standard for Binary
3853 | Floating-Point Arithmetic---which means in particular that the conversion
3854 | is rounded according to the current rounding mode. If `a' is a NaN,
3855 | the largest positive integer is returned. Otherwise, if the conversion
3856 | overflows, the largest integer with the same sign as `a' is returned.
3857 *----------------------------------------------------------------------------*/
3859 int64
floatx80_to_int64( floatx80 a STATUS_PARAM
)
3862 int32 aExp
, shiftCount
;
3863 uint64_t aSig
, aSigExtra
;
3865 aSig
= extractFloatx80Frac( a
);
3866 aExp
= extractFloatx80Exp( a
);
3867 aSign
= extractFloatx80Sign( a
);
3868 shiftCount
= 0x403E - aExp
;
3869 if ( shiftCount
<= 0 ) {
3871 float_raise( float_flag_invalid STATUS_VAR
);
3873 || ( ( aExp
== 0x7FFF )
3874 && ( aSig
!= LIT64( 0x8000000000000000 ) ) )
3876 return LIT64( 0x7FFFFFFFFFFFFFFF );
3878 return (int64_t) LIT64( 0x8000000000000000 );
3883 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
3885 return roundAndPackInt64( aSign
, aSig
, aSigExtra STATUS_VAR
);
3889 /*----------------------------------------------------------------------------
3890 | Returns the result of converting the extended double-precision floating-
3891 | point value `a' to the 64-bit two's complement integer format. The
3892 | conversion is performed according to the IEC/IEEE Standard for Binary
3893 | Floating-Point Arithmetic, except that the conversion is always rounded
3894 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3895 | Otherwise, if the conversion overflows, the largest integer with the same
3896 | sign as `a' is returned.
3897 *----------------------------------------------------------------------------*/
3899 int64
floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM
)
3902 int32 aExp
, shiftCount
;
3906 aSig
= extractFloatx80Frac( a
);
3907 aExp
= extractFloatx80Exp( a
);
3908 aSign
= extractFloatx80Sign( a
);
3909 shiftCount
= aExp
- 0x403E;
3910 if ( 0 <= shiftCount
) {
3911 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
3912 if ( ( a
.high
!= 0xC03E ) || aSig
) {
3913 float_raise( float_flag_invalid STATUS_VAR
);
3914 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
3915 return LIT64( 0x7FFFFFFFFFFFFFFF );
3918 return (int64_t) LIT64( 0x8000000000000000 );
3920 else if ( aExp
< 0x3FFF ) {
3921 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
3924 z
= aSig
>>( - shiftCount
);
3925 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
3926 STATUS(float_exception_flags
) |= float_flag_inexact
;
3928 if ( aSign
) z
= - z
;
3933 /*----------------------------------------------------------------------------
3934 | Returns the result of converting the extended double-precision floating-
3935 | point value `a' to the single-precision floating-point format. The
3936 | conversion is performed according to the IEC/IEEE Standard for Binary
3937 | Floating-Point Arithmetic.
3938 *----------------------------------------------------------------------------*/
3940 float32
floatx80_to_float32( floatx80 a STATUS_PARAM
)
3946 aSig
= extractFloatx80Frac( a
);
3947 aExp
= extractFloatx80Exp( a
);
3948 aSign
= extractFloatx80Sign( a
);
3949 if ( aExp
== 0x7FFF ) {
3950 if ( (uint64_t) ( aSig
<<1 ) ) {
3951 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
3953 return packFloat32( aSign
, 0xFF, 0 );
3955 shift64RightJamming( aSig
, 33, &aSig
);
3956 if ( aExp
|| aSig
) aExp
-= 0x3F81;
3957 return roundAndPackFloat32( aSign
, aExp
, aSig STATUS_VAR
);
3961 /*----------------------------------------------------------------------------
3962 | Returns the result of converting the extended double-precision floating-
3963 | point value `a' to the double-precision floating-point format. The
3964 | conversion is performed according to the IEC/IEEE Standard for Binary
3965 | Floating-Point Arithmetic.
3966 *----------------------------------------------------------------------------*/
3968 float64
floatx80_to_float64( floatx80 a STATUS_PARAM
)
3972 uint64_t aSig
, zSig
;
3974 aSig
= extractFloatx80Frac( a
);
3975 aExp
= extractFloatx80Exp( a
);
3976 aSign
= extractFloatx80Sign( a
);
3977 if ( aExp
== 0x7FFF ) {
3978 if ( (uint64_t) ( aSig
<<1 ) ) {
3979 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
3981 return packFloat64( aSign
, 0x7FF, 0 );
3983 shift64RightJamming( aSig
, 1, &zSig
);
3984 if ( aExp
|| aSig
) aExp
-= 0x3C01;
3985 return roundAndPackFloat64( aSign
, aExp
, zSig STATUS_VAR
);
3989 /*----------------------------------------------------------------------------
3990 | Returns the result of converting the extended double-precision floating-
3991 | point value `a' to the quadruple-precision floating-point format. The
3992 | conversion is performed according to the IEC/IEEE Standard for Binary
3993 | Floating-Point Arithmetic.
3994 *----------------------------------------------------------------------------*/
3996 float128
floatx80_to_float128( floatx80 a STATUS_PARAM
)
4000 uint64_t aSig
, zSig0
, zSig1
;
4002 aSig
= extractFloatx80Frac( a
);
4003 aExp
= extractFloatx80Exp( a
);
4004 aSign
= extractFloatx80Sign( a
);
4005 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
4006 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
4008 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
4009 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
4013 /*----------------------------------------------------------------------------
4014 | Rounds the extended double-precision floating-point value `a' to an integer,
4015 | and returns the result as an extended quadruple-precision floating-point
4016 | value. The operation is performed according to the IEC/IEEE Standard for
4017 | Binary Floating-Point Arithmetic.
4018 *----------------------------------------------------------------------------*/
4020 floatx80
floatx80_round_to_int( floatx80 a STATUS_PARAM
)
4024 uint64_t lastBitMask
, roundBitsMask
;
4028 aExp
= extractFloatx80Exp( a
);
4029 if ( 0x403E <= aExp
) {
4030 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
4031 return propagateFloatx80NaN( a
, a STATUS_VAR
);
4035 if ( aExp
< 0x3FFF ) {
4037 && ( (uint64_t) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
4040 STATUS(float_exception_flags
) |= float_flag_inexact
;
4041 aSign
= extractFloatx80Sign( a
);
4042 switch ( STATUS(float_rounding_mode
) ) {
4043 case float_round_nearest_even
:
4044 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
4047 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
4050 case float_round_down
:
4053 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4054 : packFloatx80( 0, 0, 0 );
4055 case float_round_up
:
4057 aSign
? packFloatx80( 1, 0, 0 )
4058 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4060 return packFloatx80( aSign
, 0, 0 );
4063 lastBitMask
<<= 0x403E - aExp
;
4064 roundBitsMask
= lastBitMask
- 1;
4066 roundingMode
= STATUS(float_rounding_mode
);
4067 if ( roundingMode
== float_round_nearest_even
) {
4068 z
.low
+= lastBitMask
>>1;
4069 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
4071 else if ( roundingMode
!= float_round_to_zero
) {
4072 if ( extractFloatx80Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
4073 z
.low
+= roundBitsMask
;
4076 z
.low
&= ~ roundBitsMask
;
4079 z
.low
= LIT64( 0x8000000000000000 );
4081 if ( z
.low
!= a
.low
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4086 /*----------------------------------------------------------------------------
4087 | Returns the result of adding the absolute values of the extended double-
4088 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4089 | negated before being returned. `zSign' is ignored if the result is a NaN.
4090 | The addition is performed according to the IEC/IEEE Standard for Binary
4091 | Floating-Point Arithmetic.
4092 *----------------------------------------------------------------------------*/
4094 static floatx80
addFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign STATUS_PARAM
)
4096 int32 aExp
, bExp
, zExp
;
4097 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4100 aSig
= extractFloatx80Frac( a
);
4101 aExp
= extractFloatx80Exp( a
);
4102 bSig
= extractFloatx80Frac( b
);
4103 bExp
= extractFloatx80Exp( b
);
4104 expDiff
= aExp
- bExp
;
4105 if ( 0 < expDiff
) {
4106 if ( aExp
== 0x7FFF ) {
4107 if ( (uint64_t) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4110 if ( bExp
== 0 ) --expDiff
;
4111 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
4114 else if ( expDiff
< 0 ) {
4115 if ( bExp
== 0x7FFF ) {
4116 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4117 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4119 if ( aExp
== 0 ) ++expDiff
;
4120 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
4124 if ( aExp
== 0x7FFF ) {
4125 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
4126 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4131 zSig0
= aSig
+ bSig
;
4133 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
4139 zSig0
= aSig
+ bSig
;
4140 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
4142 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
4143 zSig0
|= LIT64( 0x8000000000000000 );
4147 roundAndPackFloatx80(
4148 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4152 /*----------------------------------------------------------------------------
4153 | Returns the result of subtracting the absolute values of the extended
4154 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4155 | difference is negated before being returned. `zSign' is ignored if the
4156 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4157 | Standard for Binary Floating-Point Arithmetic.
4158 *----------------------------------------------------------------------------*/
4160 static floatx80
subFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign STATUS_PARAM
)
4162 int32 aExp
, bExp
, zExp
;
4163 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4167 aSig
= extractFloatx80Frac( a
);
4168 aExp
= extractFloatx80Exp( a
);
4169 bSig
= extractFloatx80Frac( b
);
4170 bExp
= extractFloatx80Exp( b
);
4171 expDiff
= aExp
- bExp
;
4172 if ( 0 < expDiff
) goto aExpBigger
;
4173 if ( expDiff
< 0 ) goto bExpBigger
;
4174 if ( aExp
== 0x7FFF ) {
4175 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
4176 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4178 float_raise( float_flag_invalid STATUS_VAR
);
4179 z
.low
= floatx80_default_nan_low
;
4180 z
.high
= floatx80_default_nan_high
;
4188 if ( bSig
< aSig
) goto aBigger
;
4189 if ( aSig
< bSig
) goto bBigger
;
4190 return packFloatx80( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
4192 if ( bExp
== 0x7FFF ) {
4193 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4194 return packFloatx80( zSign
^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4196 if ( aExp
== 0 ) ++expDiff
;
4197 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
4199 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
4202 goto normalizeRoundAndPack
;
4204 if ( aExp
== 0x7FFF ) {
4205 if ( (uint64_t) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4208 if ( bExp
== 0 ) --expDiff
;
4209 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
4211 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
4213 normalizeRoundAndPack
:
4215 normalizeRoundAndPackFloatx80(
4216 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4220 /*----------------------------------------------------------------------------
4221 | Returns the result of adding the extended double-precision floating-point
4222 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4223 | Standard for Binary Floating-Point Arithmetic.
4224 *----------------------------------------------------------------------------*/
4226 floatx80
floatx80_add( floatx80 a
, floatx80 b STATUS_PARAM
)
4230 aSign
= extractFloatx80Sign( a
);
4231 bSign
= extractFloatx80Sign( b
);
4232 if ( aSign
== bSign
) {
4233 return addFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
4236 return subFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
4241 /*----------------------------------------------------------------------------
4242 | Returns the result of subtracting the extended double-precision floating-
4243 | point values `a' and `b'. The operation is performed according to the
4244 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4245 *----------------------------------------------------------------------------*/
4247 floatx80
floatx80_sub( floatx80 a
, floatx80 b STATUS_PARAM
)
4251 aSign
= extractFloatx80Sign( a
);
4252 bSign
= extractFloatx80Sign( b
);
4253 if ( aSign
== bSign
) {
4254 return subFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
4257 return addFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
4262 /*----------------------------------------------------------------------------
4263 | Returns the result of multiplying the extended double-precision floating-
4264 | point values `a' and `b'. The operation is performed according to the
4265 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4266 *----------------------------------------------------------------------------*/
4268 floatx80
floatx80_mul( floatx80 a
, floatx80 b STATUS_PARAM
)
4270 flag aSign
, bSign
, zSign
;
4271 int32 aExp
, bExp
, zExp
;
4272 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4275 aSig
= extractFloatx80Frac( a
);
4276 aExp
= extractFloatx80Exp( a
);
4277 aSign
= extractFloatx80Sign( a
);
4278 bSig
= extractFloatx80Frac( b
);
4279 bExp
= extractFloatx80Exp( b
);
4280 bSign
= extractFloatx80Sign( b
);
4281 zSign
= aSign
^ bSign
;
4282 if ( aExp
== 0x7FFF ) {
4283 if ( (uint64_t) ( aSig
<<1 )
4284 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
4285 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4287 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
4288 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4290 if ( bExp
== 0x7FFF ) {
4291 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4292 if ( ( aExp
| aSig
) == 0 ) {
4294 float_raise( float_flag_invalid STATUS_VAR
);
4295 z
.low
= floatx80_default_nan_low
;
4296 z
.high
= floatx80_default_nan_high
;
4299 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4302 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
4303 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
4306 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
4307 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
4309 zExp
= aExp
+ bExp
- 0x3FFE;
4310 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
4311 if ( 0 < (int64_t) zSig0
) {
4312 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
4316 roundAndPackFloatx80(
4317 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4321 /*----------------------------------------------------------------------------
4322 | Returns the result of dividing the extended double-precision floating-point
4323 | value `a' by the corresponding value `b'. The operation is performed
4324 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4325 *----------------------------------------------------------------------------*/
4327 floatx80
floatx80_div( floatx80 a
, floatx80 b STATUS_PARAM
)
4329 flag aSign
, bSign
, zSign
;
4330 int32 aExp
, bExp
, zExp
;
4331 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4332 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
4335 aSig
= extractFloatx80Frac( a
);
4336 aExp
= extractFloatx80Exp( a
);
4337 aSign
= extractFloatx80Sign( a
);
4338 bSig
= extractFloatx80Frac( b
);
4339 bExp
= extractFloatx80Exp( b
);
4340 bSign
= extractFloatx80Sign( b
);
4341 zSign
= aSign
^ bSign
;
4342 if ( aExp
== 0x7FFF ) {
4343 if ( (uint64_t) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4344 if ( bExp
== 0x7FFF ) {
4345 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4348 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4350 if ( bExp
== 0x7FFF ) {
4351 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4352 return packFloatx80( zSign
, 0, 0 );
4356 if ( ( aExp
| aSig
) == 0 ) {
4358 float_raise( float_flag_invalid STATUS_VAR
);
4359 z
.low
= floatx80_default_nan_low
;
4360 z
.high
= floatx80_default_nan_high
;
4363 float_raise( float_flag_divbyzero STATUS_VAR
);
4364 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4366 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
4369 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
4370 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
4372 zExp
= aExp
- bExp
+ 0x3FFE;
4374 if ( bSig
<= aSig
) {
4375 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
4378 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
4379 mul64To128( bSig
, zSig0
, &term0
, &term1
);
4380 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
4381 while ( (int64_t) rem0
< 0 ) {
4383 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
4385 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
4386 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
4387 mul64To128( bSig
, zSig1
, &term1
, &term2
);
4388 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
4389 while ( (int64_t) rem1
< 0 ) {
4391 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
4393 zSig1
|= ( ( rem1
| rem2
) != 0 );
4396 roundAndPackFloatx80(
4397 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4401 /*----------------------------------------------------------------------------
4402 | Returns the remainder of the extended double-precision floating-point value
4403 | `a' with respect to the corresponding value `b'. The operation is performed
4404 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4405 *----------------------------------------------------------------------------*/
4407 floatx80
floatx80_rem( floatx80 a
, floatx80 b STATUS_PARAM
)
4410 int32 aExp
, bExp
, expDiff
;
4411 uint64_t aSig0
, aSig1
, bSig
;
4412 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
4415 aSig0
= extractFloatx80Frac( a
);
4416 aExp
= extractFloatx80Exp( a
);
4417 aSign
= extractFloatx80Sign( a
);
4418 bSig
= extractFloatx80Frac( b
);
4419 bExp
= extractFloatx80Exp( b
);
4420 if ( aExp
== 0x7FFF ) {
4421 if ( (uint64_t) ( aSig0
<<1 )
4422 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
4423 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4427 if ( bExp
== 0x7FFF ) {
4428 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4434 float_raise( float_flag_invalid STATUS_VAR
);
4435 z
.low
= floatx80_default_nan_low
;
4436 z
.high
= floatx80_default_nan_high
;
4439 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
4442 if ( (uint64_t) ( aSig0
<<1 ) == 0 ) return a
;
4443 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
4445 bSig
|= LIT64( 0x8000000000000000 );
4447 expDiff
= aExp
- bExp
;
4449 if ( expDiff
< 0 ) {
4450 if ( expDiff
< -1 ) return a
;
4451 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
4454 q
= ( bSig
<= aSig0
);
4455 if ( q
) aSig0
-= bSig
;
4457 while ( 0 < expDiff
) {
4458 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
4459 q
= ( 2 < q
) ? q
- 2 : 0;
4460 mul64To128( bSig
, q
, &term0
, &term1
);
4461 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
4462 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
4466 if ( 0 < expDiff
) {
4467 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
4468 q
= ( 2 < q
) ? q
- 2 : 0;
4470 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
4471 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
4472 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
4473 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
4475 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
4482 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
4483 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
4484 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
4487 aSig0
= alternateASig0
;
4488 aSig1
= alternateASig1
;
4492 normalizeRoundAndPackFloatx80(
4493 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1 STATUS_VAR
);
4497 /*----------------------------------------------------------------------------
4498 | Returns the square root of the extended double-precision floating-point
4499 | value `a'. The operation is performed according to the IEC/IEEE Standard
4500 | for Binary Floating-Point Arithmetic.
4501 *----------------------------------------------------------------------------*/
4503 floatx80
floatx80_sqrt( floatx80 a STATUS_PARAM
)
4507 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
4508 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
4511 aSig0
= extractFloatx80Frac( a
);
4512 aExp
= extractFloatx80Exp( a
);
4513 aSign
= extractFloatx80Sign( a
);
4514 if ( aExp
== 0x7FFF ) {
4515 if ( (uint64_t) ( aSig0
<<1 ) ) return propagateFloatx80NaN( a
, a STATUS_VAR
);
4516 if ( ! aSign
) return a
;
4520 if ( ( aExp
| aSig0
) == 0 ) return a
;
4522 float_raise( float_flag_invalid STATUS_VAR
);
4523 z
.low
= floatx80_default_nan_low
;
4524 z
.high
= floatx80_default_nan_high
;
4528 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
4529 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
4531 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
4532 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
4533 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
4534 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
4535 doubleZSig0
= zSig0
<<1;
4536 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
4537 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
4538 while ( (int64_t) rem0
< 0 ) {
4541 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
4543 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
4544 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4545 if ( zSig1
== 0 ) zSig1
= 1;
4546 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
4547 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
4548 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
4549 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
4550 while ( (int64_t) rem1
< 0 ) {
4552 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
4554 term2
|= doubleZSig0
;
4555 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
4557 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
4559 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
4560 zSig0
|= doubleZSig0
;
4562 roundAndPackFloatx80(
4563 STATUS(floatx80_rounding_precision
), 0, zExp
, zSig0
, zSig1 STATUS_VAR
);
4567 /*----------------------------------------------------------------------------
4568 | Returns 1 if the extended double-precision floating-point value `a' is equal
4569 | to the corresponding value `b', and 0 otherwise. The invalid exception is
4570 | raised if either operand is a NaN. Otherwise, the comparison is performed
4571 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4572 *----------------------------------------------------------------------------*/
4574 int floatx80_eq( floatx80 a
, floatx80 b STATUS_PARAM
)
4577 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4578 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
4579 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4580 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
4582 float_raise( float_flag_invalid STATUS_VAR
);
4587 && ( ( a
.high
== b
.high
)
4589 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
4594 /*----------------------------------------------------------------------------
4595 | Returns 1 if the extended double-precision floating-point value `a' is
4596 | less than or equal to the corresponding value `b', and 0 otherwise. The
4597 | invalid exception is raised if either operand is a NaN. The comparison is
4598 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4600 *----------------------------------------------------------------------------*/
4602 int floatx80_le( floatx80 a
, floatx80 b STATUS_PARAM
)
4606 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4607 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
4608 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4609 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
4611 float_raise( float_flag_invalid STATUS_VAR
);
4614 aSign
= extractFloatx80Sign( a
);
4615 bSign
= extractFloatx80Sign( b
);
4616 if ( aSign
!= bSign
) {
4619 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4623 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4624 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4628 /*----------------------------------------------------------------------------
4629 | Returns 1 if the extended double-precision floating-point value `a' is
4630 | less than the corresponding value `b', and 0 otherwise. The invalid
4631 | exception is raised if either operand is a NaN. The comparison is performed
4632 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4633 *----------------------------------------------------------------------------*/
4635 int floatx80_lt( floatx80 a
, floatx80 b STATUS_PARAM
)
4639 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4640 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
4641 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4642 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
4644 float_raise( float_flag_invalid STATUS_VAR
);
4647 aSign
= extractFloatx80Sign( a
);
4648 bSign
= extractFloatx80Sign( b
);
4649 if ( aSign
!= bSign
) {
4652 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4656 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4657 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4661 /*----------------------------------------------------------------------------
4662 | Returns 1 if the extended double-precision floating-point values `a' and `b'
4663 | cannot be compared, and 0 otherwise. The invalid exception is raised if
4664 | either operand is a NaN. The comparison is performed according to the
4665 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4666 *----------------------------------------------------------------------------*/
4667 int floatx80_unordered( floatx80 a
, floatx80 b STATUS_PARAM
)
4669 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4670 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
4671 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4672 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
4674 float_raise( float_flag_invalid STATUS_VAR
);
4680 /*----------------------------------------------------------------------------
4681 | Returns 1 if the extended double-precision floating-point value `a' is
4682 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4683 | cause an exception. The comparison is performed according to the IEC/IEEE
4684 | Standard for Binary Floating-Point Arithmetic.
4685 *----------------------------------------------------------------------------*/
4687 int floatx80_eq_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
4690 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4691 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
4692 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4693 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
4695 if ( floatx80_is_signaling_nan( a
)
4696 || floatx80_is_signaling_nan( b
) ) {
4697 float_raise( float_flag_invalid STATUS_VAR
);
4703 && ( ( a
.high
== b
.high
)
4705 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
4710 /*----------------------------------------------------------------------------
4711 | Returns 1 if the extended double-precision floating-point value `a' is less
4712 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4713 | do not cause an exception. Otherwise, the comparison is performed according
4714 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4715 *----------------------------------------------------------------------------*/
4717 int floatx80_le_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
4721 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4722 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
4723 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4724 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
4726 if ( floatx80_is_signaling_nan( a
)
4727 || floatx80_is_signaling_nan( b
) ) {
4728 float_raise( float_flag_invalid STATUS_VAR
);
4732 aSign
= extractFloatx80Sign( a
);
4733 bSign
= extractFloatx80Sign( b
);
4734 if ( aSign
!= bSign
) {
4737 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4741 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4742 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4746 /*----------------------------------------------------------------------------
4747 | Returns 1 if the extended double-precision floating-point value `a' is less
4748 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4749 | an exception. Otherwise, the comparison is performed according to the
4750 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4751 *----------------------------------------------------------------------------*/
4753 int floatx80_lt_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
4757 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4758 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
4759 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4760 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
4762 if ( floatx80_is_signaling_nan( a
)
4763 || floatx80_is_signaling_nan( b
) ) {
4764 float_raise( float_flag_invalid STATUS_VAR
);
4768 aSign
= extractFloatx80Sign( a
);
4769 bSign
= extractFloatx80Sign( b
);
4770 if ( aSign
!= bSign
) {
4773 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4777 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4778 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4782 /*----------------------------------------------------------------------------
4783 | Returns 1 if the extended double-precision floating-point values `a' and `b'
4784 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
4785 | The comparison is performed according to the IEC/IEEE Standard for Binary
4786 | Floating-Point Arithmetic.
4787 *----------------------------------------------------------------------------*/
4788 int floatx80_unordered_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
4790 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4791 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
4792 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4793 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
4795 if ( floatx80_is_signaling_nan( a
)
4796 || floatx80_is_signaling_nan( b
) ) {
4797 float_raise( float_flag_invalid STATUS_VAR
);
4804 /*----------------------------------------------------------------------------
4805 | Returns the result of converting the quadruple-precision floating-point
4806 | value `a' to the 32-bit two's complement integer format. The conversion
4807 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4808 | Arithmetic---which means in particular that the conversion is rounded
4809 | according to the current rounding mode. If `a' is a NaN, the largest
4810 | positive integer is returned. Otherwise, if the conversion overflows, the
4811 | largest integer with the same sign as `a' is returned.
4812 *----------------------------------------------------------------------------*/
4814 int32
float128_to_int32( float128 a STATUS_PARAM
)
4817 int32 aExp
, shiftCount
;
4818 uint64_t aSig0
, aSig1
;
4820 aSig1
= extractFloat128Frac1( a
);
4821 aSig0
= extractFloat128Frac0( a
);
4822 aExp
= extractFloat128Exp( a
);
4823 aSign
= extractFloat128Sign( a
);
4824 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
4825 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4826 aSig0
|= ( aSig1
!= 0 );
4827 shiftCount
= 0x4028 - aExp
;
4828 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
4829 return roundAndPackInt32( aSign
, aSig0 STATUS_VAR
);
4833 /*----------------------------------------------------------------------------
4834 | Returns the result of converting the quadruple-precision floating-point
4835 | value `a' to the 32-bit two's complement integer format. The conversion
4836 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4837 | Arithmetic, except that the conversion is always rounded toward zero. If
4838 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4839 | conversion overflows, the largest integer with the same sign as `a' is
4841 *----------------------------------------------------------------------------*/
4843 int32
float128_to_int32_round_to_zero( float128 a STATUS_PARAM
)
4846 int32 aExp
, shiftCount
;
4847 uint64_t aSig0
, aSig1
, savedASig
;
4850 aSig1
= extractFloat128Frac1( a
);
4851 aSig0
= extractFloat128Frac0( a
);
4852 aExp
= extractFloat128Exp( a
);
4853 aSign
= extractFloat128Sign( a
);
4854 aSig0
|= ( aSig1
!= 0 );
4855 if ( 0x401E < aExp
) {
4856 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
4859 else if ( aExp
< 0x3FFF ) {
4860 if ( aExp
|| aSig0
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4863 aSig0
|= LIT64( 0x0001000000000000 );
4864 shiftCount
= 0x402F - aExp
;
4866 aSig0
>>= shiftCount
;
4868 if ( aSign
) z
= - z
;
4869 if ( ( z
< 0 ) ^ aSign
) {
4871 float_raise( float_flag_invalid STATUS_VAR
);
4872 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
4874 if ( ( aSig0
<<shiftCount
) != savedASig
) {
4875 STATUS(float_exception_flags
) |= float_flag_inexact
;
4881 /*----------------------------------------------------------------------------
4882 | Returns the result of converting the quadruple-precision floating-point
4883 | value `a' to the 64-bit two's complement integer format. The conversion
4884 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4885 | Arithmetic---which means in particular that the conversion is rounded
4886 | according to the current rounding mode. If `a' is a NaN, the largest
4887 | positive integer is returned. Otherwise, if the conversion overflows, the
4888 | largest integer with the same sign as `a' is returned.
4889 *----------------------------------------------------------------------------*/
4891 int64
float128_to_int64( float128 a STATUS_PARAM
)
4894 int32 aExp
, shiftCount
;
4895 uint64_t aSig0
, aSig1
;
4897 aSig1
= extractFloat128Frac1( a
);
4898 aSig0
= extractFloat128Frac0( a
);
4899 aExp
= extractFloat128Exp( a
);
4900 aSign
= extractFloat128Sign( a
);
4901 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4902 shiftCount
= 0x402F - aExp
;
4903 if ( shiftCount
<= 0 ) {
4904 if ( 0x403E < aExp
) {
4905 float_raise( float_flag_invalid STATUS_VAR
);
4907 || ( ( aExp
== 0x7FFF )
4908 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
4911 return LIT64( 0x7FFFFFFFFFFFFFFF );
4913 return (int64_t) LIT64( 0x8000000000000000 );
4915 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
4918 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
4920 return roundAndPackInt64( aSign
, aSig0
, aSig1 STATUS_VAR
);
4924 /*----------------------------------------------------------------------------
4925 | Returns the result of converting the quadruple-precision floating-point
4926 | value `a' to the 64-bit two's complement integer format. The conversion
4927 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4928 | Arithmetic, except that the conversion is always rounded toward zero.
4929 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4930 | the conversion overflows, the largest integer with the same sign as `a' is
4932 *----------------------------------------------------------------------------*/
4934 int64
float128_to_int64_round_to_zero( float128 a STATUS_PARAM
)
4937 int32 aExp
, shiftCount
;
4938 uint64_t aSig0
, aSig1
;
4941 aSig1
= extractFloat128Frac1( a
);
4942 aSig0
= extractFloat128Frac0( a
);
4943 aExp
= extractFloat128Exp( a
);
4944 aSign
= extractFloat128Sign( a
);
4945 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4946 shiftCount
= aExp
- 0x402F;
4947 if ( 0 < shiftCount
) {
4948 if ( 0x403E <= aExp
) {
4949 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
4950 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
4951 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
4952 if ( aSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4955 float_raise( float_flag_invalid STATUS_VAR
);
4956 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
4957 return LIT64( 0x7FFFFFFFFFFFFFFF );
4960 return (int64_t) LIT64( 0x8000000000000000 );
4962 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
4963 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
4964 STATUS(float_exception_flags
) |= float_flag_inexact
;
4968 if ( aExp
< 0x3FFF ) {
4969 if ( aExp
| aSig0
| aSig1
) {
4970 STATUS(float_exception_flags
) |= float_flag_inexact
;
4974 z
= aSig0
>>( - shiftCount
);
4976 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
4977 STATUS(float_exception_flags
) |= float_flag_inexact
;
4980 if ( aSign
) z
= - z
;
4985 /*----------------------------------------------------------------------------
4986 | Returns the result of converting the quadruple-precision floating-point
4987 | value `a' to the single-precision floating-point format. The conversion
4988 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4990 *----------------------------------------------------------------------------*/
4992 float32
float128_to_float32( float128 a STATUS_PARAM
)
4996 uint64_t aSig0
, aSig1
;
4999 aSig1
= extractFloat128Frac1( a
);
5000 aSig0
= extractFloat128Frac0( a
);
5001 aExp
= extractFloat128Exp( a
);
5002 aSign
= extractFloat128Sign( a
);
5003 if ( aExp
== 0x7FFF ) {
5004 if ( aSig0
| aSig1
) {
5005 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
5007 return packFloat32( aSign
, 0xFF, 0 );
5009 aSig0
|= ( aSig1
!= 0 );
5010 shift64RightJamming( aSig0
, 18, &aSig0
);
5012 if ( aExp
|| zSig
) {
5016 return roundAndPackFloat32( aSign
, aExp
, zSig STATUS_VAR
);
5020 /*----------------------------------------------------------------------------
5021 | Returns the result of converting the quadruple-precision floating-point
5022 | value `a' to the double-precision floating-point format. The conversion
5023 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5025 *----------------------------------------------------------------------------*/
5027 float64
float128_to_float64( float128 a STATUS_PARAM
)
5031 uint64_t aSig0
, aSig1
;
5033 aSig1
= extractFloat128Frac1( a
);
5034 aSig0
= extractFloat128Frac0( a
);
5035 aExp
= extractFloat128Exp( a
);
5036 aSign
= extractFloat128Sign( a
);
5037 if ( aExp
== 0x7FFF ) {
5038 if ( aSig0
| aSig1
) {
5039 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
5041 return packFloat64( aSign
, 0x7FF, 0 );
5043 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
5044 aSig0
|= ( aSig1
!= 0 );
5045 if ( aExp
|| aSig0
) {
5046 aSig0
|= LIT64( 0x4000000000000000 );
5049 return roundAndPackFloat64( aSign
, aExp
, aSig0 STATUS_VAR
);
5053 /*----------------------------------------------------------------------------
5054 | Returns the result of converting the quadruple-precision floating-point
5055 | value `a' to the extended double-precision floating-point format. The
5056 | conversion is performed according to the IEC/IEEE Standard for Binary
5057 | Floating-Point Arithmetic.
5058 *----------------------------------------------------------------------------*/
5060 floatx80
float128_to_floatx80( float128 a STATUS_PARAM
)
5064 uint64_t aSig0
, aSig1
;
5066 aSig1
= extractFloat128Frac1( a
);
5067 aSig0
= extractFloat128Frac0( a
);
5068 aExp
= extractFloat128Exp( a
);
5069 aSign
= extractFloat128Sign( a
);
5070 if ( aExp
== 0x7FFF ) {
5071 if ( aSig0
| aSig1
) {
5072 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
5074 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5077 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
5078 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5081 aSig0
|= LIT64( 0x0001000000000000 );
5083 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
5084 return roundAndPackFloatx80( 80, aSign
, aExp
, aSig0
, aSig1 STATUS_VAR
);
5088 /*----------------------------------------------------------------------------
5089 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5090 | returns the result as a quadruple-precision floating-point value. The
5091 | operation is performed according to the IEC/IEEE Standard for Binary
5092 | Floating-Point Arithmetic.
5093 *----------------------------------------------------------------------------*/
5095 float128
float128_round_to_int( float128 a STATUS_PARAM
)
5099 uint64_t lastBitMask
, roundBitsMask
;
5103 aExp
= extractFloat128Exp( a
);
5104 if ( 0x402F <= aExp
) {
5105 if ( 0x406F <= aExp
) {
5106 if ( ( aExp
== 0x7FFF )
5107 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
5109 return propagateFloat128NaN( a
, a STATUS_VAR
);
5114 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
5115 roundBitsMask
= lastBitMask
- 1;
5117 roundingMode
= STATUS(float_rounding_mode
);
5118 if ( roundingMode
== float_round_nearest_even
) {
5119 if ( lastBitMask
) {
5120 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
5121 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
5124 if ( (int64_t) z
.low
< 0 ) {
5126 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
5130 else if ( roundingMode
!= float_round_to_zero
) {
5131 if ( extractFloat128Sign( z
)
5132 ^ ( roundingMode
== float_round_up
) ) {
5133 add128( z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
5136 z
.low
&= ~ roundBitsMask
;
5139 if ( aExp
< 0x3FFF ) {
5140 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
5141 STATUS(float_exception_flags
) |= float_flag_inexact
;
5142 aSign
= extractFloat128Sign( a
);
5143 switch ( STATUS(float_rounding_mode
) ) {
5144 case float_round_nearest_even
:
5145 if ( ( aExp
== 0x3FFE )
5146 && ( extractFloat128Frac0( a
)
5147 | extractFloat128Frac1( a
) )
5149 return packFloat128( aSign
, 0x3FFF, 0, 0 );
5152 case float_round_down
:
5154 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
5155 : packFloat128( 0, 0, 0, 0 );
5156 case float_round_up
:
5158 aSign
? packFloat128( 1, 0, 0, 0 )
5159 : packFloat128( 0, 0x3FFF, 0, 0 );
5161 return packFloat128( aSign
, 0, 0, 0 );
5164 lastBitMask
<<= 0x402F - aExp
;
5165 roundBitsMask
= lastBitMask
- 1;
5168 roundingMode
= STATUS(float_rounding_mode
);
5169 if ( roundingMode
== float_round_nearest_even
) {
5170 z
.high
+= lastBitMask
>>1;
5171 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
5172 z
.high
&= ~ lastBitMask
;
5175 else if ( roundingMode
!= float_round_to_zero
) {
5176 if ( extractFloat128Sign( z
)
5177 ^ ( roundingMode
== float_round_up
) ) {
5178 z
.high
|= ( a
.low
!= 0 );
5179 z
.high
+= roundBitsMask
;
5182 z
.high
&= ~ roundBitsMask
;
5184 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
5185 STATUS(float_exception_flags
) |= float_flag_inexact
;
5191 /*----------------------------------------------------------------------------
5192 | Returns the result of adding the absolute values of the quadruple-precision
5193 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5194 | before being returned. `zSign' is ignored if the result is a NaN.
5195 | The addition is performed according to the IEC/IEEE Standard for Binary
5196 | Floating-Point Arithmetic.
5197 *----------------------------------------------------------------------------*/
5199 static float128
addFloat128Sigs( float128 a
, float128 b
, flag zSign STATUS_PARAM
)
5201 int32 aExp
, bExp
, zExp
;
5202 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
5205 aSig1
= extractFloat128Frac1( a
);
5206 aSig0
= extractFloat128Frac0( a
);
5207 aExp
= extractFloat128Exp( a
);
5208 bSig1
= extractFloat128Frac1( b
);
5209 bSig0
= extractFloat128Frac0( b
);
5210 bExp
= extractFloat128Exp( b
);
5211 expDiff
= aExp
- bExp
;
5212 if ( 0 < expDiff
) {
5213 if ( aExp
== 0x7FFF ) {
5214 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5221 bSig0
|= LIT64( 0x0001000000000000 );
5223 shift128ExtraRightJamming(
5224 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
5227 else if ( expDiff
< 0 ) {
5228 if ( bExp
== 0x7FFF ) {
5229 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5230 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5236 aSig0
|= LIT64( 0x0001000000000000 );
5238 shift128ExtraRightJamming(
5239 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
5243 if ( aExp
== 0x7FFF ) {
5244 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
5245 return propagateFloat128NaN( a
, b STATUS_VAR
);
5249 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
5251 if (STATUS(flush_to_zero
)) {
5252 if (zSig0
| zSig1
) {
5253 float_raise(float_flag_output_denormal STATUS_VAR
);
5255 return packFloat128(zSign
, 0, 0, 0);
5257 return packFloat128( zSign
, 0, zSig0
, zSig1
);
5260 zSig0
|= LIT64( 0x0002000000000000 );
5264 aSig0
|= LIT64( 0x0001000000000000 );
5265 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
5267 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
5270 shift128ExtraRightJamming(
5271 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
5273 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5277 /*----------------------------------------------------------------------------
5278 | Returns the result of subtracting the absolute values of the quadruple-
5279 | precision floating-point values `a' and `b'. If `zSign' is 1, the
5280 | difference is negated before being returned. `zSign' is ignored if the
5281 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5282 | Standard for Binary Floating-Point Arithmetic.
5283 *----------------------------------------------------------------------------*/
5285 static float128
subFloat128Sigs( float128 a
, float128 b
, flag zSign STATUS_PARAM
)
5287 int32 aExp
, bExp
, zExp
;
5288 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
5292 aSig1
= extractFloat128Frac1( a
);
5293 aSig0
= extractFloat128Frac0( a
);
5294 aExp
= extractFloat128Exp( a
);
5295 bSig1
= extractFloat128Frac1( b
);
5296 bSig0
= extractFloat128Frac0( b
);
5297 bExp
= extractFloat128Exp( b
);
5298 expDiff
= aExp
- bExp
;
5299 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
5300 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
5301 if ( 0 < expDiff
) goto aExpBigger
;
5302 if ( expDiff
< 0 ) goto bExpBigger
;
5303 if ( aExp
== 0x7FFF ) {
5304 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
5305 return propagateFloat128NaN( a
, b STATUS_VAR
);
5307 float_raise( float_flag_invalid STATUS_VAR
);
5308 z
.low
= float128_default_nan_low
;
5309 z
.high
= float128_default_nan_high
;
5316 if ( bSig0
< aSig0
) goto aBigger
;
5317 if ( aSig0
< bSig0
) goto bBigger
;
5318 if ( bSig1
< aSig1
) goto aBigger
;
5319 if ( aSig1
< bSig1
) goto bBigger
;
5320 return packFloat128( STATUS(float_rounding_mode
) == float_round_down
, 0, 0, 0 );
5322 if ( bExp
== 0x7FFF ) {
5323 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5324 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
5330 aSig0
|= LIT64( 0x4000000000000000 );
5332 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
5333 bSig0
|= LIT64( 0x4000000000000000 );
5335 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
5338 goto normalizeRoundAndPack
;
5340 if ( aExp
== 0x7FFF ) {
5341 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5348 bSig0
|= LIT64( 0x4000000000000000 );
5350 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
5351 aSig0
|= LIT64( 0x4000000000000000 );
5353 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
5355 normalizeRoundAndPack
:
5357 return normalizeRoundAndPackFloat128( zSign
, zExp
- 14, zSig0
, zSig1 STATUS_VAR
);
5361 /*----------------------------------------------------------------------------
5362 | Returns the result of adding the quadruple-precision floating-point values
5363 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5364 | for Binary Floating-Point Arithmetic.
5365 *----------------------------------------------------------------------------*/
5367 float128
float128_add( float128 a
, float128 b STATUS_PARAM
)
5371 aSign
= extractFloat128Sign( a
);
5372 bSign
= extractFloat128Sign( b
);
5373 if ( aSign
== bSign
) {
5374 return addFloat128Sigs( a
, b
, aSign STATUS_VAR
);
5377 return subFloat128Sigs( a
, b
, aSign STATUS_VAR
);
5382 /*----------------------------------------------------------------------------
5383 | Returns the result of subtracting the quadruple-precision floating-point
5384 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5385 | Standard for Binary Floating-Point Arithmetic.
5386 *----------------------------------------------------------------------------*/
5388 float128
float128_sub( float128 a
, float128 b STATUS_PARAM
)
5392 aSign
= extractFloat128Sign( a
);
5393 bSign
= extractFloat128Sign( b
);
5394 if ( aSign
== bSign
) {
5395 return subFloat128Sigs( a
, b
, aSign STATUS_VAR
);
5398 return addFloat128Sigs( a
, b
, aSign STATUS_VAR
);
5403 /*----------------------------------------------------------------------------
5404 | Returns the result of multiplying the quadruple-precision floating-point
5405 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5406 | Standard for Binary Floating-Point Arithmetic.
5407 *----------------------------------------------------------------------------*/
5409 float128
float128_mul( float128 a
, float128 b STATUS_PARAM
)
5411 flag aSign
, bSign
, zSign
;
5412 int32 aExp
, bExp
, zExp
;
5413 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
5416 aSig1
= extractFloat128Frac1( a
);
5417 aSig0
= extractFloat128Frac0( a
);
5418 aExp
= extractFloat128Exp( a
);
5419 aSign
= extractFloat128Sign( a
);
5420 bSig1
= extractFloat128Frac1( b
);
5421 bSig0
= extractFloat128Frac0( b
);
5422 bExp
= extractFloat128Exp( b
);
5423 bSign
= extractFloat128Sign( b
);
5424 zSign
= aSign
^ bSign
;
5425 if ( aExp
== 0x7FFF ) {
5426 if ( ( aSig0
| aSig1
)
5427 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
5428 return propagateFloat128NaN( a
, b STATUS_VAR
);
5430 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
5431 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5433 if ( bExp
== 0x7FFF ) {
5434 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5435 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
5437 float_raise( float_flag_invalid STATUS_VAR
);
5438 z
.low
= float128_default_nan_low
;
5439 z
.high
= float128_default_nan_high
;
5442 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5445 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
5446 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5449 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
5450 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5452 zExp
= aExp
+ bExp
- 0x4000;
5453 aSig0
|= LIT64( 0x0001000000000000 );
5454 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
5455 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
5456 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
5457 zSig2
|= ( zSig3
!= 0 );
5458 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
5459 shift128ExtraRightJamming(
5460 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
5463 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5467 /*----------------------------------------------------------------------------
5468 | Returns the result of dividing the quadruple-precision floating-point value
5469 | `a' by the corresponding value `b'. The operation is performed according to
5470 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5471 *----------------------------------------------------------------------------*/
5473 float128
float128_div( float128 a
, float128 b STATUS_PARAM
)
5475 flag aSign
, bSign
, zSign
;
5476 int32 aExp
, bExp
, zExp
;
5477 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
5478 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5481 aSig1
= extractFloat128Frac1( a
);
5482 aSig0
= extractFloat128Frac0( a
);
5483 aExp
= extractFloat128Exp( a
);
5484 aSign
= extractFloat128Sign( a
);
5485 bSig1
= extractFloat128Frac1( b
);
5486 bSig0
= extractFloat128Frac0( b
);
5487 bExp
= extractFloat128Exp( b
);
5488 bSign
= extractFloat128Sign( b
);
5489 zSign
= aSign
^ bSign
;
5490 if ( aExp
== 0x7FFF ) {
5491 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5492 if ( bExp
== 0x7FFF ) {
5493 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5496 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5498 if ( bExp
== 0x7FFF ) {
5499 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5500 return packFloat128( zSign
, 0, 0, 0 );
5503 if ( ( bSig0
| bSig1
) == 0 ) {
5504 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
5506 float_raise( float_flag_invalid STATUS_VAR
);
5507 z
.low
= float128_default_nan_low
;
5508 z
.high
= float128_default_nan_high
;
5511 float_raise( float_flag_divbyzero STATUS_VAR
);
5512 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5514 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5517 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
5518 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5520 zExp
= aExp
- bExp
+ 0x3FFD;
5522 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
5524 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
5525 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
5526 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
5529 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5530 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
5531 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
5532 while ( (int64_t) rem0
< 0 ) {
5534 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
5536 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
5537 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
5538 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
5539 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
5540 while ( (int64_t) rem1
< 0 ) {
5542 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
5544 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5546 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
5547 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5551 /*----------------------------------------------------------------------------
5552 | Returns the remainder of the quadruple-precision floating-point value `a'
5553 | with respect to the corresponding value `b'. The operation is performed
5554 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5555 *----------------------------------------------------------------------------*/
5557 float128
float128_rem( float128 a
, float128 b STATUS_PARAM
)
5560 int32 aExp
, bExp
, expDiff
;
5561 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
5562 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
5566 aSig1
= extractFloat128Frac1( a
);
5567 aSig0
= extractFloat128Frac0( a
);
5568 aExp
= extractFloat128Exp( a
);
5569 aSign
= extractFloat128Sign( a
);
5570 bSig1
= extractFloat128Frac1( b
);
5571 bSig0
= extractFloat128Frac0( b
);
5572 bExp
= extractFloat128Exp( b
);
5573 if ( aExp
== 0x7FFF ) {
5574 if ( ( aSig0
| aSig1
)
5575 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
5576 return propagateFloat128NaN( a
, b STATUS_VAR
);
5580 if ( bExp
== 0x7FFF ) {
5581 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5585 if ( ( bSig0
| bSig1
) == 0 ) {
5587 float_raise( float_flag_invalid STATUS_VAR
);
5588 z
.low
= float128_default_nan_low
;
5589 z
.high
= float128_default_nan_high
;
5592 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5595 if ( ( aSig0
| aSig1
) == 0 ) return a
;
5596 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5598 expDiff
= aExp
- bExp
;
5599 if ( expDiff
< -1 ) return a
;
5601 aSig0
| LIT64( 0x0001000000000000 ),
5603 15 - ( expDiff
< 0 ),
5608 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
5609 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
5610 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
5612 while ( 0 < expDiff
) {
5613 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5614 q
= ( 4 < q
) ? q
- 4 : 0;
5615 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
5616 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
5617 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
5618 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
5621 if ( -64 < expDiff
) {
5622 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5623 q
= ( 4 < q
) ? q
- 4 : 0;
5625 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
5627 if ( expDiff
< 0 ) {
5628 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
5631 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
5633 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
5634 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
5637 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
5638 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
5641 alternateASig0
= aSig0
;
5642 alternateASig1
= aSig1
;
5644 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
5645 } while ( 0 <= (int64_t) aSig0
);
5647 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
5648 if ( ( sigMean0
< 0 )
5649 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
5650 aSig0
= alternateASig0
;
5651 aSig1
= alternateASig1
;
5653 zSign
= ( (int64_t) aSig0
< 0 );
5654 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
5656 normalizeRoundAndPackFloat128( aSign
^ zSign
, bExp
- 4, aSig0
, aSig1 STATUS_VAR
);
5660 /*----------------------------------------------------------------------------
5661 | Returns the square root of the quadruple-precision floating-point value `a'.
5662 | The operation is performed according to the IEC/IEEE Standard for Binary
5663 | Floating-Point Arithmetic.
5664 *----------------------------------------------------------------------------*/
5666 float128
float128_sqrt( float128 a STATUS_PARAM
)
5670 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
5671 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5674 aSig1
= extractFloat128Frac1( a
);
5675 aSig0
= extractFloat128Frac0( a
);
5676 aExp
= extractFloat128Exp( a
);
5677 aSign
= extractFloat128Sign( a
);
5678 if ( aExp
== 0x7FFF ) {
5679 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, a STATUS_VAR
);
5680 if ( ! aSign
) return a
;
5684 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
5686 float_raise( float_flag_invalid STATUS_VAR
);
5687 z
.low
= float128_default_nan_low
;
5688 z
.high
= float128_default_nan_high
;
5692 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
5693 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5695 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
5696 aSig0
|= LIT64( 0x0001000000000000 );
5697 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
5698 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
5699 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
5700 doubleZSig0
= zSig0
<<1;
5701 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
5702 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
5703 while ( (int64_t) rem0
< 0 ) {
5706 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
5708 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
5709 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
5710 if ( zSig1
== 0 ) zSig1
= 1;
5711 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
5712 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5713 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
5714 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5715 while ( (int64_t) rem1
< 0 ) {
5717 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
5719 term2
|= doubleZSig0
;
5720 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5722 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5724 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
5725 return roundAndPackFloat128( 0, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5729 /*----------------------------------------------------------------------------
5730 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5731 | the corresponding value `b', and 0 otherwise. The invalid exception is
5732 | raised if either operand is a NaN. Otherwise, the comparison is performed
5733 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5734 *----------------------------------------------------------------------------*/
5736 int float128_eq( float128 a
, float128 b STATUS_PARAM
)
5739 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5740 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5741 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5742 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5744 float_raise( float_flag_invalid STATUS_VAR
);
5749 && ( ( a
.high
== b
.high
)
5751 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5756 /*----------------------------------------------------------------------------
5757 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5758 | or equal to the corresponding value `b', and 0 otherwise. The invalid
5759 | exception is raised if either operand is a NaN. The comparison is performed
5760 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5761 *----------------------------------------------------------------------------*/
5763 int float128_le( float128 a
, float128 b STATUS_PARAM
)
5767 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5768 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5769 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5770 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5772 float_raise( float_flag_invalid STATUS_VAR
);
5775 aSign
= extractFloat128Sign( a
);
5776 bSign
= extractFloat128Sign( b
);
5777 if ( aSign
!= bSign
) {
5780 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5784 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5785 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5789 /*----------------------------------------------------------------------------
5790 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5791 | the corresponding value `b', and 0 otherwise. The invalid exception is
5792 | raised if either operand is a NaN. The comparison is performed according
5793 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5794 *----------------------------------------------------------------------------*/
5796 int float128_lt( float128 a
, float128 b STATUS_PARAM
)
5800 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5801 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5802 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5803 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5805 float_raise( float_flag_invalid STATUS_VAR
);
5808 aSign
= extractFloat128Sign( a
);
5809 bSign
= extractFloat128Sign( b
);
5810 if ( aSign
!= bSign
) {
5813 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5817 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5818 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5822 /*----------------------------------------------------------------------------
5823 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5824 | be compared, and 0 otherwise. The invalid exception is raised if either
5825 | operand is a NaN. The comparison is performed according to the IEC/IEEE
5826 | Standard for Binary Floating-Point Arithmetic.
5827 *----------------------------------------------------------------------------*/
5829 int float128_unordered( float128 a
, float128 b STATUS_PARAM
)
5831 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5832 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5833 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5834 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5836 float_raise( float_flag_invalid STATUS_VAR
);
5842 /*----------------------------------------------------------------------------
5843 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5844 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5845 | exception. The comparison is performed according to the IEC/IEEE Standard
5846 | for Binary Floating-Point Arithmetic.
5847 *----------------------------------------------------------------------------*/
5849 int float128_eq_quiet( float128 a
, float128 b STATUS_PARAM
)
5852 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5853 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5854 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5855 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5857 if ( float128_is_signaling_nan( a
)
5858 || float128_is_signaling_nan( b
) ) {
5859 float_raise( float_flag_invalid STATUS_VAR
);
5865 && ( ( a
.high
== b
.high
)
5867 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5872 /*----------------------------------------------------------------------------
5873 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5874 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5875 | cause an exception. Otherwise, the comparison is performed according to the
5876 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5877 *----------------------------------------------------------------------------*/
5879 int float128_le_quiet( float128 a
, float128 b STATUS_PARAM
)
5883 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5884 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5885 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5886 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5888 if ( float128_is_signaling_nan( a
)
5889 || float128_is_signaling_nan( b
) ) {
5890 float_raise( float_flag_invalid STATUS_VAR
);
5894 aSign
= extractFloat128Sign( a
);
5895 bSign
= extractFloat128Sign( b
);
5896 if ( aSign
!= bSign
) {
5899 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5903 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5904 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5908 /*----------------------------------------------------------------------------
5909 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5910 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5911 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5912 | Standard for Binary Floating-Point Arithmetic.
5913 *----------------------------------------------------------------------------*/
5915 int float128_lt_quiet( float128 a
, float128 b STATUS_PARAM
)
5919 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5920 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5921 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5922 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5924 if ( float128_is_signaling_nan( a
)
5925 || float128_is_signaling_nan( b
) ) {
5926 float_raise( float_flag_invalid STATUS_VAR
);
5930 aSign
= extractFloat128Sign( a
);
5931 bSign
= extractFloat128Sign( b
);
5932 if ( aSign
!= bSign
) {
5935 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5939 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5940 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5944 /*----------------------------------------------------------------------------
5945 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5946 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
5947 | comparison is performed according to the IEC/IEEE Standard for Binary
5948 | Floating-Point Arithmetic.
5949 *----------------------------------------------------------------------------*/
5951 int float128_unordered_quiet( float128 a
, float128 b STATUS_PARAM
)
5953 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5954 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5955 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5956 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5958 if ( float128_is_signaling_nan( a
)
5959 || float128_is_signaling_nan( b
) ) {
5960 float_raise( float_flag_invalid STATUS_VAR
);
5967 /* misc functions */
5968 float32
uint32_to_float32( uint32 a STATUS_PARAM
)
5970 return int64_to_float32(a STATUS_VAR
);
5973 float64
uint32_to_float64( uint32 a STATUS_PARAM
)
5975 return int64_to_float64(a STATUS_VAR
);
5978 uint32
float32_to_uint32( float32 a STATUS_PARAM
)
5983 v
= float32_to_int64(a STATUS_VAR
);
5986 float_raise( float_flag_invalid STATUS_VAR
);
5987 } else if (v
> 0xffffffff) {
5989 float_raise( float_flag_invalid STATUS_VAR
);
5996 uint32
float32_to_uint32_round_to_zero( float32 a STATUS_PARAM
)
6001 v
= float32_to_int64_round_to_zero(a STATUS_VAR
);
6004 float_raise( float_flag_invalid STATUS_VAR
);
6005 } else if (v
> 0xffffffff) {
6007 float_raise( float_flag_invalid STATUS_VAR
);
6014 uint16
float32_to_uint16_round_to_zero( float32 a STATUS_PARAM
)
6019 v
= float32_to_int64_round_to_zero(a STATUS_VAR
);
6022 float_raise( float_flag_invalid STATUS_VAR
);
6023 } else if (v
> 0xffff) {
6025 float_raise( float_flag_invalid STATUS_VAR
);
6032 uint32
float64_to_uint32( float64 a STATUS_PARAM
)
6037 v
= float64_to_int64(a STATUS_VAR
);
6040 float_raise( float_flag_invalid STATUS_VAR
);
6041 } else if (v
> 0xffffffff) {
6043 float_raise( float_flag_invalid STATUS_VAR
);
6050 uint32
float64_to_uint32_round_to_zero( float64 a STATUS_PARAM
)
6055 v
= float64_to_int64_round_to_zero(a STATUS_VAR
);
6058 float_raise( float_flag_invalid STATUS_VAR
);
6059 } else if (v
> 0xffffffff) {
6061 float_raise( float_flag_invalid STATUS_VAR
);
6068 uint16
float64_to_uint16_round_to_zero( float64 a STATUS_PARAM
)
6073 v
= float64_to_int64_round_to_zero(a STATUS_VAR
);
6076 float_raise( float_flag_invalid STATUS_VAR
);
6077 } else if (v
> 0xffff) {
6079 float_raise( float_flag_invalid STATUS_VAR
);
6086 /* FIXME: This looks broken. */
6087 uint64_t float64_to_uint64 (float64 a STATUS_PARAM
)
6091 v
= float64_val(int64_to_float64(INT64_MIN STATUS_VAR
));
6092 v
+= float64_val(a
);
6093 v
= float64_to_int64(make_float64(v
) STATUS_VAR
);
6095 return v
- INT64_MIN
;
6098 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM
)
6102 v
= float64_val(int64_to_float64(INT64_MIN STATUS_VAR
));
6103 v
+= float64_val(a
);
6104 v
= float64_to_int64_round_to_zero(make_float64(v
) STATUS_VAR
);
6106 return v
- INT64_MIN
;
6109 #define COMPARE(s, nan_exp) \
6110 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
6111 int is_quiet STATUS_PARAM ) \
6113 flag aSign, bSign; \
6114 uint ## s ## _t av, bv; \
6115 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6116 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6118 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
6119 extractFloat ## s ## Frac( a ) ) || \
6120 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
6121 extractFloat ## s ## Frac( b ) )) { \
6123 float ## s ## _is_signaling_nan( a ) || \
6124 float ## s ## _is_signaling_nan( b ) ) { \
6125 float_raise( float_flag_invalid STATUS_VAR); \
6127 return float_relation_unordered; \
6129 aSign = extractFloat ## s ## Sign( a ); \
6130 bSign = extractFloat ## s ## Sign( b ); \
6131 av = float ## s ## _val(a); \
6132 bv = float ## s ## _val(b); \
6133 if ( aSign != bSign ) { \
6134 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
6136 return float_relation_equal; \
6138 return 1 - (2 * aSign); \
6142 return float_relation_equal; \
6144 return 1 - 2 * (aSign ^ ( av < bv )); \
6149 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
6151 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
6154 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
6156 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
6162 INLINE
int floatx80_compare_internal( floatx80 a
, floatx80 b
,
6163 int is_quiet STATUS_PARAM
)
6167 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
6168 ( extractFloatx80Frac( a
)<<1 ) ) ||
6169 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
6170 ( extractFloatx80Frac( b
)<<1 ) )) {
6172 floatx80_is_signaling_nan( a
) ||
6173 floatx80_is_signaling_nan( b
) ) {
6174 float_raise( float_flag_invalid STATUS_VAR
);
6176 return float_relation_unordered
;
6178 aSign
= extractFloatx80Sign( a
);
6179 bSign
= extractFloatx80Sign( b
);
6180 if ( aSign
!= bSign
) {
6182 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
6183 ( ( a
.low
| b
.low
) == 0 ) ) {
6185 return float_relation_equal
;
6187 return 1 - (2 * aSign
);
6190 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
6191 return float_relation_equal
;
6193 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
6198 int floatx80_compare( floatx80 a
, floatx80 b STATUS_PARAM
)
6200 return floatx80_compare_internal(a
, b
, 0 STATUS_VAR
);
6203 int floatx80_compare_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
6205 return floatx80_compare_internal(a
, b
, 1 STATUS_VAR
);
6208 INLINE
int float128_compare_internal( float128 a
, float128 b
,
6209 int is_quiet STATUS_PARAM
)
6213 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
6214 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
6215 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
6216 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
6218 float128_is_signaling_nan( a
) ||
6219 float128_is_signaling_nan( b
) ) {
6220 float_raise( float_flag_invalid STATUS_VAR
);
6222 return float_relation_unordered
;
6224 aSign
= extractFloat128Sign( a
);
6225 bSign
= extractFloat128Sign( b
);
6226 if ( aSign
!= bSign
) {
6227 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
6229 return float_relation_equal
;
6231 return 1 - (2 * aSign
);
6234 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
6235 return float_relation_equal
;
6237 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
6242 int float128_compare( float128 a
, float128 b STATUS_PARAM
)
6244 return float128_compare_internal(a
, b
, 0 STATUS_VAR
);
6247 int float128_compare_quiet( float128 a
, float128 b STATUS_PARAM
)
6249 return float128_compare_internal(a
, b
, 1 STATUS_VAR
);
6252 /* min() and max() functions. These can't be implemented as
6253 * 'compare and pick one input' because that would mishandle
6254 * NaNs and +0 vs -0.
6256 #define MINMAX(s, nan_exp) \
6257 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \
6258 int ismin STATUS_PARAM ) \
6260 flag aSign, bSign; \
6261 uint ## s ## _t av, bv; \
6262 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6263 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6264 if (float ## s ## _is_any_nan(a) || \
6265 float ## s ## _is_any_nan(b)) { \
6266 return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
6268 aSign = extractFloat ## s ## Sign(a); \
6269 bSign = extractFloat ## s ## Sign(b); \
6270 av = float ## s ## _val(a); \
6271 bv = float ## s ## _val(b); \
6272 if (aSign != bSign) { \
6274 return aSign ? a : b; \
6276 return aSign ? b : a; \
6280 return (aSign ^ (av < bv)) ? a : b; \
6282 return (aSign ^ (av < bv)) ? b : a; \
6287 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
6289 return float ## s ## _minmax(a, b, 1 STATUS_VAR); \
6292 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
6294 return float ## s ## _minmax(a, b, 0 STATUS_VAR); \
6301 /* Multiply A by 2 raised to the power N. */
6302 float32
float32_scalbn( float32 a
, int n STATUS_PARAM
)
6308 a
= float32_squash_input_denormal(a STATUS_VAR
);
6309 aSig
= extractFloat32Frac( a
);
6310 aExp
= extractFloat32Exp( a
);
6311 aSign
= extractFloat32Sign( a
);
6313 if ( aExp
== 0xFF ) {
6315 return propagateFloat32NaN( a
, a STATUS_VAR
);
6321 else if ( aSig
== 0 )
6326 } else if (n
< -0x200) {
6332 return normalizeRoundAndPackFloat32( aSign
, aExp
, aSig STATUS_VAR
);
6335 float64
float64_scalbn( float64 a
, int n STATUS_PARAM
)
6341 a
= float64_squash_input_denormal(a STATUS_VAR
);
6342 aSig
= extractFloat64Frac( a
);
6343 aExp
= extractFloat64Exp( a
);
6344 aSign
= extractFloat64Sign( a
);
6346 if ( aExp
== 0x7FF ) {
6348 return propagateFloat64NaN( a
, a STATUS_VAR
);
6353 aSig
|= LIT64( 0x0010000000000000 );
6354 else if ( aSig
== 0 )
6359 } else if (n
< -0x1000) {
6365 return normalizeRoundAndPackFloat64( aSign
, aExp
, aSig STATUS_VAR
);
6368 floatx80
floatx80_scalbn( floatx80 a
, int n STATUS_PARAM
)
6374 aSig
= extractFloatx80Frac( a
);
6375 aExp
= extractFloatx80Exp( a
);
6376 aSign
= extractFloatx80Sign( a
);
6378 if ( aExp
== 0x7FFF ) {
6380 return propagateFloatx80NaN( a
, a STATUS_VAR
);
6385 if (aExp
== 0 && aSig
== 0)
6390 } else if (n
< -0x10000) {
6395 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision
),
6396 aSign
, aExp
, aSig
, 0 STATUS_VAR
);
6399 float128
float128_scalbn( float128 a
, int n STATUS_PARAM
)
6403 uint64_t aSig0
, aSig1
;
6405 aSig1
= extractFloat128Frac1( a
);
6406 aSig0
= extractFloat128Frac0( a
);
6407 aExp
= extractFloat128Exp( a
);
6408 aSign
= extractFloat128Sign( a
);
6409 if ( aExp
== 0x7FFF ) {
6410 if ( aSig0
| aSig1
) {
6411 return propagateFloat128NaN( a
, a STATUS_VAR
);
6416 aSig0
|= LIT64( 0x0001000000000000 );
6417 else if ( aSig0
== 0 && aSig1
== 0 )
6422 } else if (n
< -0x10000) {
6427 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1