4 * Derived from SoftFloat.
7 /*============================================================================
9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
12 Written by John R. Hauser. This work was made possible in part by the
13 International Computer Science Institute, located at Suite 600, 1947 Center
14 Street, Berkeley, California 94704. Funding was partially provided by the
15 National Science Foundation under grant MIP-9311980. The original version
16 of this code was written as part of a project to build a fixed-point vector
17 processor in collaboration with the University of California at Berkeley,
18 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20 arithmetic/SoftFloat.html'.
22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
31 Derivative works are acceptable, even for commercial purposes, so long as
32 (1) the source code for the derivative work includes prominent notice that
33 the work is derivative, and (2) the source code includes prominent notice with
34 these four paragraphs for those parts of this code that are retained.
36 =============================================================================*/
38 /* softfloat (and in particular the code in softfloat-specialize.h) is
39 * target-dependent and needs the TARGET_* macros.
43 #include "softfloat.h"
45 /*----------------------------------------------------------------------------
46 | Primitive arithmetic functions, including multi-word arithmetic, and
47 | division and square root approximations. (Can be specialized to target if
49 *----------------------------------------------------------------------------*/
50 #include "softfloat-macros.h"
52 /*----------------------------------------------------------------------------
53 | Functions and definitions to determine: (1) whether tininess for underflow
54 | is detected before or after rounding by default, (2) what (if anything)
55 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
56 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
57 | are propagated from function inputs to output. These details are target-
59 *----------------------------------------------------------------------------*/
60 #include "softfloat-specialize.h"
62 void set_float_rounding_mode(int val STATUS_PARAM
)
64 STATUS(float_rounding_mode
) = val
;
67 void set_float_exception_flags(int val STATUS_PARAM
)
69 STATUS(float_exception_flags
) = val
;
72 void set_floatx80_rounding_precision(int val STATUS_PARAM
)
74 STATUS(floatx80_rounding_precision
) = val
;
77 /*----------------------------------------------------------------------------
78 | Returns the fraction bits of the half-precision floating-point value `a'.
79 *----------------------------------------------------------------------------*/
81 INLINE
uint32_t extractFloat16Frac(float16 a
)
83 return float16_val(a
) & 0x3ff;
86 /*----------------------------------------------------------------------------
87 | Returns the exponent bits of the half-precision floating-point value `a'.
88 *----------------------------------------------------------------------------*/
90 INLINE int16
extractFloat16Exp(float16 a
)
92 return (float16_val(a
) >> 10) & 0x1f;
95 /*----------------------------------------------------------------------------
96 | Returns the sign bit of the single-precision floating-point value `a'.
97 *----------------------------------------------------------------------------*/
99 INLINE flag
extractFloat16Sign(float16 a
)
101 return float16_val(a
)>>15;
104 /*----------------------------------------------------------------------------
105 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
106 | and 7, and returns the properly rounded 32-bit integer corresponding to the
107 | input. If `zSign' is 1, the input is negated before being converted to an
108 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
109 | is simply rounded to an integer, with the inexact exception raised if the
110 | input cannot be represented exactly as an integer. However, if the fixed-
111 | point input is too large, the invalid exception is raised and the largest
112 | positive or negative integer is returned.
113 *----------------------------------------------------------------------------*/
115 static int32
roundAndPackInt32( flag zSign
, uint64_t absZ STATUS_PARAM
)
118 flag roundNearestEven
;
119 int8 roundIncrement
, roundBits
;
122 roundingMode
= STATUS(float_rounding_mode
);
123 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
124 roundIncrement
= 0x40;
125 if ( ! roundNearestEven
) {
126 if ( roundingMode
== float_round_to_zero
) {
130 roundIncrement
= 0x7F;
132 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
135 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
139 roundBits
= absZ
& 0x7F;
140 absZ
= ( absZ
+ roundIncrement
)>>7;
141 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
143 if ( zSign
) z
= - z
;
144 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
145 float_raise( float_flag_invalid STATUS_VAR
);
146 return zSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
148 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
153 /*----------------------------------------------------------------------------
154 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
155 | `absZ1', with binary point between bits 63 and 64 (between the input words),
156 | and returns the properly rounded 64-bit integer corresponding to the input.
157 | If `zSign' is 1, the input is negated before being converted to an integer.
158 | Ordinarily, the fixed-point input is simply rounded to an integer, with
159 | the inexact exception raised if the input cannot be represented exactly as
160 | an integer. However, if the fixed-point input is too large, the invalid
161 | exception is raised and the largest positive or negative integer is
163 *----------------------------------------------------------------------------*/
165 static int64
roundAndPackInt64( flag zSign
, uint64_t absZ0
, uint64_t absZ1 STATUS_PARAM
)
168 flag roundNearestEven
, increment
;
171 roundingMode
= STATUS(float_rounding_mode
);
172 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
173 increment
= ( (int64_t) absZ1
< 0 );
174 if ( ! roundNearestEven
) {
175 if ( roundingMode
== float_round_to_zero
) {
180 increment
= ( roundingMode
== float_round_down
) && absZ1
;
183 increment
= ( roundingMode
== float_round_up
) && absZ1
;
189 if ( absZ0
== 0 ) goto overflow
;
190 absZ0
&= ~ ( ( (uint64_t) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
193 if ( zSign
) z
= - z
;
194 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
196 float_raise( float_flag_invalid STATUS_VAR
);
198 zSign
? (int64_t) LIT64( 0x8000000000000000 )
199 : LIT64( 0x7FFFFFFFFFFFFFFF );
201 if ( absZ1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
206 /*----------------------------------------------------------------------------
207 | Returns the fraction bits of the single-precision floating-point value `a'.
208 *----------------------------------------------------------------------------*/
210 INLINE
uint32_t extractFloat32Frac( float32 a
)
213 return float32_val(a
) & 0x007FFFFF;
217 /*----------------------------------------------------------------------------
218 | Returns the exponent bits of the single-precision floating-point value `a'.
219 *----------------------------------------------------------------------------*/
221 INLINE int16
extractFloat32Exp( float32 a
)
224 return ( float32_val(a
)>>23 ) & 0xFF;
228 /*----------------------------------------------------------------------------
229 | Returns the sign bit of the single-precision floating-point value `a'.
230 *----------------------------------------------------------------------------*/
232 INLINE flag
extractFloat32Sign( float32 a
)
235 return float32_val(a
)>>31;
239 /*----------------------------------------------------------------------------
240 | If `a' is denormal and we are in flush-to-zero mode then set the
241 | input-denormal exception and return zero. Otherwise just return the value.
242 *----------------------------------------------------------------------------*/
243 static float32
float32_squash_input_denormal(float32 a STATUS_PARAM
)
245 if (STATUS(flush_inputs_to_zero
)) {
246 if (extractFloat32Exp(a
) == 0 && extractFloat32Frac(a
) != 0) {
247 float_raise(float_flag_input_denormal STATUS_VAR
);
248 return make_float32(float32_val(a
) & 0x80000000);
254 /*----------------------------------------------------------------------------
255 | Normalizes the subnormal single-precision floating-point value represented
256 | by the denormalized significand `aSig'. The normalized exponent and
257 | significand are stored at the locations pointed to by `zExpPtr' and
258 | `zSigPtr', respectively.
259 *----------------------------------------------------------------------------*/
262 normalizeFloat32Subnormal( uint32_t aSig
, int16
*zExpPtr
, uint32_t *zSigPtr
)
266 shiftCount
= countLeadingZeros32( aSig
) - 8;
267 *zSigPtr
= aSig
<<shiftCount
;
268 *zExpPtr
= 1 - shiftCount
;
272 /*----------------------------------------------------------------------------
273 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
274 | single-precision floating-point value, returning the result. After being
275 | shifted into the proper positions, the three fields are simply added
276 | together to form the result. This means that any integer portion of `zSig'
277 | will be added into the exponent. Since a properly normalized significand
278 | will have an integer portion equal to 1, the `zExp' input should be 1 less
279 | than the desired result exponent whenever `zSig' is a complete, normalized
281 *----------------------------------------------------------------------------*/
283 INLINE float32
packFloat32( flag zSign
, int16 zExp
, uint32_t zSig
)
287 ( ( (uint32_t) zSign
)<<31 ) + ( ( (uint32_t) zExp
)<<23 ) + zSig
);
291 /*----------------------------------------------------------------------------
292 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
293 | and significand `zSig', and returns the proper single-precision floating-
294 | point value corresponding to the abstract input. Ordinarily, the abstract
295 | value is simply rounded and packed into the single-precision format, with
296 | the inexact exception raised if the abstract input cannot be represented
297 | exactly. However, if the abstract value is too large, the overflow and
298 | inexact exceptions are raised and an infinity or maximal finite value is
299 | returned. If the abstract value is too small, the input value is rounded to
300 | a subnormal number, and the underflow and inexact exceptions are raised if
301 | the abstract input cannot be represented exactly as a subnormal single-
302 | precision floating-point number.
303 | The input significand `zSig' has its binary point between bits 30
304 | and 29, which is 7 bits to the left of the usual location. This shifted
305 | significand must be normalized or smaller. If `zSig' is not normalized,
306 | `zExp' must be 0; in that case, the result returned is a subnormal number,
307 | and it must not require rounding. In the usual case that `zSig' is
308 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
309 | The handling of underflow and overflow follows the IEC/IEEE Standard for
310 | Binary Floating-Point Arithmetic.
311 *----------------------------------------------------------------------------*/
313 static float32
roundAndPackFloat32( flag zSign
, int16 zExp
, uint32_t zSig STATUS_PARAM
)
316 flag roundNearestEven
;
317 int8 roundIncrement
, roundBits
;
320 roundingMode
= STATUS(float_rounding_mode
);
321 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
322 roundIncrement
= 0x40;
323 if ( ! roundNearestEven
) {
324 if ( roundingMode
== float_round_to_zero
) {
328 roundIncrement
= 0x7F;
330 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
333 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
337 roundBits
= zSig
& 0x7F;
338 if ( 0xFD <= (uint16_t) zExp
) {
340 || ( ( zExp
== 0xFD )
341 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
343 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
344 return packFloat32( zSign
, 0xFF, - ( roundIncrement
== 0 ));
347 if (STATUS(flush_to_zero
)) {
348 float_raise(float_flag_output_denormal STATUS_VAR
);
349 return packFloat32(zSign
, 0, 0);
352 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
354 || ( zSig
+ roundIncrement
< 0x80000000 );
355 shift32RightJamming( zSig
, - zExp
, &zSig
);
357 roundBits
= zSig
& 0x7F;
358 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
361 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
362 zSig
= ( zSig
+ roundIncrement
)>>7;
363 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
364 if ( zSig
== 0 ) zExp
= 0;
365 return packFloat32( zSign
, zExp
, zSig
);
369 /*----------------------------------------------------------------------------
370 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
371 | and significand `zSig', and returns the proper single-precision floating-
372 | point value corresponding to the abstract input. This routine is just like
373 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
374 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
375 | floating-point exponent.
376 *----------------------------------------------------------------------------*/
379 normalizeRoundAndPackFloat32( flag zSign
, int16 zExp
, uint32_t zSig STATUS_PARAM
)
383 shiftCount
= countLeadingZeros32( zSig
) - 1;
384 return roundAndPackFloat32( zSign
, zExp
- shiftCount
, zSig
<<shiftCount STATUS_VAR
);
388 /*----------------------------------------------------------------------------
389 | Returns the fraction bits of the double-precision floating-point value `a'.
390 *----------------------------------------------------------------------------*/
392 INLINE
uint64_t extractFloat64Frac( float64 a
)
395 return float64_val(a
) & LIT64( 0x000FFFFFFFFFFFFF );
399 /*----------------------------------------------------------------------------
400 | Returns the exponent bits of the double-precision floating-point value `a'.
401 *----------------------------------------------------------------------------*/
403 INLINE int16
extractFloat64Exp( float64 a
)
406 return ( float64_val(a
)>>52 ) & 0x7FF;
410 /*----------------------------------------------------------------------------
411 | Returns the sign bit of the double-precision floating-point value `a'.
412 *----------------------------------------------------------------------------*/
414 INLINE flag
extractFloat64Sign( float64 a
)
417 return float64_val(a
)>>63;
421 /*----------------------------------------------------------------------------
422 | If `a' is denormal and we are in flush-to-zero mode then set the
423 | input-denormal exception and return zero. Otherwise just return the value.
424 *----------------------------------------------------------------------------*/
425 static float64
float64_squash_input_denormal(float64 a STATUS_PARAM
)
427 if (STATUS(flush_inputs_to_zero
)) {
428 if (extractFloat64Exp(a
) == 0 && extractFloat64Frac(a
) != 0) {
429 float_raise(float_flag_input_denormal STATUS_VAR
);
430 return make_float64(float64_val(a
) & (1ULL << 63));
436 /*----------------------------------------------------------------------------
437 | Normalizes the subnormal double-precision floating-point value represented
438 | by the denormalized significand `aSig'. The normalized exponent and
439 | significand are stored at the locations pointed to by `zExpPtr' and
440 | `zSigPtr', respectively.
441 *----------------------------------------------------------------------------*/
444 normalizeFloat64Subnormal( uint64_t aSig
, int16
*zExpPtr
, uint64_t *zSigPtr
)
448 shiftCount
= countLeadingZeros64( aSig
) - 11;
449 *zSigPtr
= aSig
<<shiftCount
;
450 *zExpPtr
= 1 - shiftCount
;
454 /*----------------------------------------------------------------------------
455 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
456 | double-precision floating-point value, returning the result. After being
457 | shifted into the proper positions, the three fields are simply added
458 | together to form the result. This means that any integer portion of `zSig'
459 | will be added into the exponent. Since a properly normalized significand
460 | will have an integer portion equal to 1, the `zExp' input should be 1 less
461 | than the desired result exponent whenever `zSig' is a complete, normalized
463 *----------------------------------------------------------------------------*/
465 INLINE float64
packFloat64( flag zSign
, int16 zExp
, uint64_t zSig
)
469 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
473 /*----------------------------------------------------------------------------
474 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
475 | and significand `zSig', and returns the proper double-precision floating-
476 | point value corresponding to the abstract input. Ordinarily, the abstract
477 | value is simply rounded and packed into the double-precision format, with
478 | the inexact exception raised if the abstract input cannot be represented
479 | exactly. However, if the abstract value is too large, the overflow and
480 | inexact exceptions are raised and an infinity or maximal finite value is
481 | returned. If the abstract value is too small, the input value is rounded
482 | to a subnormal number, and the underflow and inexact exceptions are raised
483 | if the abstract input cannot be represented exactly as a subnormal double-
484 | precision floating-point number.
485 | The input significand `zSig' has its binary point between bits 62
486 | and 61, which is 10 bits to the left of the usual location. This shifted
487 | significand must be normalized or smaller. If `zSig' is not normalized,
488 | `zExp' must be 0; in that case, the result returned is a subnormal number,
489 | and it must not require rounding. In the usual case that `zSig' is
490 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
491 | The handling of underflow and overflow follows the IEC/IEEE Standard for
492 | Binary Floating-Point Arithmetic.
493 *----------------------------------------------------------------------------*/
495 static float64
roundAndPackFloat64( flag zSign
, int16 zExp
, uint64_t zSig STATUS_PARAM
)
498 flag roundNearestEven
;
499 int16 roundIncrement
, roundBits
;
502 roundingMode
= STATUS(float_rounding_mode
);
503 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
504 roundIncrement
= 0x200;
505 if ( ! roundNearestEven
) {
506 if ( roundingMode
== float_round_to_zero
) {
510 roundIncrement
= 0x3FF;
512 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
515 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
519 roundBits
= zSig
& 0x3FF;
520 if ( 0x7FD <= (uint16_t) zExp
) {
521 if ( ( 0x7FD < zExp
)
522 || ( ( zExp
== 0x7FD )
523 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
525 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
526 return packFloat64( zSign
, 0x7FF, - ( roundIncrement
== 0 ));
529 if (STATUS(flush_to_zero
)) {
530 float_raise(float_flag_output_denormal STATUS_VAR
);
531 return packFloat64(zSign
, 0, 0);
534 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
536 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
537 shift64RightJamming( zSig
, - zExp
, &zSig
);
539 roundBits
= zSig
& 0x3FF;
540 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
543 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
544 zSig
= ( zSig
+ roundIncrement
)>>10;
545 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
546 if ( zSig
== 0 ) zExp
= 0;
547 return packFloat64( zSign
, zExp
, zSig
);
551 /*----------------------------------------------------------------------------
552 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
553 | and significand `zSig', and returns the proper double-precision floating-
554 | point value corresponding to the abstract input. This routine is just like
555 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
556 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
557 | floating-point exponent.
558 *----------------------------------------------------------------------------*/
561 normalizeRoundAndPackFloat64( flag zSign
, int16 zExp
, uint64_t zSig STATUS_PARAM
)
565 shiftCount
= countLeadingZeros64( zSig
) - 1;
566 return roundAndPackFloat64( zSign
, zExp
- shiftCount
, zSig
<<shiftCount STATUS_VAR
);
570 /*----------------------------------------------------------------------------
571 | Returns the fraction bits of the extended double-precision floating-point
573 *----------------------------------------------------------------------------*/
575 INLINE
uint64_t extractFloatx80Frac( floatx80 a
)
582 /*----------------------------------------------------------------------------
583 | Returns the exponent bits of the extended double-precision floating-point
585 *----------------------------------------------------------------------------*/
587 INLINE int32
extractFloatx80Exp( floatx80 a
)
590 return a
.high
& 0x7FFF;
594 /*----------------------------------------------------------------------------
595 | Returns the sign bit of the extended double-precision floating-point value
597 *----------------------------------------------------------------------------*/
599 INLINE flag
extractFloatx80Sign( floatx80 a
)
606 /*----------------------------------------------------------------------------
607 | Normalizes the subnormal extended double-precision floating-point value
608 | represented by the denormalized significand `aSig'. The normalized exponent
609 | and significand are stored at the locations pointed to by `zExpPtr' and
610 | `zSigPtr', respectively.
611 *----------------------------------------------------------------------------*/
614 normalizeFloatx80Subnormal( uint64_t aSig
, int32
*zExpPtr
, uint64_t *zSigPtr
)
618 shiftCount
= countLeadingZeros64( aSig
);
619 *zSigPtr
= aSig
<<shiftCount
;
620 *zExpPtr
= 1 - shiftCount
;
624 /*----------------------------------------------------------------------------
625 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
626 | extended double-precision floating-point value, returning the result.
627 *----------------------------------------------------------------------------*/
629 INLINE floatx80
packFloatx80( flag zSign
, int32 zExp
, uint64_t zSig
)
634 z
.high
= ( ( (uint16_t) zSign
)<<15 ) + zExp
;
639 /*----------------------------------------------------------------------------
640 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
641 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
642 | and returns the proper extended double-precision floating-point value
643 | corresponding to the abstract input. Ordinarily, the abstract value is
644 | rounded and packed into the extended double-precision format, with the
645 | inexact exception raised if the abstract input cannot be represented
646 | exactly. However, if the abstract value is too large, the overflow and
647 | inexact exceptions are raised and an infinity or maximal finite value is
648 | returned. If the abstract value is too small, the input value is rounded to
649 | a subnormal number, and the underflow and inexact exceptions are raised if
650 | the abstract input cannot be represented exactly as a subnormal extended
651 | double-precision floating-point number.
652 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
653 | number of bits as single or double precision, respectively. Otherwise, the
654 | result is rounded to the full precision of the extended double-precision
656 | The input significand must be normalized or smaller. If the input
657 | significand is not normalized, `zExp' must be 0; in that case, the result
658 | returned is a subnormal number, and it must not require rounding. The
659 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
660 | Floating-Point Arithmetic.
661 *----------------------------------------------------------------------------*/
664 roundAndPackFloatx80(
665 int8 roundingPrecision
, flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1
669 flag roundNearestEven
, increment
, isTiny
;
670 int64 roundIncrement
, roundMask
, roundBits
;
672 roundingMode
= STATUS(float_rounding_mode
);
673 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
674 if ( roundingPrecision
== 80 ) goto precision80
;
675 if ( roundingPrecision
== 64 ) {
676 roundIncrement
= LIT64( 0x0000000000000400 );
677 roundMask
= LIT64( 0x00000000000007FF );
679 else if ( roundingPrecision
== 32 ) {
680 roundIncrement
= LIT64( 0x0000008000000000 );
681 roundMask
= LIT64( 0x000000FFFFFFFFFF );
686 zSig0
|= ( zSig1
!= 0 );
687 if ( ! roundNearestEven
) {
688 if ( roundingMode
== float_round_to_zero
) {
692 roundIncrement
= roundMask
;
694 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
697 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
701 roundBits
= zSig0
& roundMask
;
702 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
703 if ( ( 0x7FFE < zExp
)
704 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
709 if (STATUS(flush_to_zero
)) {
710 float_raise(float_flag_output_denormal STATUS_VAR
);
711 return packFloatx80(zSign
, 0, 0);
714 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
716 || ( zSig0
<= zSig0
+ roundIncrement
);
717 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
719 roundBits
= zSig0
& roundMask
;
720 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
721 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
722 zSig0
+= roundIncrement
;
723 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
724 roundIncrement
= roundMask
+ 1;
725 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
726 roundMask
|= roundIncrement
;
728 zSig0
&= ~ roundMask
;
729 return packFloatx80( zSign
, zExp
, zSig0
);
732 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
733 zSig0
+= roundIncrement
;
734 if ( zSig0
< roundIncrement
) {
736 zSig0
= LIT64( 0x8000000000000000 );
738 roundIncrement
= roundMask
+ 1;
739 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
740 roundMask
|= roundIncrement
;
742 zSig0
&= ~ roundMask
;
743 if ( zSig0
== 0 ) zExp
= 0;
744 return packFloatx80( zSign
, zExp
, zSig0
);
746 increment
= ( (int64_t) zSig1
< 0 );
747 if ( ! roundNearestEven
) {
748 if ( roundingMode
== float_round_to_zero
) {
753 increment
= ( roundingMode
== float_round_down
) && zSig1
;
756 increment
= ( roundingMode
== float_round_up
) && zSig1
;
760 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
761 if ( ( 0x7FFE < zExp
)
762 || ( ( zExp
== 0x7FFE )
763 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
769 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
770 if ( ( roundingMode
== float_round_to_zero
)
771 || ( zSign
&& ( roundingMode
== float_round_up
) )
772 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
774 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
776 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
780 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
783 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
784 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
786 if ( isTiny
&& zSig1
) float_raise( float_flag_underflow STATUS_VAR
);
787 if ( zSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
788 if ( roundNearestEven
) {
789 increment
= ( (int64_t) zSig1
< 0 );
793 increment
= ( roundingMode
== float_round_down
) && zSig1
;
796 increment
= ( roundingMode
== float_round_up
) && zSig1
;
802 ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
803 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
805 return packFloatx80( zSign
, zExp
, zSig0
);
808 if ( zSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
813 zSig0
= LIT64( 0x8000000000000000 );
816 zSig0
&= ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
820 if ( zSig0
== 0 ) zExp
= 0;
822 return packFloatx80( zSign
, zExp
, zSig0
);
826 /*----------------------------------------------------------------------------
827 | Takes an abstract floating-point value having sign `zSign', exponent
828 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
829 | and returns the proper extended double-precision floating-point value
830 | corresponding to the abstract input. This routine is just like
831 | `roundAndPackFloatx80' except that the input significand does not have to be
833 *----------------------------------------------------------------------------*/
836 normalizeRoundAndPackFloatx80(
837 int8 roundingPrecision
, flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1
847 shiftCount
= countLeadingZeros64( zSig0
);
848 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
851 roundAndPackFloatx80( roundingPrecision
, zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
855 /*----------------------------------------------------------------------------
856 | Returns the least-significant 64 fraction bits of the quadruple-precision
857 | floating-point value `a'.
858 *----------------------------------------------------------------------------*/
860 INLINE
uint64_t extractFloat128Frac1( float128 a
)
867 /*----------------------------------------------------------------------------
868 | Returns the most-significant 48 fraction bits of the quadruple-precision
869 | floating-point value `a'.
870 *----------------------------------------------------------------------------*/
872 INLINE
uint64_t extractFloat128Frac0( float128 a
)
875 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
879 /*----------------------------------------------------------------------------
880 | Returns the exponent bits of the quadruple-precision floating-point value
882 *----------------------------------------------------------------------------*/
884 INLINE int32
extractFloat128Exp( float128 a
)
887 return ( a
.high
>>48 ) & 0x7FFF;
891 /*----------------------------------------------------------------------------
892 | Returns the sign bit of the quadruple-precision floating-point value `a'.
893 *----------------------------------------------------------------------------*/
895 INLINE flag
extractFloat128Sign( float128 a
)
902 /*----------------------------------------------------------------------------
903 | Normalizes the subnormal quadruple-precision floating-point value
904 | represented by the denormalized significand formed by the concatenation of
905 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
906 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
907 | significand are stored at the location pointed to by `zSig0Ptr', and the
908 | least significant 64 bits of the normalized significand are stored at the
909 | location pointed to by `zSig1Ptr'.
910 *----------------------------------------------------------------------------*/
913 normalizeFloat128Subnormal(
924 shiftCount
= countLeadingZeros64( aSig1
) - 15;
925 if ( shiftCount
< 0 ) {
926 *zSig0Ptr
= aSig1
>>( - shiftCount
);
927 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
930 *zSig0Ptr
= aSig1
<<shiftCount
;
933 *zExpPtr
= - shiftCount
- 63;
936 shiftCount
= countLeadingZeros64( aSig0
) - 15;
937 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
938 *zExpPtr
= 1 - shiftCount
;
943 /*----------------------------------------------------------------------------
944 | Packs the sign `zSign', the exponent `zExp', and the significand formed
945 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
946 | floating-point value, returning the result. After being shifted into the
947 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
948 | added together to form the most significant 32 bits of the result. This
949 | means that any integer portion of `zSig0' will be added into the exponent.
950 | Since a properly normalized significand will have an integer portion equal
951 | to 1, the `zExp' input should be 1 less than the desired result exponent
952 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
954 *----------------------------------------------------------------------------*/
957 packFloat128( flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1
)
962 z
.high
= ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<48 ) + zSig0
;
967 /*----------------------------------------------------------------------------
968 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
969 | and extended significand formed by the concatenation of `zSig0', `zSig1',
970 | and `zSig2', and returns the proper quadruple-precision floating-point value
971 | corresponding to the abstract input. Ordinarily, the abstract value is
972 | simply rounded and packed into the quadruple-precision format, with the
973 | inexact exception raised if the abstract input cannot be represented
974 | exactly. However, if the abstract value is too large, the overflow and
975 | inexact exceptions are raised and an infinity or maximal finite value is
976 | returned. If the abstract value is too small, the input value is rounded to
977 | a subnormal number, and the underflow and inexact exceptions are raised if
978 | the abstract input cannot be represented exactly as a subnormal quadruple-
979 | precision floating-point number.
980 | The input significand must be normalized or smaller. If the input
981 | significand is not normalized, `zExp' must be 0; in that case, the result
982 | returned is a subnormal number, and it must not require rounding. In the
983 | usual case that the input significand is normalized, `zExp' must be 1 less
984 | than the ``true'' floating-point exponent. The handling of underflow and
985 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
986 *----------------------------------------------------------------------------*/
989 roundAndPackFloat128(
990 flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1
, uint64_t zSig2 STATUS_PARAM
)
993 flag roundNearestEven
, increment
, isTiny
;
995 roundingMode
= STATUS(float_rounding_mode
);
996 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
997 increment
= ( (int64_t) zSig2
< 0 );
998 if ( ! roundNearestEven
) {
999 if ( roundingMode
== float_round_to_zero
) {
1004 increment
= ( roundingMode
== float_round_down
) && zSig2
;
1007 increment
= ( roundingMode
== float_round_up
) && zSig2
;
1011 if ( 0x7FFD <= (uint32_t) zExp
) {
1012 if ( ( 0x7FFD < zExp
)
1013 || ( ( zExp
== 0x7FFD )
1015 LIT64( 0x0001FFFFFFFFFFFF ),
1016 LIT64( 0xFFFFFFFFFFFFFFFF ),
1023 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
1024 if ( ( roundingMode
== float_round_to_zero
)
1025 || ( zSign
&& ( roundingMode
== float_round_up
) )
1026 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
1032 LIT64( 0x0000FFFFFFFFFFFF ),
1033 LIT64( 0xFFFFFFFFFFFFFFFF )
1036 return packFloat128( zSign
, 0x7FFF, 0, 0 );
1039 if (STATUS(flush_to_zero
)) {
1040 float_raise(float_flag_output_denormal STATUS_VAR
);
1041 return packFloat128(zSign
, 0, 0, 0);
1044 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
1050 LIT64( 0x0001FFFFFFFFFFFF ),
1051 LIT64( 0xFFFFFFFFFFFFFFFF )
1053 shift128ExtraRightJamming(
1054 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
1056 if ( isTiny
&& zSig2
) float_raise( float_flag_underflow STATUS_VAR
);
1057 if ( roundNearestEven
) {
1058 increment
= ( (int64_t) zSig2
< 0 );
1062 increment
= ( roundingMode
== float_round_down
) && zSig2
;
1065 increment
= ( roundingMode
== float_round_up
) && zSig2
;
1070 if ( zSig2
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1072 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
1073 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
1076 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
1078 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1082 /*----------------------------------------------------------------------------
1083 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1084 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1085 | returns the proper quadruple-precision floating-point value corresponding
1086 | to the abstract input. This routine is just like `roundAndPackFloat128'
1087 | except that the input significand has fewer bits and does not have to be
1088 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1090 *----------------------------------------------------------------------------*/
1093 normalizeRoundAndPackFloat128(
1094 flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1 STATUS_PARAM
)
1104 shiftCount
= countLeadingZeros64( zSig0
) - 15;
1105 if ( 0 <= shiftCount
) {
1107 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1110 shift128ExtraRightJamming(
1111 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
1114 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
1118 /*----------------------------------------------------------------------------
1119 | Returns the result of converting the 32-bit two's complement integer `a'
1120 | to the single-precision floating-point format. The conversion is performed
1121 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1122 *----------------------------------------------------------------------------*/
1124 float32
int32_to_float32( int32 a STATUS_PARAM
)
1128 if ( a
== 0 ) return float32_zero
;
1129 if ( a
== (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1131 return normalizeRoundAndPackFloat32( zSign
, 0x9C, zSign
? - a
: a STATUS_VAR
);
1135 /*----------------------------------------------------------------------------
1136 | Returns the result of converting the 32-bit two's complement integer `a'
1137 | to the double-precision floating-point format. The conversion is performed
1138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139 *----------------------------------------------------------------------------*/
1141 float64
int32_to_float64( int32 a STATUS_PARAM
)
1148 if ( a
== 0 ) return float64_zero
;
1150 absA
= zSign
? - a
: a
;
1151 shiftCount
= countLeadingZeros32( absA
) + 21;
1153 return packFloat64( zSign
, 0x432 - shiftCount
, zSig
<<shiftCount
);
1157 /*----------------------------------------------------------------------------
1158 | Returns the result of converting the 32-bit two's complement integer `a'
1159 | to the extended double-precision floating-point format. The conversion
1160 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1162 *----------------------------------------------------------------------------*/
1164 floatx80
int32_to_floatx80( int32 a STATUS_PARAM
)
1171 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1173 absA
= zSign
? - a
: a
;
1174 shiftCount
= countLeadingZeros32( absA
) + 32;
1176 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
1180 /*----------------------------------------------------------------------------
1181 | Returns the result of converting the 32-bit two's complement integer `a' to
1182 | the quadruple-precision floating-point format. The conversion is performed
1183 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1184 *----------------------------------------------------------------------------*/
1186 float128
int32_to_float128( int32 a STATUS_PARAM
)
1193 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1195 absA
= zSign
? - a
: a
;
1196 shiftCount
= countLeadingZeros32( absA
) + 17;
1198 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
1202 /*----------------------------------------------------------------------------
1203 | Returns the result of converting the 64-bit two's complement integer `a'
1204 | to the single-precision floating-point format. The conversion is performed
1205 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1206 *----------------------------------------------------------------------------*/
1208 float32
int64_to_float32( int64 a STATUS_PARAM
)
1214 if ( a
== 0 ) return float32_zero
;
1216 absA
= zSign
? - a
: a
;
1217 shiftCount
= countLeadingZeros64( absA
) - 40;
1218 if ( 0 <= shiftCount
) {
1219 return packFloat32( zSign
, 0x95 - shiftCount
, absA
<<shiftCount
);
1223 if ( shiftCount
< 0 ) {
1224 shift64RightJamming( absA
, - shiftCount
, &absA
);
1227 absA
<<= shiftCount
;
1229 return roundAndPackFloat32( zSign
, 0x9C - shiftCount
, absA STATUS_VAR
);
1234 float32
uint64_to_float32( uint64 a STATUS_PARAM
)
1238 if ( a
== 0 ) return float32_zero
;
1239 shiftCount
= countLeadingZeros64( a
) - 40;
1240 if ( 0 <= shiftCount
) {
1241 return packFloat32( 1 > 0, 0x95 - shiftCount
, a
<<shiftCount
);
1245 if ( shiftCount
< 0 ) {
1246 shift64RightJamming( a
, - shiftCount
, &a
);
1251 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount
, a STATUS_VAR
);
1255 /*----------------------------------------------------------------------------
1256 | Returns the result of converting the 64-bit two's complement integer `a'
1257 | to the double-precision floating-point format. The conversion is performed
1258 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1259 *----------------------------------------------------------------------------*/
1261 float64
int64_to_float64( int64 a STATUS_PARAM
)
1265 if ( a
== 0 ) return float64_zero
;
1266 if ( a
== (int64_t) LIT64( 0x8000000000000000 ) ) {
1267 return packFloat64( 1, 0x43E, 0 );
1270 return normalizeRoundAndPackFloat64( zSign
, 0x43C, zSign
? - a
: a STATUS_VAR
);
1274 float64
uint64_to_float64( uint64 a STATUS_PARAM
)
1276 if ( a
== 0 ) return float64_zero
;
1277 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR
);
1281 /*----------------------------------------------------------------------------
1282 | Returns the result of converting the 64-bit two's complement integer `a'
1283 | to the extended double-precision floating-point format. The conversion
1284 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1286 *----------------------------------------------------------------------------*/
1288 floatx80
int64_to_floatx80( int64 a STATUS_PARAM
)
1294 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1296 absA
= zSign
? - a
: a
;
1297 shiftCount
= countLeadingZeros64( absA
);
1298 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
1302 /*----------------------------------------------------------------------------
1303 | Returns the result of converting the 64-bit two's complement integer `a' to
1304 | the quadruple-precision floating-point format. The conversion is performed
1305 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1306 *----------------------------------------------------------------------------*/
1308 float128
int64_to_float128( int64 a STATUS_PARAM
)
1314 uint64_t zSig0
, zSig1
;
1316 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1318 absA
= zSign
? - a
: a
;
1319 shiftCount
= countLeadingZeros64( absA
) + 49;
1320 zExp
= 0x406E - shiftCount
;
1321 if ( 64 <= shiftCount
) {
1330 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1331 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1335 /*----------------------------------------------------------------------------
1336 | Returns the result of converting the single-precision floating-point value
1337 | `a' to the 32-bit two's complement integer format. The conversion is
1338 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1339 | Arithmetic---which means in particular that the conversion is rounded
1340 | according to the current rounding mode. If `a' is a NaN, the largest
1341 | positive integer is returned. Otherwise, if the conversion overflows, the
1342 | largest integer with the same sign as `a' is returned.
1343 *----------------------------------------------------------------------------*/
1345 int32
float32_to_int32( float32 a STATUS_PARAM
)
1348 int16 aExp
, shiftCount
;
1352 a
= float32_squash_input_denormal(a STATUS_VAR
);
1353 aSig
= extractFloat32Frac( a
);
1354 aExp
= extractFloat32Exp( a
);
1355 aSign
= extractFloat32Sign( a
);
1356 if ( ( aExp
== 0xFF ) && aSig
) aSign
= 0;
1357 if ( aExp
) aSig
|= 0x00800000;
1358 shiftCount
= 0xAF - aExp
;
1361 if ( 0 < shiftCount
) shift64RightJamming( aSig64
, shiftCount
, &aSig64
);
1362 return roundAndPackInt32( aSign
, aSig64 STATUS_VAR
);
1366 /*----------------------------------------------------------------------------
1367 | Returns the result of converting the single-precision floating-point value
1368 | `a' to the 32-bit two's complement integer format. The conversion is
1369 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1370 | Arithmetic, except that the conversion is always rounded toward zero.
1371 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1372 | the conversion overflows, the largest integer with the same sign as `a' is
1374 *----------------------------------------------------------------------------*/
1376 int32
float32_to_int32_round_to_zero( float32 a STATUS_PARAM
)
1379 int16 aExp
, shiftCount
;
1382 a
= float32_squash_input_denormal(a STATUS_VAR
);
1384 aSig
= extractFloat32Frac( a
);
1385 aExp
= extractFloat32Exp( a
);
1386 aSign
= extractFloat32Sign( a
);
1387 shiftCount
= aExp
- 0x9E;
1388 if ( 0 <= shiftCount
) {
1389 if ( float32_val(a
) != 0xCF000000 ) {
1390 float_raise( float_flag_invalid STATUS_VAR
);
1391 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
1393 return (int32_t) 0x80000000;
1395 else if ( aExp
<= 0x7E ) {
1396 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1399 aSig
= ( aSig
| 0x00800000 )<<8;
1400 z
= aSig
>>( - shiftCount
);
1401 if ( (uint32_t) ( aSig
<<( shiftCount
& 31 ) ) ) {
1402 STATUS(float_exception_flags
) |= float_flag_inexact
;
1404 if ( aSign
) z
= - z
;
1409 /*----------------------------------------------------------------------------
1410 | Returns the result of converting the single-precision floating-point value
1411 | `a' to the 16-bit two's complement integer format. The conversion is
1412 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1413 | Arithmetic, except that the conversion is always rounded toward zero.
1414 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1415 | the conversion overflows, the largest integer with the same sign as `a' is
1417 *----------------------------------------------------------------------------*/
1419 int16
float32_to_int16_round_to_zero( float32 a STATUS_PARAM
)
1422 int16 aExp
, shiftCount
;
1426 aSig
= extractFloat32Frac( a
);
1427 aExp
= extractFloat32Exp( a
);
1428 aSign
= extractFloat32Sign( a
);
1429 shiftCount
= aExp
- 0x8E;
1430 if ( 0 <= shiftCount
) {
1431 if ( float32_val(a
) != 0xC7000000 ) {
1432 float_raise( float_flag_invalid STATUS_VAR
);
1433 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1437 return (int32_t) 0xffff8000;
1439 else if ( aExp
<= 0x7E ) {
1440 if ( aExp
| aSig
) {
1441 STATUS(float_exception_flags
) |= float_flag_inexact
;
1446 aSig
= ( aSig
| 0x00800000 )<<8;
1447 z
= aSig
>>( - shiftCount
);
1448 if ( (uint32_t) ( aSig
<<( shiftCount
& 31 ) ) ) {
1449 STATUS(float_exception_flags
) |= float_flag_inexact
;
1458 /*----------------------------------------------------------------------------
1459 | Returns the result of converting the single-precision floating-point value
1460 | `a' to the 64-bit two's complement integer format. The conversion is
1461 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1462 | Arithmetic---which means in particular that the conversion is rounded
1463 | according to the current rounding mode. If `a' is a NaN, the largest
1464 | positive integer is returned. Otherwise, if the conversion overflows, the
1465 | largest integer with the same sign as `a' is returned.
1466 *----------------------------------------------------------------------------*/
1468 int64
float32_to_int64( float32 a STATUS_PARAM
)
1471 int16 aExp
, shiftCount
;
1473 uint64_t aSig64
, aSigExtra
;
1474 a
= float32_squash_input_denormal(a STATUS_VAR
);
1476 aSig
= extractFloat32Frac( a
);
1477 aExp
= extractFloat32Exp( a
);
1478 aSign
= extractFloat32Sign( a
);
1479 shiftCount
= 0xBE - aExp
;
1480 if ( shiftCount
< 0 ) {
1481 float_raise( float_flag_invalid STATUS_VAR
);
1482 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1483 return LIT64( 0x7FFFFFFFFFFFFFFF );
1485 return (int64_t) LIT64( 0x8000000000000000 );
1487 if ( aExp
) aSig
|= 0x00800000;
1490 shift64ExtraRightJamming( aSig64
, 0, shiftCount
, &aSig64
, &aSigExtra
);
1491 return roundAndPackInt64( aSign
, aSig64
, aSigExtra STATUS_VAR
);
1495 /*----------------------------------------------------------------------------
1496 | Returns the result of converting the single-precision floating-point value
1497 | `a' to the 64-bit two's complement integer format. The conversion is
1498 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1499 | Arithmetic, except that the conversion is always rounded toward zero. If
1500 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1501 | conversion overflows, the largest integer with the same sign as `a' is
1503 *----------------------------------------------------------------------------*/
1505 int64
float32_to_int64_round_to_zero( float32 a STATUS_PARAM
)
1508 int16 aExp
, shiftCount
;
1512 a
= float32_squash_input_denormal(a STATUS_VAR
);
1514 aSig
= extractFloat32Frac( a
);
1515 aExp
= extractFloat32Exp( a
);
1516 aSign
= extractFloat32Sign( a
);
1517 shiftCount
= aExp
- 0xBE;
1518 if ( 0 <= shiftCount
) {
1519 if ( float32_val(a
) != 0xDF000000 ) {
1520 float_raise( float_flag_invalid STATUS_VAR
);
1521 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1522 return LIT64( 0x7FFFFFFFFFFFFFFF );
1525 return (int64_t) LIT64( 0x8000000000000000 );
1527 else if ( aExp
<= 0x7E ) {
1528 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1531 aSig64
= aSig
| 0x00800000;
1533 z
= aSig64
>>( - shiftCount
);
1534 if ( (uint64_t) ( aSig64
<<( shiftCount
& 63 ) ) ) {
1535 STATUS(float_exception_flags
) |= float_flag_inexact
;
1537 if ( aSign
) z
= - z
;
1542 /*----------------------------------------------------------------------------
1543 | Returns the result of converting the single-precision floating-point value
1544 | `a' to the double-precision floating-point format. The conversion is
1545 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1547 *----------------------------------------------------------------------------*/
1549 float64
float32_to_float64( float32 a STATUS_PARAM
)
1554 a
= float32_squash_input_denormal(a STATUS_VAR
);
1556 aSig
= extractFloat32Frac( a
);
1557 aExp
= extractFloat32Exp( a
);
1558 aSign
= extractFloat32Sign( a
);
1559 if ( aExp
== 0xFF ) {
1560 if ( aSig
) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
1561 return packFloat64( aSign
, 0x7FF, 0 );
1564 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
1565 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1568 return packFloat64( aSign
, aExp
+ 0x380, ( (uint64_t) aSig
)<<29 );
1572 /*----------------------------------------------------------------------------
1573 | Returns the result of converting the single-precision floating-point value
1574 | `a' to the extended double-precision floating-point format. The conversion
1575 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1577 *----------------------------------------------------------------------------*/
1579 floatx80
float32_to_floatx80( float32 a STATUS_PARAM
)
1585 a
= float32_squash_input_denormal(a STATUS_VAR
);
1586 aSig
= extractFloat32Frac( a
);
1587 aExp
= extractFloat32Exp( a
);
1588 aSign
= extractFloat32Sign( a
);
1589 if ( aExp
== 0xFF ) {
1590 if ( aSig
) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
1591 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1594 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
1595 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1598 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
1602 /*----------------------------------------------------------------------------
1603 | Returns the result of converting the single-precision floating-point value
1604 | `a' to the double-precision floating-point format. The conversion is
1605 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1607 *----------------------------------------------------------------------------*/
1609 float128
float32_to_float128( float32 a STATUS_PARAM
)
1615 a
= float32_squash_input_denormal(a STATUS_VAR
);
1616 aSig
= extractFloat32Frac( a
);
1617 aExp
= extractFloat32Exp( a
);
1618 aSign
= extractFloat32Sign( a
);
1619 if ( aExp
== 0xFF ) {
1620 if ( aSig
) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
1621 return packFloat128( aSign
, 0x7FFF, 0, 0 );
1624 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
1625 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1628 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
1632 /*----------------------------------------------------------------------------
1633 | Rounds the single-precision floating-point value `a' to an integer, and
1634 | returns the result as a single-precision floating-point value. The
1635 | operation is performed according to the IEC/IEEE Standard for Binary
1636 | Floating-Point Arithmetic.
1637 *----------------------------------------------------------------------------*/
1639 float32
float32_round_to_int( float32 a STATUS_PARAM
)
1643 uint32_t lastBitMask
, roundBitsMask
;
1646 a
= float32_squash_input_denormal(a STATUS_VAR
);
1648 aExp
= extractFloat32Exp( a
);
1649 if ( 0x96 <= aExp
) {
1650 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
1651 return propagateFloat32NaN( a
, a STATUS_VAR
);
1655 if ( aExp
<= 0x7E ) {
1656 if ( (uint32_t) ( float32_val(a
)<<1 ) == 0 ) return a
;
1657 STATUS(float_exception_flags
) |= float_flag_inexact
;
1658 aSign
= extractFloat32Sign( a
);
1659 switch ( STATUS(float_rounding_mode
) ) {
1660 case float_round_nearest_even
:
1661 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
1662 return packFloat32( aSign
, 0x7F, 0 );
1665 case float_round_down
:
1666 return make_float32(aSign
? 0xBF800000 : 0);
1667 case float_round_up
:
1668 return make_float32(aSign
? 0x80000000 : 0x3F800000);
1670 return packFloat32( aSign
, 0, 0 );
1673 lastBitMask
<<= 0x96 - aExp
;
1674 roundBitsMask
= lastBitMask
- 1;
1676 roundingMode
= STATUS(float_rounding_mode
);
1677 if ( roundingMode
== float_round_nearest_even
) {
1678 z
+= lastBitMask
>>1;
1679 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
1681 else if ( roundingMode
!= float_round_to_zero
) {
1682 if ( extractFloat32Sign( make_float32(z
) ) ^ ( roundingMode
== float_round_up
) ) {
1686 z
&= ~ roundBitsMask
;
1687 if ( z
!= float32_val(a
) ) STATUS(float_exception_flags
) |= float_flag_inexact
;
1688 return make_float32(z
);
1692 /*----------------------------------------------------------------------------
1693 | Returns the result of adding the absolute values of the single-precision
1694 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1695 | before being returned. `zSign' is ignored if the result is a NaN.
1696 | The addition is performed according to the IEC/IEEE Standard for Binary
1697 | Floating-Point Arithmetic.
1698 *----------------------------------------------------------------------------*/
1700 static float32
addFloat32Sigs( float32 a
, float32 b
, flag zSign STATUS_PARAM
)
1702 int16 aExp
, bExp
, zExp
;
1703 uint32_t aSig
, bSig
, zSig
;
1706 aSig
= extractFloat32Frac( a
);
1707 aExp
= extractFloat32Exp( a
);
1708 bSig
= extractFloat32Frac( b
);
1709 bExp
= extractFloat32Exp( b
);
1710 expDiff
= aExp
- bExp
;
1713 if ( 0 < expDiff
) {
1714 if ( aExp
== 0xFF ) {
1715 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1724 shift32RightJamming( bSig
, expDiff
, &bSig
);
1727 else if ( expDiff
< 0 ) {
1728 if ( bExp
== 0xFF ) {
1729 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1730 return packFloat32( zSign
, 0xFF, 0 );
1738 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1742 if ( aExp
== 0xFF ) {
1743 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1747 if (STATUS(flush_to_zero
)) {
1749 float_raise(float_flag_output_denormal STATUS_VAR
);
1751 return packFloat32(zSign
, 0, 0);
1753 return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
1755 zSig
= 0x40000000 + aSig
+ bSig
;
1760 zSig
= ( aSig
+ bSig
)<<1;
1762 if ( (int32_t) zSig
< 0 ) {
1767 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1771 /*----------------------------------------------------------------------------
1772 | Returns the result of subtracting the absolute values of the single-
1773 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1774 | difference is negated before being returned. `zSign' is ignored if the
1775 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1776 | Standard for Binary Floating-Point Arithmetic.
1777 *----------------------------------------------------------------------------*/
1779 static float32
subFloat32Sigs( float32 a
, float32 b
, flag zSign STATUS_PARAM
)
1781 int16 aExp
, bExp
, zExp
;
1782 uint32_t aSig
, bSig
, zSig
;
1785 aSig
= extractFloat32Frac( a
);
1786 aExp
= extractFloat32Exp( a
);
1787 bSig
= extractFloat32Frac( b
);
1788 bExp
= extractFloat32Exp( b
);
1789 expDiff
= aExp
- bExp
;
1792 if ( 0 < expDiff
) goto aExpBigger
;
1793 if ( expDiff
< 0 ) goto bExpBigger
;
1794 if ( aExp
== 0xFF ) {
1795 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1796 float_raise( float_flag_invalid STATUS_VAR
);
1797 return float32_default_nan
;
1803 if ( bSig
< aSig
) goto aBigger
;
1804 if ( aSig
< bSig
) goto bBigger
;
1805 return packFloat32( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
1807 if ( bExp
== 0xFF ) {
1808 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1809 return packFloat32( zSign
^ 1, 0xFF, 0 );
1817 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1823 goto normalizeRoundAndPack
;
1825 if ( aExp
== 0xFF ) {
1826 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1835 shift32RightJamming( bSig
, expDiff
, &bSig
);
1840 normalizeRoundAndPack
:
1842 return normalizeRoundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1846 /*----------------------------------------------------------------------------
1847 | Returns the result of adding the single-precision floating-point values `a'
1848 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1849 | Binary Floating-Point Arithmetic.
1850 *----------------------------------------------------------------------------*/
1852 float32
float32_add( float32 a
, float32 b STATUS_PARAM
)
1855 a
= float32_squash_input_denormal(a STATUS_VAR
);
1856 b
= float32_squash_input_denormal(b STATUS_VAR
);
1858 aSign
= extractFloat32Sign( a
);
1859 bSign
= extractFloat32Sign( b
);
1860 if ( aSign
== bSign
) {
1861 return addFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1864 return subFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1869 /*----------------------------------------------------------------------------
1870 | Returns the result of subtracting the single-precision floating-point values
1871 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1872 | for Binary Floating-Point Arithmetic.
1873 *----------------------------------------------------------------------------*/
1875 float32
float32_sub( float32 a
, float32 b STATUS_PARAM
)
1878 a
= float32_squash_input_denormal(a STATUS_VAR
);
1879 b
= float32_squash_input_denormal(b STATUS_VAR
);
1881 aSign
= extractFloat32Sign( a
);
1882 bSign
= extractFloat32Sign( b
);
1883 if ( aSign
== bSign
) {
1884 return subFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1887 return addFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1892 /*----------------------------------------------------------------------------
1893 | Returns the result of multiplying the single-precision floating-point values
1894 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1895 | for Binary Floating-Point Arithmetic.
1896 *----------------------------------------------------------------------------*/
1898 float32
float32_mul( float32 a
, float32 b STATUS_PARAM
)
1900 flag aSign
, bSign
, zSign
;
1901 int16 aExp
, bExp
, zExp
;
1902 uint32_t aSig
, bSig
;
1906 a
= float32_squash_input_denormal(a STATUS_VAR
);
1907 b
= float32_squash_input_denormal(b STATUS_VAR
);
1909 aSig
= extractFloat32Frac( a
);
1910 aExp
= extractFloat32Exp( a
);
1911 aSign
= extractFloat32Sign( a
);
1912 bSig
= extractFloat32Frac( b
);
1913 bExp
= extractFloat32Exp( b
);
1914 bSign
= extractFloat32Sign( b
);
1915 zSign
= aSign
^ bSign
;
1916 if ( aExp
== 0xFF ) {
1917 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1918 return propagateFloat32NaN( a
, b STATUS_VAR
);
1920 if ( ( bExp
| bSig
) == 0 ) {
1921 float_raise( float_flag_invalid STATUS_VAR
);
1922 return float32_default_nan
;
1924 return packFloat32( zSign
, 0xFF, 0 );
1926 if ( bExp
== 0xFF ) {
1927 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1928 if ( ( aExp
| aSig
) == 0 ) {
1929 float_raise( float_flag_invalid STATUS_VAR
);
1930 return float32_default_nan
;
1932 return packFloat32( zSign
, 0xFF, 0 );
1935 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1936 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1939 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1940 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1942 zExp
= aExp
+ bExp
- 0x7F;
1943 aSig
= ( aSig
| 0x00800000 )<<7;
1944 bSig
= ( bSig
| 0x00800000 )<<8;
1945 shift64RightJamming( ( (uint64_t) aSig
) * bSig
, 32, &zSig64
);
1947 if ( 0 <= (int32_t) ( zSig
<<1 ) ) {
1951 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1955 /*----------------------------------------------------------------------------
1956 | Returns the result of dividing the single-precision floating-point value `a'
1957 | by the corresponding value `b'. The operation is performed according to the
1958 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1959 *----------------------------------------------------------------------------*/
1961 float32
float32_div( float32 a
, float32 b STATUS_PARAM
)
1963 flag aSign
, bSign
, zSign
;
1964 int16 aExp
, bExp
, zExp
;
1965 uint32_t aSig
, bSig
, zSig
;
1966 a
= float32_squash_input_denormal(a STATUS_VAR
);
1967 b
= float32_squash_input_denormal(b STATUS_VAR
);
1969 aSig
= extractFloat32Frac( a
);
1970 aExp
= extractFloat32Exp( a
);
1971 aSign
= extractFloat32Sign( a
);
1972 bSig
= extractFloat32Frac( b
);
1973 bExp
= extractFloat32Exp( b
);
1974 bSign
= extractFloat32Sign( b
);
1975 zSign
= aSign
^ bSign
;
1976 if ( aExp
== 0xFF ) {
1977 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1978 if ( bExp
== 0xFF ) {
1979 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1980 float_raise( float_flag_invalid STATUS_VAR
);
1981 return float32_default_nan
;
1983 return packFloat32( zSign
, 0xFF, 0 );
1985 if ( bExp
== 0xFF ) {
1986 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1987 return packFloat32( zSign
, 0, 0 );
1991 if ( ( aExp
| aSig
) == 0 ) {
1992 float_raise( float_flag_invalid STATUS_VAR
);
1993 return float32_default_nan
;
1995 float_raise( float_flag_divbyzero STATUS_VAR
);
1996 return packFloat32( zSign
, 0xFF, 0 );
1998 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2001 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
2002 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2004 zExp
= aExp
- bExp
+ 0x7D;
2005 aSig
= ( aSig
| 0x00800000 )<<7;
2006 bSig
= ( bSig
| 0x00800000 )<<8;
2007 if ( bSig
<= ( aSig
+ aSig
) ) {
2011 zSig
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
2012 if ( ( zSig
& 0x3F ) == 0 ) {
2013 zSig
|= ( (uint64_t) bSig
* zSig
!= ( (uint64_t) aSig
)<<32 );
2015 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
2019 /*----------------------------------------------------------------------------
2020 | Returns the remainder of the single-precision floating-point value `a'
2021 | with respect to the corresponding value `b'. The operation is performed
2022 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2023 *----------------------------------------------------------------------------*/
2025 float32
float32_rem( float32 a
, float32 b STATUS_PARAM
)
2028 int16 aExp
, bExp
, expDiff
;
2029 uint32_t aSig
, bSig
;
2031 uint64_t aSig64
, bSig64
, q64
;
2032 uint32_t alternateASig
;
2034 a
= float32_squash_input_denormal(a STATUS_VAR
);
2035 b
= float32_squash_input_denormal(b STATUS_VAR
);
2037 aSig
= extractFloat32Frac( a
);
2038 aExp
= extractFloat32Exp( a
);
2039 aSign
= extractFloat32Sign( a
);
2040 bSig
= extractFloat32Frac( b
);
2041 bExp
= extractFloat32Exp( b
);
2042 if ( aExp
== 0xFF ) {
2043 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
2044 return propagateFloat32NaN( a
, b STATUS_VAR
);
2046 float_raise( float_flag_invalid STATUS_VAR
);
2047 return float32_default_nan
;
2049 if ( bExp
== 0xFF ) {
2050 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
2055 float_raise( float_flag_invalid STATUS_VAR
);
2056 return float32_default_nan
;
2058 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2061 if ( aSig
== 0 ) return a
;
2062 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2064 expDiff
= aExp
- bExp
;
2067 if ( expDiff
< 32 ) {
2070 if ( expDiff
< 0 ) {
2071 if ( expDiff
< -1 ) return a
;
2074 q
= ( bSig
<= aSig
);
2075 if ( q
) aSig
-= bSig
;
2076 if ( 0 < expDiff
) {
2077 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
2080 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
2088 if ( bSig
<= aSig
) aSig
-= bSig
;
2089 aSig64
= ( (uint64_t) aSig
)<<40;
2090 bSig64
= ( (uint64_t) bSig
)<<40;
2092 while ( 0 < expDiff
) {
2093 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2094 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2095 aSig64
= - ( ( bSig
* q64
)<<38 );
2099 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2100 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2101 q
= q64
>>( 64 - expDiff
);
2103 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
2106 alternateASig
= aSig
;
2109 } while ( 0 <= (int32_t) aSig
);
2110 sigMean
= aSig
+ alternateASig
;
2111 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
2112 aSig
= alternateASig
;
2114 zSign
= ( (int32_t) aSig
< 0 );
2115 if ( zSign
) aSig
= - aSig
;
2116 return normalizeRoundAndPackFloat32( aSign
^ zSign
, bExp
, aSig STATUS_VAR
);
2120 /*----------------------------------------------------------------------------
2121 | Returns the square root of the single-precision floating-point value `a'.
2122 | The operation is performed according to the IEC/IEEE Standard for Binary
2123 | Floating-Point Arithmetic.
2124 *----------------------------------------------------------------------------*/
2126 float32
float32_sqrt( float32 a STATUS_PARAM
)
2130 uint32_t aSig
, zSig
;
2132 a
= float32_squash_input_denormal(a STATUS_VAR
);
2134 aSig
= extractFloat32Frac( a
);
2135 aExp
= extractFloat32Exp( a
);
2136 aSign
= extractFloat32Sign( a
);
2137 if ( aExp
== 0xFF ) {
2138 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2139 if ( ! aSign
) return a
;
2140 float_raise( float_flag_invalid STATUS_VAR
);
2141 return float32_default_nan
;
2144 if ( ( aExp
| aSig
) == 0 ) return a
;
2145 float_raise( float_flag_invalid STATUS_VAR
);
2146 return float32_default_nan
;
2149 if ( aSig
== 0 ) return float32_zero
;
2150 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2152 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
2153 aSig
= ( aSig
| 0x00800000 )<<8;
2154 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
2155 if ( ( zSig
& 0x7F ) <= 5 ) {
2161 term
= ( (uint64_t) zSig
) * zSig
;
2162 rem
= ( ( (uint64_t) aSig
)<<32 ) - term
;
2163 while ( (int64_t) rem
< 0 ) {
2165 rem
+= ( ( (uint64_t) zSig
)<<1 ) | 1;
2167 zSig
|= ( rem
!= 0 );
2169 shift32RightJamming( zSig
, 1, &zSig
);
2171 return roundAndPackFloat32( 0, zExp
, zSig STATUS_VAR
);
2175 /*----------------------------------------------------------------------------
2176 | Returns the binary exponential of the single-precision floating-point value
2177 | `a'. The operation is performed according to the IEC/IEEE Standard for
2178 | Binary Floating-Point Arithmetic.
2180 | Uses the following identities:
2182 | 1. -------------------------------------------------------------------------
2186 | 2. -------------------------------------------------------------------------
2189 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2191 *----------------------------------------------------------------------------*/
2193 static const float64 float32_exp2_coefficients
[15] =
2195 const_float64( 0x3ff0000000000000ll
), /* 1 */
2196 const_float64( 0x3fe0000000000000ll
), /* 2 */
2197 const_float64( 0x3fc5555555555555ll
), /* 3 */
2198 const_float64( 0x3fa5555555555555ll
), /* 4 */
2199 const_float64( 0x3f81111111111111ll
), /* 5 */
2200 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
2201 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
2202 const_float64( 0x3efa01a01a01a01all
), /* 8 */
2203 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
2204 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
2205 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
2206 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
2207 const_float64( 0x3de6124613a86d09ll
), /* 13 */
2208 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
2209 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
2212 float32
float32_exp2( float32 a STATUS_PARAM
)
2219 a
= float32_squash_input_denormal(a STATUS_VAR
);
2221 aSig
= extractFloat32Frac( a
);
2222 aExp
= extractFloat32Exp( a
);
2223 aSign
= extractFloat32Sign( a
);
2225 if ( aExp
== 0xFF) {
2226 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2227 return (aSign
) ? float32_zero
: a
;
2230 if (aSig
== 0) return float32_one
;
2233 float_raise( float_flag_inexact STATUS_VAR
);
2235 /* ******************************* */
2236 /* using float64 for approximation */
2237 /* ******************************* */
2238 x
= float32_to_float64(a STATUS_VAR
);
2239 x
= float64_mul(x
, float64_ln2 STATUS_VAR
);
2243 for (i
= 0 ; i
< 15 ; i
++) {
2246 f
= float64_mul(xn
, float32_exp2_coefficients
[i
] STATUS_VAR
);
2247 r
= float64_add(r
, f STATUS_VAR
);
2249 xn
= float64_mul(xn
, x STATUS_VAR
);
2252 return float64_to_float32(r
, status
);
2255 /*----------------------------------------------------------------------------
2256 | Returns the binary log of the single-precision floating-point value `a'.
2257 | The operation is performed according to the IEC/IEEE Standard for Binary
2258 | Floating-Point Arithmetic.
2259 *----------------------------------------------------------------------------*/
2260 float32
float32_log2( float32 a STATUS_PARAM
)
2264 uint32_t aSig
, zSig
, i
;
2266 a
= float32_squash_input_denormal(a STATUS_VAR
);
2267 aSig
= extractFloat32Frac( a
);
2268 aExp
= extractFloat32Exp( a
);
2269 aSign
= extractFloat32Sign( a
);
2272 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
2273 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2276 float_raise( float_flag_invalid STATUS_VAR
);
2277 return float32_default_nan
;
2279 if ( aExp
== 0xFF ) {
2280 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2289 for (i
= 1 << 22; i
> 0; i
>>= 1) {
2290 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
2291 if ( aSig
& 0x01000000 ) {
2300 return normalizeRoundAndPackFloat32( zSign
, 0x85, zSig STATUS_VAR
);
2303 /*----------------------------------------------------------------------------
2304 | Returns 1 if the single-precision floating-point value `a' is equal to
2305 | the corresponding value `b', and 0 otherwise. The invalid exception is
2306 | raised if either operand is a NaN. Otherwise, the comparison is performed
2307 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2308 *----------------------------------------------------------------------------*/
2310 int float32_eq( float32 a
, float32 b STATUS_PARAM
)
2313 a
= float32_squash_input_denormal(a STATUS_VAR
);
2314 b
= float32_squash_input_denormal(b STATUS_VAR
);
2316 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2317 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2319 float_raise( float_flag_invalid STATUS_VAR
);
2322 av
= float32_val(a
);
2323 bv
= float32_val(b
);
2324 return ( av
== bv
) || ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
2327 /*----------------------------------------------------------------------------
2328 | Returns 1 if the single-precision floating-point value `a' is less than
2329 | or equal to the corresponding value `b', and 0 otherwise. The invalid
2330 | exception is raised if either operand is a NaN. The comparison is performed
2331 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2332 *----------------------------------------------------------------------------*/
2334 int float32_le( float32 a
, float32 b STATUS_PARAM
)
2338 a
= float32_squash_input_denormal(a STATUS_VAR
);
2339 b
= float32_squash_input_denormal(b STATUS_VAR
);
2341 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2342 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2344 float_raise( float_flag_invalid STATUS_VAR
);
2347 aSign
= extractFloat32Sign( a
);
2348 bSign
= extractFloat32Sign( b
);
2349 av
= float32_val(a
);
2350 bv
= float32_val(b
);
2351 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
2352 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
2356 /*----------------------------------------------------------------------------
2357 | Returns 1 if the single-precision floating-point value `a' is less than
2358 | the corresponding value `b', and 0 otherwise. The invalid exception is
2359 | raised if either operand is a NaN. The comparison is performed according
2360 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2361 *----------------------------------------------------------------------------*/
2363 int float32_lt( float32 a
, float32 b STATUS_PARAM
)
2367 a
= float32_squash_input_denormal(a STATUS_VAR
);
2368 b
= float32_squash_input_denormal(b STATUS_VAR
);
2370 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2371 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2373 float_raise( float_flag_invalid STATUS_VAR
);
2376 aSign
= extractFloat32Sign( a
);
2377 bSign
= extractFloat32Sign( b
);
2378 av
= float32_val(a
);
2379 bv
= float32_val(b
);
2380 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
2381 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
2385 /*----------------------------------------------------------------------------
2386 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2387 | be compared, and 0 otherwise. The invalid exception is raised if either
2388 | operand is a NaN. The comparison is performed according to the IEC/IEEE
2389 | Standard for Binary Floating-Point Arithmetic.
2390 *----------------------------------------------------------------------------*/
2392 int float32_unordered( float32 a
, float32 b STATUS_PARAM
)
2394 a
= float32_squash_input_denormal(a STATUS_VAR
);
2395 b
= float32_squash_input_denormal(b STATUS_VAR
);
2397 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2398 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2400 float_raise( float_flag_invalid STATUS_VAR
);
2406 /*----------------------------------------------------------------------------
2407 | Returns 1 if the single-precision floating-point value `a' is equal to
2408 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2409 | exception. The comparison is performed according to the IEC/IEEE Standard
2410 | for Binary Floating-Point Arithmetic.
2411 *----------------------------------------------------------------------------*/
2413 int float32_eq_quiet( float32 a
, float32 b STATUS_PARAM
)
2415 a
= float32_squash_input_denormal(a STATUS_VAR
);
2416 b
= float32_squash_input_denormal(b STATUS_VAR
);
2418 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2419 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2421 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2422 float_raise( float_flag_invalid STATUS_VAR
);
2426 return ( float32_val(a
) == float32_val(b
) ) ||
2427 ( (uint32_t) ( ( float32_val(a
) | float32_val(b
) )<<1 ) == 0 );
2430 /*----------------------------------------------------------------------------
2431 | Returns 1 if the single-precision floating-point value `a' is less than or
2432 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2433 | cause an exception. Otherwise, the comparison is performed according to the
2434 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2435 *----------------------------------------------------------------------------*/
2437 int float32_le_quiet( float32 a
, float32 b STATUS_PARAM
)
2441 a
= float32_squash_input_denormal(a STATUS_VAR
);
2442 b
= float32_squash_input_denormal(b STATUS_VAR
);
2444 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2445 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2447 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2448 float_raise( float_flag_invalid STATUS_VAR
);
2452 aSign
= extractFloat32Sign( a
);
2453 bSign
= extractFloat32Sign( b
);
2454 av
= float32_val(a
);
2455 bv
= float32_val(b
);
2456 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
2457 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
2461 /*----------------------------------------------------------------------------
2462 | Returns 1 if the single-precision floating-point value `a' is less than
2463 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2464 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2465 | Standard for Binary Floating-Point Arithmetic.
2466 *----------------------------------------------------------------------------*/
2468 int float32_lt_quiet( float32 a
, float32 b STATUS_PARAM
)
2472 a
= float32_squash_input_denormal(a STATUS_VAR
);
2473 b
= float32_squash_input_denormal(b STATUS_VAR
);
2475 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2476 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2478 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2479 float_raise( float_flag_invalid STATUS_VAR
);
2483 aSign
= extractFloat32Sign( a
);
2484 bSign
= extractFloat32Sign( b
);
2485 av
= float32_val(a
);
2486 bv
= float32_val(b
);
2487 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
2488 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
2492 /*----------------------------------------------------------------------------
2493 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2494 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2495 | comparison is performed according to the IEC/IEEE Standard for Binary
2496 | Floating-Point Arithmetic.
2497 *----------------------------------------------------------------------------*/
2499 int float32_unordered_quiet( float32 a
, float32 b STATUS_PARAM
)
2501 a
= float32_squash_input_denormal(a STATUS_VAR
);
2502 b
= float32_squash_input_denormal(b STATUS_VAR
);
2504 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2505 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2507 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2508 float_raise( float_flag_invalid STATUS_VAR
);
2515 /*----------------------------------------------------------------------------
2516 | Returns the result of converting the double-precision floating-point value
2517 | `a' to the 32-bit two's complement integer format. The conversion is
2518 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2519 | Arithmetic---which means in particular that the conversion is rounded
2520 | according to the current rounding mode. If `a' is a NaN, the largest
2521 | positive integer is returned. Otherwise, if the conversion overflows, the
2522 | largest integer with the same sign as `a' is returned.
2523 *----------------------------------------------------------------------------*/
2525 int32
float64_to_int32( float64 a STATUS_PARAM
)
2528 int16 aExp
, shiftCount
;
2530 a
= float64_squash_input_denormal(a STATUS_VAR
);
2532 aSig
= extractFloat64Frac( a
);
2533 aExp
= extractFloat64Exp( a
);
2534 aSign
= extractFloat64Sign( a
);
2535 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2536 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2537 shiftCount
= 0x42C - aExp
;
2538 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
2539 return roundAndPackInt32( aSign
, aSig STATUS_VAR
);
2543 /*----------------------------------------------------------------------------
2544 | Returns the result of converting the double-precision floating-point value
2545 | `a' to the 32-bit two's complement integer format. The conversion is
2546 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2547 | Arithmetic, except that the conversion is always rounded toward zero.
2548 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2549 | the conversion overflows, the largest integer with the same sign as `a' is
2551 *----------------------------------------------------------------------------*/
2553 int32
float64_to_int32_round_to_zero( float64 a STATUS_PARAM
)
2556 int16 aExp
, shiftCount
;
2557 uint64_t aSig
, savedASig
;
2559 a
= float64_squash_input_denormal(a STATUS_VAR
);
2561 aSig
= extractFloat64Frac( a
);
2562 aExp
= extractFloat64Exp( a
);
2563 aSign
= extractFloat64Sign( a
);
2564 if ( 0x41E < aExp
) {
2565 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2568 else if ( aExp
< 0x3FF ) {
2569 if ( aExp
|| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
2572 aSig
|= LIT64( 0x0010000000000000 );
2573 shiftCount
= 0x433 - aExp
;
2575 aSig
>>= shiftCount
;
2577 if ( aSign
) z
= - z
;
2578 if ( ( z
< 0 ) ^ aSign
) {
2580 float_raise( float_flag_invalid STATUS_VAR
);
2581 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
2583 if ( ( aSig
<<shiftCount
) != savedASig
) {
2584 STATUS(float_exception_flags
) |= float_flag_inexact
;
2590 /*----------------------------------------------------------------------------
2591 | Returns the result of converting the double-precision floating-point value
2592 | `a' to the 16-bit two's complement integer format. The conversion is
2593 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2594 | Arithmetic, except that the conversion is always rounded toward zero.
2595 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2596 | the conversion overflows, the largest integer with the same sign as `a' is
2598 *----------------------------------------------------------------------------*/
2600 int16
float64_to_int16_round_to_zero( float64 a STATUS_PARAM
)
2603 int16 aExp
, shiftCount
;
2604 uint64_t aSig
, savedASig
;
2607 aSig
= extractFloat64Frac( a
);
2608 aExp
= extractFloat64Exp( a
);
2609 aSign
= extractFloat64Sign( a
);
2610 if ( 0x40E < aExp
) {
2611 if ( ( aExp
== 0x7FF ) && aSig
) {
2616 else if ( aExp
< 0x3FF ) {
2617 if ( aExp
|| aSig
) {
2618 STATUS(float_exception_flags
) |= float_flag_inexact
;
2622 aSig
|= LIT64( 0x0010000000000000 );
2623 shiftCount
= 0x433 - aExp
;
2625 aSig
>>= shiftCount
;
2630 if ( ( (int16_t)z
< 0 ) ^ aSign
) {
2632 float_raise( float_flag_invalid STATUS_VAR
);
2633 return aSign
? (int32_t) 0xffff8000 : 0x7FFF;
2635 if ( ( aSig
<<shiftCount
) != savedASig
) {
2636 STATUS(float_exception_flags
) |= float_flag_inexact
;
2641 /*----------------------------------------------------------------------------
2642 | Returns the result of converting the double-precision floating-point value
2643 | `a' to the 64-bit two's complement integer format. The conversion is
2644 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2645 | Arithmetic---which means in particular that the conversion is rounded
2646 | according to the current rounding mode. If `a' is a NaN, the largest
2647 | positive integer is returned. Otherwise, if the conversion overflows, the
2648 | largest integer with the same sign as `a' is returned.
2649 *----------------------------------------------------------------------------*/
2651 int64
float64_to_int64( float64 a STATUS_PARAM
)
2654 int16 aExp
, shiftCount
;
2655 uint64_t aSig
, aSigExtra
;
2656 a
= float64_squash_input_denormal(a STATUS_VAR
);
2658 aSig
= extractFloat64Frac( a
);
2659 aExp
= extractFloat64Exp( a
);
2660 aSign
= extractFloat64Sign( a
);
2661 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2662 shiftCount
= 0x433 - aExp
;
2663 if ( shiftCount
<= 0 ) {
2664 if ( 0x43E < aExp
) {
2665 float_raise( float_flag_invalid STATUS_VAR
);
2667 || ( ( aExp
== 0x7FF )
2668 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2670 return LIT64( 0x7FFFFFFFFFFFFFFF );
2672 return (int64_t) LIT64( 0x8000000000000000 );
2675 aSig
<<= - shiftCount
;
2678 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
2680 return roundAndPackInt64( aSign
, aSig
, aSigExtra STATUS_VAR
);
2684 /*----------------------------------------------------------------------------
2685 | Returns the result of converting the double-precision floating-point value
2686 | `a' to the 64-bit two's complement integer format. The conversion is
2687 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2688 | Arithmetic, except that the conversion is always rounded toward zero.
2689 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2690 | the conversion overflows, the largest integer with the same sign as `a' is
2692 *----------------------------------------------------------------------------*/
2694 int64
float64_to_int64_round_to_zero( float64 a STATUS_PARAM
)
2697 int16 aExp
, shiftCount
;
2700 a
= float64_squash_input_denormal(a STATUS_VAR
);
2702 aSig
= extractFloat64Frac( a
);
2703 aExp
= extractFloat64Exp( a
);
2704 aSign
= extractFloat64Sign( a
);
2705 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2706 shiftCount
= aExp
- 0x433;
2707 if ( 0 <= shiftCount
) {
2708 if ( 0x43E <= aExp
) {
2709 if ( float64_val(a
) != LIT64( 0xC3E0000000000000 ) ) {
2710 float_raise( float_flag_invalid STATUS_VAR
);
2712 || ( ( aExp
== 0x7FF )
2713 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2715 return LIT64( 0x7FFFFFFFFFFFFFFF );
2718 return (int64_t) LIT64( 0x8000000000000000 );
2720 z
= aSig
<<shiftCount
;
2723 if ( aExp
< 0x3FE ) {
2724 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
2727 z
= aSig
>>( - shiftCount
);
2728 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
2729 STATUS(float_exception_flags
) |= float_flag_inexact
;
2732 if ( aSign
) z
= - z
;
2737 /*----------------------------------------------------------------------------
2738 | Returns the result of converting the double-precision floating-point value
2739 | `a' to the single-precision floating-point format. The conversion is
2740 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2742 *----------------------------------------------------------------------------*/
2744 float32
float64_to_float32( float64 a STATUS_PARAM
)
2750 a
= float64_squash_input_denormal(a STATUS_VAR
);
2752 aSig
= extractFloat64Frac( a
);
2753 aExp
= extractFloat64Exp( a
);
2754 aSign
= extractFloat64Sign( a
);
2755 if ( aExp
== 0x7FF ) {
2756 if ( aSig
) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
2757 return packFloat32( aSign
, 0xFF, 0 );
2759 shift64RightJamming( aSig
, 22, &aSig
);
2761 if ( aExp
|| zSig
) {
2765 return roundAndPackFloat32( aSign
, aExp
, zSig STATUS_VAR
);
2770 /*----------------------------------------------------------------------------
2771 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2772 | half-precision floating-point value, returning the result. After being
2773 | shifted into the proper positions, the three fields are simply added
2774 | together to form the result. This means that any integer portion of `zSig'
2775 | will be added into the exponent. Since a properly normalized significand
2776 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2777 | than the desired result exponent whenever `zSig' is a complete, normalized
2779 *----------------------------------------------------------------------------*/
2780 static float16
packFloat16(flag zSign
, int16 zExp
, uint16_t zSig
)
2782 return make_float16(
2783 (((uint32_t)zSign
) << 15) + (((uint32_t)zExp
) << 10) + zSig
);
2786 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2787 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
2789 float32
float16_to_float32(float16 a
, flag ieee STATUS_PARAM
)
2795 aSign
= extractFloat16Sign(a
);
2796 aExp
= extractFloat16Exp(a
);
2797 aSig
= extractFloat16Frac(a
);
2799 if (aExp
== 0x1f && ieee
) {
2801 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR
) STATUS_VAR
);
2803 return packFloat32(aSign
, 0xff, aSig
<< 13);
2809 return packFloat32(aSign
, 0, 0);
2812 shiftCount
= countLeadingZeros32( aSig
) - 21;
2813 aSig
= aSig
<< shiftCount
;
2816 return packFloat32( aSign
, aExp
+ 0x70, aSig
<< 13);
2819 float16
float32_to_float16(float32 a
, flag ieee STATUS_PARAM
)
2827 a
= float32_squash_input_denormal(a STATUS_VAR
);
2829 aSig
= extractFloat32Frac( a
);
2830 aExp
= extractFloat32Exp( a
);
2831 aSign
= extractFloat32Sign( a
);
2832 if ( aExp
== 0xFF ) {
2834 /* Input is a NaN */
2835 float16 r
= commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
2837 return packFloat16(aSign
, 0, 0);
2843 float_raise(float_flag_invalid STATUS_VAR
);
2844 return packFloat16(aSign
, 0x1f, 0x3ff);
2846 return packFloat16(aSign
, 0x1f, 0);
2848 if (aExp
== 0 && aSig
== 0) {
2849 return packFloat16(aSign
, 0, 0);
2851 /* Decimal point between bits 22 and 23. */
2863 float_raise( float_flag_underflow STATUS_VAR
);
2864 roundingMode
= STATUS(float_rounding_mode
);
2865 switch (roundingMode
) {
2866 case float_round_nearest_even
:
2867 increment
= (mask
+ 1) >> 1;
2868 if ((aSig
& mask
) == increment
) {
2869 increment
= aSig
& (increment
<< 1);
2872 case float_round_up
:
2873 increment
= aSign
? 0 : mask
;
2875 case float_round_down
:
2876 increment
= aSign
? mask
: 0;
2878 default: /* round_to_zero */
2883 if (aSig
>= 0x01000000) {
2887 } else if (aExp
< -14
2888 && STATUS(float_detect_tininess
) == float_tininess_before_rounding
) {
2889 float_raise( float_flag_underflow STATUS_VAR
);
2894 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
2895 return packFloat16(aSign
, 0x1f, 0);
2899 float_raise(float_flag_invalid
| float_flag_inexact STATUS_VAR
);
2900 return packFloat16(aSign
, 0x1f, 0x3ff);
2904 return packFloat16(aSign
, 0, 0);
2907 aSig
>>= -14 - aExp
;
2910 return packFloat16(aSign
, aExp
+ 14, aSig
>> 13);
2913 /*----------------------------------------------------------------------------
2914 | Returns the result of converting the double-precision floating-point value
2915 | `a' to the extended double-precision floating-point format. The conversion
2916 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2918 *----------------------------------------------------------------------------*/
2920 floatx80
float64_to_floatx80( float64 a STATUS_PARAM
)
2926 a
= float64_squash_input_denormal(a STATUS_VAR
);
2927 aSig
= extractFloat64Frac( a
);
2928 aExp
= extractFloat64Exp( a
);
2929 aSign
= extractFloat64Sign( a
);
2930 if ( aExp
== 0x7FF ) {
2931 if ( aSig
) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
2932 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2935 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
2936 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2940 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
2944 /*----------------------------------------------------------------------------
2945 | Returns the result of converting the double-precision floating-point value
2946 | `a' to the quadruple-precision floating-point format. The conversion is
2947 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2949 *----------------------------------------------------------------------------*/
2951 float128
float64_to_float128( float64 a STATUS_PARAM
)
2955 uint64_t aSig
, zSig0
, zSig1
;
2957 a
= float64_squash_input_denormal(a STATUS_VAR
);
2958 aSig
= extractFloat64Frac( a
);
2959 aExp
= extractFloat64Exp( a
);
2960 aSign
= extractFloat64Sign( a
);
2961 if ( aExp
== 0x7FF ) {
2962 if ( aSig
) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
2963 return packFloat128( aSign
, 0x7FFF, 0, 0 );
2966 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
2967 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2970 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
2971 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
2975 /*----------------------------------------------------------------------------
2976 | Rounds the double-precision floating-point value `a' to an integer, and
2977 | returns the result as a double-precision floating-point value. The
2978 | operation is performed according to the IEC/IEEE Standard for Binary
2979 | Floating-Point Arithmetic.
2980 *----------------------------------------------------------------------------*/
2982 float64
float64_round_to_int( float64 a STATUS_PARAM
)
2986 uint64_t lastBitMask
, roundBitsMask
;
2989 a
= float64_squash_input_denormal(a STATUS_VAR
);
2991 aExp
= extractFloat64Exp( a
);
2992 if ( 0x433 <= aExp
) {
2993 if ( ( aExp
== 0x7FF ) && extractFloat64Frac( a
) ) {
2994 return propagateFloat64NaN( a
, a STATUS_VAR
);
2998 if ( aExp
< 0x3FF ) {
2999 if ( (uint64_t) ( float64_val(a
)<<1 ) == 0 ) return a
;
3000 STATUS(float_exception_flags
) |= float_flag_inexact
;
3001 aSign
= extractFloat64Sign( a
);
3002 switch ( STATUS(float_rounding_mode
) ) {
3003 case float_round_nearest_even
:
3004 if ( ( aExp
== 0x3FE ) && extractFloat64Frac( a
) ) {
3005 return packFloat64( aSign
, 0x3FF, 0 );
3008 case float_round_down
:
3009 return make_float64(aSign
? LIT64( 0xBFF0000000000000 ) : 0);
3010 case float_round_up
:
3011 return make_float64(
3012 aSign
? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3014 return packFloat64( aSign
, 0, 0 );
3017 lastBitMask
<<= 0x433 - aExp
;
3018 roundBitsMask
= lastBitMask
- 1;
3020 roundingMode
= STATUS(float_rounding_mode
);
3021 if ( roundingMode
== float_round_nearest_even
) {
3022 z
+= lastBitMask
>>1;
3023 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
3025 else if ( roundingMode
!= float_round_to_zero
) {
3026 if ( extractFloat64Sign( make_float64(z
) ) ^ ( roundingMode
== float_round_up
) ) {
3030 z
&= ~ roundBitsMask
;
3031 if ( z
!= float64_val(a
) )
3032 STATUS(float_exception_flags
) |= float_flag_inexact
;
3033 return make_float64(z
);
3037 float64
float64_trunc_to_int( float64 a STATUS_PARAM
)
3041 oldmode
= STATUS(float_rounding_mode
);
3042 STATUS(float_rounding_mode
) = float_round_to_zero
;
3043 res
= float64_round_to_int(a STATUS_VAR
);
3044 STATUS(float_rounding_mode
) = oldmode
;
3048 /*----------------------------------------------------------------------------
3049 | Returns the result of adding the absolute values of the double-precision
3050 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3051 | before being returned. `zSign' is ignored if the result is a NaN.
3052 | The addition is performed according to the IEC/IEEE Standard for Binary
3053 | Floating-Point Arithmetic.
3054 *----------------------------------------------------------------------------*/
3056 static float64
addFloat64Sigs( float64 a
, float64 b
, flag zSign STATUS_PARAM
)
3058 int16 aExp
, bExp
, zExp
;
3059 uint64_t aSig
, bSig
, zSig
;
3062 aSig
= extractFloat64Frac( a
);
3063 aExp
= extractFloat64Exp( a
);
3064 bSig
= extractFloat64Frac( b
);
3065 bExp
= extractFloat64Exp( b
);
3066 expDiff
= aExp
- bExp
;
3069 if ( 0 < expDiff
) {
3070 if ( aExp
== 0x7FF ) {
3071 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3078 bSig
|= LIT64( 0x2000000000000000 );
3080 shift64RightJamming( bSig
, expDiff
, &bSig
);
3083 else if ( expDiff
< 0 ) {
3084 if ( bExp
== 0x7FF ) {
3085 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3086 return packFloat64( zSign
, 0x7FF, 0 );
3092 aSig
|= LIT64( 0x2000000000000000 );
3094 shift64RightJamming( aSig
, - expDiff
, &aSig
);
3098 if ( aExp
== 0x7FF ) {
3099 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3103 if (STATUS(flush_to_zero
)) {
3105 float_raise(float_flag_output_denormal STATUS_VAR
);
3107 return packFloat64(zSign
, 0, 0);
3109 return packFloat64( zSign
, 0, ( aSig
+ bSig
)>>9 );
3111 zSig
= LIT64( 0x4000000000000000 ) + aSig
+ bSig
;
3115 aSig
|= LIT64( 0x2000000000000000 );
3116 zSig
= ( aSig
+ bSig
)<<1;
3118 if ( (int64_t) zSig
< 0 ) {
3123 return roundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
3127 /*----------------------------------------------------------------------------
3128 | Returns the result of subtracting the absolute values of the double-
3129 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3130 | difference is negated before being returned. `zSign' is ignored if the
3131 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3132 | Standard for Binary Floating-Point Arithmetic.
3133 *----------------------------------------------------------------------------*/
3135 static float64
subFloat64Sigs( float64 a
, float64 b
, flag zSign STATUS_PARAM
)
3137 int16 aExp
, bExp
, zExp
;
3138 uint64_t aSig
, bSig
, zSig
;
3141 aSig
= extractFloat64Frac( a
);
3142 aExp
= extractFloat64Exp( a
);
3143 bSig
= extractFloat64Frac( b
);
3144 bExp
= extractFloat64Exp( b
);
3145 expDiff
= aExp
- bExp
;
3148 if ( 0 < expDiff
) goto aExpBigger
;
3149 if ( expDiff
< 0 ) goto bExpBigger
;
3150 if ( aExp
== 0x7FF ) {
3151 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3152 float_raise( float_flag_invalid STATUS_VAR
);
3153 return float64_default_nan
;
3159 if ( bSig
< aSig
) goto aBigger
;
3160 if ( aSig
< bSig
) goto bBigger
;
3161 return packFloat64( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
3163 if ( bExp
== 0x7FF ) {
3164 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3165 return packFloat64( zSign
^ 1, 0x7FF, 0 );
3171 aSig
|= LIT64( 0x4000000000000000 );
3173 shift64RightJamming( aSig
, - expDiff
, &aSig
);
3174 bSig
|= LIT64( 0x4000000000000000 );
3179 goto normalizeRoundAndPack
;
3181 if ( aExp
== 0x7FF ) {
3182 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3189 bSig
|= LIT64( 0x4000000000000000 );
3191 shift64RightJamming( bSig
, expDiff
, &bSig
);
3192 aSig
|= LIT64( 0x4000000000000000 );
3196 normalizeRoundAndPack
:
3198 return normalizeRoundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
3202 /*----------------------------------------------------------------------------
3203 | Returns the result of adding the double-precision floating-point values `a'
3204 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3205 | Binary Floating-Point Arithmetic.
3206 *----------------------------------------------------------------------------*/
3208 float64
float64_add( float64 a
, float64 b STATUS_PARAM
)
3211 a
= float64_squash_input_denormal(a STATUS_VAR
);
3212 b
= float64_squash_input_denormal(b STATUS_VAR
);
3214 aSign
= extractFloat64Sign( a
);
3215 bSign
= extractFloat64Sign( b
);
3216 if ( aSign
== bSign
) {
3217 return addFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3220 return subFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3225 /*----------------------------------------------------------------------------
3226 | Returns the result of subtracting the double-precision floating-point values
3227 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3228 | for Binary Floating-Point Arithmetic.
3229 *----------------------------------------------------------------------------*/
3231 float64
float64_sub( float64 a
, float64 b STATUS_PARAM
)
3234 a
= float64_squash_input_denormal(a STATUS_VAR
);
3235 b
= float64_squash_input_denormal(b STATUS_VAR
);
3237 aSign
= extractFloat64Sign( a
);
3238 bSign
= extractFloat64Sign( b
);
3239 if ( aSign
== bSign
) {
3240 return subFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3243 return addFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3248 /*----------------------------------------------------------------------------
3249 | Returns the result of multiplying the double-precision floating-point values
3250 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3251 | for Binary Floating-Point Arithmetic.
3252 *----------------------------------------------------------------------------*/
3254 float64
float64_mul( float64 a
, float64 b STATUS_PARAM
)
3256 flag aSign
, bSign
, zSign
;
3257 int16 aExp
, bExp
, zExp
;
3258 uint64_t aSig
, bSig
, zSig0
, zSig1
;
3260 a
= float64_squash_input_denormal(a STATUS_VAR
);
3261 b
= float64_squash_input_denormal(b STATUS_VAR
);
3263 aSig
= extractFloat64Frac( a
);
3264 aExp
= extractFloat64Exp( a
);
3265 aSign
= extractFloat64Sign( a
);
3266 bSig
= extractFloat64Frac( b
);
3267 bExp
= extractFloat64Exp( b
);
3268 bSign
= extractFloat64Sign( b
);
3269 zSign
= aSign
^ bSign
;
3270 if ( aExp
== 0x7FF ) {
3271 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
3272 return propagateFloat64NaN( a
, b STATUS_VAR
);
3274 if ( ( bExp
| bSig
) == 0 ) {
3275 float_raise( float_flag_invalid STATUS_VAR
);
3276 return float64_default_nan
;
3278 return packFloat64( zSign
, 0x7FF, 0 );
3280 if ( bExp
== 0x7FF ) {
3281 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3282 if ( ( aExp
| aSig
) == 0 ) {
3283 float_raise( float_flag_invalid STATUS_VAR
);
3284 return float64_default_nan
;
3286 return packFloat64( zSign
, 0x7FF, 0 );
3289 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
3290 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3293 if ( bSig
== 0 ) return packFloat64( zSign
, 0, 0 );
3294 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3296 zExp
= aExp
+ bExp
- 0x3FF;
3297 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
3298 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3299 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
3300 zSig0
|= ( zSig1
!= 0 );
3301 if ( 0 <= (int64_t) ( zSig0
<<1 ) ) {
3305 return roundAndPackFloat64( zSign
, zExp
, zSig0 STATUS_VAR
);
3309 /*----------------------------------------------------------------------------
3310 | Returns the result of dividing the double-precision floating-point value `a'
3311 | by the corresponding value `b'. The operation is performed according to
3312 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3313 *----------------------------------------------------------------------------*/
3315 float64
float64_div( float64 a
, float64 b STATUS_PARAM
)
3317 flag aSign
, bSign
, zSign
;
3318 int16 aExp
, bExp
, zExp
;
3319 uint64_t aSig
, bSig
, zSig
;
3320 uint64_t rem0
, rem1
;
3321 uint64_t term0
, term1
;
3322 a
= float64_squash_input_denormal(a STATUS_VAR
);
3323 b
= float64_squash_input_denormal(b STATUS_VAR
);
3325 aSig
= extractFloat64Frac( a
);
3326 aExp
= extractFloat64Exp( a
);
3327 aSign
= extractFloat64Sign( a
);
3328 bSig
= extractFloat64Frac( b
);
3329 bExp
= extractFloat64Exp( b
);
3330 bSign
= extractFloat64Sign( b
);
3331 zSign
= aSign
^ bSign
;
3332 if ( aExp
== 0x7FF ) {
3333 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3334 if ( bExp
== 0x7FF ) {
3335 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3336 float_raise( float_flag_invalid STATUS_VAR
);
3337 return float64_default_nan
;
3339 return packFloat64( zSign
, 0x7FF, 0 );
3341 if ( bExp
== 0x7FF ) {
3342 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3343 return packFloat64( zSign
, 0, 0 );
3347 if ( ( aExp
| aSig
) == 0 ) {
3348 float_raise( float_flag_invalid STATUS_VAR
);
3349 return float64_default_nan
;
3351 float_raise( float_flag_divbyzero STATUS_VAR
);
3352 return packFloat64( zSign
, 0x7FF, 0 );
3354 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3357 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
3358 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3360 zExp
= aExp
- bExp
+ 0x3FD;
3361 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
3362 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3363 if ( bSig
<= ( aSig
+ aSig
) ) {
3367 zSig
= estimateDiv128To64( aSig
, 0, bSig
);
3368 if ( ( zSig
& 0x1FF ) <= 2 ) {
3369 mul64To128( bSig
, zSig
, &term0
, &term1
);
3370 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
3371 while ( (int64_t) rem0
< 0 ) {
3373 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
3375 zSig
|= ( rem1
!= 0 );
3377 return roundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
3381 /*----------------------------------------------------------------------------
3382 | Returns the remainder of the double-precision floating-point value `a'
3383 | with respect to the corresponding value `b'. The operation is performed
3384 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3385 *----------------------------------------------------------------------------*/
3387 float64
float64_rem( float64 a
, float64 b STATUS_PARAM
)
3390 int16 aExp
, bExp
, expDiff
;
3391 uint64_t aSig
, bSig
;
3392 uint64_t q
, alternateASig
;
3395 a
= float64_squash_input_denormal(a STATUS_VAR
);
3396 b
= float64_squash_input_denormal(b STATUS_VAR
);
3397 aSig
= extractFloat64Frac( a
);
3398 aExp
= extractFloat64Exp( a
);
3399 aSign
= extractFloat64Sign( a
);
3400 bSig
= extractFloat64Frac( b
);
3401 bExp
= extractFloat64Exp( b
);
3402 if ( aExp
== 0x7FF ) {
3403 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
3404 return propagateFloat64NaN( a
, b STATUS_VAR
);
3406 float_raise( float_flag_invalid STATUS_VAR
);
3407 return float64_default_nan
;
3409 if ( bExp
== 0x7FF ) {
3410 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3415 float_raise( float_flag_invalid STATUS_VAR
);
3416 return float64_default_nan
;
3418 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3421 if ( aSig
== 0 ) return a
;
3422 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3424 expDiff
= aExp
- bExp
;
3425 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
3426 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3427 if ( expDiff
< 0 ) {
3428 if ( expDiff
< -1 ) return a
;
3431 q
= ( bSig
<= aSig
);
3432 if ( q
) aSig
-= bSig
;
3434 while ( 0 < expDiff
) {
3435 q
= estimateDiv128To64( aSig
, 0, bSig
);
3436 q
= ( 2 < q
) ? q
- 2 : 0;
3437 aSig
= - ( ( bSig
>>2 ) * q
);
3441 if ( 0 < expDiff
) {
3442 q
= estimateDiv128To64( aSig
, 0, bSig
);
3443 q
= ( 2 < q
) ? q
- 2 : 0;
3446 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
3453 alternateASig
= aSig
;
3456 } while ( 0 <= (int64_t) aSig
);
3457 sigMean
= aSig
+ alternateASig
;
3458 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
3459 aSig
= alternateASig
;
3461 zSign
= ( (int64_t) aSig
< 0 );
3462 if ( zSign
) aSig
= - aSig
;
3463 return normalizeRoundAndPackFloat64( aSign
^ zSign
, bExp
, aSig STATUS_VAR
);
3467 /*----------------------------------------------------------------------------
3468 | Returns the square root of the double-precision floating-point value `a'.
3469 | The operation is performed according to the IEC/IEEE Standard for Binary
3470 | Floating-Point Arithmetic.
3471 *----------------------------------------------------------------------------*/
3473 float64
float64_sqrt( float64 a STATUS_PARAM
)
3477 uint64_t aSig
, zSig
, doubleZSig
;
3478 uint64_t rem0
, rem1
, term0
, term1
;
3479 a
= float64_squash_input_denormal(a STATUS_VAR
);
3481 aSig
= extractFloat64Frac( a
);
3482 aExp
= extractFloat64Exp( a
);
3483 aSign
= extractFloat64Sign( a
);
3484 if ( aExp
== 0x7FF ) {
3485 if ( aSig
) return propagateFloat64NaN( a
, a STATUS_VAR
);
3486 if ( ! aSign
) return a
;
3487 float_raise( float_flag_invalid STATUS_VAR
);
3488 return float64_default_nan
;
3491 if ( ( aExp
| aSig
) == 0 ) return a
;
3492 float_raise( float_flag_invalid STATUS_VAR
);
3493 return float64_default_nan
;
3496 if ( aSig
== 0 ) return float64_zero
;
3497 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3499 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
3500 aSig
|= LIT64( 0x0010000000000000 );
3501 zSig
= estimateSqrt32( aExp
, aSig
>>21 );
3502 aSig
<<= 9 - ( aExp
& 1 );
3503 zSig
= estimateDiv128To64( aSig
, 0, zSig
<<32 ) + ( zSig
<<30 );
3504 if ( ( zSig
& 0x1FF ) <= 5 ) {
3505 doubleZSig
= zSig
<<1;
3506 mul64To128( zSig
, zSig
, &term0
, &term1
);
3507 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
3508 while ( (int64_t) rem0
< 0 ) {
3511 add128( rem0
, rem1
, zSig
>>63, doubleZSig
| 1, &rem0
, &rem1
);
3513 zSig
|= ( ( rem0
| rem1
) != 0 );
3515 return roundAndPackFloat64( 0, zExp
, zSig STATUS_VAR
);
3519 /*----------------------------------------------------------------------------
3520 | Returns the binary log of the double-precision floating-point value `a'.
3521 | The operation is performed according to the IEC/IEEE Standard for Binary
3522 | Floating-Point Arithmetic.
3523 *----------------------------------------------------------------------------*/
3524 float64
float64_log2( float64 a STATUS_PARAM
)
3528 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
3529 a
= float64_squash_input_denormal(a STATUS_VAR
);
3531 aSig
= extractFloat64Frac( a
);
3532 aExp
= extractFloat64Exp( a
);
3533 aSign
= extractFloat64Sign( a
);
3536 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
3537 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3540 float_raise( float_flag_invalid STATUS_VAR
);
3541 return float64_default_nan
;
3543 if ( aExp
== 0x7FF ) {
3544 if ( aSig
) return propagateFloat64NaN( a
, float64_zero STATUS_VAR
);
3549 aSig
|= LIT64( 0x0010000000000000 );
3551 zSig
= (uint64_t)aExp
<< 52;
3552 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
3553 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
3554 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
3555 if ( aSig
& LIT64( 0x0020000000000000 ) ) {
3563 return normalizeRoundAndPackFloat64( zSign
, 0x408, zSig STATUS_VAR
);
3566 /*----------------------------------------------------------------------------
3567 | Returns 1 if the double-precision floating-point value `a' is equal to the
3568 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3569 | if either operand is a NaN. Otherwise, the comparison is performed
3570 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3571 *----------------------------------------------------------------------------*/
3573 int float64_eq( float64 a
, float64 b STATUS_PARAM
)
3576 a
= float64_squash_input_denormal(a STATUS_VAR
);
3577 b
= float64_squash_input_denormal(b STATUS_VAR
);
3579 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3580 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3582 float_raise( float_flag_invalid STATUS_VAR
);
3585 av
= float64_val(a
);
3586 bv
= float64_val(b
);
3587 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
3591 /*----------------------------------------------------------------------------
3592 | Returns 1 if the double-precision floating-point value `a' is less than or
3593 | equal to the corresponding value `b', and 0 otherwise. The invalid
3594 | exception is raised if either operand is a NaN. The comparison is performed
3595 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3596 *----------------------------------------------------------------------------*/
3598 int float64_le( float64 a
, float64 b STATUS_PARAM
)
3602 a
= float64_squash_input_denormal(a STATUS_VAR
);
3603 b
= float64_squash_input_denormal(b STATUS_VAR
);
3605 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3606 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3608 float_raise( float_flag_invalid STATUS_VAR
);
3611 aSign
= extractFloat64Sign( a
);
3612 bSign
= extractFloat64Sign( b
);
3613 av
= float64_val(a
);
3614 bv
= float64_val(b
);
3615 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
3616 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3620 /*----------------------------------------------------------------------------
3621 | Returns 1 if the double-precision floating-point value `a' is less than
3622 | the corresponding value `b', and 0 otherwise. The invalid exception is
3623 | raised if either operand is a NaN. The comparison is performed according
3624 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3625 *----------------------------------------------------------------------------*/
3627 int float64_lt( float64 a
, float64 b STATUS_PARAM
)
3632 a
= float64_squash_input_denormal(a STATUS_VAR
);
3633 b
= float64_squash_input_denormal(b STATUS_VAR
);
3634 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3635 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3637 float_raise( float_flag_invalid STATUS_VAR
);
3640 aSign
= extractFloat64Sign( a
);
3641 bSign
= extractFloat64Sign( b
);
3642 av
= float64_val(a
);
3643 bv
= float64_val(b
);
3644 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
3645 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3649 /*----------------------------------------------------------------------------
3650 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
3651 | be compared, and 0 otherwise. The invalid exception is raised if either
3652 | operand is a NaN. The comparison is performed according to the IEC/IEEE
3653 | Standard for Binary Floating-Point Arithmetic.
3654 *----------------------------------------------------------------------------*/
3656 int float64_unordered( float64 a
, float64 b STATUS_PARAM
)
3658 a
= float64_squash_input_denormal(a STATUS_VAR
);
3659 b
= float64_squash_input_denormal(b STATUS_VAR
);
3661 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3662 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3664 float_raise( float_flag_invalid STATUS_VAR
);
3670 /*----------------------------------------------------------------------------
3671 | Returns 1 if the double-precision floating-point value `a' is equal to the
3672 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3673 | exception.The comparison is performed according to the IEC/IEEE Standard
3674 | for Binary Floating-Point Arithmetic.
3675 *----------------------------------------------------------------------------*/
3677 int float64_eq_quiet( float64 a
, float64 b STATUS_PARAM
)
3680 a
= float64_squash_input_denormal(a STATUS_VAR
);
3681 b
= float64_squash_input_denormal(b STATUS_VAR
);
3683 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3684 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3686 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3687 float_raise( float_flag_invalid STATUS_VAR
);
3691 av
= float64_val(a
);
3692 bv
= float64_val(b
);
3693 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
3697 /*----------------------------------------------------------------------------
3698 | Returns 1 if the double-precision floating-point value `a' is less than or
3699 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3700 | cause an exception. Otherwise, the comparison is performed according to the
3701 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3702 *----------------------------------------------------------------------------*/
3704 int float64_le_quiet( float64 a
, float64 b STATUS_PARAM
)
3708 a
= float64_squash_input_denormal(a STATUS_VAR
);
3709 b
= float64_squash_input_denormal(b STATUS_VAR
);
3711 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3712 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3714 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3715 float_raise( float_flag_invalid STATUS_VAR
);
3719 aSign
= extractFloat64Sign( a
);
3720 bSign
= extractFloat64Sign( b
);
3721 av
= float64_val(a
);
3722 bv
= float64_val(b
);
3723 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
3724 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3728 /*----------------------------------------------------------------------------
3729 | Returns 1 if the double-precision floating-point value `a' is less than
3730 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3731 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3732 | Standard for Binary Floating-Point Arithmetic.
3733 *----------------------------------------------------------------------------*/
3735 int float64_lt_quiet( float64 a
, float64 b STATUS_PARAM
)
3739 a
= float64_squash_input_denormal(a STATUS_VAR
);
3740 b
= float64_squash_input_denormal(b STATUS_VAR
);
3742 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3743 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3745 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3746 float_raise( float_flag_invalid STATUS_VAR
);
3750 aSign
= extractFloat64Sign( a
);
3751 bSign
= extractFloat64Sign( b
);
3752 av
= float64_val(a
);
3753 bv
= float64_val(b
);
3754 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
3755 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3759 /*----------------------------------------------------------------------------
3760 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
3761 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3762 | comparison is performed according to the IEC/IEEE Standard for Binary
3763 | Floating-Point Arithmetic.
3764 *----------------------------------------------------------------------------*/
3766 int float64_unordered_quiet( float64 a
, float64 b STATUS_PARAM
)
3768 a
= float64_squash_input_denormal(a STATUS_VAR
);
3769 b
= float64_squash_input_denormal(b STATUS_VAR
);
3771 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3772 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3774 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3775 float_raise( float_flag_invalid STATUS_VAR
);
3782 /*----------------------------------------------------------------------------
3783 | Returns the result of converting the extended double-precision floating-
3784 | point value `a' to the 32-bit two's complement integer format. The
3785 | conversion is performed according to the IEC/IEEE Standard for Binary
3786 | Floating-Point Arithmetic---which means in particular that the conversion
3787 | is rounded according to the current rounding mode. If `a' is a NaN, the
3788 | largest positive integer is returned. Otherwise, if the conversion
3789 | overflows, the largest integer with the same sign as `a' is returned.
3790 *----------------------------------------------------------------------------*/
3792 int32
floatx80_to_int32( floatx80 a STATUS_PARAM
)
3795 int32 aExp
, shiftCount
;
3798 aSig
= extractFloatx80Frac( a
);
3799 aExp
= extractFloatx80Exp( a
);
3800 aSign
= extractFloatx80Sign( a
);
3801 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
3802 shiftCount
= 0x4037 - aExp
;
3803 if ( shiftCount
<= 0 ) shiftCount
= 1;
3804 shift64RightJamming( aSig
, shiftCount
, &aSig
);
3805 return roundAndPackInt32( aSign
, aSig STATUS_VAR
);
3809 /*----------------------------------------------------------------------------
3810 | Returns the result of converting the extended double-precision floating-
3811 | point value `a' to the 32-bit two's complement integer format. The
3812 | conversion is performed according to the IEC/IEEE Standard for Binary
3813 | Floating-Point Arithmetic, except that the conversion is always rounded
3814 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3815 | Otherwise, if the conversion overflows, the largest integer with the same
3816 | sign as `a' is returned.
3817 *----------------------------------------------------------------------------*/
3819 int32
floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM
)
3822 int32 aExp
, shiftCount
;
3823 uint64_t aSig
, savedASig
;
3826 aSig
= extractFloatx80Frac( a
);
3827 aExp
= extractFloatx80Exp( a
);
3828 aSign
= extractFloatx80Sign( a
);
3829 if ( 0x401E < aExp
) {
3830 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
3833 else if ( aExp
< 0x3FFF ) {
3834 if ( aExp
|| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
3837 shiftCount
= 0x403E - aExp
;
3839 aSig
>>= shiftCount
;
3841 if ( aSign
) z
= - z
;
3842 if ( ( z
< 0 ) ^ aSign
) {
3844 float_raise( float_flag_invalid STATUS_VAR
);
3845 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
3847 if ( ( aSig
<<shiftCount
) != savedASig
) {
3848 STATUS(float_exception_flags
) |= float_flag_inexact
;
3854 /*----------------------------------------------------------------------------
3855 | Returns the result of converting the extended double-precision floating-
3856 | point value `a' to the 64-bit two's complement integer format. The
3857 | conversion is performed according to the IEC/IEEE Standard for Binary
3858 | Floating-Point Arithmetic---which means in particular that the conversion
3859 | is rounded according to the current rounding mode. If `a' is a NaN,
3860 | the largest positive integer is returned. Otherwise, if the conversion
3861 | overflows, the largest integer with the same sign as `a' is returned.
3862 *----------------------------------------------------------------------------*/
3864 int64
floatx80_to_int64( floatx80 a STATUS_PARAM
)
3867 int32 aExp
, shiftCount
;
3868 uint64_t aSig
, aSigExtra
;
3870 aSig
= extractFloatx80Frac( a
);
3871 aExp
= extractFloatx80Exp( a
);
3872 aSign
= extractFloatx80Sign( a
);
3873 shiftCount
= 0x403E - aExp
;
3874 if ( shiftCount
<= 0 ) {
3876 float_raise( float_flag_invalid STATUS_VAR
);
3878 || ( ( aExp
== 0x7FFF )
3879 && ( aSig
!= LIT64( 0x8000000000000000 ) ) )
3881 return LIT64( 0x7FFFFFFFFFFFFFFF );
3883 return (int64_t) LIT64( 0x8000000000000000 );
3888 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
3890 return roundAndPackInt64( aSign
, aSig
, aSigExtra STATUS_VAR
);
3894 /*----------------------------------------------------------------------------
3895 | Returns the result of converting the extended double-precision floating-
3896 | point value `a' to the 64-bit two's complement integer format. The
3897 | conversion is performed according to the IEC/IEEE Standard for Binary
3898 | Floating-Point Arithmetic, except that the conversion is always rounded
3899 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3900 | Otherwise, if the conversion overflows, the largest integer with the same
3901 | sign as `a' is returned.
3902 *----------------------------------------------------------------------------*/
3904 int64
floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM
)
3907 int32 aExp
, shiftCount
;
3911 aSig
= extractFloatx80Frac( a
);
3912 aExp
= extractFloatx80Exp( a
);
3913 aSign
= extractFloatx80Sign( a
);
3914 shiftCount
= aExp
- 0x403E;
3915 if ( 0 <= shiftCount
) {
3916 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
3917 if ( ( a
.high
!= 0xC03E ) || aSig
) {
3918 float_raise( float_flag_invalid STATUS_VAR
);
3919 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
3920 return LIT64( 0x7FFFFFFFFFFFFFFF );
3923 return (int64_t) LIT64( 0x8000000000000000 );
3925 else if ( aExp
< 0x3FFF ) {
3926 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
3929 z
= aSig
>>( - shiftCount
);
3930 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
3931 STATUS(float_exception_flags
) |= float_flag_inexact
;
3933 if ( aSign
) z
= - z
;
3938 /*----------------------------------------------------------------------------
3939 | Returns the result of converting the extended double-precision floating-
3940 | point value `a' to the single-precision floating-point format. The
3941 | conversion is performed according to the IEC/IEEE Standard for Binary
3942 | Floating-Point Arithmetic.
3943 *----------------------------------------------------------------------------*/
3945 float32
floatx80_to_float32( floatx80 a STATUS_PARAM
)
3951 aSig
= extractFloatx80Frac( a
);
3952 aExp
= extractFloatx80Exp( a
);
3953 aSign
= extractFloatx80Sign( a
);
3954 if ( aExp
== 0x7FFF ) {
3955 if ( (uint64_t) ( aSig
<<1 ) ) {
3956 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
3958 return packFloat32( aSign
, 0xFF, 0 );
3960 shift64RightJamming( aSig
, 33, &aSig
);
3961 if ( aExp
|| aSig
) aExp
-= 0x3F81;
3962 return roundAndPackFloat32( aSign
, aExp
, aSig STATUS_VAR
);
3966 /*----------------------------------------------------------------------------
3967 | Returns the result of converting the extended double-precision floating-
3968 | point value `a' to the double-precision floating-point format. The
3969 | conversion is performed according to the IEC/IEEE Standard for Binary
3970 | Floating-Point Arithmetic.
3971 *----------------------------------------------------------------------------*/
3973 float64
floatx80_to_float64( floatx80 a STATUS_PARAM
)
3977 uint64_t aSig
, zSig
;
3979 aSig
= extractFloatx80Frac( a
);
3980 aExp
= extractFloatx80Exp( a
);
3981 aSign
= extractFloatx80Sign( a
);
3982 if ( aExp
== 0x7FFF ) {
3983 if ( (uint64_t) ( aSig
<<1 ) ) {
3984 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
3986 return packFloat64( aSign
, 0x7FF, 0 );
3988 shift64RightJamming( aSig
, 1, &zSig
);
3989 if ( aExp
|| aSig
) aExp
-= 0x3C01;
3990 return roundAndPackFloat64( aSign
, aExp
, zSig STATUS_VAR
);
3994 /*----------------------------------------------------------------------------
3995 | Returns the result of converting the extended double-precision floating-
3996 | point value `a' to the quadruple-precision floating-point format. The
3997 | conversion is performed according to the IEC/IEEE Standard for Binary
3998 | Floating-Point Arithmetic.
3999 *----------------------------------------------------------------------------*/
4001 float128
floatx80_to_float128( floatx80 a STATUS_PARAM
)
4005 uint64_t aSig
, zSig0
, zSig1
;
4007 aSig
= extractFloatx80Frac( a
);
4008 aExp
= extractFloatx80Exp( a
);
4009 aSign
= extractFloatx80Sign( a
);
4010 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
4011 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
4013 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
4014 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
4018 /*----------------------------------------------------------------------------
4019 | Rounds the extended double-precision floating-point value `a' to an integer,
4020 | and returns the result as an extended quadruple-precision floating-point
4021 | value. The operation is performed according to the IEC/IEEE Standard for
4022 | Binary Floating-Point Arithmetic.
4023 *----------------------------------------------------------------------------*/
4025 floatx80
floatx80_round_to_int( floatx80 a STATUS_PARAM
)
4029 uint64_t lastBitMask
, roundBitsMask
;
4033 aExp
= extractFloatx80Exp( a
);
4034 if ( 0x403E <= aExp
) {
4035 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
4036 return propagateFloatx80NaN( a
, a STATUS_VAR
);
4040 if ( aExp
< 0x3FFF ) {
4042 && ( (uint64_t) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
4045 STATUS(float_exception_flags
) |= float_flag_inexact
;
4046 aSign
= extractFloatx80Sign( a
);
4047 switch ( STATUS(float_rounding_mode
) ) {
4048 case float_round_nearest_even
:
4049 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
4052 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
4055 case float_round_down
:
4058 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4059 : packFloatx80( 0, 0, 0 );
4060 case float_round_up
:
4062 aSign
? packFloatx80( 1, 0, 0 )
4063 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4065 return packFloatx80( aSign
, 0, 0 );
4068 lastBitMask
<<= 0x403E - aExp
;
4069 roundBitsMask
= lastBitMask
- 1;
4071 roundingMode
= STATUS(float_rounding_mode
);
4072 if ( roundingMode
== float_round_nearest_even
) {
4073 z
.low
+= lastBitMask
>>1;
4074 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
4076 else if ( roundingMode
!= float_round_to_zero
) {
4077 if ( extractFloatx80Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
4078 z
.low
+= roundBitsMask
;
4081 z
.low
&= ~ roundBitsMask
;
4084 z
.low
= LIT64( 0x8000000000000000 );
4086 if ( z
.low
!= a
.low
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4091 /*----------------------------------------------------------------------------
4092 | Returns the result of adding the absolute values of the extended double-
4093 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4094 | negated before being returned. `zSign' is ignored if the result is a NaN.
4095 | The addition is performed according to the IEC/IEEE Standard for Binary
4096 | Floating-Point Arithmetic.
4097 *----------------------------------------------------------------------------*/
4099 static floatx80
addFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign STATUS_PARAM
)
4101 int32 aExp
, bExp
, zExp
;
4102 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4105 aSig
= extractFloatx80Frac( a
);
4106 aExp
= extractFloatx80Exp( a
);
4107 bSig
= extractFloatx80Frac( b
);
4108 bExp
= extractFloatx80Exp( b
);
4109 expDiff
= aExp
- bExp
;
4110 if ( 0 < expDiff
) {
4111 if ( aExp
== 0x7FFF ) {
4112 if ( (uint64_t) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4115 if ( bExp
== 0 ) --expDiff
;
4116 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
4119 else if ( expDiff
< 0 ) {
4120 if ( bExp
== 0x7FFF ) {
4121 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4122 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4124 if ( aExp
== 0 ) ++expDiff
;
4125 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
4129 if ( aExp
== 0x7FFF ) {
4130 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
4131 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4136 zSig0
= aSig
+ bSig
;
4138 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
4144 zSig0
= aSig
+ bSig
;
4145 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
4147 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
4148 zSig0
|= LIT64( 0x8000000000000000 );
4152 roundAndPackFloatx80(
4153 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4157 /*----------------------------------------------------------------------------
4158 | Returns the result of subtracting the absolute values of the extended
4159 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4160 | difference is negated before being returned. `zSign' is ignored if the
4161 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4162 | Standard for Binary Floating-Point Arithmetic.
4163 *----------------------------------------------------------------------------*/
4165 static floatx80
subFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign STATUS_PARAM
)
4167 int32 aExp
, bExp
, zExp
;
4168 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4172 aSig
= extractFloatx80Frac( a
);
4173 aExp
= extractFloatx80Exp( a
);
4174 bSig
= extractFloatx80Frac( b
);
4175 bExp
= extractFloatx80Exp( b
);
4176 expDiff
= aExp
- bExp
;
4177 if ( 0 < expDiff
) goto aExpBigger
;
4178 if ( expDiff
< 0 ) goto bExpBigger
;
4179 if ( aExp
== 0x7FFF ) {
4180 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
4181 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4183 float_raise( float_flag_invalid STATUS_VAR
);
4184 z
.low
= floatx80_default_nan_low
;
4185 z
.high
= floatx80_default_nan_high
;
4193 if ( bSig
< aSig
) goto aBigger
;
4194 if ( aSig
< bSig
) goto bBigger
;
4195 return packFloatx80( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
4197 if ( bExp
== 0x7FFF ) {
4198 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4199 return packFloatx80( zSign
^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4201 if ( aExp
== 0 ) ++expDiff
;
4202 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
4204 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
4207 goto normalizeRoundAndPack
;
4209 if ( aExp
== 0x7FFF ) {
4210 if ( (uint64_t) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4213 if ( bExp
== 0 ) --expDiff
;
4214 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
4216 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
4218 normalizeRoundAndPack
:
4220 normalizeRoundAndPackFloatx80(
4221 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4225 /*----------------------------------------------------------------------------
4226 | Returns the result of adding the extended double-precision floating-point
4227 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4228 | Standard for Binary Floating-Point Arithmetic.
4229 *----------------------------------------------------------------------------*/
4231 floatx80
floatx80_add( floatx80 a
, floatx80 b STATUS_PARAM
)
4235 aSign
= extractFloatx80Sign( a
);
4236 bSign
= extractFloatx80Sign( b
);
4237 if ( aSign
== bSign
) {
4238 return addFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
4241 return subFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
4246 /*----------------------------------------------------------------------------
4247 | Returns the result of subtracting the extended double-precision floating-
4248 | point values `a' and `b'. The operation is performed according to the
4249 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4250 *----------------------------------------------------------------------------*/
4252 floatx80
floatx80_sub( floatx80 a
, floatx80 b STATUS_PARAM
)
4256 aSign
= extractFloatx80Sign( a
);
4257 bSign
= extractFloatx80Sign( b
);
4258 if ( aSign
== bSign
) {
4259 return subFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
4262 return addFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
4267 /*----------------------------------------------------------------------------
4268 | Returns the result of multiplying the extended double-precision floating-
4269 | point values `a' and `b'. The operation is performed according to the
4270 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4271 *----------------------------------------------------------------------------*/
4273 floatx80
floatx80_mul( floatx80 a
, floatx80 b STATUS_PARAM
)
4275 flag aSign
, bSign
, zSign
;
4276 int32 aExp
, bExp
, zExp
;
4277 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4280 aSig
= extractFloatx80Frac( a
);
4281 aExp
= extractFloatx80Exp( a
);
4282 aSign
= extractFloatx80Sign( a
);
4283 bSig
= extractFloatx80Frac( b
);
4284 bExp
= extractFloatx80Exp( b
);
4285 bSign
= extractFloatx80Sign( b
);
4286 zSign
= aSign
^ bSign
;
4287 if ( aExp
== 0x7FFF ) {
4288 if ( (uint64_t) ( aSig
<<1 )
4289 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
4290 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4292 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
4293 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4295 if ( bExp
== 0x7FFF ) {
4296 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4297 if ( ( aExp
| aSig
) == 0 ) {
4299 float_raise( float_flag_invalid STATUS_VAR
);
4300 z
.low
= floatx80_default_nan_low
;
4301 z
.high
= floatx80_default_nan_high
;
4304 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4307 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
4308 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
4311 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
4312 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
4314 zExp
= aExp
+ bExp
- 0x3FFE;
4315 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
4316 if ( 0 < (int64_t) zSig0
) {
4317 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
4321 roundAndPackFloatx80(
4322 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4326 /*----------------------------------------------------------------------------
4327 | Returns the result of dividing the extended double-precision floating-point
4328 | value `a' by the corresponding value `b'. The operation is performed
4329 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4330 *----------------------------------------------------------------------------*/
4332 floatx80
floatx80_div( floatx80 a
, floatx80 b STATUS_PARAM
)
4334 flag aSign
, bSign
, zSign
;
4335 int32 aExp
, bExp
, zExp
;
4336 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4337 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
4340 aSig
= extractFloatx80Frac( a
);
4341 aExp
= extractFloatx80Exp( a
);
4342 aSign
= extractFloatx80Sign( a
);
4343 bSig
= extractFloatx80Frac( b
);
4344 bExp
= extractFloatx80Exp( b
);
4345 bSign
= extractFloatx80Sign( b
);
4346 zSign
= aSign
^ bSign
;
4347 if ( aExp
== 0x7FFF ) {
4348 if ( (uint64_t) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4349 if ( bExp
== 0x7FFF ) {
4350 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4353 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4355 if ( bExp
== 0x7FFF ) {
4356 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4357 return packFloatx80( zSign
, 0, 0 );
4361 if ( ( aExp
| aSig
) == 0 ) {
4363 float_raise( float_flag_invalid STATUS_VAR
);
4364 z
.low
= floatx80_default_nan_low
;
4365 z
.high
= floatx80_default_nan_high
;
4368 float_raise( float_flag_divbyzero STATUS_VAR
);
4369 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4371 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
4374 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
4375 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
4377 zExp
= aExp
- bExp
+ 0x3FFE;
4379 if ( bSig
<= aSig
) {
4380 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
4383 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
4384 mul64To128( bSig
, zSig0
, &term0
, &term1
);
4385 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
4386 while ( (int64_t) rem0
< 0 ) {
4388 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
4390 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
4391 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
4392 mul64To128( bSig
, zSig1
, &term1
, &term2
);
4393 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
4394 while ( (int64_t) rem1
< 0 ) {
4396 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
4398 zSig1
|= ( ( rem1
| rem2
) != 0 );
4401 roundAndPackFloatx80(
4402 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4406 /*----------------------------------------------------------------------------
4407 | Returns the remainder of the extended double-precision floating-point value
4408 | `a' with respect to the corresponding value `b'. The operation is performed
4409 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4410 *----------------------------------------------------------------------------*/
4412 floatx80
floatx80_rem( floatx80 a
, floatx80 b STATUS_PARAM
)
4415 int32 aExp
, bExp
, expDiff
;
4416 uint64_t aSig0
, aSig1
, bSig
;
4417 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
4420 aSig0
= extractFloatx80Frac( a
);
4421 aExp
= extractFloatx80Exp( a
);
4422 aSign
= extractFloatx80Sign( a
);
4423 bSig
= extractFloatx80Frac( b
);
4424 bExp
= extractFloatx80Exp( b
);
4425 if ( aExp
== 0x7FFF ) {
4426 if ( (uint64_t) ( aSig0
<<1 )
4427 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
4428 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4432 if ( bExp
== 0x7FFF ) {
4433 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4439 float_raise( float_flag_invalid STATUS_VAR
);
4440 z
.low
= floatx80_default_nan_low
;
4441 z
.high
= floatx80_default_nan_high
;
4444 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
4447 if ( (uint64_t) ( aSig0
<<1 ) == 0 ) return a
;
4448 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
4450 bSig
|= LIT64( 0x8000000000000000 );
4452 expDiff
= aExp
- bExp
;
4454 if ( expDiff
< 0 ) {
4455 if ( expDiff
< -1 ) return a
;
4456 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
4459 q
= ( bSig
<= aSig0
);
4460 if ( q
) aSig0
-= bSig
;
4462 while ( 0 < expDiff
) {
4463 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
4464 q
= ( 2 < q
) ? q
- 2 : 0;
4465 mul64To128( bSig
, q
, &term0
, &term1
);
4466 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
4467 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
4471 if ( 0 < expDiff
) {
4472 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
4473 q
= ( 2 < q
) ? q
- 2 : 0;
4475 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
4476 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
4477 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
4478 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
4480 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
4487 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
4488 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
4489 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
4492 aSig0
= alternateASig0
;
4493 aSig1
= alternateASig1
;
4497 normalizeRoundAndPackFloatx80(
4498 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1 STATUS_VAR
);
4502 /*----------------------------------------------------------------------------
4503 | Returns the square root of the extended double-precision floating-point
4504 | value `a'. The operation is performed according to the IEC/IEEE Standard
4505 | for Binary Floating-Point Arithmetic.
4506 *----------------------------------------------------------------------------*/
4508 floatx80
floatx80_sqrt( floatx80 a STATUS_PARAM
)
4512 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
4513 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
4516 aSig0
= extractFloatx80Frac( a
);
4517 aExp
= extractFloatx80Exp( a
);
4518 aSign
= extractFloatx80Sign( a
);
4519 if ( aExp
== 0x7FFF ) {
4520 if ( (uint64_t) ( aSig0
<<1 ) ) return propagateFloatx80NaN( a
, a STATUS_VAR
);
4521 if ( ! aSign
) return a
;
4525 if ( ( aExp
| aSig0
) == 0 ) return a
;
4527 float_raise( float_flag_invalid STATUS_VAR
);
4528 z
.low
= floatx80_default_nan_low
;
4529 z
.high
= floatx80_default_nan_high
;
4533 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
4534 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
4536 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
4537 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
4538 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
4539 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
4540 doubleZSig0
= zSig0
<<1;
4541 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
4542 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
4543 while ( (int64_t) rem0
< 0 ) {
4546 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
4548 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
4549 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4550 if ( zSig1
== 0 ) zSig1
= 1;
4551 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
4552 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
4553 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
4554 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
4555 while ( (int64_t) rem1
< 0 ) {
4557 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
4559 term2
|= doubleZSig0
;
4560 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
4562 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
4564 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
4565 zSig0
|= doubleZSig0
;
4567 roundAndPackFloatx80(
4568 STATUS(floatx80_rounding_precision
), 0, zExp
, zSig0
, zSig1 STATUS_VAR
);
4572 /*----------------------------------------------------------------------------
4573 | Returns 1 if the extended double-precision floating-point value `a' is equal
4574 | to the corresponding value `b', and 0 otherwise. The invalid exception is
4575 | raised if either operand is a NaN. Otherwise, the comparison is performed
4576 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4577 *----------------------------------------------------------------------------*/
4579 int floatx80_eq( floatx80 a
, floatx80 b STATUS_PARAM
)
4582 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4583 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
4584 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4585 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
4587 float_raise( float_flag_invalid STATUS_VAR
);
4592 && ( ( a
.high
== b
.high
)
4594 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
4599 /*----------------------------------------------------------------------------
4600 | Returns 1 if the extended double-precision floating-point value `a' is
4601 | less than or equal to the corresponding value `b', and 0 otherwise. The
4602 | invalid exception is raised if either operand is a NaN. The comparison is
4603 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4605 *----------------------------------------------------------------------------*/
4607 int floatx80_le( floatx80 a
, floatx80 b STATUS_PARAM
)
4611 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4612 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
4613 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4614 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
4616 float_raise( float_flag_invalid STATUS_VAR
);
4619 aSign
= extractFloatx80Sign( a
);
4620 bSign
= extractFloatx80Sign( b
);
4621 if ( aSign
!= bSign
) {
4624 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4628 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4629 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4633 /*----------------------------------------------------------------------------
4634 | Returns 1 if the extended double-precision floating-point value `a' is
4635 | less than the corresponding value `b', and 0 otherwise. The invalid
4636 | exception is raised if either operand is a NaN. The comparison is performed
4637 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4638 *----------------------------------------------------------------------------*/
4640 int floatx80_lt( floatx80 a
, floatx80 b STATUS_PARAM
)
4644 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4645 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
4646 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4647 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
4649 float_raise( float_flag_invalid STATUS_VAR
);
4652 aSign
= extractFloatx80Sign( a
);
4653 bSign
= extractFloatx80Sign( b
);
4654 if ( aSign
!= bSign
) {
4657 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4661 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4662 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4666 /*----------------------------------------------------------------------------
4667 | Returns 1 if the extended double-precision floating-point values `a' and `b'
4668 | cannot be compared, and 0 otherwise. The invalid exception is raised if
4669 | either operand is a NaN. The comparison is performed according to the
4670 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4671 *----------------------------------------------------------------------------*/
4672 int floatx80_unordered( floatx80 a
, floatx80 b STATUS_PARAM
)
4674 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4675 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
4676 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4677 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
4679 float_raise( float_flag_invalid STATUS_VAR
);
4685 /*----------------------------------------------------------------------------
4686 | Returns 1 if the extended double-precision floating-point value `a' is
4687 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4688 | cause an exception. The comparison is performed according to the IEC/IEEE
4689 | Standard for Binary Floating-Point Arithmetic.
4690 *----------------------------------------------------------------------------*/
4692 int floatx80_eq_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
4695 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4696 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
4697 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4698 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
4700 if ( floatx80_is_signaling_nan( a
)
4701 || floatx80_is_signaling_nan( b
) ) {
4702 float_raise( float_flag_invalid STATUS_VAR
);
4708 && ( ( a
.high
== b
.high
)
4710 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
4715 /*----------------------------------------------------------------------------
4716 | Returns 1 if the extended double-precision floating-point value `a' is less
4717 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4718 | do not cause an exception. Otherwise, the comparison is performed according
4719 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4720 *----------------------------------------------------------------------------*/
4722 int floatx80_le_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
4726 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4727 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
4728 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4729 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
4731 if ( floatx80_is_signaling_nan( a
)
4732 || floatx80_is_signaling_nan( b
) ) {
4733 float_raise( float_flag_invalid STATUS_VAR
);
4737 aSign
= extractFloatx80Sign( a
);
4738 bSign
= extractFloatx80Sign( b
);
4739 if ( aSign
!= bSign
) {
4742 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4746 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4747 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4751 /*----------------------------------------------------------------------------
4752 | Returns 1 if the extended double-precision floating-point value `a' is less
4753 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4754 | an exception. Otherwise, the comparison is performed according to the
4755 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4756 *----------------------------------------------------------------------------*/
4758 int floatx80_lt_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
4762 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4763 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
4764 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4765 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
4767 if ( floatx80_is_signaling_nan( a
)
4768 || floatx80_is_signaling_nan( b
) ) {
4769 float_raise( float_flag_invalid STATUS_VAR
);
4773 aSign
= extractFloatx80Sign( a
);
4774 bSign
= extractFloatx80Sign( b
);
4775 if ( aSign
!= bSign
) {
4778 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4782 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4783 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4787 /*----------------------------------------------------------------------------
4788 | Returns 1 if the extended double-precision floating-point values `a' and `b'
4789 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
4790 | The comparison is performed according to the IEC/IEEE Standard for Binary
4791 | Floating-Point Arithmetic.
4792 *----------------------------------------------------------------------------*/
4793 int floatx80_unordered_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
4795 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4796 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
4797 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4798 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
4800 if ( floatx80_is_signaling_nan( a
)
4801 || floatx80_is_signaling_nan( b
) ) {
4802 float_raise( float_flag_invalid STATUS_VAR
);
4809 /*----------------------------------------------------------------------------
4810 | Returns the result of converting the quadruple-precision floating-point
4811 | value `a' to the 32-bit two's complement integer format. The conversion
4812 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4813 | Arithmetic---which means in particular that the conversion is rounded
4814 | according to the current rounding mode. If `a' is a NaN, the largest
4815 | positive integer is returned. Otherwise, if the conversion overflows, the
4816 | largest integer with the same sign as `a' is returned.
4817 *----------------------------------------------------------------------------*/
4819 int32
float128_to_int32( float128 a STATUS_PARAM
)
4822 int32 aExp
, shiftCount
;
4823 uint64_t aSig0
, aSig1
;
4825 aSig1
= extractFloat128Frac1( a
);
4826 aSig0
= extractFloat128Frac0( a
);
4827 aExp
= extractFloat128Exp( a
);
4828 aSign
= extractFloat128Sign( a
);
4829 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
4830 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4831 aSig0
|= ( aSig1
!= 0 );
4832 shiftCount
= 0x4028 - aExp
;
4833 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
4834 return roundAndPackInt32( aSign
, aSig0 STATUS_VAR
);
4838 /*----------------------------------------------------------------------------
4839 | Returns the result of converting the quadruple-precision floating-point
4840 | value `a' to the 32-bit two's complement integer format. The conversion
4841 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4842 | Arithmetic, except that the conversion is always rounded toward zero. If
4843 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4844 | conversion overflows, the largest integer with the same sign as `a' is
4846 *----------------------------------------------------------------------------*/
4848 int32
float128_to_int32_round_to_zero( float128 a STATUS_PARAM
)
4851 int32 aExp
, shiftCount
;
4852 uint64_t aSig0
, aSig1
, savedASig
;
4855 aSig1
= extractFloat128Frac1( a
);
4856 aSig0
= extractFloat128Frac0( a
);
4857 aExp
= extractFloat128Exp( a
);
4858 aSign
= extractFloat128Sign( a
);
4859 aSig0
|= ( aSig1
!= 0 );
4860 if ( 0x401E < aExp
) {
4861 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
4864 else if ( aExp
< 0x3FFF ) {
4865 if ( aExp
|| aSig0
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4868 aSig0
|= LIT64( 0x0001000000000000 );
4869 shiftCount
= 0x402F - aExp
;
4871 aSig0
>>= shiftCount
;
4873 if ( aSign
) z
= - z
;
4874 if ( ( z
< 0 ) ^ aSign
) {
4876 float_raise( float_flag_invalid STATUS_VAR
);
4877 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
4879 if ( ( aSig0
<<shiftCount
) != savedASig
) {
4880 STATUS(float_exception_flags
) |= float_flag_inexact
;
4886 /*----------------------------------------------------------------------------
4887 | Returns the result of converting the quadruple-precision floating-point
4888 | value `a' to the 64-bit two's complement integer format. The conversion
4889 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4890 | Arithmetic---which means in particular that the conversion is rounded
4891 | according to the current rounding mode. If `a' is a NaN, the largest
4892 | positive integer is returned. Otherwise, if the conversion overflows, the
4893 | largest integer with the same sign as `a' is returned.
4894 *----------------------------------------------------------------------------*/
4896 int64
float128_to_int64( float128 a STATUS_PARAM
)
4899 int32 aExp
, shiftCount
;
4900 uint64_t aSig0
, aSig1
;
4902 aSig1
= extractFloat128Frac1( a
);
4903 aSig0
= extractFloat128Frac0( a
);
4904 aExp
= extractFloat128Exp( a
);
4905 aSign
= extractFloat128Sign( a
);
4906 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4907 shiftCount
= 0x402F - aExp
;
4908 if ( shiftCount
<= 0 ) {
4909 if ( 0x403E < aExp
) {
4910 float_raise( float_flag_invalid STATUS_VAR
);
4912 || ( ( aExp
== 0x7FFF )
4913 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
4916 return LIT64( 0x7FFFFFFFFFFFFFFF );
4918 return (int64_t) LIT64( 0x8000000000000000 );
4920 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
4923 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
4925 return roundAndPackInt64( aSign
, aSig0
, aSig1 STATUS_VAR
);
4929 /*----------------------------------------------------------------------------
4930 | Returns the result of converting the quadruple-precision floating-point
4931 | value `a' to the 64-bit two's complement integer format. The conversion
4932 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4933 | Arithmetic, except that the conversion is always rounded toward zero.
4934 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4935 | the conversion overflows, the largest integer with the same sign as `a' is
4937 *----------------------------------------------------------------------------*/
4939 int64
float128_to_int64_round_to_zero( float128 a STATUS_PARAM
)
4942 int32 aExp
, shiftCount
;
4943 uint64_t aSig0
, aSig1
;
4946 aSig1
= extractFloat128Frac1( a
);
4947 aSig0
= extractFloat128Frac0( a
);
4948 aExp
= extractFloat128Exp( a
);
4949 aSign
= extractFloat128Sign( a
);
4950 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4951 shiftCount
= aExp
- 0x402F;
4952 if ( 0 < shiftCount
) {
4953 if ( 0x403E <= aExp
) {
4954 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
4955 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
4956 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
4957 if ( aSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4960 float_raise( float_flag_invalid STATUS_VAR
);
4961 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
4962 return LIT64( 0x7FFFFFFFFFFFFFFF );
4965 return (int64_t) LIT64( 0x8000000000000000 );
4967 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
4968 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
4969 STATUS(float_exception_flags
) |= float_flag_inexact
;
4973 if ( aExp
< 0x3FFF ) {
4974 if ( aExp
| aSig0
| aSig1
) {
4975 STATUS(float_exception_flags
) |= float_flag_inexact
;
4979 z
= aSig0
>>( - shiftCount
);
4981 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
4982 STATUS(float_exception_flags
) |= float_flag_inexact
;
4985 if ( aSign
) z
= - z
;
4990 /*----------------------------------------------------------------------------
4991 | Returns the result of converting the quadruple-precision floating-point
4992 | value `a' to the single-precision floating-point format. The conversion
4993 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4995 *----------------------------------------------------------------------------*/
4997 float32
float128_to_float32( float128 a STATUS_PARAM
)
5001 uint64_t aSig0
, aSig1
;
5004 aSig1
= extractFloat128Frac1( a
);
5005 aSig0
= extractFloat128Frac0( a
);
5006 aExp
= extractFloat128Exp( a
);
5007 aSign
= extractFloat128Sign( a
);
5008 if ( aExp
== 0x7FFF ) {
5009 if ( aSig0
| aSig1
) {
5010 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
5012 return packFloat32( aSign
, 0xFF, 0 );
5014 aSig0
|= ( aSig1
!= 0 );
5015 shift64RightJamming( aSig0
, 18, &aSig0
);
5017 if ( aExp
|| zSig
) {
5021 return roundAndPackFloat32( aSign
, aExp
, zSig STATUS_VAR
);
5025 /*----------------------------------------------------------------------------
5026 | Returns the result of converting the quadruple-precision floating-point
5027 | value `a' to the double-precision floating-point format. The conversion
5028 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5030 *----------------------------------------------------------------------------*/
5032 float64
float128_to_float64( float128 a STATUS_PARAM
)
5036 uint64_t aSig0
, aSig1
;
5038 aSig1
= extractFloat128Frac1( a
);
5039 aSig0
= extractFloat128Frac0( a
);
5040 aExp
= extractFloat128Exp( a
);
5041 aSign
= extractFloat128Sign( a
);
5042 if ( aExp
== 0x7FFF ) {
5043 if ( aSig0
| aSig1
) {
5044 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
5046 return packFloat64( aSign
, 0x7FF, 0 );
5048 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
5049 aSig0
|= ( aSig1
!= 0 );
5050 if ( aExp
|| aSig0
) {
5051 aSig0
|= LIT64( 0x4000000000000000 );
5054 return roundAndPackFloat64( aSign
, aExp
, aSig0 STATUS_VAR
);
5058 /*----------------------------------------------------------------------------
5059 | Returns the result of converting the quadruple-precision floating-point
5060 | value `a' to the extended double-precision floating-point format. The
5061 | conversion is performed according to the IEC/IEEE Standard for Binary
5062 | Floating-Point Arithmetic.
5063 *----------------------------------------------------------------------------*/
5065 floatx80
float128_to_floatx80( float128 a STATUS_PARAM
)
5069 uint64_t aSig0
, aSig1
;
5071 aSig1
= extractFloat128Frac1( a
);
5072 aSig0
= extractFloat128Frac0( a
);
5073 aExp
= extractFloat128Exp( a
);
5074 aSign
= extractFloat128Sign( a
);
5075 if ( aExp
== 0x7FFF ) {
5076 if ( aSig0
| aSig1
) {
5077 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
5079 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5082 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
5083 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5086 aSig0
|= LIT64( 0x0001000000000000 );
5088 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
5089 return roundAndPackFloatx80( 80, aSign
, aExp
, aSig0
, aSig1 STATUS_VAR
);
5093 /*----------------------------------------------------------------------------
5094 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5095 | returns the result as a quadruple-precision floating-point value. The
5096 | operation is performed according to the IEC/IEEE Standard for Binary
5097 | Floating-Point Arithmetic.
5098 *----------------------------------------------------------------------------*/
5100 float128
float128_round_to_int( float128 a STATUS_PARAM
)
5104 uint64_t lastBitMask
, roundBitsMask
;
5108 aExp
= extractFloat128Exp( a
);
5109 if ( 0x402F <= aExp
) {
5110 if ( 0x406F <= aExp
) {
5111 if ( ( aExp
== 0x7FFF )
5112 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
5114 return propagateFloat128NaN( a
, a STATUS_VAR
);
5119 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
5120 roundBitsMask
= lastBitMask
- 1;
5122 roundingMode
= STATUS(float_rounding_mode
);
5123 if ( roundingMode
== float_round_nearest_even
) {
5124 if ( lastBitMask
) {
5125 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
5126 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
5129 if ( (int64_t) z
.low
< 0 ) {
5131 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
5135 else if ( roundingMode
!= float_round_to_zero
) {
5136 if ( extractFloat128Sign( z
)
5137 ^ ( roundingMode
== float_round_up
) ) {
5138 add128( z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
5141 z
.low
&= ~ roundBitsMask
;
5144 if ( aExp
< 0x3FFF ) {
5145 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
5146 STATUS(float_exception_flags
) |= float_flag_inexact
;
5147 aSign
= extractFloat128Sign( a
);
5148 switch ( STATUS(float_rounding_mode
) ) {
5149 case float_round_nearest_even
:
5150 if ( ( aExp
== 0x3FFE )
5151 && ( extractFloat128Frac0( a
)
5152 | extractFloat128Frac1( a
) )
5154 return packFloat128( aSign
, 0x3FFF, 0, 0 );
5157 case float_round_down
:
5159 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
5160 : packFloat128( 0, 0, 0, 0 );
5161 case float_round_up
:
5163 aSign
? packFloat128( 1, 0, 0, 0 )
5164 : packFloat128( 0, 0x3FFF, 0, 0 );
5166 return packFloat128( aSign
, 0, 0, 0 );
5169 lastBitMask
<<= 0x402F - aExp
;
5170 roundBitsMask
= lastBitMask
- 1;
5173 roundingMode
= STATUS(float_rounding_mode
);
5174 if ( roundingMode
== float_round_nearest_even
) {
5175 z
.high
+= lastBitMask
>>1;
5176 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
5177 z
.high
&= ~ lastBitMask
;
5180 else if ( roundingMode
!= float_round_to_zero
) {
5181 if ( extractFloat128Sign( z
)
5182 ^ ( roundingMode
== float_round_up
) ) {
5183 z
.high
|= ( a
.low
!= 0 );
5184 z
.high
+= roundBitsMask
;
5187 z
.high
&= ~ roundBitsMask
;
5189 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
5190 STATUS(float_exception_flags
) |= float_flag_inexact
;
5196 /*----------------------------------------------------------------------------
5197 | Returns the result of adding the absolute values of the quadruple-precision
5198 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5199 | before being returned. `zSign' is ignored if the result is a NaN.
5200 | The addition is performed according to the IEC/IEEE Standard for Binary
5201 | Floating-Point Arithmetic.
5202 *----------------------------------------------------------------------------*/
5204 static float128
addFloat128Sigs( float128 a
, float128 b
, flag zSign STATUS_PARAM
)
5206 int32 aExp
, bExp
, zExp
;
5207 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
5210 aSig1
= extractFloat128Frac1( a
);
5211 aSig0
= extractFloat128Frac0( a
);
5212 aExp
= extractFloat128Exp( a
);
5213 bSig1
= extractFloat128Frac1( b
);
5214 bSig0
= extractFloat128Frac0( b
);
5215 bExp
= extractFloat128Exp( b
);
5216 expDiff
= aExp
- bExp
;
5217 if ( 0 < expDiff
) {
5218 if ( aExp
== 0x7FFF ) {
5219 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5226 bSig0
|= LIT64( 0x0001000000000000 );
5228 shift128ExtraRightJamming(
5229 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
5232 else if ( expDiff
< 0 ) {
5233 if ( bExp
== 0x7FFF ) {
5234 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5235 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5241 aSig0
|= LIT64( 0x0001000000000000 );
5243 shift128ExtraRightJamming(
5244 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
5248 if ( aExp
== 0x7FFF ) {
5249 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
5250 return propagateFloat128NaN( a
, b STATUS_VAR
);
5254 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
5256 if (STATUS(flush_to_zero
)) {
5257 if (zSig0
| zSig1
) {
5258 float_raise(float_flag_output_denormal STATUS_VAR
);
5260 return packFloat128(zSign
, 0, 0, 0);
5262 return packFloat128( zSign
, 0, zSig0
, zSig1
);
5265 zSig0
|= LIT64( 0x0002000000000000 );
5269 aSig0
|= LIT64( 0x0001000000000000 );
5270 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
5272 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
5275 shift128ExtraRightJamming(
5276 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
5278 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5282 /*----------------------------------------------------------------------------
5283 | Returns the result of subtracting the absolute values of the quadruple-
5284 | precision floating-point values `a' and `b'. If `zSign' is 1, the
5285 | difference is negated before being returned. `zSign' is ignored if the
5286 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5287 | Standard for Binary Floating-Point Arithmetic.
5288 *----------------------------------------------------------------------------*/
5290 static float128
subFloat128Sigs( float128 a
, float128 b
, flag zSign STATUS_PARAM
)
5292 int32 aExp
, bExp
, zExp
;
5293 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
5297 aSig1
= extractFloat128Frac1( a
);
5298 aSig0
= extractFloat128Frac0( a
);
5299 aExp
= extractFloat128Exp( a
);
5300 bSig1
= extractFloat128Frac1( b
);
5301 bSig0
= extractFloat128Frac0( b
);
5302 bExp
= extractFloat128Exp( b
);
5303 expDiff
= aExp
- bExp
;
5304 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
5305 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
5306 if ( 0 < expDiff
) goto aExpBigger
;
5307 if ( expDiff
< 0 ) goto bExpBigger
;
5308 if ( aExp
== 0x7FFF ) {
5309 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
5310 return propagateFloat128NaN( a
, b STATUS_VAR
);
5312 float_raise( float_flag_invalid STATUS_VAR
);
5313 z
.low
= float128_default_nan_low
;
5314 z
.high
= float128_default_nan_high
;
5321 if ( bSig0
< aSig0
) goto aBigger
;
5322 if ( aSig0
< bSig0
) goto bBigger
;
5323 if ( bSig1
< aSig1
) goto aBigger
;
5324 if ( aSig1
< bSig1
) goto bBigger
;
5325 return packFloat128( STATUS(float_rounding_mode
) == float_round_down
, 0, 0, 0 );
5327 if ( bExp
== 0x7FFF ) {
5328 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5329 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
5335 aSig0
|= LIT64( 0x4000000000000000 );
5337 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
5338 bSig0
|= LIT64( 0x4000000000000000 );
5340 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
5343 goto normalizeRoundAndPack
;
5345 if ( aExp
== 0x7FFF ) {
5346 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5353 bSig0
|= LIT64( 0x4000000000000000 );
5355 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
5356 aSig0
|= LIT64( 0x4000000000000000 );
5358 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
5360 normalizeRoundAndPack
:
5362 return normalizeRoundAndPackFloat128( zSign
, zExp
- 14, zSig0
, zSig1 STATUS_VAR
);
5366 /*----------------------------------------------------------------------------
5367 | Returns the result of adding the quadruple-precision floating-point values
5368 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5369 | for Binary Floating-Point Arithmetic.
5370 *----------------------------------------------------------------------------*/
5372 float128
float128_add( float128 a
, float128 b STATUS_PARAM
)
5376 aSign
= extractFloat128Sign( a
);
5377 bSign
= extractFloat128Sign( b
);
5378 if ( aSign
== bSign
) {
5379 return addFloat128Sigs( a
, b
, aSign STATUS_VAR
);
5382 return subFloat128Sigs( a
, b
, aSign STATUS_VAR
);
5387 /*----------------------------------------------------------------------------
5388 | Returns the result of subtracting the quadruple-precision floating-point
5389 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5390 | Standard for Binary Floating-Point Arithmetic.
5391 *----------------------------------------------------------------------------*/
5393 float128
float128_sub( float128 a
, float128 b STATUS_PARAM
)
5397 aSign
= extractFloat128Sign( a
);
5398 bSign
= extractFloat128Sign( b
);
5399 if ( aSign
== bSign
) {
5400 return subFloat128Sigs( a
, b
, aSign STATUS_VAR
);
5403 return addFloat128Sigs( a
, b
, aSign STATUS_VAR
);
5408 /*----------------------------------------------------------------------------
5409 | Returns the result of multiplying the quadruple-precision floating-point
5410 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5411 | Standard for Binary Floating-Point Arithmetic.
5412 *----------------------------------------------------------------------------*/
5414 float128
float128_mul( float128 a
, float128 b STATUS_PARAM
)
5416 flag aSign
, bSign
, zSign
;
5417 int32 aExp
, bExp
, zExp
;
5418 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
5421 aSig1
= extractFloat128Frac1( a
);
5422 aSig0
= extractFloat128Frac0( a
);
5423 aExp
= extractFloat128Exp( a
);
5424 aSign
= extractFloat128Sign( a
);
5425 bSig1
= extractFloat128Frac1( b
);
5426 bSig0
= extractFloat128Frac0( b
);
5427 bExp
= extractFloat128Exp( b
);
5428 bSign
= extractFloat128Sign( b
);
5429 zSign
= aSign
^ bSign
;
5430 if ( aExp
== 0x7FFF ) {
5431 if ( ( aSig0
| aSig1
)
5432 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
5433 return propagateFloat128NaN( a
, b STATUS_VAR
);
5435 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
5436 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5438 if ( bExp
== 0x7FFF ) {
5439 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5440 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
5442 float_raise( float_flag_invalid STATUS_VAR
);
5443 z
.low
= float128_default_nan_low
;
5444 z
.high
= float128_default_nan_high
;
5447 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5450 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
5451 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5454 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
5455 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5457 zExp
= aExp
+ bExp
- 0x4000;
5458 aSig0
|= LIT64( 0x0001000000000000 );
5459 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
5460 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
5461 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
5462 zSig2
|= ( zSig3
!= 0 );
5463 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
5464 shift128ExtraRightJamming(
5465 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
5468 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5472 /*----------------------------------------------------------------------------
5473 | Returns the result of dividing the quadruple-precision floating-point value
5474 | `a' by the corresponding value `b'. The operation is performed according to
5475 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5476 *----------------------------------------------------------------------------*/
5478 float128
float128_div( float128 a
, float128 b STATUS_PARAM
)
5480 flag aSign
, bSign
, zSign
;
5481 int32 aExp
, bExp
, zExp
;
5482 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
5483 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5486 aSig1
= extractFloat128Frac1( a
);
5487 aSig0
= extractFloat128Frac0( a
);
5488 aExp
= extractFloat128Exp( a
);
5489 aSign
= extractFloat128Sign( a
);
5490 bSig1
= extractFloat128Frac1( b
);
5491 bSig0
= extractFloat128Frac0( b
);
5492 bExp
= extractFloat128Exp( b
);
5493 bSign
= extractFloat128Sign( b
);
5494 zSign
= aSign
^ bSign
;
5495 if ( aExp
== 0x7FFF ) {
5496 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5497 if ( bExp
== 0x7FFF ) {
5498 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5501 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5503 if ( bExp
== 0x7FFF ) {
5504 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5505 return packFloat128( zSign
, 0, 0, 0 );
5508 if ( ( bSig0
| bSig1
) == 0 ) {
5509 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
5511 float_raise( float_flag_invalid STATUS_VAR
);
5512 z
.low
= float128_default_nan_low
;
5513 z
.high
= float128_default_nan_high
;
5516 float_raise( float_flag_divbyzero STATUS_VAR
);
5517 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5519 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5522 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
5523 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5525 zExp
= aExp
- bExp
+ 0x3FFD;
5527 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
5529 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
5530 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
5531 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
5534 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5535 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
5536 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
5537 while ( (int64_t) rem0
< 0 ) {
5539 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
5541 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
5542 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
5543 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
5544 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
5545 while ( (int64_t) rem1
< 0 ) {
5547 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
5549 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5551 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
5552 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5556 /*----------------------------------------------------------------------------
5557 | Returns the remainder of the quadruple-precision floating-point value `a'
5558 | with respect to the corresponding value `b'. The operation is performed
5559 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5560 *----------------------------------------------------------------------------*/
5562 float128
float128_rem( float128 a
, float128 b STATUS_PARAM
)
5565 int32 aExp
, bExp
, expDiff
;
5566 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
5567 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
5571 aSig1
= extractFloat128Frac1( a
);
5572 aSig0
= extractFloat128Frac0( a
);
5573 aExp
= extractFloat128Exp( a
);
5574 aSign
= extractFloat128Sign( a
);
5575 bSig1
= extractFloat128Frac1( b
);
5576 bSig0
= extractFloat128Frac0( b
);
5577 bExp
= extractFloat128Exp( b
);
5578 if ( aExp
== 0x7FFF ) {
5579 if ( ( aSig0
| aSig1
)
5580 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
5581 return propagateFloat128NaN( a
, b STATUS_VAR
);
5585 if ( bExp
== 0x7FFF ) {
5586 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5590 if ( ( bSig0
| bSig1
) == 0 ) {
5592 float_raise( float_flag_invalid STATUS_VAR
);
5593 z
.low
= float128_default_nan_low
;
5594 z
.high
= float128_default_nan_high
;
5597 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5600 if ( ( aSig0
| aSig1
) == 0 ) return a
;
5601 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5603 expDiff
= aExp
- bExp
;
5604 if ( expDiff
< -1 ) return a
;
5606 aSig0
| LIT64( 0x0001000000000000 ),
5608 15 - ( expDiff
< 0 ),
5613 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
5614 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
5615 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
5617 while ( 0 < expDiff
) {
5618 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5619 q
= ( 4 < q
) ? q
- 4 : 0;
5620 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
5621 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
5622 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
5623 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
5626 if ( -64 < expDiff
) {
5627 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5628 q
= ( 4 < q
) ? q
- 4 : 0;
5630 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
5632 if ( expDiff
< 0 ) {
5633 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
5636 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
5638 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
5639 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
5642 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
5643 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
5646 alternateASig0
= aSig0
;
5647 alternateASig1
= aSig1
;
5649 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
5650 } while ( 0 <= (int64_t) aSig0
);
5652 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
5653 if ( ( sigMean0
< 0 )
5654 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
5655 aSig0
= alternateASig0
;
5656 aSig1
= alternateASig1
;
5658 zSign
= ( (int64_t) aSig0
< 0 );
5659 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
5661 normalizeRoundAndPackFloat128( aSign
^ zSign
, bExp
- 4, aSig0
, aSig1 STATUS_VAR
);
5665 /*----------------------------------------------------------------------------
5666 | Returns the square root of the quadruple-precision floating-point value `a'.
5667 | The operation is performed according to the IEC/IEEE Standard for Binary
5668 | Floating-Point Arithmetic.
5669 *----------------------------------------------------------------------------*/
5671 float128
float128_sqrt( float128 a STATUS_PARAM
)
5675 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
5676 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5679 aSig1
= extractFloat128Frac1( a
);
5680 aSig0
= extractFloat128Frac0( a
);
5681 aExp
= extractFloat128Exp( a
);
5682 aSign
= extractFloat128Sign( a
);
5683 if ( aExp
== 0x7FFF ) {
5684 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, a STATUS_VAR
);
5685 if ( ! aSign
) return a
;
5689 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
5691 float_raise( float_flag_invalid STATUS_VAR
);
5692 z
.low
= float128_default_nan_low
;
5693 z
.high
= float128_default_nan_high
;
5697 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
5698 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5700 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
5701 aSig0
|= LIT64( 0x0001000000000000 );
5702 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
5703 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
5704 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
5705 doubleZSig0
= zSig0
<<1;
5706 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
5707 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
5708 while ( (int64_t) rem0
< 0 ) {
5711 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
5713 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
5714 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
5715 if ( zSig1
== 0 ) zSig1
= 1;
5716 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
5717 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5718 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
5719 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5720 while ( (int64_t) rem1
< 0 ) {
5722 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
5724 term2
|= doubleZSig0
;
5725 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5727 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5729 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
5730 return roundAndPackFloat128( 0, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5734 /*----------------------------------------------------------------------------
5735 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5736 | the corresponding value `b', and 0 otherwise. The invalid exception is
5737 | raised if either operand is a NaN. Otherwise, the comparison is performed
5738 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5739 *----------------------------------------------------------------------------*/
5741 int float128_eq( float128 a
, float128 b STATUS_PARAM
)
5744 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5745 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5746 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5747 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5749 float_raise( float_flag_invalid STATUS_VAR
);
5754 && ( ( a
.high
== b
.high
)
5756 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5761 /*----------------------------------------------------------------------------
5762 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5763 | or equal to the corresponding value `b', and 0 otherwise. The invalid
5764 | exception is raised if either operand is a NaN. The comparison is performed
5765 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5766 *----------------------------------------------------------------------------*/
5768 int float128_le( float128 a
, float128 b STATUS_PARAM
)
5772 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5773 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5774 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5775 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5777 float_raise( float_flag_invalid STATUS_VAR
);
5780 aSign
= extractFloat128Sign( a
);
5781 bSign
= extractFloat128Sign( b
);
5782 if ( aSign
!= bSign
) {
5785 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5789 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5790 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5794 /*----------------------------------------------------------------------------
5795 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5796 | the corresponding value `b', and 0 otherwise. The invalid exception is
5797 | raised if either operand is a NaN. The comparison is performed according
5798 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5799 *----------------------------------------------------------------------------*/
5801 int float128_lt( float128 a
, float128 b STATUS_PARAM
)
5805 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5806 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5807 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5808 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5810 float_raise( float_flag_invalid STATUS_VAR
);
5813 aSign
= extractFloat128Sign( a
);
5814 bSign
= extractFloat128Sign( b
);
5815 if ( aSign
!= bSign
) {
5818 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5822 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5823 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5827 /*----------------------------------------------------------------------------
5828 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5829 | be compared, and 0 otherwise. The invalid exception is raised if either
5830 | operand is a NaN. The comparison is performed according to the IEC/IEEE
5831 | Standard for Binary Floating-Point Arithmetic.
5832 *----------------------------------------------------------------------------*/
5834 int float128_unordered( float128 a
, float128 b STATUS_PARAM
)
5836 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5837 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5838 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5839 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5841 float_raise( float_flag_invalid STATUS_VAR
);
5847 /*----------------------------------------------------------------------------
5848 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5849 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5850 | exception. The comparison is performed according to the IEC/IEEE Standard
5851 | for Binary Floating-Point Arithmetic.
5852 *----------------------------------------------------------------------------*/
5854 int float128_eq_quiet( float128 a
, float128 b STATUS_PARAM
)
5857 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5858 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5859 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5860 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5862 if ( float128_is_signaling_nan( a
)
5863 || float128_is_signaling_nan( b
) ) {
5864 float_raise( float_flag_invalid STATUS_VAR
);
5870 && ( ( a
.high
== b
.high
)
5872 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5877 /*----------------------------------------------------------------------------
5878 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5879 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5880 | cause an exception. Otherwise, the comparison is performed according to the
5881 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5882 *----------------------------------------------------------------------------*/
5884 int float128_le_quiet( float128 a
, float128 b STATUS_PARAM
)
5888 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5889 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5890 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5891 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5893 if ( float128_is_signaling_nan( a
)
5894 || float128_is_signaling_nan( b
) ) {
5895 float_raise( float_flag_invalid STATUS_VAR
);
5899 aSign
= extractFloat128Sign( a
);
5900 bSign
= extractFloat128Sign( b
);
5901 if ( aSign
!= bSign
) {
5904 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5908 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5909 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5913 /*----------------------------------------------------------------------------
5914 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5915 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5916 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5917 | Standard for Binary Floating-Point Arithmetic.
5918 *----------------------------------------------------------------------------*/
5920 int float128_lt_quiet( float128 a
, float128 b STATUS_PARAM
)
5924 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5925 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5926 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5927 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5929 if ( float128_is_signaling_nan( a
)
5930 || float128_is_signaling_nan( b
) ) {
5931 float_raise( float_flag_invalid STATUS_VAR
);
5935 aSign
= extractFloat128Sign( a
);
5936 bSign
= extractFloat128Sign( b
);
5937 if ( aSign
!= bSign
) {
5940 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5944 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5945 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5949 /*----------------------------------------------------------------------------
5950 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5951 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
5952 | comparison is performed according to the IEC/IEEE Standard for Binary
5953 | Floating-Point Arithmetic.
5954 *----------------------------------------------------------------------------*/
5956 int float128_unordered_quiet( float128 a
, float128 b STATUS_PARAM
)
5958 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5959 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5960 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5961 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5963 if ( float128_is_signaling_nan( a
)
5964 || float128_is_signaling_nan( b
) ) {
5965 float_raise( float_flag_invalid STATUS_VAR
);
5972 /* misc functions */
5973 float32
uint32_to_float32( uint32 a STATUS_PARAM
)
5975 return int64_to_float32(a STATUS_VAR
);
5978 float64
uint32_to_float64( uint32 a STATUS_PARAM
)
5980 return int64_to_float64(a STATUS_VAR
);
5983 uint32
float32_to_uint32( float32 a STATUS_PARAM
)
5988 v
= float32_to_int64(a STATUS_VAR
);
5991 float_raise( float_flag_invalid STATUS_VAR
);
5992 } else if (v
> 0xffffffff) {
5994 float_raise( float_flag_invalid STATUS_VAR
);
6001 uint32
float32_to_uint32_round_to_zero( float32 a STATUS_PARAM
)
6006 v
= float32_to_int64_round_to_zero(a STATUS_VAR
);
6009 float_raise( float_flag_invalid STATUS_VAR
);
6010 } else if (v
> 0xffffffff) {
6012 float_raise( float_flag_invalid STATUS_VAR
);
6019 uint16
float32_to_uint16_round_to_zero( float32 a STATUS_PARAM
)
6024 v
= float32_to_int64_round_to_zero(a STATUS_VAR
);
6027 float_raise( float_flag_invalid STATUS_VAR
);
6028 } else if (v
> 0xffff) {
6030 float_raise( float_flag_invalid STATUS_VAR
);
6037 uint32
float64_to_uint32( float64 a STATUS_PARAM
)
6042 v
= float64_to_int64(a STATUS_VAR
);
6045 float_raise( float_flag_invalid STATUS_VAR
);
6046 } else if (v
> 0xffffffff) {
6048 float_raise( float_flag_invalid STATUS_VAR
);
6055 uint32
float64_to_uint32_round_to_zero( float64 a STATUS_PARAM
)
6060 v
= float64_to_int64_round_to_zero(a STATUS_VAR
);
6063 float_raise( float_flag_invalid STATUS_VAR
);
6064 } else if (v
> 0xffffffff) {
6066 float_raise( float_flag_invalid STATUS_VAR
);
6073 uint16
float64_to_uint16_round_to_zero( float64 a STATUS_PARAM
)
6078 v
= float64_to_int64_round_to_zero(a STATUS_VAR
);
6081 float_raise( float_flag_invalid STATUS_VAR
);
6082 } else if (v
> 0xffff) {
6084 float_raise( float_flag_invalid STATUS_VAR
);
6091 /* FIXME: This looks broken. */
6092 uint64_t float64_to_uint64 (float64 a STATUS_PARAM
)
6096 v
= float64_val(int64_to_float64(INT64_MIN STATUS_VAR
));
6097 v
+= float64_val(a
);
6098 v
= float64_to_int64(make_float64(v
) STATUS_VAR
);
6100 return v
- INT64_MIN
;
6103 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM
)
6107 v
= float64_val(int64_to_float64(INT64_MIN STATUS_VAR
));
6108 v
+= float64_val(a
);
6109 v
= float64_to_int64_round_to_zero(make_float64(v
) STATUS_VAR
);
6111 return v
- INT64_MIN
;
6114 #define COMPARE(s, nan_exp) \
6115 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
6116 int is_quiet STATUS_PARAM ) \
6118 flag aSign, bSign; \
6119 uint ## s ## _t av, bv; \
6120 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6121 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6123 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
6124 extractFloat ## s ## Frac( a ) ) || \
6125 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
6126 extractFloat ## s ## Frac( b ) )) { \
6128 float ## s ## _is_signaling_nan( a ) || \
6129 float ## s ## _is_signaling_nan( b ) ) { \
6130 float_raise( float_flag_invalid STATUS_VAR); \
6132 return float_relation_unordered; \
6134 aSign = extractFloat ## s ## Sign( a ); \
6135 bSign = extractFloat ## s ## Sign( b ); \
6136 av = float ## s ## _val(a); \
6137 bv = float ## s ## _val(b); \
6138 if ( aSign != bSign ) { \
6139 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
6141 return float_relation_equal; \
6143 return 1 - (2 * aSign); \
6147 return float_relation_equal; \
6149 return 1 - 2 * (aSign ^ ( av < bv )); \
6154 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
6156 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
6159 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
6161 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
6167 INLINE
int floatx80_compare_internal( floatx80 a
, floatx80 b
,
6168 int is_quiet STATUS_PARAM
)
6172 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
6173 ( extractFloatx80Frac( a
)<<1 ) ) ||
6174 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
6175 ( extractFloatx80Frac( b
)<<1 ) )) {
6177 floatx80_is_signaling_nan( a
) ||
6178 floatx80_is_signaling_nan( b
) ) {
6179 float_raise( float_flag_invalid STATUS_VAR
);
6181 return float_relation_unordered
;
6183 aSign
= extractFloatx80Sign( a
);
6184 bSign
= extractFloatx80Sign( b
);
6185 if ( aSign
!= bSign
) {
6187 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
6188 ( ( a
.low
| b
.low
) == 0 ) ) {
6190 return float_relation_equal
;
6192 return 1 - (2 * aSign
);
6195 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
6196 return float_relation_equal
;
6198 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
6203 int floatx80_compare( floatx80 a
, floatx80 b STATUS_PARAM
)
6205 return floatx80_compare_internal(a
, b
, 0 STATUS_VAR
);
6208 int floatx80_compare_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
6210 return floatx80_compare_internal(a
, b
, 1 STATUS_VAR
);
6213 INLINE
int float128_compare_internal( float128 a
, float128 b
,
6214 int is_quiet STATUS_PARAM
)
6218 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
6219 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
6220 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
6221 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
6223 float128_is_signaling_nan( a
) ||
6224 float128_is_signaling_nan( b
) ) {
6225 float_raise( float_flag_invalid STATUS_VAR
);
6227 return float_relation_unordered
;
6229 aSign
= extractFloat128Sign( a
);
6230 bSign
= extractFloat128Sign( b
);
6231 if ( aSign
!= bSign
) {
6232 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
6234 return float_relation_equal
;
6236 return 1 - (2 * aSign
);
6239 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
6240 return float_relation_equal
;
6242 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
6247 int float128_compare( float128 a
, float128 b STATUS_PARAM
)
6249 return float128_compare_internal(a
, b
, 0 STATUS_VAR
);
6252 int float128_compare_quiet( float128 a
, float128 b STATUS_PARAM
)
6254 return float128_compare_internal(a
, b
, 1 STATUS_VAR
);
6257 /* min() and max() functions. These can't be implemented as
6258 * 'compare and pick one input' because that would mishandle
6259 * NaNs and +0 vs -0.
6261 #define MINMAX(s, nan_exp) \
6262 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \
6263 int ismin STATUS_PARAM ) \
6265 flag aSign, bSign; \
6266 uint ## s ## _t av, bv; \
6267 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6268 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6269 if (float ## s ## _is_any_nan(a) || \
6270 float ## s ## _is_any_nan(b)) { \
6271 return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
6273 aSign = extractFloat ## s ## Sign(a); \
6274 bSign = extractFloat ## s ## Sign(b); \
6275 av = float ## s ## _val(a); \
6276 bv = float ## s ## _val(b); \
6277 if (aSign != bSign) { \
6279 return aSign ? a : b; \
6281 return aSign ? b : a; \
6285 return (aSign ^ (av < bv)) ? a : b; \
6287 return (aSign ^ (av < bv)) ? b : a; \
6292 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
6294 return float ## s ## _minmax(a, b, 1 STATUS_VAR); \
6297 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
6299 return float ## s ## _minmax(a, b, 0 STATUS_VAR); \
6306 /* Multiply A by 2 raised to the power N. */
6307 float32
float32_scalbn( float32 a
, int n STATUS_PARAM
)
6313 a
= float32_squash_input_denormal(a STATUS_VAR
);
6314 aSig
= extractFloat32Frac( a
);
6315 aExp
= extractFloat32Exp( a
);
6316 aSign
= extractFloat32Sign( a
);
6318 if ( aExp
== 0xFF ) {
6320 return propagateFloat32NaN( a
, a STATUS_VAR
);
6326 else if ( aSig
== 0 )
6331 } else if (n
< -0x200) {
6337 return normalizeRoundAndPackFloat32( aSign
, aExp
, aSig STATUS_VAR
);
6340 float64
float64_scalbn( float64 a
, int n STATUS_PARAM
)
6346 a
= float64_squash_input_denormal(a STATUS_VAR
);
6347 aSig
= extractFloat64Frac( a
);
6348 aExp
= extractFloat64Exp( a
);
6349 aSign
= extractFloat64Sign( a
);
6351 if ( aExp
== 0x7FF ) {
6353 return propagateFloat64NaN( a
, a STATUS_VAR
);
6358 aSig
|= LIT64( 0x0010000000000000 );
6359 else if ( aSig
== 0 )
6364 } else if (n
< -0x1000) {
6370 return normalizeRoundAndPackFloat64( aSign
, aExp
, aSig STATUS_VAR
);
6373 floatx80
floatx80_scalbn( floatx80 a
, int n STATUS_PARAM
)
6379 aSig
= extractFloatx80Frac( a
);
6380 aExp
= extractFloatx80Exp( a
);
6381 aSign
= extractFloatx80Sign( a
);
6383 if ( aExp
== 0x7FFF ) {
6385 return propagateFloatx80NaN( a
, a STATUS_VAR
);
6390 if (aExp
== 0 && aSig
== 0)
6395 } else if (n
< -0x10000) {
6400 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision
),
6401 aSign
, aExp
, aSig
, 0 STATUS_VAR
);
6404 float128
float128_scalbn( float128 a
, int n STATUS_PARAM
)
6408 uint64_t aSig0
, aSig1
;
6410 aSig1
= extractFloat128Frac1( a
);
6411 aSig0
= extractFloat128Frac0( a
);
6412 aExp
= extractFloat128Exp( a
);
6413 aSign
= extractFloat128Sign( a
);
6414 if ( aExp
== 0x7FFF ) {
6415 if ( aSig0
| aSig1
) {
6416 return propagateFloat128NaN( a
, a STATUS_VAR
);
6421 aSig0
|= LIT64( 0x0001000000000000 );
6422 else if ( aSig0
== 0 && aSig1
== 0 )
6427 } else if (n
< -0x10000) {
6432 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1