2 /*============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15 arithmetic/SoftFloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
26 Derivative works are acceptable, even for commercial purposes, so long as
27 (1) the source code for the derivative work includes prominent notice that
28 the work is derivative, and (2) the source code includes prominent notice with
29 these four paragraphs for those parts of this code that are retained.
31 =============================================================================*/
33 #include "softfloat.h"
35 /*----------------------------------------------------------------------------
36 | Primitive arithmetic functions, including multi-word arithmetic, and
37 | division and square root approximations. (Can be specialized to target if
39 *----------------------------------------------------------------------------*/
40 #include "softfloat-macros.h"
42 /*----------------------------------------------------------------------------
43 | Functions and definitions to determine: (1) whether tininess for underflow
44 | is detected before or after rounding by default, (2) what (if anything)
45 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
46 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
47 | are propagated from function inputs to output. These details are target-
49 *----------------------------------------------------------------------------*/
50 #include "softfloat-specialize.h"
52 void set_float_rounding_mode(int val STATUS_PARAM
)
54 STATUS(float_rounding_mode
) = val
;
57 void set_float_exception_flags(int val STATUS_PARAM
)
59 STATUS(float_exception_flags
) = val
;
63 void set_floatx80_rounding_precision(int val STATUS_PARAM
)
65 STATUS(floatx80_rounding_precision
) = val
;
69 /*----------------------------------------------------------------------------
70 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
71 | and 7, and returns the properly rounded 32-bit integer corresponding to the
72 | input. If `zSign' is 1, the input is negated before being converted to an
73 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
74 | is simply rounded to an integer, with the inexact exception raised if the
75 | input cannot be represented exactly as an integer. However, if the fixed-
76 | point input is too large, the invalid exception is raised and the largest
77 | positive or negative integer is returned.
78 *----------------------------------------------------------------------------*/
80 static int32
roundAndPackInt32( flag zSign
, bits64 absZ STATUS_PARAM
)
83 flag roundNearestEven
;
84 int8 roundIncrement
, roundBits
;
87 roundingMode
= STATUS(float_rounding_mode
);
88 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
89 roundIncrement
= 0x40;
90 if ( ! roundNearestEven
) {
91 if ( roundingMode
== float_round_to_zero
) {
95 roundIncrement
= 0x7F;
97 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
100 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
104 roundBits
= absZ
& 0x7F;
105 absZ
= ( absZ
+ roundIncrement
)>>7;
106 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
108 if ( zSign
) z
= - z
;
109 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
110 float_raise( float_flag_invalid STATUS_VAR
);
111 return zSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
113 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
118 /*----------------------------------------------------------------------------
119 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
120 | `absZ1', with binary point between bits 63 and 64 (between the input words),
121 | and returns the properly rounded 64-bit integer corresponding to the input.
122 | If `zSign' is 1, the input is negated before being converted to an integer.
123 | Ordinarily, the fixed-point input is simply rounded to an integer, with
124 | the inexact exception raised if the input cannot be represented exactly as
125 | an integer. However, if the fixed-point input is too large, the invalid
126 | exception is raised and the largest positive or negative integer is
128 *----------------------------------------------------------------------------*/
130 static int64
roundAndPackInt64( flag zSign
, bits64 absZ0
, bits64 absZ1 STATUS_PARAM
)
133 flag roundNearestEven
, increment
;
136 roundingMode
= STATUS(float_rounding_mode
);
137 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
138 increment
= ( (sbits64
) absZ1
< 0 );
139 if ( ! roundNearestEven
) {
140 if ( roundingMode
== float_round_to_zero
) {
145 increment
= ( roundingMode
== float_round_down
) && absZ1
;
148 increment
= ( roundingMode
== float_round_up
) && absZ1
;
154 if ( absZ0
== 0 ) goto overflow
;
155 absZ0
&= ~ ( ( (bits64
) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
158 if ( zSign
) z
= - z
;
159 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
161 float_raise( float_flag_invalid STATUS_VAR
);
163 zSign
? (sbits64
) LIT64( 0x8000000000000000 )
164 : LIT64( 0x7FFFFFFFFFFFFFFF );
166 if ( absZ1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
171 /*----------------------------------------------------------------------------
172 | Returns the fraction bits of the single-precision floating-point value `a'.
173 *----------------------------------------------------------------------------*/
175 INLINE bits32
extractFloat32Frac( float32 a
)
178 return float32_val(a
) & 0x007FFFFF;
182 /*----------------------------------------------------------------------------
183 | Returns the exponent bits of the single-precision floating-point value `a'.
184 *----------------------------------------------------------------------------*/
186 INLINE int16
extractFloat32Exp( float32 a
)
189 return ( float32_val(a
)>>23 ) & 0xFF;
193 /*----------------------------------------------------------------------------
194 | Returns the sign bit of the single-precision floating-point value `a'.
195 *----------------------------------------------------------------------------*/
197 INLINE flag
extractFloat32Sign( float32 a
)
200 return float32_val(a
)>>31;
204 /*----------------------------------------------------------------------------
205 | Normalizes the subnormal single-precision floating-point value represented
206 | by the denormalized significand `aSig'. The normalized exponent and
207 | significand are stored at the locations pointed to by `zExpPtr' and
208 | `zSigPtr', respectively.
209 *----------------------------------------------------------------------------*/
212 normalizeFloat32Subnormal( bits32 aSig
, int16
*zExpPtr
, bits32
*zSigPtr
)
216 shiftCount
= countLeadingZeros32( aSig
) - 8;
217 *zSigPtr
= aSig
<<shiftCount
;
218 *zExpPtr
= 1 - shiftCount
;
222 /*----------------------------------------------------------------------------
223 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
224 | single-precision floating-point value, returning the result. After being
225 | shifted into the proper positions, the three fields are simply added
226 | together to form the result. This means that any integer portion of `zSig'
227 | will be added into the exponent. Since a properly normalized significand
228 | will have an integer portion equal to 1, the `zExp' input should be 1 less
229 | than the desired result exponent whenever `zSig' is a complete, normalized
231 *----------------------------------------------------------------------------*/
233 INLINE float32
packFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
237 ( ( (bits32
) zSign
)<<31 ) + ( ( (bits32
) zExp
)<<23 ) + zSig
);
241 /*----------------------------------------------------------------------------
242 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
243 | and significand `zSig', and returns the proper single-precision floating-
244 | point value corresponding to the abstract input. Ordinarily, the abstract
245 | value is simply rounded and packed into the single-precision format, with
246 | the inexact exception raised if the abstract input cannot be represented
247 | exactly. However, if the abstract value is too large, the overflow and
248 | inexact exceptions are raised and an infinity or maximal finite value is
249 | returned. If the abstract value is too small, the input value is rounded to
250 | a subnormal number, and the underflow and inexact exceptions are raised if
251 | the abstract input cannot be represented exactly as a subnormal single-
252 | precision floating-point number.
253 | The input significand `zSig' has its binary point between bits 30
254 | and 29, which is 7 bits to the left of the usual location. This shifted
255 | significand must be normalized or smaller. If `zSig' is not normalized,
256 | `zExp' must be 0; in that case, the result returned is a subnormal number,
257 | and it must not require rounding. In the usual case that `zSig' is
258 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
259 | The handling of underflow and overflow follows the IEC/IEEE Standard for
260 | Binary Floating-Point Arithmetic.
261 *----------------------------------------------------------------------------*/
263 static float32
roundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig STATUS_PARAM
)
266 flag roundNearestEven
;
267 int8 roundIncrement
, roundBits
;
270 roundingMode
= STATUS(float_rounding_mode
);
271 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
272 roundIncrement
= 0x40;
273 if ( ! roundNearestEven
) {
274 if ( roundingMode
== float_round_to_zero
) {
278 roundIncrement
= 0x7F;
280 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
283 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
287 roundBits
= zSig
& 0x7F;
288 if ( 0xFD <= (bits16
) zExp
) {
290 || ( ( zExp
== 0xFD )
291 && ( (sbits32
) ( zSig
+ roundIncrement
) < 0 ) )
293 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
294 return packFloat32( zSign
, 0xFF, - ( roundIncrement
== 0 ));
298 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
300 || ( zSig
+ roundIncrement
< 0x80000000 );
301 shift32RightJamming( zSig
, - zExp
, &zSig
);
303 roundBits
= zSig
& 0x7F;
304 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
307 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
308 zSig
= ( zSig
+ roundIncrement
)>>7;
309 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
310 if ( zSig
== 0 ) zExp
= 0;
311 return packFloat32( zSign
, zExp
, zSig
);
315 /*----------------------------------------------------------------------------
316 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
317 | and significand `zSig', and returns the proper single-precision floating-
318 | point value corresponding to the abstract input. This routine is just like
319 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
320 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
321 | floating-point exponent.
322 *----------------------------------------------------------------------------*/
325 normalizeRoundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig STATUS_PARAM
)
329 shiftCount
= countLeadingZeros32( zSig
) - 1;
330 return roundAndPackFloat32( zSign
, zExp
- shiftCount
, zSig
<<shiftCount STATUS_VAR
);
334 /*----------------------------------------------------------------------------
335 | Returns the fraction bits of the double-precision floating-point value `a'.
336 *----------------------------------------------------------------------------*/
338 INLINE bits64
extractFloat64Frac( float64 a
)
341 return float64_val(a
) & LIT64( 0x000FFFFFFFFFFFFF );
345 /*----------------------------------------------------------------------------
346 | Returns the exponent bits of the double-precision floating-point value `a'.
347 *----------------------------------------------------------------------------*/
349 INLINE int16
extractFloat64Exp( float64 a
)
352 return ( float64_val(a
)>>52 ) & 0x7FF;
356 /*----------------------------------------------------------------------------
357 | Returns the sign bit of the double-precision floating-point value `a'.
358 *----------------------------------------------------------------------------*/
360 INLINE flag
extractFloat64Sign( float64 a
)
363 return float64_val(a
)>>63;
367 /*----------------------------------------------------------------------------
368 | Normalizes the subnormal double-precision floating-point value represented
369 | by the denormalized significand `aSig'. The normalized exponent and
370 | significand are stored at the locations pointed to by `zExpPtr' and
371 | `zSigPtr', respectively.
372 *----------------------------------------------------------------------------*/
375 normalizeFloat64Subnormal( bits64 aSig
, int16
*zExpPtr
, bits64
*zSigPtr
)
379 shiftCount
= countLeadingZeros64( aSig
) - 11;
380 *zSigPtr
= aSig
<<shiftCount
;
381 *zExpPtr
= 1 - shiftCount
;
385 /*----------------------------------------------------------------------------
386 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
387 | double-precision floating-point value, returning the result. After being
388 | shifted into the proper positions, the three fields are simply added
389 | together to form the result. This means that any integer portion of `zSig'
390 | will be added into the exponent. Since a properly normalized significand
391 | will have an integer portion equal to 1, the `zExp' input should be 1 less
392 | than the desired result exponent whenever `zSig' is a complete, normalized
394 *----------------------------------------------------------------------------*/
396 INLINE float64
packFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
400 ( ( (bits64
) zSign
)<<63 ) + ( ( (bits64
) zExp
)<<52 ) + zSig
);
404 /*----------------------------------------------------------------------------
405 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
406 | and significand `zSig', and returns the proper double-precision floating-
407 | point value corresponding to the abstract input. Ordinarily, the abstract
408 | value is simply rounded and packed into the double-precision format, with
409 | the inexact exception raised if the abstract input cannot be represented
410 | exactly. However, if the abstract value is too large, the overflow and
411 | inexact exceptions are raised and an infinity or maximal finite value is
412 | returned. If the abstract value is too small, the input value is rounded
413 | to a subnormal number, and the underflow and inexact exceptions are raised
414 | if the abstract input cannot be represented exactly as a subnormal double-
415 | precision floating-point number.
416 | The input significand `zSig' has its binary point between bits 62
417 | and 61, which is 10 bits to the left of the usual location. This shifted
418 | significand must be normalized or smaller. If `zSig' is not normalized,
419 | `zExp' must be 0; in that case, the result returned is a subnormal number,
420 | and it must not require rounding. In the usual case that `zSig' is
421 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
422 | The handling of underflow and overflow follows the IEC/IEEE Standard for
423 | Binary Floating-Point Arithmetic.
424 *----------------------------------------------------------------------------*/
426 static float64
roundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig STATUS_PARAM
)
429 flag roundNearestEven
;
430 int16 roundIncrement
, roundBits
;
433 roundingMode
= STATUS(float_rounding_mode
);
434 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
435 roundIncrement
= 0x200;
436 if ( ! roundNearestEven
) {
437 if ( roundingMode
== float_round_to_zero
) {
441 roundIncrement
= 0x3FF;
443 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
446 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
450 roundBits
= zSig
& 0x3FF;
451 if ( 0x7FD <= (bits16
) zExp
) {
452 if ( ( 0x7FD < zExp
)
453 || ( ( zExp
== 0x7FD )
454 && ( (sbits64
) ( zSig
+ roundIncrement
) < 0 ) )
456 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
457 return packFloat64( zSign
, 0x7FF, - ( roundIncrement
== 0 ));
461 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
463 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
464 shift64RightJamming( zSig
, - zExp
, &zSig
);
466 roundBits
= zSig
& 0x3FF;
467 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
470 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
471 zSig
= ( zSig
+ roundIncrement
)>>10;
472 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
473 if ( zSig
== 0 ) zExp
= 0;
474 return packFloat64( zSign
, zExp
, zSig
);
478 /*----------------------------------------------------------------------------
479 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
480 | and significand `zSig', and returns the proper double-precision floating-
481 | point value corresponding to the abstract input. This routine is just like
482 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
483 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
484 | floating-point exponent.
485 *----------------------------------------------------------------------------*/
488 normalizeRoundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig STATUS_PARAM
)
492 shiftCount
= countLeadingZeros64( zSig
) - 1;
493 return roundAndPackFloat64( zSign
, zExp
- shiftCount
, zSig
<<shiftCount STATUS_VAR
);
499 /*----------------------------------------------------------------------------
500 | Returns the fraction bits of the extended double-precision floating-point
502 *----------------------------------------------------------------------------*/
504 INLINE bits64
extractFloatx80Frac( floatx80 a
)
511 /*----------------------------------------------------------------------------
512 | Returns the exponent bits of the extended double-precision floating-point
514 *----------------------------------------------------------------------------*/
516 INLINE int32
extractFloatx80Exp( floatx80 a
)
519 return a
.high
& 0x7FFF;
523 /*----------------------------------------------------------------------------
524 | Returns the sign bit of the extended double-precision floating-point value
526 *----------------------------------------------------------------------------*/
528 INLINE flag
extractFloatx80Sign( floatx80 a
)
535 /*----------------------------------------------------------------------------
536 | Normalizes the subnormal extended double-precision floating-point value
537 | represented by the denormalized significand `aSig'. The normalized exponent
538 | and significand are stored at the locations pointed to by `zExpPtr' and
539 | `zSigPtr', respectively.
540 *----------------------------------------------------------------------------*/
543 normalizeFloatx80Subnormal( bits64 aSig
, int32
*zExpPtr
, bits64
*zSigPtr
)
547 shiftCount
= countLeadingZeros64( aSig
);
548 *zSigPtr
= aSig
<<shiftCount
;
549 *zExpPtr
= 1 - shiftCount
;
553 /*----------------------------------------------------------------------------
554 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
555 | extended double-precision floating-point value, returning the result.
556 *----------------------------------------------------------------------------*/
558 INLINE floatx80
packFloatx80( flag zSign
, int32 zExp
, bits64 zSig
)
563 z
.high
= ( ( (bits16
) zSign
)<<15 ) + zExp
;
568 /*----------------------------------------------------------------------------
569 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
570 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
571 | and returns the proper extended double-precision floating-point value
572 | corresponding to the abstract input. Ordinarily, the abstract value is
573 | rounded and packed into the extended double-precision format, with the
574 | inexact exception raised if the abstract input cannot be represented
575 | exactly. However, if the abstract value is too large, the overflow and
576 | inexact exceptions are raised and an infinity or maximal finite value is
577 | returned. If the abstract value is too small, the input value is rounded to
578 | a subnormal number, and the underflow and inexact exceptions are raised if
579 | the abstract input cannot be represented exactly as a subnormal extended
580 | double-precision floating-point number.
581 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
582 | number of bits as single or double precision, respectively. Otherwise, the
583 | result is rounded to the full precision of the extended double-precision
585 | The input significand must be normalized or smaller. If the input
586 | significand is not normalized, `zExp' must be 0; in that case, the result
587 | returned is a subnormal number, and it must not require rounding. The
588 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
589 | Floating-Point Arithmetic.
590 *----------------------------------------------------------------------------*/
593 roundAndPackFloatx80(
594 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
598 flag roundNearestEven
, increment
, isTiny
;
599 int64 roundIncrement
, roundMask
, roundBits
;
601 roundingMode
= STATUS(float_rounding_mode
);
602 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
603 if ( roundingPrecision
== 80 ) goto precision80
;
604 if ( roundingPrecision
== 64 ) {
605 roundIncrement
= LIT64( 0x0000000000000400 );
606 roundMask
= LIT64( 0x00000000000007FF );
608 else if ( roundingPrecision
== 32 ) {
609 roundIncrement
= LIT64( 0x0000008000000000 );
610 roundMask
= LIT64( 0x000000FFFFFFFFFF );
615 zSig0
|= ( zSig1
!= 0 );
616 if ( ! roundNearestEven
) {
617 if ( roundingMode
== float_round_to_zero
) {
621 roundIncrement
= roundMask
;
623 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
626 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
630 roundBits
= zSig0
& roundMask
;
631 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
632 if ( ( 0x7FFE < zExp
)
633 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
639 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
641 || ( zSig0
<= zSig0
+ roundIncrement
);
642 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
644 roundBits
= zSig0
& roundMask
;
645 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
646 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
647 zSig0
+= roundIncrement
;
648 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
649 roundIncrement
= roundMask
+ 1;
650 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
651 roundMask
|= roundIncrement
;
653 zSig0
&= ~ roundMask
;
654 return packFloatx80( zSign
, zExp
, zSig0
);
657 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
658 zSig0
+= roundIncrement
;
659 if ( zSig0
< roundIncrement
) {
661 zSig0
= LIT64( 0x8000000000000000 );
663 roundIncrement
= roundMask
+ 1;
664 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
665 roundMask
|= roundIncrement
;
667 zSig0
&= ~ roundMask
;
668 if ( zSig0
== 0 ) zExp
= 0;
669 return packFloatx80( zSign
, zExp
, zSig0
);
671 increment
= ( (sbits64
) zSig1
< 0 );
672 if ( ! roundNearestEven
) {
673 if ( roundingMode
== float_round_to_zero
) {
678 increment
= ( roundingMode
== float_round_down
) && zSig1
;
681 increment
= ( roundingMode
== float_round_up
) && zSig1
;
685 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
686 if ( ( 0x7FFE < zExp
)
687 || ( ( zExp
== 0x7FFE )
688 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
694 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
695 if ( ( roundingMode
== float_round_to_zero
)
696 || ( zSign
&& ( roundingMode
== float_round_up
) )
697 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
699 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
701 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
705 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
708 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
709 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
711 if ( isTiny
&& zSig1
) float_raise( float_flag_underflow STATUS_VAR
);
712 if ( zSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
713 if ( roundNearestEven
) {
714 increment
= ( (sbits64
) zSig1
< 0 );
718 increment
= ( roundingMode
== float_round_down
) && zSig1
;
721 increment
= ( roundingMode
== float_round_up
) && zSig1
;
727 ~ ( ( (bits64
) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
728 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
730 return packFloatx80( zSign
, zExp
, zSig0
);
733 if ( zSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
738 zSig0
= LIT64( 0x8000000000000000 );
741 zSig0
&= ~ ( ( (bits64
) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
745 if ( zSig0
== 0 ) zExp
= 0;
747 return packFloatx80( zSign
, zExp
, zSig0
);
751 /*----------------------------------------------------------------------------
752 | Takes an abstract floating-point value having sign `zSign', exponent
753 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
754 | and returns the proper extended double-precision floating-point value
755 | corresponding to the abstract input. This routine is just like
756 | `roundAndPackFloatx80' except that the input significand does not have to be
758 *----------------------------------------------------------------------------*/
761 normalizeRoundAndPackFloatx80(
762 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
772 shiftCount
= countLeadingZeros64( zSig0
);
773 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
776 roundAndPackFloatx80( roundingPrecision
, zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
784 /*----------------------------------------------------------------------------
785 | Returns the least-significant 64 fraction bits of the quadruple-precision
786 | floating-point value `a'.
787 *----------------------------------------------------------------------------*/
789 INLINE bits64
extractFloat128Frac1( float128 a
)
796 /*----------------------------------------------------------------------------
797 | Returns the most-significant 48 fraction bits of the quadruple-precision
798 | floating-point value `a'.
799 *----------------------------------------------------------------------------*/
801 INLINE bits64
extractFloat128Frac0( float128 a
)
804 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
808 /*----------------------------------------------------------------------------
809 | Returns the exponent bits of the quadruple-precision floating-point value
811 *----------------------------------------------------------------------------*/
813 INLINE int32
extractFloat128Exp( float128 a
)
816 return ( a
.high
>>48 ) & 0x7FFF;
820 /*----------------------------------------------------------------------------
821 | Returns the sign bit of the quadruple-precision floating-point value `a'.
822 *----------------------------------------------------------------------------*/
824 INLINE flag
extractFloat128Sign( float128 a
)
831 /*----------------------------------------------------------------------------
832 | Normalizes the subnormal quadruple-precision floating-point value
833 | represented by the denormalized significand formed by the concatenation of
834 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
835 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
836 | significand are stored at the location pointed to by `zSig0Ptr', and the
837 | least significant 64 bits of the normalized significand are stored at the
838 | location pointed to by `zSig1Ptr'.
839 *----------------------------------------------------------------------------*/
842 normalizeFloat128Subnormal(
853 shiftCount
= countLeadingZeros64( aSig1
) - 15;
854 if ( shiftCount
< 0 ) {
855 *zSig0Ptr
= aSig1
>>( - shiftCount
);
856 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
859 *zSig0Ptr
= aSig1
<<shiftCount
;
862 *zExpPtr
= - shiftCount
- 63;
865 shiftCount
= countLeadingZeros64( aSig0
) - 15;
866 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
867 *zExpPtr
= 1 - shiftCount
;
872 /*----------------------------------------------------------------------------
873 | Packs the sign `zSign', the exponent `zExp', and the significand formed
874 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
875 | floating-point value, returning the result. After being shifted into the
876 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
877 | added together to form the most significant 32 bits of the result. This
878 | means that any integer portion of `zSig0' will be added into the exponent.
879 | Since a properly normalized significand will have an integer portion equal
880 | to 1, the `zExp' input should be 1 less than the desired result exponent
881 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
883 *----------------------------------------------------------------------------*/
886 packFloat128( flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
)
891 z
.high
= ( ( (bits64
) zSign
)<<63 ) + ( ( (bits64
) zExp
)<<48 ) + zSig0
;
896 /*----------------------------------------------------------------------------
897 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
898 | and extended significand formed by the concatenation of `zSig0', `zSig1',
899 | and `zSig2', and returns the proper quadruple-precision floating-point value
900 | corresponding to the abstract input. Ordinarily, the abstract value is
901 | simply rounded and packed into the quadruple-precision format, with the
902 | inexact exception raised if the abstract input cannot be represented
903 | exactly. However, if the abstract value is too large, the overflow and
904 | inexact exceptions are raised and an infinity or maximal finite value is
905 | returned. If the abstract value is too small, the input value is rounded to
906 | a subnormal number, and the underflow and inexact exceptions are raised if
907 | the abstract input cannot be represented exactly as a subnormal quadruple-
908 | precision floating-point number.
909 | The input significand must be normalized or smaller. If the input
910 | significand is not normalized, `zExp' must be 0; in that case, the result
911 | returned is a subnormal number, and it must not require rounding. In the
912 | usual case that the input significand is normalized, `zExp' must be 1 less
913 | than the ``true'' floating-point exponent. The handling of underflow and
914 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
915 *----------------------------------------------------------------------------*/
918 roundAndPackFloat128(
919 flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
, bits64 zSig2 STATUS_PARAM
)
922 flag roundNearestEven
, increment
, isTiny
;
924 roundingMode
= STATUS(float_rounding_mode
);
925 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
926 increment
= ( (sbits64
) zSig2
< 0 );
927 if ( ! roundNearestEven
) {
928 if ( roundingMode
== float_round_to_zero
) {
933 increment
= ( roundingMode
== float_round_down
) && zSig2
;
936 increment
= ( roundingMode
== float_round_up
) && zSig2
;
940 if ( 0x7FFD <= (bits32
) zExp
) {
941 if ( ( 0x7FFD < zExp
)
942 || ( ( zExp
== 0x7FFD )
944 LIT64( 0x0001FFFFFFFFFFFF ),
945 LIT64( 0xFFFFFFFFFFFFFFFF ),
952 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
953 if ( ( roundingMode
== float_round_to_zero
)
954 || ( zSign
&& ( roundingMode
== float_round_up
) )
955 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
961 LIT64( 0x0000FFFFFFFFFFFF ),
962 LIT64( 0xFFFFFFFFFFFFFFFF )
965 return packFloat128( zSign
, 0x7FFF, 0, 0 );
969 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
975 LIT64( 0x0001FFFFFFFFFFFF ),
976 LIT64( 0xFFFFFFFFFFFFFFFF )
978 shift128ExtraRightJamming(
979 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
981 if ( isTiny
&& zSig2
) float_raise( float_flag_underflow STATUS_VAR
);
982 if ( roundNearestEven
) {
983 increment
= ( (sbits64
) zSig2
< 0 );
987 increment
= ( roundingMode
== float_round_down
) && zSig2
;
990 increment
= ( roundingMode
== float_round_up
) && zSig2
;
995 if ( zSig2
) STATUS(float_exception_flags
) |= float_flag_inexact
;
997 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
998 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
1001 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
1003 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1007 /*----------------------------------------------------------------------------
1008 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1009 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1010 | returns the proper quadruple-precision floating-point value corresponding
1011 | to the abstract input. This routine is just like `roundAndPackFloat128'
1012 | except that the input significand has fewer bits and does not have to be
1013 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1015 *----------------------------------------------------------------------------*/
1018 normalizeRoundAndPackFloat128(
1019 flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1 STATUS_PARAM
)
1029 shiftCount
= countLeadingZeros64( zSig0
) - 15;
1030 if ( 0 <= shiftCount
) {
1032 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1035 shift128ExtraRightJamming(
1036 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
1039 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
1045 /*----------------------------------------------------------------------------
1046 | Returns the result of converting the 32-bit two's complement integer `a'
1047 | to the single-precision floating-point format. The conversion is performed
1048 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1049 *----------------------------------------------------------------------------*/
1051 float32
int32_to_float32( int32 a STATUS_PARAM
)
1055 if ( a
== 0 ) return float32_zero
;
1056 if ( a
== (sbits32
) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1058 return normalizeRoundAndPackFloat32( zSign
, 0x9C, zSign
? - a
: a STATUS_VAR
);
1062 /*----------------------------------------------------------------------------
1063 | Returns the result of converting the 32-bit two's complement integer `a'
1064 | to the double-precision floating-point format. The conversion is performed
1065 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1066 *----------------------------------------------------------------------------*/
1068 float64
int32_to_float64( int32 a STATUS_PARAM
)
1075 if ( a
== 0 ) return float64_zero
;
1077 absA
= zSign
? - a
: a
;
1078 shiftCount
= countLeadingZeros32( absA
) + 21;
1080 return packFloat64( zSign
, 0x432 - shiftCount
, zSig
<<shiftCount
);
1086 /*----------------------------------------------------------------------------
1087 | Returns the result of converting the 32-bit two's complement integer `a'
1088 | to the extended double-precision floating-point format. The conversion
1089 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1091 *----------------------------------------------------------------------------*/
1093 floatx80
int32_to_floatx80( int32 a STATUS_PARAM
)
1100 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1102 absA
= zSign
? - a
: a
;
1103 shiftCount
= countLeadingZeros32( absA
) + 32;
1105 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
1113 /*----------------------------------------------------------------------------
1114 | Returns the result of converting the 32-bit two's complement integer `a' to
1115 | the quadruple-precision floating-point format. The conversion is performed
1116 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1117 *----------------------------------------------------------------------------*/
1119 float128
int32_to_float128( int32 a STATUS_PARAM
)
1126 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1128 absA
= zSign
? - a
: a
;
1129 shiftCount
= countLeadingZeros32( absA
) + 17;
1131 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
1137 /*----------------------------------------------------------------------------
1138 | Returns the result of converting the 64-bit two's complement integer `a'
1139 | to the single-precision floating-point format. The conversion is performed
1140 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1141 *----------------------------------------------------------------------------*/
1143 float32
int64_to_float32( int64 a STATUS_PARAM
)
1149 if ( a
== 0 ) return float32_zero
;
1151 absA
= zSign
? - a
: a
;
1152 shiftCount
= countLeadingZeros64( absA
) - 40;
1153 if ( 0 <= shiftCount
) {
1154 return packFloat32( zSign
, 0x95 - shiftCount
, absA
<<shiftCount
);
1158 if ( shiftCount
< 0 ) {
1159 shift64RightJamming( absA
, - shiftCount
, &absA
);
1162 absA
<<= shiftCount
;
1164 return roundAndPackFloat32( zSign
, 0x9C - shiftCount
, absA STATUS_VAR
);
1169 float32
uint64_to_float32( uint64 a STATUS_PARAM
)
1173 if ( a
== 0 ) return float32_zero
;
1174 shiftCount
= countLeadingZeros64( a
) - 40;
1175 if ( 0 <= shiftCount
) {
1176 return packFloat32( 1 > 0, 0x95 - shiftCount
, a
<<shiftCount
);
1180 if ( shiftCount
< 0 ) {
1181 shift64RightJamming( a
, - shiftCount
, &a
);
1186 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount
, a STATUS_VAR
);
1190 /*----------------------------------------------------------------------------
1191 | Returns the result of converting the 64-bit two's complement integer `a'
1192 | to the double-precision floating-point format. The conversion is performed
1193 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1194 *----------------------------------------------------------------------------*/
1196 float64
int64_to_float64( int64 a STATUS_PARAM
)
1200 if ( a
== 0 ) return float64_zero
;
1201 if ( a
== (sbits64
) LIT64( 0x8000000000000000 ) ) {
1202 return packFloat64( 1, 0x43E, 0 );
1205 return normalizeRoundAndPackFloat64( zSign
, 0x43C, zSign
? - a
: a STATUS_VAR
);
1209 float64
uint64_to_float64( uint64 a STATUS_PARAM
)
1211 if ( a
== 0 ) return float64_zero
;
1212 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR
);
1218 /*----------------------------------------------------------------------------
1219 | Returns the result of converting the 64-bit two's complement integer `a'
1220 | to the extended double-precision floating-point format. The conversion
1221 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1223 *----------------------------------------------------------------------------*/
1225 floatx80
int64_to_floatx80( int64 a STATUS_PARAM
)
1231 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1233 absA
= zSign
? - a
: a
;
1234 shiftCount
= countLeadingZeros64( absA
);
1235 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
1243 /*----------------------------------------------------------------------------
1244 | Returns the result of converting the 64-bit two's complement integer `a' to
1245 | the quadruple-precision floating-point format. The conversion is performed
1246 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1247 *----------------------------------------------------------------------------*/
1249 float128
int64_to_float128( int64 a STATUS_PARAM
)
1255 bits64 zSig0
, zSig1
;
1257 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1259 absA
= zSign
? - a
: a
;
1260 shiftCount
= countLeadingZeros64( absA
) + 49;
1261 zExp
= 0x406E - shiftCount
;
1262 if ( 64 <= shiftCount
) {
1271 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1272 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1278 /*----------------------------------------------------------------------------
1279 | Returns the result of converting the single-precision floating-point value
1280 | `a' to the 32-bit two's complement integer format. The conversion is
1281 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1282 | Arithmetic---which means in particular that the conversion is rounded
1283 | according to the current rounding mode. If `a' is a NaN, the largest
1284 | positive integer is returned. Otherwise, if the conversion overflows, the
1285 | largest integer with the same sign as `a' is returned.
1286 *----------------------------------------------------------------------------*/
1288 int32
float32_to_int32( float32 a STATUS_PARAM
)
1291 int16 aExp
, shiftCount
;
1295 aSig
= extractFloat32Frac( a
);
1296 aExp
= extractFloat32Exp( a
);
1297 aSign
= extractFloat32Sign( a
);
1298 if ( ( aExp
== 0xFF ) && aSig
) aSign
= 0;
1299 if ( aExp
) aSig
|= 0x00800000;
1300 shiftCount
= 0xAF - aExp
;
1303 if ( 0 < shiftCount
) shift64RightJamming( aSig64
, shiftCount
, &aSig64
);
1304 return roundAndPackInt32( aSign
, aSig64 STATUS_VAR
);
1308 /*----------------------------------------------------------------------------
1309 | Returns the result of converting the single-precision floating-point value
1310 | `a' to the 32-bit two's complement integer format. The conversion is
1311 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1312 | Arithmetic, except that the conversion is always rounded toward zero.
1313 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1314 | the conversion overflows, the largest integer with the same sign as `a' is
1316 *----------------------------------------------------------------------------*/
1318 int32
float32_to_int32_round_to_zero( float32 a STATUS_PARAM
)
1321 int16 aExp
, shiftCount
;
1325 aSig
= extractFloat32Frac( a
);
1326 aExp
= extractFloat32Exp( a
);
1327 aSign
= extractFloat32Sign( a
);
1328 shiftCount
= aExp
- 0x9E;
1329 if ( 0 <= shiftCount
) {
1330 if ( float32_val(a
) != 0xCF000000 ) {
1331 float_raise( float_flag_invalid STATUS_VAR
);
1332 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
1334 return (sbits32
) 0x80000000;
1336 else if ( aExp
<= 0x7E ) {
1337 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1340 aSig
= ( aSig
| 0x00800000 )<<8;
1341 z
= aSig
>>( - shiftCount
);
1342 if ( (bits32
) ( aSig
<<( shiftCount
& 31 ) ) ) {
1343 STATUS(float_exception_flags
) |= float_flag_inexact
;
1345 if ( aSign
) z
= - z
;
1350 /*----------------------------------------------------------------------------
1351 | Returns the result of converting the single-precision floating-point value
1352 | `a' to the 64-bit two's complement integer format. The conversion is
1353 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1354 | Arithmetic---which means in particular that the conversion is rounded
1355 | according to the current rounding mode. If `a' is a NaN, the largest
1356 | positive integer is returned. Otherwise, if the conversion overflows, the
1357 | largest integer with the same sign as `a' is returned.
1358 *----------------------------------------------------------------------------*/
1360 int64
float32_to_int64( float32 a STATUS_PARAM
)
1363 int16 aExp
, shiftCount
;
1365 bits64 aSig64
, aSigExtra
;
1367 aSig
= extractFloat32Frac( a
);
1368 aExp
= extractFloat32Exp( a
);
1369 aSign
= extractFloat32Sign( a
);
1370 shiftCount
= 0xBE - aExp
;
1371 if ( shiftCount
< 0 ) {
1372 float_raise( float_flag_invalid STATUS_VAR
);
1373 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1374 return LIT64( 0x7FFFFFFFFFFFFFFF );
1376 return (sbits64
) LIT64( 0x8000000000000000 );
1378 if ( aExp
) aSig
|= 0x00800000;
1381 shift64ExtraRightJamming( aSig64
, 0, shiftCount
, &aSig64
, &aSigExtra
);
1382 return roundAndPackInt64( aSign
, aSig64
, aSigExtra STATUS_VAR
);
1386 /*----------------------------------------------------------------------------
1387 | Returns the result of converting the single-precision floating-point value
1388 | `a' to the 64-bit two's complement integer format. The conversion is
1389 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1390 | Arithmetic, except that the conversion is always rounded toward zero. If
1391 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1392 | conversion overflows, the largest integer with the same sign as `a' is
1394 *----------------------------------------------------------------------------*/
1396 int64
float32_to_int64_round_to_zero( float32 a STATUS_PARAM
)
1399 int16 aExp
, shiftCount
;
1404 aSig
= extractFloat32Frac( a
);
1405 aExp
= extractFloat32Exp( a
);
1406 aSign
= extractFloat32Sign( a
);
1407 shiftCount
= aExp
- 0xBE;
1408 if ( 0 <= shiftCount
) {
1409 if ( float32_val(a
) != 0xDF000000 ) {
1410 float_raise( float_flag_invalid STATUS_VAR
);
1411 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1412 return LIT64( 0x7FFFFFFFFFFFFFFF );
1415 return (sbits64
) LIT64( 0x8000000000000000 );
1417 else if ( aExp
<= 0x7E ) {
1418 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1421 aSig64
= aSig
| 0x00800000;
1423 z
= aSig64
>>( - shiftCount
);
1424 if ( (bits64
) ( aSig64
<<( shiftCount
& 63 ) ) ) {
1425 STATUS(float_exception_flags
) |= float_flag_inexact
;
1427 if ( aSign
) z
= - z
;
1432 /*----------------------------------------------------------------------------
1433 | Returns the result of converting the single-precision floating-point value
1434 | `a' to the double-precision floating-point format. The conversion is
1435 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1437 *----------------------------------------------------------------------------*/
1439 float64
float32_to_float64( float32 a STATUS_PARAM
)
1445 aSig
= extractFloat32Frac( a
);
1446 aExp
= extractFloat32Exp( a
);
1447 aSign
= extractFloat32Sign( a
);
1448 if ( aExp
== 0xFF ) {
1449 if ( aSig
) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR
));
1450 return packFloat64( aSign
, 0x7FF, 0 );
1453 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
1454 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1457 return packFloat64( aSign
, aExp
+ 0x380, ( (bits64
) aSig
)<<29 );
1463 /*----------------------------------------------------------------------------
1464 | Returns the result of converting the single-precision floating-point value
1465 | `a' to the extended double-precision floating-point format. The conversion
1466 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1468 *----------------------------------------------------------------------------*/
1470 floatx80
float32_to_floatx80( float32 a STATUS_PARAM
)
1476 aSig
= extractFloat32Frac( a
);
1477 aExp
= extractFloat32Exp( a
);
1478 aSign
= extractFloat32Sign( a
);
1479 if ( aExp
== 0xFF ) {
1480 if ( aSig
) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR
) );
1481 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1484 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
1485 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1488 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<40 );
1496 /*----------------------------------------------------------------------------
1497 | Returns the result of converting the single-precision floating-point value
1498 | `a' to the double-precision floating-point format. The conversion is
1499 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1501 *----------------------------------------------------------------------------*/
1503 float128
float32_to_float128( float32 a STATUS_PARAM
)
1509 aSig
= extractFloat32Frac( a
);
1510 aExp
= extractFloat32Exp( a
);
1511 aSign
= extractFloat32Sign( a
);
1512 if ( aExp
== 0xFF ) {
1513 if ( aSig
) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR
) );
1514 return packFloat128( aSign
, 0x7FFF, 0, 0 );
1517 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
1518 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1521 return packFloat128( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<25, 0 );
1527 /*----------------------------------------------------------------------------
1528 | Rounds the single-precision floating-point value `a' to an integer, and
1529 | returns the result as a single-precision floating-point value. The
1530 | operation is performed according to the IEC/IEEE Standard for Binary
1531 | Floating-Point Arithmetic.
1532 *----------------------------------------------------------------------------*/
1534 float32
float32_round_to_int( float32 a STATUS_PARAM
)
1538 bits32 lastBitMask
, roundBitsMask
;
1542 aExp
= extractFloat32Exp( a
);
1543 if ( 0x96 <= aExp
) {
1544 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
1545 return propagateFloat32NaN( a
, a STATUS_VAR
);
1549 if ( aExp
<= 0x7E ) {
1550 if ( (bits32
) ( float32_val(a
)<<1 ) == 0 ) return a
;
1551 STATUS(float_exception_flags
) |= float_flag_inexact
;
1552 aSign
= extractFloat32Sign( a
);
1553 switch ( STATUS(float_rounding_mode
) ) {
1554 case float_round_nearest_even
:
1555 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
1556 return packFloat32( aSign
, 0x7F, 0 );
1559 case float_round_down
:
1560 return make_float32(aSign
? 0xBF800000 : 0);
1561 case float_round_up
:
1562 return make_float32(aSign
? 0x80000000 : 0x3F800000);
1564 return packFloat32( aSign
, 0, 0 );
1567 lastBitMask
<<= 0x96 - aExp
;
1568 roundBitsMask
= lastBitMask
- 1;
1570 roundingMode
= STATUS(float_rounding_mode
);
1571 if ( roundingMode
== float_round_nearest_even
) {
1572 z
+= lastBitMask
>>1;
1573 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
1575 else if ( roundingMode
!= float_round_to_zero
) {
1576 if ( extractFloat32Sign( make_float32(z
) ) ^ ( roundingMode
== float_round_up
) ) {
1580 z
&= ~ roundBitsMask
;
1581 if ( z
!= float32_val(a
) ) STATUS(float_exception_flags
) |= float_flag_inexact
;
1582 return make_float32(z
);
1586 /*----------------------------------------------------------------------------
1587 | Returns the result of adding the absolute values of the single-precision
1588 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1589 | before being returned. `zSign' is ignored if the result is a NaN.
1590 | The addition is performed according to the IEC/IEEE Standard for Binary
1591 | Floating-Point Arithmetic.
1592 *----------------------------------------------------------------------------*/
1594 static float32
addFloat32Sigs( float32 a
, float32 b
, flag zSign STATUS_PARAM
)
1596 int16 aExp
, bExp
, zExp
;
1597 bits32 aSig
, bSig
, zSig
;
1600 aSig
= extractFloat32Frac( a
);
1601 aExp
= extractFloat32Exp( a
);
1602 bSig
= extractFloat32Frac( b
);
1603 bExp
= extractFloat32Exp( b
);
1604 expDiff
= aExp
- bExp
;
1607 if ( 0 < expDiff
) {
1608 if ( aExp
== 0xFF ) {
1609 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1618 shift32RightJamming( bSig
, expDiff
, &bSig
);
1621 else if ( expDiff
< 0 ) {
1622 if ( bExp
== 0xFF ) {
1623 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1624 return packFloat32( zSign
, 0xFF, 0 );
1632 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1636 if ( aExp
== 0xFF ) {
1637 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1640 if ( aExp
== 0 ) return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
1641 zSig
= 0x40000000 + aSig
+ bSig
;
1646 zSig
= ( aSig
+ bSig
)<<1;
1648 if ( (sbits32
) zSig
< 0 ) {
1653 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1657 /*----------------------------------------------------------------------------
1658 | Returns the result of subtracting the absolute values of the single-
1659 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1660 | difference is negated before being returned. `zSign' is ignored if the
1661 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1662 | Standard for Binary Floating-Point Arithmetic.
1663 *----------------------------------------------------------------------------*/
1665 static float32
subFloat32Sigs( float32 a
, float32 b
, flag zSign STATUS_PARAM
)
1667 int16 aExp
, bExp
, zExp
;
1668 bits32 aSig
, bSig
, zSig
;
1671 aSig
= extractFloat32Frac( a
);
1672 aExp
= extractFloat32Exp( a
);
1673 bSig
= extractFloat32Frac( b
);
1674 bExp
= extractFloat32Exp( b
);
1675 expDiff
= aExp
- bExp
;
1678 if ( 0 < expDiff
) goto aExpBigger
;
1679 if ( expDiff
< 0 ) goto bExpBigger
;
1680 if ( aExp
== 0xFF ) {
1681 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1682 float_raise( float_flag_invalid STATUS_VAR
);
1683 return float32_default_nan
;
1689 if ( bSig
< aSig
) goto aBigger
;
1690 if ( aSig
< bSig
) goto bBigger
;
1691 return packFloat32( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
1693 if ( bExp
== 0xFF ) {
1694 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1695 return packFloat32( zSign
^ 1, 0xFF, 0 );
1703 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1709 goto normalizeRoundAndPack
;
1711 if ( aExp
== 0xFF ) {
1712 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1721 shift32RightJamming( bSig
, expDiff
, &bSig
);
1726 normalizeRoundAndPack
:
1728 return normalizeRoundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1732 /*----------------------------------------------------------------------------
1733 | Returns the result of adding the single-precision floating-point values `a'
1734 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1735 | Binary Floating-Point Arithmetic.
1736 *----------------------------------------------------------------------------*/
1738 float32
float32_add( float32 a
, float32 b STATUS_PARAM
)
1742 aSign
= extractFloat32Sign( a
);
1743 bSign
= extractFloat32Sign( b
);
1744 if ( aSign
== bSign
) {
1745 return addFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1748 return subFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1753 /*----------------------------------------------------------------------------
1754 | Returns the result of subtracting the single-precision floating-point values
1755 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1756 | for Binary Floating-Point Arithmetic.
1757 *----------------------------------------------------------------------------*/
1759 float32
float32_sub( float32 a
, float32 b STATUS_PARAM
)
1763 aSign
= extractFloat32Sign( a
);
1764 bSign
= extractFloat32Sign( b
);
1765 if ( aSign
== bSign
) {
1766 return subFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1769 return addFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1774 /*----------------------------------------------------------------------------
1775 | Returns the result of multiplying the single-precision floating-point values
1776 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1777 | for Binary Floating-Point Arithmetic.
1778 *----------------------------------------------------------------------------*/
1780 float32
float32_mul( float32 a
, float32 b STATUS_PARAM
)
1782 flag aSign
, bSign
, zSign
;
1783 int16 aExp
, bExp
, zExp
;
1788 aSig
= extractFloat32Frac( a
);
1789 aExp
= extractFloat32Exp( a
);
1790 aSign
= extractFloat32Sign( a
);
1791 bSig
= extractFloat32Frac( b
);
1792 bExp
= extractFloat32Exp( b
);
1793 bSign
= extractFloat32Sign( b
);
1794 zSign
= aSign
^ bSign
;
1795 if ( aExp
== 0xFF ) {
1796 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1797 return propagateFloat32NaN( a
, b STATUS_VAR
);
1799 if ( ( bExp
| bSig
) == 0 ) {
1800 float_raise( float_flag_invalid STATUS_VAR
);
1801 return float32_default_nan
;
1803 return packFloat32( zSign
, 0xFF, 0 );
1805 if ( bExp
== 0xFF ) {
1806 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1807 if ( ( aExp
| aSig
) == 0 ) {
1808 float_raise( float_flag_invalid STATUS_VAR
);
1809 return float32_default_nan
;
1811 return packFloat32( zSign
, 0xFF, 0 );
1814 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1815 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1818 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1819 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1821 zExp
= aExp
+ bExp
- 0x7F;
1822 aSig
= ( aSig
| 0x00800000 )<<7;
1823 bSig
= ( bSig
| 0x00800000 )<<8;
1824 shift64RightJamming( ( (bits64
) aSig
) * bSig
, 32, &zSig64
);
1826 if ( 0 <= (sbits32
) ( zSig
<<1 ) ) {
1830 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1834 /*----------------------------------------------------------------------------
1835 | Returns the result of dividing the single-precision floating-point value `a'
1836 | by the corresponding value `b'. The operation is performed according to the
1837 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1838 *----------------------------------------------------------------------------*/
1840 float32
float32_div( float32 a
, float32 b STATUS_PARAM
)
1842 flag aSign
, bSign
, zSign
;
1843 int16 aExp
, bExp
, zExp
;
1844 bits32 aSig
, bSig
, zSig
;
1846 aSig
= extractFloat32Frac( a
);
1847 aExp
= extractFloat32Exp( a
);
1848 aSign
= extractFloat32Sign( a
);
1849 bSig
= extractFloat32Frac( b
);
1850 bExp
= extractFloat32Exp( b
);
1851 bSign
= extractFloat32Sign( b
);
1852 zSign
= aSign
^ bSign
;
1853 if ( aExp
== 0xFF ) {
1854 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1855 if ( bExp
== 0xFF ) {
1856 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1857 float_raise( float_flag_invalid STATUS_VAR
);
1858 return float32_default_nan
;
1860 return packFloat32( zSign
, 0xFF, 0 );
1862 if ( bExp
== 0xFF ) {
1863 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1864 return packFloat32( zSign
, 0, 0 );
1868 if ( ( aExp
| aSig
) == 0 ) {
1869 float_raise( float_flag_invalid STATUS_VAR
);
1870 return float32_default_nan
;
1872 float_raise( float_flag_divbyzero STATUS_VAR
);
1873 return packFloat32( zSign
, 0xFF, 0 );
1875 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1878 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1879 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1881 zExp
= aExp
- bExp
+ 0x7D;
1882 aSig
= ( aSig
| 0x00800000 )<<7;
1883 bSig
= ( bSig
| 0x00800000 )<<8;
1884 if ( bSig
<= ( aSig
+ aSig
) ) {
1888 zSig
= ( ( (bits64
) aSig
)<<32 ) / bSig
;
1889 if ( ( zSig
& 0x3F ) == 0 ) {
1890 zSig
|= ( (bits64
) bSig
* zSig
!= ( (bits64
) aSig
)<<32 );
1892 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1896 /*----------------------------------------------------------------------------
1897 | Returns the remainder of the single-precision floating-point value `a'
1898 | with respect to the corresponding value `b'. The operation is performed
1899 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1900 *----------------------------------------------------------------------------*/
1902 float32
float32_rem( float32 a
, float32 b STATUS_PARAM
)
1904 flag aSign
, bSign
, zSign
;
1905 int16 aExp
, bExp
, expDiff
;
1908 bits64 aSig64
, bSig64
, q64
;
1909 bits32 alternateASig
;
1912 aSig
= extractFloat32Frac( a
);
1913 aExp
= extractFloat32Exp( a
);
1914 aSign
= extractFloat32Sign( a
);
1915 bSig
= extractFloat32Frac( b
);
1916 bExp
= extractFloat32Exp( b
);
1917 bSign
= extractFloat32Sign( b
);
1918 if ( aExp
== 0xFF ) {
1919 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1920 return propagateFloat32NaN( a
, b STATUS_VAR
);
1922 float_raise( float_flag_invalid STATUS_VAR
);
1923 return float32_default_nan
;
1925 if ( bExp
== 0xFF ) {
1926 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1931 float_raise( float_flag_invalid STATUS_VAR
);
1932 return float32_default_nan
;
1934 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1937 if ( aSig
== 0 ) return a
;
1938 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1940 expDiff
= aExp
- bExp
;
1943 if ( expDiff
< 32 ) {
1946 if ( expDiff
< 0 ) {
1947 if ( expDiff
< -1 ) return a
;
1950 q
= ( bSig
<= aSig
);
1951 if ( q
) aSig
-= bSig
;
1952 if ( 0 < expDiff
) {
1953 q
= ( ( (bits64
) aSig
)<<32 ) / bSig
;
1956 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
1964 if ( bSig
<= aSig
) aSig
-= bSig
;
1965 aSig64
= ( (bits64
) aSig
)<<40;
1966 bSig64
= ( (bits64
) bSig
)<<40;
1968 while ( 0 < expDiff
) {
1969 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
1970 q64
= ( 2 < q64
) ? q64
- 2 : 0;
1971 aSig64
= - ( ( bSig
* q64
)<<38 );
1975 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
1976 q64
= ( 2 < q64
) ? q64
- 2 : 0;
1977 q
= q64
>>( 64 - expDiff
);
1979 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
1982 alternateASig
= aSig
;
1985 } while ( 0 <= (sbits32
) aSig
);
1986 sigMean
= aSig
+ alternateASig
;
1987 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
1988 aSig
= alternateASig
;
1990 zSign
= ( (sbits32
) aSig
< 0 );
1991 if ( zSign
) aSig
= - aSig
;
1992 return normalizeRoundAndPackFloat32( aSign
^ zSign
, bExp
, aSig STATUS_VAR
);
1996 /*----------------------------------------------------------------------------
1997 | Returns the square root of the single-precision floating-point value `a'.
1998 | The operation is performed according to the IEC/IEEE Standard for Binary
1999 | Floating-Point Arithmetic.
2000 *----------------------------------------------------------------------------*/
2002 float32
float32_sqrt( float32 a STATUS_PARAM
)
2009 aSig
= extractFloat32Frac( a
);
2010 aExp
= extractFloat32Exp( a
);
2011 aSign
= extractFloat32Sign( a
);
2012 if ( aExp
== 0xFF ) {
2013 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2014 if ( ! aSign
) return a
;
2015 float_raise( float_flag_invalid STATUS_VAR
);
2016 return float32_default_nan
;
2019 if ( ( aExp
| aSig
) == 0 ) return a
;
2020 float_raise( float_flag_invalid STATUS_VAR
);
2021 return float32_default_nan
;
2024 if ( aSig
== 0 ) return float32_zero
;
2025 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2027 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
2028 aSig
= ( aSig
| 0x00800000 )<<8;
2029 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
2030 if ( ( zSig
& 0x7F ) <= 5 ) {
2036 term
= ( (bits64
) zSig
) * zSig
;
2037 rem
= ( ( (bits64
) aSig
)<<32 ) - term
;
2038 while ( (sbits64
) rem
< 0 ) {
2040 rem
+= ( ( (bits64
) zSig
)<<1 ) | 1;
2042 zSig
|= ( rem
!= 0 );
2044 shift32RightJamming( zSig
, 1, &zSig
);
2046 return roundAndPackFloat32( 0, zExp
, zSig STATUS_VAR
);
2050 /*----------------------------------------------------------------------------
2051 | Returns 1 if the single-precision floating-point value `a' is equal to
2052 | the corresponding value `b', and 0 otherwise. The comparison is performed
2053 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2054 *----------------------------------------------------------------------------*/
2056 int float32_eq( float32 a
, float32 b STATUS_PARAM
)
2059 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2060 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2062 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2063 float_raise( float_flag_invalid STATUS_VAR
);
2067 return ( float32_val(a
) == float32_val(b
) ) ||
2068 ( (bits32
) ( ( float32_val(a
) | float32_val(b
) )<<1 ) == 0 );
2072 /*----------------------------------------------------------------------------
2073 | Returns 1 if the single-precision floating-point value `a' is less than
2074 | or equal to the corresponding value `b', and 0 otherwise. The comparison
2075 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2077 *----------------------------------------------------------------------------*/
2079 int float32_le( float32 a
, float32 b STATUS_PARAM
)
2084 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2085 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2087 float_raise( float_flag_invalid STATUS_VAR
);
2090 aSign
= extractFloat32Sign( a
);
2091 bSign
= extractFloat32Sign( b
);
2092 av
= float32_val(a
);
2093 bv
= float32_val(b
);
2094 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( av
| bv
)<<1 ) == 0 );
2095 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
2099 /*----------------------------------------------------------------------------
2100 | Returns 1 if the single-precision floating-point value `a' is less than
2101 | the corresponding value `b', and 0 otherwise. The comparison is performed
2102 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2103 *----------------------------------------------------------------------------*/
2105 int float32_lt( float32 a
, float32 b STATUS_PARAM
)
2110 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2111 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2113 float_raise( float_flag_invalid STATUS_VAR
);
2116 aSign
= extractFloat32Sign( a
);
2117 bSign
= extractFloat32Sign( b
);
2118 av
= float32_val(a
);
2119 bv
= float32_val(b
);
2120 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( av
| bv
)<<1 ) != 0 );
2121 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
2125 /*----------------------------------------------------------------------------
2126 | Returns 1 if the single-precision floating-point value `a' is equal to
2127 | the corresponding value `b', and 0 otherwise. The invalid exception is
2128 | raised if either operand is a NaN. Otherwise, the comparison is performed
2129 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2130 *----------------------------------------------------------------------------*/
2132 int float32_eq_signaling( float32 a
, float32 b STATUS_PARAM
)
2136 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2137 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2139 float_raise( float_flag_invalid STATUS_VAR
);
2142 av
= float32_val(a
);
2143 bv
= float32_val(b
);
2144 return ( av
== bv
) || ( (bits32
) ( ( av
| bv
)<<1 ) == 0 );
2148 /*----------------------------------------------------------------------------
2149 | Returns 1 if the single-precision floating-point value `a' is less than or
2150 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2151 | cause an exception. Otherwise, the comparison is performed according to the
2152 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2153 *----------------------------------------------------------------------------*/
2155 int float32_le_quiet( float32 a
, float32 b STATUS_PARAM
)
2160 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2161 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2163 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2164 float_raise( float_flag_invalid STATUS_VAR
);
2168 aSign
= extractFloat32Sign( a
);
2169 bSign
= extractFloat32Sign( b
);
2170 av
= float32_val(a
);
2171 bv
= float32_val(b
);
2172 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( av
| bv
)<<1 ) == 0 );
2173 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
2177 /*----------------------------------------------------------------------------
2178 | Returns 1 if the single-precision floating-point value `a' is less than
2179 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2180 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2181 | Standard for Binary Floating-Point Arithmetic.
2182 *----------------------------------------------------------------------------*/
2184 int float32_lt_quiet( float32 a
, float32 b STATUS_PARAM
)
2189 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2190 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2192 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2193 float_raise( float_flag_invalid STATUS_VAR
);
2197 aSign
= extractFloat32Sign( a
);
2198 bSign
= extractFloat32Sign( b
);
2199 av
= float32_val(a
);
2200 bv
= float32_val(b
);
2201 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( av
| bv
)<<1 ) != 0 );
2202 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
2206 /*----------------------------------------------------------------------------
2207 | Returns the result of converting the double-precision floating-point value
2208 | `a' to the 32-bit two's complement integer format. The conversion is
2209 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2210 | Arithmetic---which means in particular that the conversion is rounded
2211 | according to the current rounding mode. If `a' is a NaN, the largest
2212 | positive integer is returned. Otherwise, if the conversion overflows, the
2213 | largest integer with the same sign as `a' is returned.
2214 *----------------------------------------------------------------------------*/
2216 int32
float64_to_int32( float64 a STATUS_PARAM
)
2219 int16 aExp
, shiftCount
;
2222 aSig
= extractFloat64Frac( a
);
2223 aExp
= extractFloat64Exp( a
);
2224 aSign
= extractFloat64Sign( a
);
2225 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2226 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2227 shiftCount
= 0x42C - aExp
;
2228 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
2229 return roundAndPackInt32( aSign
, aSig STATUS_VAR
);
2233 /*----------------------------------------------------------------------------
2234 | Returns the result of converting the double-precision floating-point value
2235 | `a' to the 32-bit two's complement integer format. The conversion is
2236 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2237 | Arithmetic, except that the conversion is always rounded toward zero.
2238 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2239 | the conversion overflows, the largest integer with the same sign as `a' is
2241 *----------------------------------------------------------------------------*/
2243 int32
float64_to_int32_round_to_zero( float64 a STATUS_PARAM
)
2246 int16 aExp
, shiftCount
;
2247 bits64 aSig
, savedASig
;
2250 aSig
= extractFloat64Frac( a
);
2251 aExp
= extractFloat64Exp( a
);
2252 aSign
= extractFloat64Sign( a
);
2253 if ( 0x41E < aExp
) {
2254 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2257 else if ( aExp
< 0x3FF ) {
2258 if ( aExp
|| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
2261 aSig
|= LIT64( 0x0010000000000000 );
2262 shiftCount
= 0x433 - aExp
;
2264 aSig
>>= shiftCount
;
2266 if ( aSign
) z
= - z
;
2267 if ( ( z
< 0 ) ^ aSign
) {
2269 float_raise( float_flag_invalid STATUS_VAR
);
2270 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
2272 if ( ( aSig
<<shiftCount
) != savedASig
) {
2273 STATUS(float_exception_flags
) |= float_flag_inexact
;
2279 /*----------------------------------------------------------------------------
2280 | Returns the result of converting the double-precision floating-point value
2281 | `a' to the 64-bit two's complement integer format. The conversion is
2282 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2283 | Arithmetic---which means in particular that the conversion is rounded
2284 | according to the current rounding mode. If `a' is a NaN, the largest
2285 | positive integer is returned. Otherwise, if the conversion overflows, the
2286 | largest integer with the same sign as `a' is returned.
2287 *----------------------------------------------------------------------------*/
2289 int64
float64_to_int64( float64 a STATUS_PARAM
)
2292 int16 aExp
, shiftCount
;
2293 bits64 aSig
, aSigExtra
;
2295 aSig
= extractFloat64Frac( a
);
2296 aExp
= extractFloat64Exp( a
);
2297 aSign
= extractFloat64Sign( a
);
2298 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2299 shiftCount
= 0x433 - aExp
;
2300 if ( shiftCount
<= 0 ) {
2301 if ( 0x43E < aExp
) {
2302 float_raise( float_flag_invalid STATUS_VAR
);
2304 || ( ( aExp
== 0x7FF )
2305 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2307 return LIT64( 0x7FFFFFFFFFFFFFFF );
2309 return (sbits64
) LIT64( 0x8000000000000000 );
2312 aSig
<<= - shiftCount
;
2315 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
2317 return roundAndPackInt64( aSign
, aSig
, aSigExtra STATUS_VAR
);
2321 /*----------------------------------------------------------------------------
2322 | Returns the result of converting the double-precision floating-point value
2323 | `a' to the 64-bit two's complement integer format. The conversion is
2324 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2325 | Arithmetic, except that the conversion is always rounded toward zero.
2326 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2327 | the conversion overflows, the largest integer with the same sign as `a' is
2329 *----------------------------------------------------------------------------*/
2331 int64
float64_to_int64_round_to_zero( float64 a STATUS_PARAM
)
2334 int16 aExp
, shiftCount
;
2338 aSig
= extractFloat64Frac( a
);
2339 aExp
= extractFloat64Exp( a
);
2340 aSign
= extractFloat64Sign( a
);
2341 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2342 shiftCount
= aExp
- 0x433;
2343 if ( 0 <= shiftCount
) {
2344 if ( 0x43E <= aExp
) {
2345 if ( float64_val(a
) != LIT64( 0xC3E0000000000000 ) ) {
2346 float_raise( float_flag_invalid STATUS_VAR
);
2348 || ( ( aExp
== 0x7FF )
2349 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2351 return LIT64( 0x7FFFFFFFFFFFFFFF );
2354 return (sbits64
) LIT64( 0x8000000000000000 );
2356 z
= aSig
<<shiftCount
;
2359 if ( aExp
< 0x3FE ) {
2360 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
2363 z
= aSig
>>( - shiftCount
);
2364 if ( (bits64
) ( aSig
<<( shiftCount
& 63 ) ) ) {
2365 STATUS(float_exception_flags
) |= float_flag_inexact
;
2368 if ( aSign
) z
= - z
;
2373 /*----------------------------------------------------------------------------
2374 | Returns the result of converting the double-precision floating-point value
2375 | `a' to the single-precision floating-point format. The conversion is
2376 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2378 *----------------------------------------------------------------------------*/
2380 float32
float64_to_float32( float64 a STATUS_PARAM
)
2387 aSig
= extractFloat64Frac( a
);
2388 aExp
= extractFloat64Exp( a
);
2389 aSign
= extractFloat64Sign( a
);
2390 if ( aExp
== 0x7FF ) {
2391 if ( aSig
) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR
) );
2392 return packFloat32( aSign
, 0xFF, 0 );
2394 shift64RightJamming( aSig
, 22, &aSig
);
2396 if ( aExp
|| zSig
) {
2400 return roundAndPackFloat32( aSign
, aExp
, zSig STATUS_VAR
);
2406 /*----------------------------------------------------------------------------
2407 | Returns the result of converting the double-precision floating-point value
2408 | `a' to the extended double-precision floating-point format. The conversion
2409 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2411 *----------------------------------------------------------------------------*/
2413 floatx80
float64_to_floatx80( float64 a STATUS_PARAM
)
2419 aSig
= extractFloat64Frac( a
);
2420 aExp
= extractFloat64Exp( a
);
2421 aSign
= extractFloat64Sign( a
);
2422 if ( aExp
== 0x7FF ) {
2423 if ( aSig
) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR
) );
2424 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2427 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
2428 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2432 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
2440 /*----------------------------------------------------------------------------
2441 | Returns the result of converting the double-precision floating-point value
2442 | `a' to the quadruple-precision floating-point format. The conversion is
2443 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2445 *----------------------------------------------------------------------------*/
2447 float128
float64_to_float128( float64 a STATUS_PARAM
)
2451 bits64 aSig
, zSig0
, zSig1
;
2453 aSig
= extractFloat64Frac( a
);
2454 aExp
= extractFloat64Exp( a
);
2455 aSign
= extractFloat64Sign( a
);
2456 if ( aExp
== 0x7FF ) {
2457 if ( aSig
) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR
) );
2458 return packFloat128( aSign
, 0x7FFF, 0, 0 );
2461 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
2462 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2465 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
2466 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
2472 /*----------------------------------------------------------------------------
2473 | Rounds the double-precision floating-point value `a' to an integer, and
2474 | returns the result as a double-precision floating-point value. The
2475 | operation is performed according to the IEC/IEEE Standard for Binary
2476 | Floating-Point Arithmetic.
2477 *----------------------------------------------------------------------------*/
2479 float64
float64_round_to_int( float64 a STATUS_PARAM
)
2483 bits64 lastBitMask
, roundBitsMask
;
2487 aExp
= extractFloat64Exp( a
);
2488 if ( 0x433 <= aExp
) {
2489 if ( ( aExp
== 0x7FF ) && extractFloat64Frac( a
) ) {
2490 return propagateFloat64NaN( a
, a STATUS_VAR
);
2494 if ( aExp
< 0x3FF ) {
2495 if ( (bits64
) ( float64_val(a
)<<1 ) == 0 ) return a
;
2496 STATUS(float_exception_flags
) |= float_flag_inexact
;
2497 aSign
= extractFloat64Sign( a
);
2498 switch ( STATUS(float_rounding_mode
) ) {
2499 case float_round_nearest_even
:
2500 if ( ( aExp
== 0x3FE ) && extractFloat64Frac( a
) ) {
2501 return packFloat64( aSign
, 0x3FF, 0 );
2504 case float_round_down
:
2505 return make_float64(aSign
? LIT64( 0xBFF0000000000000 ) : 0);
2506 case float_round_up
:
2507 return make_float64(
2508 aSign
? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2510 return packFloat64( aSign
, 0, 0 );
2513 lastBitMask
<<= 0x433 - aExp
;
2514 roundBitsMask
= lastBitMask
- 1;
2516 roundingMode
= STATUS(float_rounding_mode
);
2517 if ( roundingMode
== float_round_nearest_even
) {
2518 z
+= lastBitMask
>>1;
2519 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
2521 else if ( roundingMode
!= float_round_to_zero
) {
2522 if ( extractFloat64Sign( make_float64(z
) ) ^ ( roundingMode
== float_round_up
) ) {
2526 z
&= ~ roundBitsMask
;
2527 if ( z
!= float64_val(a
) )
2528 STATUS(float_exception_flags
) |= float_flag_inexact
;
2529 return make_float64(z
);
2533 float64
float64_trunc_to_int( float64 a STATUS_PARAM
)
2537 oldmode
= STATUS(float_rounding_mode
);
2538 STATUS(float_rounding_mode
) = float_round_to_zero
;
2539 res
= float64_round_to_int(a STATUS_VAR
);
2540 STATUS(float_rounding_mode
) = oldmode
;
2544 /*----------------------------------------------------------------------------
2545 | Returns the result of adding the absolute values of the double-precision
2546 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2547 | before being returned. `zSign' is ignored if the result is a NaN.
2548 | The addition is performed according to the IEC/IEEE Standard for Binary
2549 | Floating-Point Arithmetic.
2550 *----------------------------------------------------------------------------*/
2552 static float64
addFloat64Sigs( float64 a
, float64 b
, flag zSign STATUS_PARAM
)
2554 int16 aExp
, bExp
, zExp
;
2555 bits64 aSig
, bSig
, zSig
;
2558 aSig
= extractFloat64Frac( a
);
2559 aExp
= extractFloat64Exp( a
);
2560 bSig
= extractFloat64Frac( b
);
2561 bExp
= extractFloat64Exp( b
);
2562 expDiff
= aExp
- bExp
;
2565 if ( 0 < expDiff
) {
2566 if ( aExp
== 0x7FF ) {
2567 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2574 bSig
|= LIT64( 0x2000000000000000 );
2576 shift64RightJamming( bSig
, expDiff
, &bSig
);
2579 else if ( expDiff
< 0 ) {
2580 if ( bExp
== 0x7FF ) {
2581 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2582 return packFloat64( zSign
, 0x7FF, 0 );
2588 aSig
|= LIT64( 0x2000000000000000 );
2590 shift64RightJamming( aSig
, - expDiff
, &aSig
);
2594 if ( aExp
== 0x7FF ) {
2595 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2598 if ( aExp
== 0 ) return packFloat64( zSign
, 0, ( aSig
+ bSig
)>>9 );
2599 zSig
= LIT64( 0x4000000000000000 ) + aSig
+ bSig
;
2603 aSig
|= LIT64( 0x2000000000000000 );
2604 zSig
= ( aSig
+ bSig
)<<1;
2606 if ( (sbits64
) zSig
< 0 ) {
2611 return roundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
2615 /*----------------------------------------------------------------------------
2616 | Returns the result of subtracting the absolute values of the double-
2617 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2618 | difference is negated before being returned. `zSign' is ignored if the
2619 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2620 | Standard for Binary Floating-Point Arithmetic.
2621 *----------------------------------------------------------------------------*/
2623 static float64
subFloat64Sigs( float64 a
, float64 b
, flag zSign STATUS_PARAM
)
2625 int16 aExp
, bExp
, zExp
;
2626 bits64 aSig
, bSig
, zSig
;
2629 aSig
= extractFloat64Frac( a
);
2630 aExp
= extractFloat64Exp( a
);
2631 bSig
= extractFloat64Frac( b
);
2632 bExp
= extractFloat64Exp( b
);
2633 expDiff
= aExp
- bExp
;
2636 if ( 0 < expDiff
) goto aExpBigger
;
2637 if ( expDiff
< 0 ) goto bExpBigger
;
2638 if ( aExp
== 0x7FF ) {
2639 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2640 float_raise( float_flag_invalid STATUS_VAR
);
2641 return float64_default_nan
;
2647 if ( bSig
< aSig
) goto aBigger
;
2648 if ( aSig
< bSig
) goto bBigger
;
2649 return packFloat64( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
2651 if ( bExp
== 0x7FF ) {
2652 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2653 return packFloat64( zSign
^ 1, 0x7FF, 0 );
2659 aSig
|= LIT64( 0x4000000000000000 );
2661 shift64RightJamming( aSig
, - expDiff
, &aSig
);
2662 bSig
|= LIT64( 0x4000000000000000 );
2667 goto normalizeRoundAndPack
;
2669 if ( aExp
== 0x7FF ) {
2670 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2677 bSig
|= LIT64( 0x4000000000000000 );
2679 shift64RightJamming( bSig
, expDiff
, &bSig
);
2680 aSig
|= LIT64( 0x4000000000000000 );
2684 normalizeRoundAndPack
:
2686 return normalizeRoundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
2690 /*----------------------------------------------------------------------------
2691 | Returns the result of adding the double-precision floating-point values `a'
2692 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2693 | Binary Floating-Point Arithmetic.
2694 *----------------------------------------------------------------------------*/
2696 float64
float64_add( float64 a
, float64 b STATUS_PARAM
)
2700 aSign
= extractFloat64Sign( a
);
2701 bSign
= extractFloat64Sign( b
);
2702 if ( aSign
== bSign
) {
2703 return addFloat64Sigs( a
, b
, aSign STATUS_VAR
);
2706 return subFloat64Sigs( a
, b
, aSign STATUS_VAR
);
2711 /*----------------------------------------------------------------------------
2712 | Returns the result of subtracting the double-precision floating-point values
2713 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2714 | for Binary Floating-Point Arithmetic.
2715 *----------------------------------------------------------------------------*/
2717 float64
float64_sub( float64 a
, float64 b STATUS_PARAM
)
2721 aSign
= extractFloat64Sign( a
);
2722 bSign
= extractFloat64Sign( b
);
2723 if ( aSign
== bSign
) {
2724 return subFloat64Sigs( a
, b
, aSign STATUS_VAR
);
2727 return addFloat64Sigs( a
, b
, aSign STATUS_VAR
);
2732 /*----------------------------------------------------------------------------
2733 | Returns the result of multiplying the double-precision floating-point values
2734 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2735 | for Binary Floating-Point Arithmetic.
2736 *----------------------------------------------------------------------------*/
2738 float64
float64_mul( float64 a
, float64 b STATUS_PARAM
)
2740 flag aSign
, bSign
, zSign
;
2741 int16 aExp
, bExp
, zExp
;
2742 bits64 aSig
, bSig
, zSig0
, zSig1
;
2744 aSig
= extractFloat64Frac( a
);
2745 aExp
= extractFloat64Exp( a
);
2746 aSign
= extractFloat64Sign( a
);
2747 bSig
= extractFloat64Frac( b
);
2748 bExp
= extractFloat64Exp( b
);
2749 bSign
= extractFloat64Sign( b
);
2750 zSign
= aSign
^ bSign
;
2751 if ( aExp
== 0x7FF ) {
2752 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
2753 return propagateFloat64NaN( a
, b STATUS_VAR
);
2755 if ( ( bExp
| bSig
) == 0 ) {
2756 float_raise( float_flag_invalid STATUS_VAR
);
2757 return float64_default_nan
;
2759 return packFloat64( zSign
, 0x7FF, 0 );
2761 if ( bExp
== 0x7FF ) {
2762 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2763 if ( ( aExp
| aSig
) == 0 ) {
2764 float_raise( float_flag_invalid STATUS_VAR
);
2765 return float64_default_nan
;
2767 return packFloat64( zSign
, 0x7FF, 0 );
2770 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2771 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2774 if ( bSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2775 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2777 zExp
= aExp
+ bExp
- 0x3FF;
2778 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
2779 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2780 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
2781 zSig0
|= ( zSig1
!= 0 );
2782 if ( 0 <= (sbits64
) ( zSig0
<<1 ) ) {
2786 return roundAndPackFloat64( zSign
, zExp
, zSig0 STATUS_VAR
);
2790 /*----------------------------------------------------------------------------
2791 | Returns the result of dividing the double-precision floating-point value `a'
2792 | by the corresponding value `b'. The operation is performed according to
2793 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2794 *----------------------------------------------------------------------------*/
2796 float64
float64_div( float64 a
, float64 b STATUS_PARAM
)
2798 flag aSign
, bSign
, zSign
;
2799 int16 aExp
, bExp
, zExp
;
2800 bits64 aSig
, bSig
, zSig
;
2802 bits64 term0
, term1
;
2804 aSig
= extractFloat64Frac( a
);
2805 aExp
= extractFloat64Exp( a
);
2806 aSign
= extractFloat64Sign( a
);
2807 bSig
= extractFloat64Frac( b
);
2808 bExp
= extractFloat64Exp( b
);
2809 bSign
= extractFloat64Sign( b
);
2810 zSign
= aSign
^ bSign
;
2811 if ( aExp
== 0x7FF ) {
2812 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2813 if ( bExp
== 0x7FF ) {
2814 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2815 float_raise( float_flag_invalid STATUS_VAR
);
2816 return float64_default_nan
;
2818 return packFloat64( zSign
, 0x7FF, 0 );
2820 if ( bExp
== 0x7FF ) {
2821 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2822 return packFloat64( zSign
, 0, 0 );
2826 if ( ( aExp
| aSig
) == 0 ) {
2827 float_raise( float_flag_invalid STATUS_VAR
);
2828 return float64_default_nan
;
2830 float_raise( float_flag_divbyzero STATUS_VAR
);
2831 return packFloat64( zSign
, 0x7FF, 0 );
2833 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2836 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2837 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2839 zExp
= aExp
- bExp
+ 0x3FD;
2840 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
2841 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2842 if ( bSig
<= ( aSig
+ aSig
) ) {
2846 zSig
= estimateDiv128To64( aSig
, 0, bSig
);
2847 if ( ( zSig
& 0x1FF ) <= 2 ) {
2848 mul64To128( bSig
, zSig
, &term0
, &term1
);
2849 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
2850 while ( (sbits64
) rem0
< 0 ) {
2852 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
2854 zSig
|= ( rem1
!= 0 );
2856 return roundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
2860 /*----------------------------------------------------------------------------
2861 | Returns the remainder of the double-precision floating-point value `a'
2862 | with respect to the corresponding value `b'. The operation is performed
2863 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2864 *----------------------------------------------------------------------------*/
2866 float64
float64_rem( float64 a
, float64 b STATUS_PARAM
)
2868 flag aSign
, bSign
, zSign
;
2869 int16 aExp
, bExp
, expDiff
;
2871 bits64 q
, alternateASig
;
2874 aSig
= extractFloat64Frac( a
);
2875 aExp
= extractFloat64Exp( a
);
2876 aSign
= extractFloat64Sign( a
);
2877 bSig
= extractFloat64Frac( b
);
2878 bExp
= extractFloat64Exp( b
);
2879 bSign
= extractFloat64Sign( b
);
2880 if ( aExp
== 0x7FF ) {
2881 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
2882 return propagateFloat64NaN( a
, b STATUS_VAR
);
2884 float_raise( float_flag_invalid STATUS_VAR
);
2885 return float64_default_nan
;
2887 if ( bExp
== 0x7FF ) {
2888 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2893 float_raise( float_flag_invalid STATUS_VAR
);
2894 return float64_default_nan
;
2896 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2899 if ( aSig
== 0 ) return a
;
2900 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2902 expDiff
= aExp
- bExp
;
2903 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
2904 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2905 if ( expDiff
< 0 ) {
2906 if ( expDiff
< -1 ) return a
;
2909 q
= ( bSig
<= aSig
);
2910 if ( q
) aSig
-= bSig
;
2912 while ( 0 < expDiff
) {
2913 q
= estimateDiv128To64( aSig
, 0, bSig
);
2914 q
= ( 2 < q
) ? q
- 2 : 0;
2915 aSig
= - ( ( bSig
>>2 ) * q
);
2919 if ( 0 < expDiff
) {
2920 q
= estimateDiv128To64( aSig
, 0, bSig
);
2921 q
= ( 2 < q
) ? q
- 2 : 0;
2924 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
2931 alternateASig
= aSig
;
2934 } while ( 0 <= (sbits64
) aSig
);
2935 sigMean
= aSig
+ alternateASig
;
2936 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
2937 aSig
= alternateASig
;
2939 zSign
= ( (sbits64
) aSig
< 0 );
2940 if ( zSign
) aSig
= - aSig
;
2941 return normalizeRoundAndPackFloat64( aSign
^ zSign
, bExp
, aSig STATUS_VAR
);
2945 /*----------------------------------------------------------------------------
2946 | Returns the square root of the double-precision floating-point value `a'.
2947 | The operation is performed according to the IEC/IEEE Standard for Binary
2948 | Floating-Point Arithmetic.
2949 *----------------------------------------------------------------------------*/
2951 float64
float64_sqrt( float64 a STATUS_PARAM
)
2955 bits64 aSig
, zSig
, doubleZSig
;
2956 bits64 rem0
, rem1
, term0
, term1
;
2958 aSig
= extractFloat64Frac( a
);
2959 aExp
= extractFloat64Exp( a
);
2960 aSign
= extractFloat64Sign( a
);
2961 if ( aExp
== 0x7FF ) {
2962 if ( aSig
) return propagateFloat64NaN( a
, a STATUS_VAR
);
2963 if ( ! aSign
) return a
;
2964 float_raise( float_flag_invalid STATUS_VAR
);
2965 return float64_default_nan
;
2968 if ( ( aExp
| aSig
) == 0 ) return a
;
2969 float_raise( float_flag_invalid STATUS_VAR
);
2970 return float64_default_nan
;
2973 if ( aSig
== 0 ) return float64_zero
;
2974 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2976 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
2977 aSig
|= LIT64( 0x0010000000000000 );
2978 zSig
= estimateSqrt32( aExp
, aSig
>>21 );
2979 aSig
<<= 9 - ( aExp
& 1 );
2980 zSig
= estimateDiv128To64( aSig
, 0, zSig
<<32 ) + ( zSig
<<30 );
2981 if ( ( zSig
& 0x1FF ) <= 5 ) {
2982 doubleZSig
= zSig
<<1;
2983 mul64To128( zSig
, zSig
, &term0
, &term1
);
2984 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
2985 while ( (sbits64
) rem0
< 0 ) {
2988 add128( rem0
, rem1
, zSig
>>63, doubleZSig
| 1, &rem0
, &rem1
);
2990 zSig
|= ( ( rem0
| rem1
) != 0 );
2992 return roundAndPackFloat64( 0, zExp
, zSig STATUS_VAR
);
2996 /*----------------------------------------------------------------------------
2997 | Returns 1 if the double-precision floating-point value `a' is equal to the
2998 | corresponding value `b', and 0 otherwise. The comparison is performed
2999 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3000 *----------------------------------------------------------------------------*/
3002 int float64_eq( float64 a
, float64 b STATUS_PARAM
)
3006 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3007 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3009 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3010 float_raise( float_flag_invalid STATUS_VAR
);
3014 av
= float64_val(a
);
3015 bv
= float64_val(b
);
3016 return ( av
== bv
) || ( (bits64
) ( ( av
| bv
)<<1 ) == 0 );
3020 /*----------------------------------------------------------------------------
3021 | Returns 1 if the double-precision floating-point value `a' is less than or
3022 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3023 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3025 *----------------------------------------------------------------------------*/
3027 int float64_le( float64 a
, float64 b STATUS_PARAM
)
3032 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3033 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3035 float_raise( float_flag_invalid STATUS_VAR
);
3038 aSign
= extractFloat64Sign( a
);
3039 bSign
= extractFloat64Sign( b
);
3040 av
= float64_val(a
);
3041 bv
= float64_val(b
);
3042 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( av
| bv
)<<1 ) == 0 );
3043 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3047 /*----------------------------------------------------------------------------
3048 | Returns 1 if the double-precision floating-point value `a' is less than
3049 | the corresponding value `b', and 0 otherwise. The comparison is performed
3050 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3051 *----------------------------------------------------------------------------*/
3053 int float64_lt( float64 a
, float64 b STATUS_PARAM
)
3058 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3059 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3061 float_raise( float_flag_invalid STATUS_VAR
);
3064 aSign
= extractFloat64Sign( a
);
3065 bSign
= extractFloat64Sign( b
);
3066 av
= float64_val(a
);
3067 bv
= float64_val(b
);
3068 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( av
| bv
)<<1 ) != 0 );
3069 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3073 /*----------------------------------------------------------------------------
3074 | Returns 1 if the double-precision floating-point value `a' is equal to the
3075 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3076 | if either operand is a NaN. Otherwise, the comparison is performed
3077 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3078 *----------------------------------------------------------------------------*/
3080 int float64_eq_signaling( float64 a
, float64 b STATUS_PARAM
)
3084 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3085 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3087 float_raise( float_flag_invalid STATUS_VAR
);
3090 av
= float64_val(a
);
3091 bv
= float64_val(b
);
3092 return ( av
== bv
) || ( (bits64
) ( ( av
| bv
)<<1 ) == 0 );
3096 /*----------------------------------------------------------------------------
3097 | Returns 1 if the double-precision floating-point value `a' is less than or
3098 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3099 | cause an exception. Otherwise, the comparison is performed according to the
3100 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3101 *----------------------------------------------------------------------------*/
3103 int float64_le_quiet( float64 a
, float64 b STATUS_PARAM
)
3108 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3109 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3111 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3112 float_raise( float_flag_invalid STATUS_VAR
);
3116 aSign
= extractFloat64Sign( a
);
3117 bSign
= extractFloat64Sign( b
);
3118 av
= float64_val(a
);
3119 bv
= float64_val(b
);
3120 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( av
| bv
)<<1 ) == 0 );
3121 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3125 /*----------------------------------------------------------------------------
3126 | Returns 1 if the double-precision floating-point value `a' is less than
3127 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3128 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3129 | Standard for Binary Floating-Point Arithmetic.
3130 *----------------------------------------------------------------------------*/
3132 int float64_lt_quiet( float64 a
, float64 b STATUS_PARAM
)
3137 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3138 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3140 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3141 float_raise( float_flag_invalid STATUS_VAR
);
3145 aSign
= extractFloat64Sign( a
);
3146 bSign
= extractFloat64Sign( b
);
3147 av
= float64_val(a
);
3148 bv
= float64_val(b
);
3149 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( av
| bv
)<<1 ) != 0 );
3150 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3156 /*----------------------------------------------------------------------------
3157 | Returns the result of converting the extended double-precision floating-
3158 | point value `a' to the 32-bit two's complement integer format. The
3159 | conversion is performed according to the IEC/IEEE Standard for Binary
3160 | Floating-Point Arithmetic---which means in particular that the conversion
3161 | is rounded according to the current rounding mode. If `a' is a NaN, the
3162 | largest positive integer is returned. Otherwise, if the conversion
3163 | overflows, the largest integer with the same sign as `a' is returned.
3164 *----------------------------------------------------------------------------*/
3166 int32
floatx80_to_int32( floatx80 a STATUS_PARAM
)
3169 int32 aExp
, shiftCount
;
3172 aSig
= extractFloatx80Frac( a
);
3173 aExp
= extractFloatx80Exp( a
);
3174 aSign
= extractFloatx80Sign( a
);
3175 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
3176 shiftCount
= 0x4037 - aExp
;
3177 if ( shiftCount
<= 0 ) shiftCount
= 1;
3178 shift64RightJamming( aSig
, shiftCount
, &aSig
);
3179 return roundAndPackInt32( aSign
, aSig STATUS_VAR
);
3183 /*----------------------------------------------------------------------------
3184 | Returns the result of converting the extended double-precision floating-
3185 | point value `a' to the 32-bit two's complement integer format. The
3186 | conversion is performed according to the IEC/IEEE Standard for Binary
3187 | Floating-Point Arithmetic, except that the conversion is always rounded
3188 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3189 | Otherwise, if the conversion overflows, the largest integer with the same
3190 | sign as `a' is returned.
3191 *----------------------------------------------------------------------------*/
3193 int32
floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM
)
3196 int32 aExp
, shiftCount
;
3197 bits64 aSig
, savedASig
;
3200 aSig
= extractFloatx80Frac( a
);
3201 aExp
= extractFloatx80Exp( a
);
3202 aSign
= extractFloatx80Sign( a
);
3203 if ( 0x401E < aExp
) {
3204 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
3207 else if ( aExp
< 0x3FFF ) {
3208 if ( aExp
|| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
3211 shiftCount
= 0x403E - aExp
;
3213 aSig
>>= shiftCount
;
3215 if ( aSign
) z
= - z
;
3216 if ( ( z
< 0 ) ^ aSign
) {
3218 float_raise( float_flag_invalid STATUS_VAR
);
3219 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
3221 if ( ( aSig
<<shiftCount
) != savedASig
) {
3222 STATUS(float_exception_flags
) |= float_flag_inexact
;
3228 /*----------------------------------------------------------------------------
3229 | Returns the result of converting the extended double-precision floating-
3230 | point value `a' to the 64-bit two's complement integer format. The
3231 | conversion is performed according to the IEC/IEEE Standard for Binary
3232 | Floating-Point Arithmetic---which means in particular that the conversion
3233 | is rounded according to the current rounding mode. If `a' is a NaN,
3234 | the largest positive integer is returned. Otherwise, if the conversion
3235 | overflows, the largest integer with the same sign as `a' is returned.
3236 *----------------------------------------------------------------------------*/
3238 int64
floatx80_to_int64( floatx80 a STATUS_PARAM
)
3241 int32 aExp
, shiftCount
;
3242 bits64 aSig
, aSigExtra
;
3244 aSig
= extractFloatx80Frac( a
);
3245 aExp
= extractFloatx80Exp( a
);
3246 aSign
= extractFloatx80Sign( a
);
3247 shiftCount
= 0x403E - aExp
;
3248 if ( shiftCount
<= 0 ) {
3250 float_raise( float_flag_invalid STATUS_VAR
);
3252 || ( ( aExp
== 0x7FFF )
3253 && ( aSig
!= LIT64( 0x8000000000000000 ) ) )
3255 return LIT64( 0x7FFFFFFFFFFFFFFF );
3257 return (sbits64
) LIT64( 0x8000000000000000 );
3262 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
3264 return roundAndPackInt64( aSign
, aSig
, aSigExtra STATUS_VAR
);
3268 /*----------------------------------------------------------------------------
3269 | Returns the result of converting the extended double-precision floating-
3270 | point value `a' to the 64-bit two's complement integer format. The
3271 | conversion is performed according to the IEC/IEEE Standard for Binary
3272 | Floating-Point Arithmetic, except that the conversion is always rounded
3273 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3274 | Otherwise, if the conversion overflows, the largest integer with the same
3275 | sign as `a' is returned.
3276 *----------------------------------------------------------------------------*/
3278 int64
floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM
)
3281 int32 aExp
, shiftCount
;
3285 aSig
= extractFloatx80Frac( a
);
3286 aExp
= extractFloatx80Exp( a
);
3287 aSign
= extractFloatx80Sign( a
);
3288 shiftCount
= aExp
- 0x403E;
3289 if ( 0 <= shiftCount
) {
3290 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
3291 if ( ( a
.high
!= 0xC03E ) || aSig
) {
3292 float_raise( float_flag_invalid STATUS_VAR
);
3293 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
3294 return LIT64( 0x7FFFFFFFFFFFFFFF );
3297 return (sbits64
) LIT64( 0x8000000000000000 );
3299 else if ( aExp
< 0x3FFF ) {
3300 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
3303 z
= aSig
>>( - shiftCount
);
3304 if ( (bits64
) ( aSig
<<( shiftCount
& 63 ) ) ) {
3305 STATUS(float_exception_flags
) |= float_flag_inexact
;
3307 if ( aSign
) z
= - z
;
3312 /*----------------------------------------------------------------------------
3313 | Returns the result of converting the extended double-precision floating-
3314 | point value `a' to the single-precision floating-point format. The
3315 | conversion is performed according to the IEC/IEEE Standard for Binary
3316 | Floating-Point Arithmetic.
3317 *----------------------------------------------------------------------------*/
3319 float32
floatx80_to_float32( floatx80 a STATUS_PARAM
)
3325 aSig
= extractFloatx80Frac( a
);
3326 aExp
= extractFloatx80Exp( a
);
3327 aSign
= extractFloatx80Sign( a
);
3328 if ( aExp
== 0x7FFF ) {
3329 if ( (bits64
) ( aSig
<<1 ) ) {
3330 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR
) );
3332 return packFloat32( aSign
, 0xFF, 0 );
3334 shift64RightJamming( aSig
, 33, &aSig
);
3335 if ( aExp
|| aSig
) aExp
-= 0x3F81;
3336 return roundAndPackFloat32( aSign
, aExp
, aSig STATUS_VAR
);
3340 /*----------------------------------------------------------------------------
3341 | Returns the result of converting the extended double-precision floating-
3342 | point value `a' to the double-precision floating-point format. The
3343 | conversion is performed according to the IEC/IEEE Standard for Binary
3344 | Floating-Point Arithmetic.
3345 *----------------------------------------------------------------------------*/
3347 float64
floatx80_to_float64( floatx80 a STATUS_PARAM
)
3353 aSig
= extractFloatx80Frac( a
);
3354 aExp
= extractFloatx80Exp( a
);
3355 aSign
= extractFloatx80Sign( a
);
3356 if ( aExp
== 0x7FFF ) {
3357 if ( (bits64
) ( aSig
<<1 ) ) {
3358 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR
) );
3360 return packFloat64( aSign
, 0x7FF, 0 );
3362 shift64RightJamming( aSig
, 1, &zSig
);
3363 if ( aExp
|| aSig
) aExp
-= 0x3C01;
3364 return roundAndPackFloat64( aSign
, aExp
, zSig STATUS_VAR
);
3370 /*----------------------------------------------------------------------------
3371 | Returns the result of converting the extended double-precision floating-
3372 | point value `a' to the quadruple-precision floating-point format. The
3373 | conversion is performed according to the IEC/IEEE Standard for Binary
3374 | Floating-Point Arithmetic.
3375 *----------------------------------------------------------------------------*/
3377 float128
floatx80_to_float128( floatx80 a STATUS_PARAM
)
3381 bits64 aSig
, zSig0
, zSig1
;
3383 aSig
= extractFloatx80Frac( a
);
3384 aExp
= extractFloatx80Exp( a
);
3385 aSign
= extractFloatx80Sign( a
);
3386 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) {
3387 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR
) );
3389 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
3390 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
3396 /*----------------------------------------------------------------------------
3397 | Rounds the extended double-precision floating-point value `a' to an integer,
3398 | and returns the result as an extended quadruple-precision floating-point
3399 | value. The operation is performed according to the IEC/IEEE Standard for
3400 | Binary Floating-Point Arithmetic.
3401 *----------------------------------------------------------------------------*/
3403 floatx80
floatx80_round_to_int( floatx80 a STATUS_PARAM
)
3407 bits64 lastBitMask
, roundBitsMask
;
3411 aExp
= extractFloatx80Exp( a
);
3412 if ( 0x403E <= aExp
) {
3413 if ( ( aExp
== 0x7FFF ) && (bits64
) ( extractFloatx80Frac( a
)<<1 ) ) {
3414 return propagateFloatx80NaN( a
, a STATUS_VAR
);
3418 if ( aExp
< 0x3FFF ) {
3420 && ( (bits64
) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
3423 STATUS(float_exception_flags
) |= float_flag_inexact
;
3424 aSign
= extractFloatx80Sign( a
);
3425 switch ( STATUS(float_rounding_mode
) ) {
3426 case float_round_nearest_even
:
3427 if ( ( aExp
== 0x3FFE ) && (bits64
) ( extractFloatx80Frac( a
)<<1 )
3430 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
3433 case float_round_down
:
3436 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3437 : packFloatx80( 0, 0, 0 );
3438 case float_round_up
:
3440 aSign
? packFloatx80( 1, 0, 0 )
3441 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3443 return packFloatx80( aSign
, 0, 0 );
3446 lastBitMask
<<= 0x403E - aExp
;
3447 roundBitsMask
= lastBitMask
- 1;
3449 roundingMode
= STATUS(float_rounding_mode
);
3450 if ( roundingMode
== float_round_nearest_even
) {
3451 z
.low
+= lastBitMask
>>1;
3452 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
3454 else if ( roundingMode
!= float_round_to_zero
) {
3455 if ( extractFloatx80Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
3456 z
.low
+= roundBitsMask
;
3459 z
.low
&= ~ roundBitsMask
;
3462 z
.low
= LIT64( 0x8000000000000000 );
3464 if ( z
.low
!= a
.low
) STATUS(float_exception_flags
) |= float_flag_inexact
;
3469 /*----------------------------------------------------------------------------
3470 | Returns the result of adding the absolute values of the extended double-
3471 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3472 | negated before being returned. `zSign' is ignored if the result is a NaN.
3473 | The addition is performed according to the IEC/IEEE Standard for Binary
3474 | Floating-Point Arithmetic.
3475 *----------------------------------------------------------------------------*/
3477 static floatx80
addFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign STATUS_PARAM
)
3479 int32 aExp
, bExp
, zExp
;
3480 bits64 aSig
, bSig
, zSig0
, zSig1
;
3483 aSig
= extractFloatx80Frac( a
);
3484 aExp
= extractFloatx80Exp( a
);
3485 bSig
= extractFloatx80Frac( b
);
3486 bExp
= extractFloatx80Exp( b
);
3487 expDiff
= aExp
- bExp
;
3488 if ( 0 < expDiff
) {
3489 if ( aExp
== 0x7FFF ) {
3490 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3493 if ( bExp
== 0 ) --expDiff
;
3494 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
3497 else if ( expDiff
< 0 ) {
3498 if ( bExp
== 0x7FFF ) {
3499 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3500 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3502 if ( aExp
== 0 ) ++expDiff
;
3503 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
3507 if ( aExp
== 0x7FFF ) {
3508 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
3509 return propagateFloatx80NaN( a
, b STATUS_VAR
);
3514 zSig0
= aSig
+ bSig
;
3516 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
3522 zSig0
= aSig
+ bSig
;
3523 if ( (sbits64
) zSig0
< 0 ) goto roundAndPack
;
3525 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
3526 zSig0
|= LIT64( 0x8000000000000000 );
3530 roundAndPackFloatx80(
3531 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
3535 /*----------------------------------------------------------------------------
3536 | Returns the result of subtracting the absolute values of the extended
3537 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3538 | difference is negated before being returned. `zSign' is ignored if the
3539 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3540 | Standard for Binary Floating-Point Arithmetic.
3541 *----------------------------------------------------------------------------*/
3543 static floatx80
subFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign STATUS_PARAM
)
3545 int32 aExp
, bExp
, zExp
;
3546 bits64 aSig
, bSig
, zSig0
, zSig1
;
3550 aSig
= extractFloatx80Frac( a
);
3551 aExp
= extractFloatx80Exp( a
);
3552 bSig
= extractFloatx80Frac( b
);
3553 bExp
= extractFloatx80Exp( b
);
3554 expDiff
= aExp
- bExp
;
3555 if ( 0 < expDiff
) goto aExpBigger
;
3556 if ( expDiff
< 0 ) goto bExpBigger
;
3557 if ( aExp
== 0x7FFF ) {
3558 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
3559 return propagateFloatx80NaN( a
, b STATUS_VAR
);
3561 float_raise( float_flag_invalid STATUS_VAR
);
3562 z
.low
= floatx80_default_nan_low
;
3563 z
.high
= floatx80_default_nan_high
;
3571 if ( bSig
< aSig
) goto aBigger
;
3572 if ( aSig
< bSig
) goto bBigger
;
3573 return packFloatx80( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
3575 if ( bExp
== 0x7FFF ) {
3576 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3577 return packFloatx80( zSign
^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3579 if ( aExp
== 0 ) ++expDiff
;
3580 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
3582 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
3585 goto normalizeRoundAndPack
;
3587 if ( aExp
== 0x7FFF ) {
3588 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3591 if ( bExp
== 0 ) --expDiff
;
3592 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
3594 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
3596 normalizeRoundAndPack
:
3598 normalizeRoundAndPackFloatx80(
3599 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
3603 /*----------------------------------------------------------------------------
3604 | Returns the result of adding the extended double-precision floating-point
3605 | values `a' and `b'. The operation is performed according to the IEC/IEEE
3606 | Standard for Binary Floating-Point Arithmetic.
3607 *----------------------------------------------------------------------------*/
3609 floatx80
floatx80_add( floatx80 a
, floatx80 b STATUS_PARAM
)
3613 aSign
= extractFloatx80Sign( a
);
3614 bSign
= extractFloatx80Sign( b
);
3615 if ( aSign
== bSign
) {
3616 return addFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
3619 return subFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
3624 /*----------------------------------------------------------------------------
3625 | Returns the result of subtracting the extended double-precision floating-
3626 | point values `a' and `b'. The operation is performed according to the
3627 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3628 *----------------------------------------------------------------------------*/
3630 floatx80
floatx80_sub( floatx80 a
, floatx80 b STATUS_PARAM
)
3634 aSign
= extractFloatx80Sign( a
);
3635 bSign
= extractFloatx80Sign( b
);
3636 if ( aSign
== bSign
) {
3637 return subFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
3640 return addFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
3645 /*----------------------------------------------------------------------------
3646 | Returns the result of multiplying the extended double-precision floating-
3647 | point values `a' and `b'. The operation is performed according to the
3648 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3649 *----------------------------------------------------------------------------*/
3651 floatx80
floatx80_mul( floatx80 a
, floatx80 b STATUS_PARAM
)
3653 flag aSign
, bSign
, zSign
;
3654 int32 aExp
, bExp
, zExp
;
3655 bits64 aSig
, bSig
, zSig0
, zSig1
;
3658 aSig
= extractFloatx80Frac( a
);
3659 aExp
= extractFloatx80Exp( a
);
3660 aSign
= extractFloatx80Sign( a
);
3661 bSig
= extractFloatx80Frac( b
);
3662 bExp
= extractFloatx80Exp( b
);
3663 bSign
= extractFloatx80Sign( b
);
3664 zSign
= aSign
^ bSign
;
3665 if ( aExp
== 0x7FFF ) {
3666 if ( (bits64
) ( aSig
<<1 )
3667 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
3668 return propagateFloatx80NaN( a
, b STATUS_VAR
);
3670 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
3671 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3673 if ( bExp
== 0x7FFF ) {
3674 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3675 if ( ( aExp
| aSig
) == 0 ) {
3677 float_raise( float_flag_invalid STATUS_VAR
);
3678 z
.low
= floatx80_default_nan_low
;
3679 z
.high
= floatx80_default_nan_high
;
3682 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3685 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3686 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
3689 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3690 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3692 zExp
= aExp
+ bExp
- 0x3FFE;
3693 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
3694 if ( 0 < (sbits64
) zSig0
) {
3695 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
3699 roundAndPackFloatx80(
3700 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
3704 /*----------------------------------------------------------------------------
3705 | Returns the result of dividing the extended double-precision floating-point
3706 | value `a' by the corresponding value `b'. The operation is performed
3707 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3708 *----------------------------------------------------------------------------*/
3710 floatx80
floatx80_div( floatx80 a
, floatx80 b STATUS_PARAM
)
3712 flag aSign
, bSign
, zSign
;
3713 int32 aExp
, bExp
, zExp
;
3714 bits64 aSig
, bSig
, zSig0
, zSig1
;
3715 bits64 rem0
, rem1
, rem2
, term0
, term1
, term2
;
3718 aSig
= extractFloatx80Frac( a
);
3719 aExp
= extractFloatx80Exp( a
);
3720 aSign
= extractFloatx80Sign( a
);
3721 bSig
= extractFloatx80Frac( b
);
3722 bExp
= extractFloatx80Exp( b
);
3723 bSign
= extractFloatx80Sign( b
);
3724 zSign
= aSign
^ bSign
;
3725 if ( aExp
== 0x7FFF ) {
3726 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3727 if ( bExp
== 0x7FFF ) {
3728 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3731 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3733 if ( bExp
== 0x7FFF ) {
3734 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3735 return packFloatx80( zSign
, 0, 0 );
3739 if ( ( aExp
| aSig
) == 0 ) {
3741 float_raise( float_flag_invalid STATUS_VAR
);
3742 z
.low
= floatx80_default_nan_low
;
3743 z
.high
= floatx80_default_nan_high
;
3746 float_raise( float_flag_divbyzero STATUS_VAR
);
3747 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3749 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3752 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3753 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
3755 zExp
= aExp
- bExp
+ 0x3FFE;
3757 if ( bSig
<= aSig
) {
3758 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
3761 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
3762 mul64To128( bSig
, zSig0
, &term0
, &term1
);
3763 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
3764 while ( (sbits64
) rem0
< 0 ) {
3766 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
3768 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
3769 if ( (bits64
) ( zSig1
<<1 ) <= 8 ) {
3770 mul64To128( bSig
, zSig1
, &term1
, &term2
);
3771 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
3772 while ( (sbits64
) rem1
< 0 ) {
3774 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
3776 zSig1
|= ( ( rem1
| rem2
) != 0 );
3779 roundAndPackFloatx80(
3780 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
3784 /*----------------------------------------------------------------------------
3785 | Returns the remainder of the extended double-precision floating-point value
3786 | `a' with respect to the corresponding value `b'. The operation is performed
3787 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3788 *----------------------------------------------------------------------------*/
3790 floatx80
floatx80_rem( floatx80 a
, floatx80 b STATUS_PARAM
)
3792 flag aSign
, bSign
, zSign
;
3793 int32 aExp
, bExp
, expDiff
;
3794 bits64 aSig0
, aSig1
, bSig
;
3795 bits64 q
, term0
, term1
, alternateASig0
, alternateASig1
;
3798 aSig0
= extractFloatx80Frac( a
);
3799 aExp
= extractFloatx80Exp( a
);
3800 aSign
= extractFloatx80Sign( a
);
3801 bSig
= extractFloatx80Frac( b
);
3802 bExp
= extractFloatx80Exp( b
);
3803 bSign
= extractFloatx80Sign( b
);
3804 if ( aExp
== 0x7FFF ) {
3805 if ( (bits64
) ( aSig0
<<1 )
3806 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
3807 return propagateFloatx80NaN( a
, b STATUS_VAR
);
3811 if ( bExp
== 0x7FFF ) {
3812 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3818 float_raise( float_flag_invalid STATUS_VAR
);
3819 z
.low
= floatx80_default_nan_low
;
3820 z
.high
= floatx80_default_nan_high
;
3823 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3826 if ( (bits64
) ( aSig0
<<1 ) == 0 ) return a
;
3827 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
3829 bSig
|= LIT64( 0x8000000000000000 );
3831 expDiff
= aExp
- bExp
;
3833 if ( expDiff
< 0 ) {
3834 if ( expDiff
< -1 ) return a
;
3835 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
3838 q
= ( bSig
<= aSig0
);
3839 if ( q
) aSig0
-= bSig
;
3841 while ( 0 < expDiff
) {
3842 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
3843 q
= ( 2 < q
) ? q
- 2 : 0;
3844 mul64To128( bSig
, q
, &term0
, &term1
);
3845 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3846 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
3850 if ( 0 < expDiff
) {
3851 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
3852 q
= ( 2 < q
) ? q
- 2 : 0;
3854 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
3855 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3856 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
3857 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
3859 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3866 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
3867 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
3868 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
3871 aSig0
= alternateASig0
;
3872 aSig1
= alternateASig1
;
3876 normalizeRoundAndPackFloatx80(
3877 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1 STATUS_VAR
);
3881 /*----------------------------------------------------------------------------
3882 | Returns the square root of the extended double-precision floating-point
3883 | value `a'. The operation is performed according to the IEC/IEEE Standard
3884 | for Binary Floating-Point Arithmetic.
3885 *----------------------------------------------------------------------------*/
3887 floatx80
floatx80_sqrt( floatx80 a STATUS_PARAM
)
3891 bits64 aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
3892 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
3895 aSig0
= extractFloatx80Frac( a
);
3896 aExp
= extractFloatx80Exp( a
);
3897 aSign
= extractFloatx80Sign( a
);
3898 if ( aExp
== 0x7FFF ) {
3899 if ( (bits64
) ( aSig0
<<1 ) ) return propagateFloatx80NaN( a
, a STATUS_VAR
);
3900 if ( ! aSign
) return a
;
3904 if ( ( aExp
| aSig0
) == 0 ) return a
;
3906 float_raise( float_flag_invalid STATUS_VAR
);
3907 z
.low
= floatx80_default_nan_low
;
3908 z
.high
= floatx80_default_nan_high
;
3912 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
3913 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
3915 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
3916 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
3917 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
3918 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
3919 doubleZSig0
= zSig0
<<1;
3920 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
3921 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
3922 while ( (sbits64
) rem0
< 0 ) {
3925 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
3927 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
3928 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3929 if ( zSig1
== 0 ) zSig1
= 1;
3930 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
3931 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
3932 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
3933 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
3934 while ( (sbits64
) rem1
< 0 ) {
3936 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
3938 term2
|= doubleZSig0
;
3939 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
3941 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
3943 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
3944 zSig0
|= doubleZSig0
;
3946 roundAndPackFloatx80(
3947 STATUS(floatx80_rounding_precision
), 0, zExp
, zSig0
, zSig1 STATUS_VAR
);
3951 /*----------------------------------------------------------------------------
3952 | Returns 1 if the extended double-precision floating-point value `a' is
3953 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3954 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3956 *----------------------------------------------------------------------------*/
3958 int floatx80_eq( floatx80 a
, floatx80 b STATUS_PARAM
)
3961 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3962 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3963 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3964 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3966 if ( floatx80_is_signaling_nan( a
)
3967 || floatx80_is_signaling_nan( b
) ) {
3968 float_raise( float_flag_invalid STATUS_VAR
);
3974 && ( ( a
.high
== b
.high
)
3976 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
3981 /*----------------------------------------------------------------------------
3982 | Returns 1 if the extended double-precision floating-point value `a' is
3983 | less than or equal to the corresponding value `b', and 0 otherwise. The
3984 | comparison is performed according to the IEC/IEEE Standard for Binary
3985 | Floating-Point Arithmetic.
3986 *----------------------------------------------------------------------------*/
3988 int floatx80_le( floatx80 a
, floatx80 b STATUS_PARAM
)
3992 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3993 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3994 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3995 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3997 float_raise( float_flag_invalid STATUS_VAR
);
4000 aSign
= extractFloatx80Sign( a
);
4001 bSign
= extractFloatx80Sign( b
);
4002 if ( aSign
!= bSign
) {
4005 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4009 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4010 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4014 /*----------------------------------------------------------------------------
4015 | Returns 1 if the extended double-precision floating-point value `a' is
4016 | less than the corresponding value `b', and 0 otherwise. The comparison
4017 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4019 *----------------------------------------------------------------------------*/
4021 int floatx80_lt( floatx80 a
, floatx80 b STATUS_PARAM
)
4025 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4026 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4027 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4028 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4030 float_raise( float_flag_invalid STATUS_VAR
);
4033 aSign
= extractFloatx80Sign( a
);
4034 bSign
= extractFloatx80Sign( b
);
4035 if ( aSign
!= bSign
) {
4038 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4042 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4043 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4047 /*----------------------------------------------------------------------------
4048 | Returns 1 if the extended double-precision floating-point value `a' is equal
4049 | to the corresponding value `b', and 0 otherwise. The invalid exception is
4050 | raised if either operand is a NaN. Otherwise, the comparison is performed
4051 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4052 *----------------------------------------------------------------------------*/
4054 int floatx80_eq_signaling( floatx80 a
, floatx80 b STATUS_PARAM
)
4057 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4058 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4059 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4060 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4062 float_raise( float_flag_invalid STATUS_VAR
);
4067 && ( ( a
.high
== b
.high
)
4069 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
4074 /*----------------------------------------------------------------------------
4075 | Returns 1 if the extended double-precision floating-point value `a' is less
4076 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4077 | do not cause an exception. Otherwise, the comparison is performed according
4078 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4079 *----------------------------------------------------------------------------*/
4081 int floatx80_le_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
4085 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4086 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4087 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4088 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4090 if ( floatx80_is_signaling_nan( a
)
4091 || floatx80_is_signaling_nan( b
) ) {
4092 float_raise( float_flag_invalid STATUS_VAR
);
4096 aSign
= extractFloatx80Sign( a
);
4097 bSign
= extractFloatx80Sign( b
);
4098 if ( aSign
!= bSign
) {
4101 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4105 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4106 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4110 /*----------------------------------------------------------------------------
4111 | Returns 1 if the extended double-precision floating-point value `a' is less
4112 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4113 | an exception. Otherwise, the comparison is performed according to the
4114 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4115 *----------------------------------------------------------------------------*/
4117 int floatx80_lt_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
4121 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4122 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4123 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4124 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4126 if ( floatx80_is_signaling_nan( a
)
4127 || floatx80_is_signaling_nan( b
) ) {
4128 float_raise( float_flag_invalid STATUS_VAR
);
4132 aSign
= extractFloatx80Sign( a
);
4133 bSign
= extractFloatx80Sign( b
);
4134 if ( aSign
!= bSign
) {
4137 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4141 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4142 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4150 /*----------------------------------------------------------------------------
4151 | Returns the result of converting the quadruple-precision floating-point
4152 | value `a' to the 32-bit two's complement integer format. The conversion
4153 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4154 | Arithmetic---which means in particular that the conversion is rounded
4155 | according to the current rounding mode. If `a' is a NaN, the largest
4156 | positive integer is returned. Otherwise, if the conversion overflows, the
4157 | largest integer with the same sign as `a' is returned.
4158 *----------------------------------------------------------------------------*/
4160 int32
float128_to_int32( float128 a STATUS_PARAM
)
4163 int32 aExp
, shiftCount
;
4164 bits64 aSig0
, aSig1
;
4166 aSig1
= extractFloat128Frac1( a
);
4167 aSig0
= extractFloat128Frac0( a
);
4168 aExp
= extractFloat128Exp( a
);
4169 aSign
= extractFloat128Sign( a
);
4170 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
4171 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4172 aSig0
|= ( aSig1
!= 0 );
4173 shiftCount
= 0x4028 - aExp
;
4174 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
4175 return roundAndPackInt32( aSign
, aSig0 STATUS_VAR
);
4179 /*----------------------------------------------------------------------------
4180 | Returns the result of converting the quadruple-precision floating-point
4181 | value `a' to the 32-bit two's complement integer format. The conversion
4182 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4183 | Arithmetic, except that the conversion is always rounded toward zero. If
4184 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4185 | conversion overflows, the largest integer with the same sign as `a' is
4187 *----------------------------------------------------------------------------*/
4189 int32
float128_to_int32_round_to_zero( float128 a STATUS_PARAM
)
4192 int32 aExp
, shiftCount
;
4193 bits64 aSig0
, aSig1
, savedASig
;
4196 aSig1
= extractFloat128Frac1( a
);
4197 aSig0
= extractFloat128Frac0( a
);
4198 aExp
= extractFloat128Exp( a
);
4199 aSign
= extractFloat128Sign( a
);
4200 aSig0
|= ( aSig1
!= 0 );
4201 if ( 0x401E < aExp
) {
4202 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
4205 else if ( aExp
< 0x3FFF ) {
4206 if ( aExp
|| aSig0
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4209 aSig0
|= LIT64( 0x0001000000000000 );
4210 shiftCount
= 0x402F - aExp
;
4212 aSig0
>>= shiftCount
;
4214 if ( aSign
) z
= - z
;
4215 if ( ( z
< 0 ) ^ aSign
) {
4217 float_raise( float_flag_invalid STATUS_VAR
);
4218 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
4220 if ( ( aSig0
<<shiftCount
) != savedASig
) {
4221 STATUS(float_exception_flags
) |= float_flag_inexact
;
4227 /*----------------------------------------------------------------------------
4228 | Returns the result of converting the quadruple-precision floating-point
4229 | value `a' to the 64-bit two's complement integer format. The conversion
4230 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4231 | Arithmetic---which means in particular that the conversion is rounded
4232 | according to the current rounding mode. If `a' is a NaN, the largest
4233 | positive integer is returned. Otherwise, if the conversion overflows, the
4234 | largest integer with the same sign as `a' is returned.
4235 *----------------------------------------------------------------------------*/
4237 int64
float128_to_int64( float128 a STATUS_PARAM
)
4240 int32 aExp
, shiftCount
;
4241 bits64 aSig0
, aSig1
;
4243 aSig1
= extractFloat128Frac1( a
);
4244 aSig0
= extractFloat128Frac0( a
);
4245 aExp
= extractFloat128Exp( a
);
4246 aSign
= extractFloat128Sign( a
);
4247 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4248 shiftCount
= 0x402F - aExp
;
4249 if ( shiftCount
<= 0 ) {
4250 if ( 0x403E < aExp
) {
4251 float_raise( float_flag_invalid STATUS_VAR
);
4253 || ( ( aExp
== 0x7FFF )
4254 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
4257 return LIT64( 0x7FFFFFFFFFFFFFFF );
4259 return (sbits64
) LIT64( 0x8000000000000000 );
4261 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
4264 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
4266 return roundAndPackInt64( aSign
, aSig0
, aSig1 STATUS_VAR
);
4270 /*----------------------------------------------------------------------------
4271 | Returns the result of converting the quadruple-precision floating-point
4272 | value `a' to the 64-bit two's complement integer format. The conversion
4273 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4274 | Arithmetic, except that the conversion is always rounded toward zero.
4275 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4276 | the conversion overflows, the largest integer with the same sign as `a' is
4278 *----------------------------------------------------------------------------*/
4280 int64
float128_to_int64_round_to_zero( float128 a STATUS_PARAM
)
4283 int32 aExp
, shiftCount
;
4284 bits64 aSig0
, aSig1
;
4287 aSig1
= extractFloat128Frac1( a
);
4288 aSig0
= extractFloat128Frac0( a
);
4289 aExp
= extractFloat128Exp( a
);
4290 aSign
= extractFloat128Sign( a
);
4291 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4292 shiftCount
= aExp
- 0x402F;
4293 if ( 0 < shiftCount
) {
4294 if ( 0x403E <= aExp
) {
4295 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
4296 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
4297 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
4298 if ( aSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4301 float_raise( float_flag_invalid STATUS_VAR
);
4302 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
4303 return LIT64( 0x7FFFFFFFFFFFFFFF );
4306 return (sbits64
) LIT64( 0x8000000000000000 );
4308 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
4309 if ( (bits64
) ( aSig1
<<shiftCount
) ) {
4310 STATUS(float_exception_flags
) |= float_flag_inexact
;
4314 if ( aExp
< 0x3FFF ) {
4315 if ( aExp
| aSig0
| aSig1
) {
4316 STATUS(float_exception_flags
) |= float_flag_inexact
;
4320 z
= aSig0
>>( - shiftCount
);
4322 || ( shiftCount
&& (bits64
) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
4323 STATUS(float_exception_flags
) |= float_flag_inexact
;
4326 if ( aSign
) z
= - z
;
4331 /*----------------------------------------------------------------------------
4332 | Returns the result of converting the quadruple-precision floating-point
4333 | value `a' to the single-precision floating-point format. The conversion
4334 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4336 *----------------------------------------------------------------------------*/
4338 float32
float128_to_float32( float128 a STATUS_PARAM
)
4342 bits64 aSig0
, aSig1
;
4345 aSig1
= extractFloat128Frac1( a
);
4346 aSig0
= extractFloat128Frac0( a
);
4347 aExp
= extractFloat128Exp( a
);
4348 aSign
= extractFloat128Sign( a
);
4349 if ( aExp
== 0x7FFF ) {
4350 if ( aSig0
| aSig1
) {
4351 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR
) );
4353 return packFloat32( aSign
, 0xFF, 0 );
4355 aSig0
|= ( aSig1
!= 0 );
4356 shift64RightJamming( aSig0
, 18, &aSig0
);
4358 if ( aExp
|| zSig
) {
4362 return roundAndPackFloat32( aSign
, aExp
, zSig STATUS_VAR
);
4366 /*----------------------------------------------------------------------------
4367 | Returns the result of converting the quadruple-precision floating-point
4368 | value `a' to the double-precision floating-point format. The conversion
4369 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4371 *----------------------------------------------------------------------------*/
4373 float64
float128_to_float64( float128 a STATUS_PARAM
)
4377 bits64 aSig0
, aSig1
;
4379 aSig1
= extractFloat128Frac1( a
);
4380 aSig0
= extractFloat128Frac0( a
);
4381 aExp
= extractFloat128Exp( a
);
4382 aSign
= extractFloat128Sign( a
);
4383 if ( aExp
== 0x7FFF ) {
4384 if ( aSig0
| aSig1
) {
4385 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR
) );
4387 return packFloat64( aSign
, 0x7FF, 0 );
4389 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
4390 aSig0
|= ( aSig1
!= 0 );
4391 if ( aExp
|| aSig0
) {
4392 aSig0
|= LIT64( 0x4000000000000000 );
4395 return roundAndPackFloat64( aSign
, aExp
, aSig0 STATUS_VAR
);
4401 /*----------------------------------------------------------------------------
4402 | Returns the result of converting the quadruple-precision floating-point
4403 | value `a' to the extended double-precision floating-point format. The
4404 | conversion is performed according to the IEC/IEEE Standard for Binary
4405 | Floating-Point Arithmetic.
4406 *----------------------------------------------------------------------------*/
4408 floatx80
float128_to_floatx80( float128 a STATUS_PARAM
)
4412 bits64 aSig0
, aSig1
;
4414 aSig1
= extractFloat128Frac1( a
);
4415 aSig0
= extractFloat128Frac0( a
);
4416 aExp
= extractFloat128Exp( a
);
4417 aSign
= extractFloat128Sign( a
);
4418 if ( aExp
== 0x7FFF ) {
4419 if ( aSig0
| aSig1
) {
4420 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR
) );
4422 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4425 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
4426 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4429 aSig0
|= LIT64( 0x0001000000000000 );
4431 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
4432 return roundAndPackFloatx80( 80, aSign
, aExp
, aSig0
, aSig1 STATUS_VAR
);
4438 /*----------------------------------------------------------------------------
4439 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4440 | returns the result as a quadruple-precision floating-point value. The
4441 | operation is performed according to the IEC/IEEE Standard for Binary
4442 | Floating-Point Arithmetic.
4443 *----------------------------------------------------------------------------*/
4445 float128
float128_round_to_int( float128 a STATUS_PARAM
)
4449 bits64 lastBitMask
, roundBitsMask
;
4453 aExp
= extractFloat128Exp( a
);
4454 if ( 0x402F <= aExp
) {
4455 if ( 0x406F <= aExp
) {
4456 if ( ( aExp
== 0x7FFF )
4457 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
4459 return propagateFloat128NaN( a
, a STATUS_VAR
);
4464 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
4465 roundBitsMask
= lastBitMask
- 1;
4467 roundingMode
= STATUS(float_rounding_mode
);
4468 if ( roundingMode
== float_round_nearest_even
) {
4469 if ( lastBitMask
) {
4470 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
4471 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
4474 if ( (sbits64
) z
.low
< 0 ) {
4476 if ( (bits64
) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
4480 else if ( roundingMode
!= float_round_to_zero
) {
4481 if ( extractFloat128Sign( z
)
4482 ^ ( roundingMode
== float_round_up
) ) {
4483 add128( z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
4486 z
.low
&= ~ roundBitsMask
;
4489 if ( aExp
< 0x3FFF ) {
4490 if ( ( ( (bits64
) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
4491 STATUS(float_exception_flags
) |= float_flag_inexact
;
4492 aSign
= extractFloat128Sign( a
);
4493 switch ( STATUS(float_rounding_mode
) ) {
4494 case float_round_nearest_even
:
4495 if ( ( aExp
== 0x3FFE )
4496 && ( extractFloat128Frac0( a
)
4497 | extractFloat128Frac1( a
) )
4499 return packFloat128( aSign
, 0x3FFF, 0, 0 );
4502 case float_round_down
:
4504 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
4505 : packFloat128( 0, 0, 0, 0 );
4506 case float_round_up
:
4508 aSign
? packFloat128( 1, 0, 0, 0 )
4509 : packFloat128( 0, 0x3FFF, 0, 0 );
4511 return packFloat128( aSign
, 0, 0, 0 );
4514 lastBitMask
<<= 0x402F - aExp
;
4515 roundBitsMask
= lastBitMask
- 1;
4518 roundingMode
= STATUS(float_rounding_mode
);
4519 if ( roundingMode
== float_round_nearest_even
) {
4520 z
.high
+= lastBitMask
>>1;
4521 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
4522 z
.high
&= ~ lastBitMask
;
4525 else if ( roundingMode
!= float_round_to_zero
) {
4526 if ( extractFloat128Sign( z
)
4527 ^ ( roundingMode
== float_round_up
) ) {
4528 z
.high
|= ( a
.low
!= 0 );
4529 z
.high
+= roundBitsMask
;
4532 z
.high
&= ~ roundBitsMask
;
4534 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
4535 STATUS(float_exception_flags
) |= float_flag_inexact
;
4541 /*----------------------------------------------------------------------------
4542 | Returns the result of adding the absolute values of the quadruple-precision
4543 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4544 | before being returned. `zSign' is ignored if the result is a NaN.
4545 | The addition is performed according to the IEC/IEEE Standard for Binary
4546 | Floating-Point Arithmetic.
4547 *----------------------------------------------------------------------------*/
4549 static float128
addFloat128Sigs( float128 a
, float128 b
, flag zSign STATUS_PARAM
)
4551 int32 aExp
, bExp
, zExp
;
4552 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
4555 aSig1
= extractFloat128Frac1( a
);
4556 aSig0
= extractFloat128Frac0( a
);
4557 aExp
= extractFloat128Exp( a
);
4558 bSig1
= extractFloat128Frac1( b
);
4559 bSig0
= extractFloat128Frac0( b
);
4560 bExp
= extractFloat128Exp( b
);
4561 expDiff
= aExp
- bExp
;
4562 if ( 0 < expDiff
) {
4563 if ( aExp
== 0x7FFF ) {
4564 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
4571 bSig0
|= LIT64( 0x0001000000000000 );
4573 shift128ExtraRightJamming(
4574 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
4577 else if ( expDiff
< 0 ) {
4578 if ( bExp
== 0x7FFF ) {
4579 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
4580 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4586 aSig0
|= LIT64( 0x0001000000000000 );
4588 shift128ExtraRightJamming(
4589 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
4593 if ( aExp
== 0x7FFF ) {
4594 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
4595 return propagateFloat128NaN( a
, b STATUS_VAR
);
4599 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4600 if ( aExp
== 0 ) return packFloat128( zSign
, 0, zSig0
, zSig1
);
4602 zSig0
|= LIT64( 0x0002000000000000 );
4606 aSig0
|= LIT64( 0x0001000000000000 );
4607 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4609 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
4612 shift128ExtraRightJamming(
4613 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
4615 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
4619 /*----------------------------------------------------------------------------
4620 | Returns the result of subtracting the absolute values of the quadruple-
4621 | precision floating-point values `a' and `b'. If `zSign' is 1, the
4622 | difference is negated before being returned. `zSign' is ignored if the
4623 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4624 | Standard for Binary Floating-Point Arithmetic.
4625 *----------------------------------------------------------------------------*/
4627 static float128
subFloat128Sigs( float128 a
, float128 b
, flag zSign STATUS_PARAM
)
4629 int32 aExp
, bExp
, zExp
;
4630 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
4634 aSig1
= extractFloat128Frac1( a
);
4635 aSig0
= extractFloat128Frac0( a
);
4636 aExp
= extractFloat128Exp( a
);
4637 bSig1
= extractFloat128Frac1( b
);
4638 bSig0
= extractFloat128Frac0( b
);
4639 bExp
= extractFloat128Exp( b
);
4640 expDiff
= aExp
- bExp
;
4641 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
4642 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
4643 if ( 0 < expDiff
) goto aExpBigger
;
4644 if ( expDiff
< 0 ) goto bExpBigger
;
4645 if ( aExp
== 0x7FFF ) {
4646 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
4647 return propagateFloat128NaN( a
, b STATUS_VAR
);
4649 float_raise( float_flag_invalid STATUS_VAR
);
4650 z
.low
= float128_default_nan_low
;
4651 z
.high
= float128_default_nan_high
;
4658 if ( bSig0
< aSig0
) goto aBigger
;
4659 if ( aSig0
< bSig0
) goto bBigger
;
4660 if ( bSig1
< aSig1
) goto aBigger
;
4661 if ( aSig1
< bSig1
) goto bBigger
;
4662 return packFloat128( STATUS(float_rounding_mode
) == float_round_down
, 0, 0, 0 );
4664 if ( bExp
== 0x7FFF ) {
4665 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
4666 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
4672 aSig0
|= LIT64( 0x4000000000000000 );
4674 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
4675 bSig0
|= LIT64( 0x4000000000000000 );
4677 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
4680 goto normalizeRoundAndPack
;
4682 if ( aExp
== 0x7FFF ) {
4683 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
4690 bSig0
|= LIT64( 0x4000000000000000 );
4692 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
4693 aSig0
|= LIT64( 0x4000000000000000 );
4695 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4697 normalizeRoundAndPack
:
4699 return normalizeRoundAndPackFloat128( zSign
, zExp
- 14, zSig0
, zSig1 STATUS_VAR
);
4703 /*----------------------------------------------------------------------------
4704 | Returns the result of adding the quadruple-precision floating-point values
4705 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4706 | for Binary Floating-Point Arithmetic.
4707 *----------------------------------------------------------------------------*/
4709 float128
float128_add( float128 a
, float128 b STATUS_PARAM
)
4713 aSign
= extractFloat128Sign( a
);
4714 bSign
= extractFloat128Sign( b
);
4715 if ( aSign
== bSign
) {
4716 return addFloat128Sigs( a
, b
, aSign STATUS_VAR
);
4719 return subFloat128Sigs( a
, b
, aSign STATUS_VAR
);
4724 /*----------------------------------------------------------------------------
4725 | Returns the result of subtracting the quadruple-precision floating-point
4726 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4727 | Standard for Binary Floating-Point Arithmetic.
4728 *----------------------------------------------------------------------------*/
4730 float128
float128_sub( float128 a
, float128 b STATUS_PARAM
)
4734 aSign
= extractFloat128Sign( a
);
4735 bSign
= extractFloat128Sign( b
);
4736 if ( aSign
== bSign
) {
4737 return subFloat128Sigs( a
, b
, aSign STATUS_VAR
);
4740 return addFloat128Sigs( a
, b
, aSign STATUS_VAR
);
4745 /*----------------------------------------------------------------------------
4746 | Returns the result of multiplying the quadruple-precision floating-point
4747 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4748 | Standard for Binary Floating-Point Arithmetic.
4749 *----------------------------------------------------------------------------*/
4751 float128
float128_mul( float128 a
, float128 b STATUS_PARAM
)
4753 flag aSign
, bSign
, zSign
;
4754 int32 aExp
, bExp
, zExp
;
4755 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
4758 aSig1
= extractFloat128Frac1( a
);
4759 aSig0
= extractFloat128Frac0( a
);
4760 aExp
= extractFloat128Exp( a
);
4761 aSign
= extractFloat128Sign( a
);
4762 bSig1
= extractFloat128Frac1( b
);
4763 bSig0
= extractFloat128Frac0( b
);
4764 bExp
= extractFloat128Exp( b
);
4765 bSign
= extractFloat128Sign( b
);
4766 zSign
= aSign
^ bSign
;
4767 if ( aExp
== 0x7FFF ) {
4768 if ( ( aSig0
| aSig1
)
4769 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
4770 return propagateFloat128NaN( a
, b STATUS_VAR
);
4772 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
4773 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4775 if ( bExp
== 0x7FFF ) {
4776 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
4777 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
4779 float_raise( float_flag_invalid STATUS_VAR
);
4780 z
.low
= float128_default_nan_low
;
4781 z
.high
= float128_default_nan_high
;
4784 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4787 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
4788 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4791 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
4792 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
4794 zExp
= aExp
+ bExp
- 0x4000;
4795 aSig0
|= LIT64( 0x0001000000000000 );
4796 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
4797 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
4798 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
4799 zSig2
|= ( zSig3
!= 0 );
4800 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
4801 shift128ExtraRightJamming(
4802 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
4805 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
4809 /*----------------------------------------------------------------------------
4810 | Returns the result of dividing the quadruple-precision floating-point value
4811 | `a' by the corresponding value `b'. The operation is performed according to
4812 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4813 *----------------------------------------------------------------------------*/
4815 float128
float128_div( float128 a
, float128 b STATUS_PARAM
)
4817 flag aSign
, bSign
, zSign
;
4818 int32 aExp
, bExp
, zExp
;
4819 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
4820 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
4823 aSig1
= extractFloat128Frac1( a
);
4824 aSig0
= extractFloat128Frac0( a
);
4825 aExp
= extractFloat128Exp( a
);
4826 aSign
= extractFloat128Sign( a
);
4827 bSig1
= extractFloat128Frac1( b
);
4828 bSig0
= extractFloat128Frac0( b
);
4829 bExp
= extractFloat128Exp( b
);
4830 bSign
= extractFloat128Sign( b
);
4831 zSign
= aSign
^ bSign
;
4832 if ( aExp
== 0x7FFF ) {
4833 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
4834 if ( bExp
== 0x7FFF ) {
4835 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
4838 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4840 if ( bExp
== 0x7FFF ) {
4841 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
4842 return packFloat128( zSign
, 0, 0, 0 );
4845 if ( ( bSig0
| bSig1
) == 0 ) {
4846 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
4848 float_raise( float_flag_invalid STATUS_VAR
);
4849 z
.low
= float128_default_nan_low
;
4850 z
.high
= float128_default_nan_high
;
4853 float_raise( float_flag_divbyzero STATUS_VAR
);
4854 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4856 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
4859 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
4860 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4862 zExp
= aExp
- bExp
+ 0x3FFD;
4864 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
4866 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
4867 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
4868 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
4871 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
4872 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
4873 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
4874 while ( (sbits64
) rem0
< 0 ) {
4876 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
4878 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
4879 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
4880 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
4881 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
4882 while ( (sbits64
) rem1
< 0 ) {
4884 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
4886 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
4888 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
4889 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
4893 /*----------------------------------------------------------------------------
4894 | Returns the remainder of the quadruple-precision floating-point value `a'
4895 | with respect to the corresponding value `b'. The operation is performed
4896 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4897 *----------------------------------------------------------------------------*/
4899 float128
float128_rem( float128 a
, float128 b STATUS_PARAM
)
4901 flag aSign
, bSign
, zSign
;
4902 int32 aExp
, bExp
, expDiff
;
4903 bits64 aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
4904 bits64 allZero
, alternateASig0
, alternateASig1
, sigMean1
;
4908 aSig1
= extractFloat128Frac1( a
);
4909 aSig0
= extractFloat128Frac0( a
);
4910 aExp
= extractFloat128Exp( a
);
4911 aSign
= extractFloat128Sign( a
);
4912 bSig1
= extractFloat128Frac1( b
);
4913 bSig0
= extractFloat128Frac0( b
);
4914 bExp
= extractFloat128Exp( b
);
4915 bSign
= extractFloat128Sign( b
);
4916 if ( aExp
== 0x7FFF ) {
4917 if ( ( aSig0
| aSig1
)
4918 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
4919 return propagateFloat128NaN( a
, b STATUS_VAR
);
4923 if ( bExp
== 0x7FFF ) {
4924 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
4928 if ( ( bSig0
| bSig1
) == 0 ) {
4930 float_raise( float_flag_invalid STATUS_VAR
);
4931 z
.low
= float128_default_nan_low
;
4932 z
.high
= float128_default_nan_high
;
4935 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
4938 if ( ( aSig0
| aSig1
) == 0 ) return a
;
4939 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4941 expDiff
= aExp
- bExp
;
4942 if ( expDiff
< -1 ) return a
;
4944 aSig0
| LIT64( 0x0001000000000000 ),
4946 15 - ( expDiff
< 0 ),
4951 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
4952 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
4953 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
4955 while ( 0 < expDiff
) {
4956 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
4957 q
= ( 4 < q
) ? q
- 4 : 0;
4958 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
4959 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
4960 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
4961 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
4964 if ( -64 < expDiff
) {
4965 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
4966 q
= ( 4 < q
) ? q
- 4 : 0;
4968 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
4970 if ( expDiff
< 0 ) {
4971 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
4974 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
4976 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
4977 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
4980 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
4981 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
4984 alternateASig0
= aSig0
;
4985 alternateASig1
= aSig1
;
4987 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
4988 } while ( 0 <= (sbits64
) aSig0
);
4990 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (bits64
*)&sigMean0
, &sigMean1
);
4991 if ( ( sigMean0
< 0 )
4992 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
4993 aSig0
= alternateASig0
;
4994 aSig1
= alternateASig1
;
4996 zSign
= ( (sbits64
) aSig0
< 0 );
4997 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
4999 normalizeRoundAndPackFloat128( aSign
^ zSign
, bExp
- 4, aSig0
, aSig1 STATUS_VAR
);
5003 /*----------------------------------------------------------------------------
5004 | Returns the square root of the quadruple-precision floating-point value `a'.
5005 | The operation is performed according to the IEC/IEEE Standard for Binary
5006 | Floating-Point Arithmetic.
5007 *----------------------------------------------------------------------------*/
5009 float128
float128_sqrt( float128 a STATUS_PARAM
)
5013 bits64 aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
5014 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5017 aSig1
= extractFloat128Frac1( a
);
5018 aSig0
= extractFloat128Frac0( a
);
5019 aExp
= extractFloat128Exp( a
);
5020 aSign
= extractFloat128Sign( a
);
5021 if ( aExp
== 0x7FFF ) {
5022 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, a STATUS_VAR
);
5023 if ( ! aSign
) return a
;
5027 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
5029 float_raise( float_flag_invalid STATUS_VAR
);
5030 z
.low
= float128_default_nan_low
;
5031 z
.high
= float128_default_nan_high
;
5035 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
5036 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5038 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
5039 aSig0
|= LIT64( 0x0001000000000000 );
5040 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
5041 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
5042 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
5043 doubleZSig0
= zSig0
<<1;
5044 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
5045 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
5046 while ( (sbits64
) rem0
< 0 ) {
5049 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
5051 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
5052 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
5053 if ( zSig1
== 0 ) zSig1
= 1;
5054 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
5055 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5056 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
5057 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5058 while ( (sbits64
) rem1
< 0 ) {
5060 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
5062 term2
|= doubleZSig0
;
5063 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5065 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5067 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
5068 return roundAndPackFloat128( 0, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5072 /*----------------------------------------------------------------------------
5073 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5074 | the corresponding value `b', and 0 otherwise. The comparison is performed
5075 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5076 *----------------------------------------------------------------------------*/
5078 int float128_eq( float128 a
, float128 b STATUS_PARAM
)
5081 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5082 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5083 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5084 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5086 if ( float128_is_signaling_nan( a
)
5087 || float128_is_signaling_nan( b
) ) {
5088 float_raise( float_flag_invalid STATUS_VAR
);
5094 && ( ( a
.high
== b
.high
)
5096 && ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5101 /*----------------------------------------------------------------------------
5102 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5103 | or equal to the corresponding value `b', and 0 otherwise. The comparison
5104 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5106 *----------------------------------------------------------------------------*/
5108 int float128_le( float128 a
, float128 b STATUS_PARAM
)
5112 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5113 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5114 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5115 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5117 float_raise( float_flag_invalid STATUS_VAR
);
5120 aSign
= extractFloat128Sign( a
);
5121 bSign
= extractFloat128Sign( b
);
5122 if ( aSign
!= bSign
) {
5125 || ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5129 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5130 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5134 /*----------------------------------------------------------------------------
5135 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5136 | the corresponding value `b', and 0 otherwise. The comparison is performed
5137 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5138 *----------------------------------------------------------------------------*/
5140 int float128_lt( float128 a
, float128 b STATUS_PARAM
)
5144 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5145 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5146 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5147 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5149 float_raise( float_flag_invalid STATUS_VAR
);
5152 aSign
= extractFloat128Sign( a
);
5153 bSign
= extractFloat128Sign( b
);
5154 if ( aSign
!= bSign
) {
5157 && ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5161 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5162 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5166 /*----------------------------------------------------------------------------
5167 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5168 | the corresponding value `b', and 0 otherwise. The invalid exception is
5169 | raised if either operand is a NaN. Otherwise, the comparison is performed
5170 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5171 *----------------------------------------------------------------------------*/
5173 int float128_eq_signaling( float128 a
, float128 b STATUS_PARAM
)
5176 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5177 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5178 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5179 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5181 float_raise( float_flag_invalid STATUS_VAR
);
5186 && ( ( a
.high
== b
.high
)
5188 && ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5193 /*----------------------------------------------------------------------------
5194 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5195 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5196 | cause an exception. Otherwise, the comparison is performed according to the
5197 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5198 *----------------------------------------------------------------------------*/
5200 int float128_le_quiet( float128 a
, float128 b STATUS_PARAM
)
5204 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5205 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5206 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5207 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5209 if ( float128_is_signaling_nan( a
)
5210 || float128_is_signaling_nan( b
) ) {
5211 float_raise( float_flag_invalid STATUS_VAR
);
5215 aSign
= extractFloat128Sign( a
);
5216 bSign
= extractFloat128Sign( b
);
5217 if ( aSign
!= bSign
) {
5220 || ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5224 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5225 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5229 /*----------------------------------------------------------------------------
5230 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5231 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5232 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5233 | Standard for Binary Floating-Point Arithmetic.
5234 *----------------------------------------------------------------------------*/
5236 int float128_lt_quiet( float128 a
, float128 b STATUS_PARAM
)
5240 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5241 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5242 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5243 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5245 if ( float128_is_signaling_nan( a
)
5246 || float128_is_signaling_nan( b
) ) {
5247 float_raise( float_flag_invalid STATUS_VAR
);
5251 aSign
= extractFloat128Sign( a
);
5252 bSign
= extractFloat128Sign( b
);
5253 if ( aSign
!= bSign
) {
5256 && ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5260 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5261 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5267 /* misc functions */
5268 float32
uint32_to_float32( unsigned int a STATUS_PARAM
)
5270 return int64_to_float32(a STATUS_VAR
);
5273 float64
uint32_to_float64( unsigned int a STATUS_PARAM
)
5275 return int64_to_float64(a STATUS_VAR
);
5278 unsigned int float32_to_uint32( float32 a STATUS_PARAM
)
5283 v
= float32_to_int64(a STATUS_VAR
);
5286 float_raise( float_flag_invalid STATUS_VAR
);
5287 } else if (v
> 0xffffffff) {
5289 float_raise( float_flag_invalid STATUS_VAR
);
5296 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM
)
5301 v
= float32_to_int64_round_to_zero(a STATUS_VAR
);
5304 float_raise( float_flag_invalid STATUS_VAR
);
5305 } else if (v
> 0xffffffff) {
5307 float_raise( float_flag_invalid STATUS_VAR
);
5314 unsigned int float64_to_uint32( float64 a STATUS_PARAM
)
5319 v
= float64_to_int64(a STATUS_VAR
);
5322 float_raise( float_flag_invalid STATUS_VAR
);
5323 } else if (v
> 0xffffffff) {
5325 float_raise( float_flag_invalid STATUS_VAR
);
5332 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM
)
5337 v
= float64_to_int64_round_to_zero(a STATUS_VAR
);
5340 float_raise( float_flag_invalid STATUS_VAR
);
5341 } else if (v
> 0xffffffff) {
5343 float_raise( float_flag_invalid STATUS_VAR
);
5350 /* FIXME: This looks broken. */
5351 uint64_t float64_to_uint64 (float64 a STATUS_PARAM
)
5355 v
= float64_val(int64_to_float64(INT64_MIN STATUS_VAR
));
5356 v
+= float64_val(a
);
5357 v
= float64_to_int64(make_float64(v
) STATUS_VAR
);
5359 return v
- INT64_MIN
;
5362 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM
)
5366 v
= float64_val(int64_to_float64(INT64_MIN STATUS_VAR
));
5367 v
+= float64_val(a
);
5368 v
= float64_to_int64_round_to_zero(make_float64(v
) STATUS_VAR
);
5370 return v
- INT64_MIN
;
5373 #define COMPARE(s, nan_exp) \
5374 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
5375 int is_quiet STATUS_PARAM ) \
5377 flag aSign, bSign; \
5380 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
5381 extractFloat ## s ## Frac( a ) ) || \
5382 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
5383 extractFloat ## s ## Frac( b ) )) { \
5385 float ## s ## _is_signaling_nan( a ) || \
5386 float ## s ## _is_signaling_nan( b ) ) { \
5387 float_raise( float_flag_invalid STATUS_VAR); \
5389 return float_relation_unordered; \
5391 aSign = extractFloat ## s ## Sign( a ); \
5392 bSign = extractFloat ## s ## Sign( b ); \
5393 av = float ## s ## _val(a); \
5394 bv = float ## s ## _val(b); \
5395 if ( aSign != bSign ) { \
5396 if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) { \
5398 return float_relation_equal; \
5400 return 1 - (2 * aSign); \
5404 return float_relation_equal; \
5406 return 1 - 2 * (aSign ^ ( av < bv )); \
5411 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
5413 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
5416 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
5418 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
5424 INLINE
int float128_compare_internal( float128 a
, float128 b
,
5425 int is_quiet STATUS_PARAM
)
5429 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
5430 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
5431 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
5432 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
5434 float128_is_signaling_nan( a
) ||
5435 float128_is_signaling_nan( b
) ) {
5436 float_raise( float_flag_invalid STATUS_VAR
);
5438 return float_relation_unordered
;
5440 aSign
= extractFloat128Sign( a
);
5441 bSign
= extractFloat128Sign( b
);
5442 if ( aSign
!= bSign
) {
5443 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
5445 return float_relation_equal
;
5447 return 1 - (2 * aSign
);
5450 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
5451 return float_relation_equal
;
5453 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
5458 int float128_compare( float128 a
, float128 b STATUS_PARAM
)
5460 return float128_compare_internal(a
, b
, 0 STATUS_VAR
);
5463 int float128_compare_quiet( float128 a
, float128 b STATUS_PARAM
)
5465 return float128_compare_internal(a
, b
, 1 STATUS_VAR
);
5468 /* Multiply A by 2 raised to the power N. */
5469 float32
float32_scalbn( float32 a
, int n STATUS_PARAM
)
5475 aSig
= extractFloat32Frac( a
);
5476 aExp
= extractFloat32Exp( a
);
5477 aSign
= extractFloat32Sign( a
);
5479 if ( aExp
== 0xFF ) {
5483 return roundAndPackFloat32( aSign
, aExp
, aSig STATUS_VAR
);
5486 float64
float64_scalbn( float64 a
, int n STATUS_PARAM
)
5492 aSig
= extractFloat64Frac( a
);
5493 aExp
= extractFloat64Exp( a
);
5494 aSign
= extractFloat64Sign( a
);
5496 if ( aExp
== 0x7FF ) {
5500 return roundAndPackFloat64( aSign
, aExp
, aSig STATUS_VAR
);
5504 floatx80
floatx80_scalbn( floatx80 a
, int n STATUS_PARAM
)
5510 aSig
= extractFloatx80Frac( a
);
5511 aExp
= extractFloatx80Exp( a
);
5512 aSign
= extractFloatx80Sign( a
);
5514 if ( aExp
== 0x7FF ) {
5518 return roundAndPackFloatx80( STATUS(floatx80_rounding_precision
),
5519 aSign
, aExp
, aSig
, 0 STATUS_VAR
);
5524 float128
float128_scalbn( float128 a
, int n STATUS_PARAM
)
5528 bits64 aSig0
, aSig1
;
5530 aSig1
= extractFloat128Frac1( a
);
5531 aSig0
= extractFloat128Frac0( a
);
5532 aExp
= extractFloat128Exp( a
);
5533 aSign
= extractFloat128Sign( a
);
5534 if ( aExp
== 0x7FFF ) {
5538 return roundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1
, 0 STATUS_VAR
);