2 ===============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point
5 Arithmetic Package, Release 2.
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
15 arithmetic/softfloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
18 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
19 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
20 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
21 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
23 Derivative works are acceptable, even for commercial purposes, so long as
24 (1) they include prominent notice that the work is derivative, and (2) they
25 include prominent notice akin to these three paragraphs for those parts of
26 this code that are retained.
28 ===============================================================================
31 #include <asm/div64.h>
35 //#include "softfloat.h"
38 -------------------------------------------------------------------------------
39 Primitive arithmetic functions, including multi-word arithmetic, and
40 division and square root approximations. (Can be specialized to target if
42 -------------------------------------------------------------------------------
44 #include "softfloat-macros"
47 -------------------------------------------------------------------------------
48 Functions and definitions to determine: (1) whether tininess for underflow
49 is detected before or after rounding by default, (2) what (if anything)
50 happens when exceptions are raised, (3) how signaling NaNs are distinguished
51 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
52 are propagated from function inputs to output. These details are target-
54 -------------------------------------------------------------------------------
56 #include "softfloat-specialize"
59 -------------------------------------------------------------------------------
60 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
61 and 7, and returns the properly rounded 32-bit integer corresponding to the
62 input. If `zSign' is nonzero, the input is negated before being converted
63 to an integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point
64 input is simply rounded to an integer, with the inexact exception raised if
65 the input cannot be represented exactly as an integer. If the fixed-point
66 input is too large, however, the invalid exception is raised and the largest
67 positive or negative integer is returned.
68 -------------------------------------------------------------------------------
70 static int32
roundAndPackInt32( struct roundingData
*roundData
, flag zSign
, bits64 absZ
)
73 flag roundNearestEven
;
74 int8 roundIncrement
, roundBits
;
77 roundingMode
= roundData
->mode
;
78 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
79 roundIncrement
= 0x40;
80 if ( ! roundNearestEven
) {
81 if ( roundingMode
== float_round_to_zero
) {
85 roundIncrement
= 0x7F;
87 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
90 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
94 roundBits
= absZ
& 0x7F;
95 absZ
= ( absZ
+ roundIncrement
)>>7;
96 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
99 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
100 roundData
->exception
|= float_flag_invalid
;
101 return zSign
? 0x80000000 : 0x7FFFFFFF;
103 if ( roundBits
) roundData
->exception
|= float_flag_inexact
;
109 -------------------------------------------------------------------------------
110 Returns the fraction bits of the single-precision floating-point value `a'.
111 -------------------------------------------------------------------------------
113 INLINE bits32
extractFloat32Frac( float32 a
)
116 return a
& 0x007FFFFF;
121 -------------------------------------------------------------------------------
122 Returns the exponent bits of the single-precision floating-point value `a'.
123 -------------------------------------------------------------------------------
125 INLINE int16
extractFloat32Exp( float32 a
)
128 return ( a
>>23 ) & 0xFF;
133 -------------------------------------------------------------------------------
134 Returns the sign bit of the single-precision floating-point value `a'.
135 -------------------------------------------------------------------------------
139 -------------------------------------------------------------------------------
140 Normalizes the subnormal single-precision floating-point value represented
141 by the denormalized significand `aSig'. The normalized exponent and
142 significand are stored at the locations pointed to by `zExpPtr' and
143 `zSigPtr', respectively.
144 -------------------------------------------------------------------------------
147 normalizeFloat32Subnormal( bits32 aSig
, int16
*zExpPtr
, bits32
*zSigPtr
)
151 shiftCount
= countLeadingZeros32( aSig
) - 8;
152 *zSigPtr
= aSig
<<shiftCount
;
153 *zExpPtr
= 1 - shiftCount
;
158 -------------------------------------------------------------------------------
159 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
160 single-precision floating-point value, returning the result. After being
161 shifted into the proper positions, the three fields are simply added
162 together to form the result. This means that any integer portion of `zSig'
163 will be added into the exponent. Since a properly normalized significand
164 will have an integer portion equal to 1, the `zExp' input should be 1 less
165 than the desired result exponent whenever `zSig' is a complete, normalized
167 -------------------------------------------------------------------------------
169 INLINE float32
packFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
171 return ( ( (bits32
) zSign
)<<31 ) + ( ( (bits32
) zExp
)<<23 ) + zSig
;
175 -------------------------------------------------------------------------------
176 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
177 and significand `zSig', and returns the proper single-precision floating-
178 point value corresponding to the abstract input. Ordinarily, the abstract
179 value is simply rounded and packed into the single-precision format, with
180 the inexact exception raised if the abstract input cannot be represented
181 exactly. If the abstract value is too large, however, the overflow and
182 inexact exceptions are raised and an infinity or maximal finite value is
183 returned. If the abstract value is too small, the input value is rounded to
184 a subnormal number, and the underflow and inexact exceptions are raised if
185 the abstract input cannot be represented exactly as a subnormal single-
186 precision floating-point number.
187 The input significand `zSig' has its binary point between bits 30
188 and 29, which is 7 bits to the left of the usual location. This shifted
189 significand must be normalized or smaller. If `zSig' is not normalized,
190 `zExp' must be 0; in that case, the result returned is a subnormal number,
191 and it must not require rounding. In the usual case that `zSig' is
192 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
193 The handling of underflow and overflow follows the IEC/IEEE Standard for
194 Binary Floating-point Arithmetic.
195 -------------------------------------------------------------------------------
197 static float32
roundAndPackFloat32( struct roundingData
*roundData
, flag zSign
, int16 zExp
, bits32 zSig
)
200 flag roundNearestEven
;
201 int8 roundIncrement
, roundBits
;
204 roundingMode
= roundData
->mode
;
205 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
206 roundIncrement
= 0x40;
207 if ( ! roundNearestEven
) {
208 if ( roundingMode
== float_round_to_zero
) {
212 roundIncrement
= 0x7F;
214 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
217 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
221 roundBits
= zSig
& 0x7F;
222 if ( 0xFD <= (bits16
) zExp
) {
224 || ( ( zExp
== 0xFD )
225 && ( (sbits32
) ( zSig
+ roundIncrement
) < 0 ) )
227 roundData
->exception
|= float_flag_overflow
| float_flag_inexact
;
228 return packFloat32( zSign
, 0xFF, 0 ) - ( roundIncrement
== 0 );
232 ( float_detect_tininess
== float_tininess_before_rounding
)
234 || ( zSig
+ roundIncrement
< 0x80000000 );
235 shift32RightJamming( zSig
, - zExp
, &zSig
);
237 roundBits
= zSig
& 0x7F;
238 if ( isTiny
&& roundBits
) roundData
->exception
|= float_flag_underflow
;
241 if ( roundBits
) roundData
->exception
|= float_flag_inexact
;
242 zSig
= ( zSig
+ roundIncrement
)>>7;
243 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
244 if ( zSig
== 0 ) zExp
= 0;
245 return packFloat32( zSign
, zExp
, zSig
);
250 -------------------------------------------------------------------------------
251 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
252 and significand `zSig', and returns the proper single-precision floating-
253 point value corresponding to the abstract input. This routine is just like
254 `roundAndPackFloat32' except that `zSig' does not have to be normalized in
255 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
257 -------------------------------------------------------------------------------
260 normalizeRoundAndPackFloat32( struct roundingData
*roundData
, flag zSign
, int16 zExp
, bits32 zSig
)
264 shiftCount
= countLeadingZeros32( zSig
) - 1;
265 return roundAndPackFloat32( roundData
, zSign
, zExp
- shiftCount
, zSig
<<shiftCount
);
270 -------------------------------------------------------------------------------
271 Returns the fraction bits of the double-precision floating-point value `a'.
272 -------------------------------------------------------------------------------
274 INLINE bits64
extractFloat64Frac( float64 a
)
277 return a
& LIT64( 0x000FFFFFFFFFFFFF );
282 -------------------------------------------------------------------------------
283 Returns the exponent bits of the double-precision floating-point value `a'.
284 -------------------------------------------------------------------------------
286 INLINE int16
extractFloat64Exp( float64 a
)
289 return ( a
>>52 ) & 0x7FF;
294 -------------------------------------------------------------------------------
295 Returns the sign bit of the double-precision floating-point value `a'.
296 -------------------------------------------------------------------------------
300 -------------------------------------------------------------------------------
301 Normalizes the subnormal double-precision floating-point value represented
302 by the denormalized significand `aSig'. The normalized exponent and
303 significand are stored at the locations pointed to by `zExpPtr' and
304 `zSigPtr', respectively.
305 -------------------------------------------------------------------------------
308 normalizeFloat64Subnormal( bits64 aSig
, int16
*zExpPtr
, bits64
*zSigPtr
)
312 shiftCount
= countLeadingZeros64( aSig
) - 11;
313 *zSigPtr
= aSig
<<shiftCount
;
314 *zExpPtr
= 1 - shiftCount
;
319 -------------------------------------------------------------------------------
320 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
321 double-precision floating-point value, returning the result. After being
322 shifted into the proper positions, the three fields are simply added
323 together to form the result. This means that any integer portion of `zSig'
324 will be added into the exponent. Since a properly normalized significand
325 will have an integer portion equal to 1, the `zExp' input should be 1 less
326 than the desired result exponent whenever `zSig' is a complete, normalized
328 -------------------------------------------------------------------------------
330 INLINE float64
packFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
333 return ( ( (bits64
) zSign
)<<63 ) + ( ( (bits64
) zExp
)<<52 ) + zSig
;
338 -------------------------------------------------------------------------------
339 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
340 and significand `zSig', and returns the proper double-precision floating-
341 point value corresponding to the abstract input. Ordinarily, the abstract
342 value is simply rounded and packed into the double-precision format, with
343 the inexact exception raised if the abstract input cannot be represented
344 exactly. If the abstract value is too large, however, the overflow and
345 inexact exceptions are raised and an infinity or maximal finite value is
346 returned. If the abstract value is too small, the input value is rounded to
347 a subnormal number, and the underflow and inexact exceptions are raised if
348 the abstract input cannot be represented exactly as a subnormal double-
349 precision floating-point number.
350 The input significand `zSig' has its binary point between bits 62
351 and 61, which is 10 bits to the left of the usual location. This shifted
352 significand must be normalized or smaller. If `zSig' is not normalized,
353 `zExp' must be 0; in that case, the result returned is a subnormal number,
354 and it must not require rounding. In the usual case that `zSig' is
355 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
356 The handling of underflow and overflow follows the IEC/IEEE Standard for
357 Binary Floating-point Arithmetic.
358 -------------------------------------------------------------------------------
360 static float64
roundAndPackFloat64( struct roundingData
*roundData
, flag zSign
, int16 zExp
, bits64 zSig
)
363 flag roundNearestEven
;
364 int16 roundIncrement
, roundBits
;
367 roundingMode
= roundData
->mode
;
368 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
369 roundIncrement
= 0x200;
370 if ( ! roundNearestEven
) {
371 if ( roundingMode
== float_round_to_zero
) {
375 roundIncrement
= 0x3FF;
377 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
380 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
384 roundBits
= zSig
& 0x3FF;
385 if ( 0x7FD <= (bits16
) zExp
) {
386 if ( ( 0x7FD < zExp
)
387 || ( ( zExp
== 0x7FD )
388 && ( (sbits64
) ( zSig
+ roundIncrement
) < 0 ) )
390 //register int lr = __builtin_return_address(0);
391 //printk("roundAndPackFloat64 called from 0x%08x\n",lr);
392 roundData
->exception
|= float_flag_overflow
| float_flag_inexact
;
393 return packFloat64( zSign
, 0x7FF, 0 ) - ( roundIncrement
== 0 );
397 ( float_detect_tininess
== float_tininess_before_rounding
)
399 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
400 shift64RightJamming( zSig
, - zExp
, &zSig
);
402 roundBits
= zSig
& 0x3FF;
403 if ( isTiny
&& roundBits
) roundData
->exception
|= float_flag_underflow
;
406 if ( roundBits
) roundData
->exception
|= float_flag_inexact
;
407 zSig
= ( zSig
+ roundIncrement
)>>10;
408 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
409 if ( zSig
== 0 ) zExp
= 0;
410 return packFloat64( zSign
, zExp
, zSig
);
415 -------------------------------------------------------------------------------
416 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
417 and significand `zSig', and returns the proper double-precision floating-
418 point value corresponding to the abstract input. This routine is just like
419 `roundAndPackFloat64' except that `zSig' does not have to be normalized in
420 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
422 -------------------------------------------------------------------------------
425 normalizeRoundAndPackFloat64( struct roundingData
*roundData
, flag zSign
, int16 zExp
, bits64 zSig
)
429 shiftCount
= countLeadingZeros64( zSig
) - 1;
430 return roundAndPackFloat64( roundData
, zSign
, zExp
- shiftCount
, zSig
<<shiftCount
);
437 -------------------------------------------------------------------------------
438 Returns the fraction bits of the extended double-precision floating-point
440 -------------------------------------------------------------------------------
442 INLINE bits64
extractFloatx80Frac( floatx80 a
)
450 -------------------------------------------------------------------------------
451 Returns the exponent bits of the extended double-precision floating-point
453 -------------------------------------------------------------------------------
455 INLINE int32
extractFloatx80Exp( floatx80 a
)
458 return a
.high
& 0x7FFF;
463 -------------------------------------------------------------------------------
464 Returns the sign bit of the extended double-precision floating-point value
466 -------------------------------------------------------------------------------
468 INLINE flag
extractFloatx80Sign( floatx80 a
)
476 -------------------------------------------------------------------------------
477 Normalizes the subnormal extended double-precision floating-point value
478 represented by the denormalized significand `aSig'. The normalized exponent
479 and significand are stored at the locations pointed to by `zExpPtr' and
480 `zSigPtr', respectively.
481 -------------------------------------------------------------------------------
484 normalizeFloatx80Subnormal( bits64 aSig
, int32
*zExpPtr
, bits64
*zSigPtr
)
488 shiftCount
= countLeadingZeros64( aSig
);
489 *zSigPtr
= aSig
<<shiftCount
;
490 *zExpPtr
= 1 - shiftCount
;
495 -------------------------------------------------------------------------------
496 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
497 extended double-precision floating-point value, returning the result.
498 -------------------------------------------------------------------------------
500 INLINE floatx80
packFloatx80( flag zSign
, int32 zExp
, bits64 zSig
)
505 z
.high
= ( ( (bits16
) zSign
)<<15 ) + zExp
;
512 -------------------------------------------------------------------------------
513 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
514 and extended significand formed by the concatenation of `zSig0' and `zSig1',
515 and returns the proper extended double-precision floating-point value
516 corresponding to the abstract input. Ordinarily, the abstract value is
517 rounded and packed into the extended double-precision format, with the
518 inexact exception raised if the abstract input cannot be represented
519 exactly. If the abstract value is too large, however, the overflow and
520 inexact exceptions are raised and an infinity or maximal finite value is
521 returned. If the abstract value is too small, the input value is rounded to
522 a subnormal number, and the underflow and inexact exceptions are raised if
523 the abstract input cannot be represented exactly as a subnormal extended
524 double-precision floating-point number.
525 If `roundingPrecision' is 32 or 64, the result is rounded to the same
526 number of bits as single or double precision, respectively. Otherwise, the
527 result is rounded to the full precision of the extended double-precision
529 The input significand must be normalized or smaller. If the input
530 significand is not normalized, `zExp' must be 0; in that case, the result
531 returned is a subnormal number, and it must not require rounding. The
532 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
533 Floating-point Arithmetic.
534 -------------------------------------------------------------------------------
537 roundAndPackFloatx80(
538 struct roundingData
*roundData
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
541 int8 roundingMode
, roundingPrecision
;
542 flag roundNearestEven
, increment
, isTiny
;
543 int64 roundIncrement
, roundMask
, roundBits
;
545 roundingMode
= roundData
->mode
;
546 roundingPrecision
= roundData
->precision
;
547 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
548 if ( roundingPrecision
== 80 ) goto precision80
;
549 if ( roundingPrecision
== 64 ) {
550 roundIncrement
= LIT64( 0x0000000000000400 );
551 roundMask
= LIT64( 0x00000000000007FF );
553 else if ( roundingPrecision
== 32 ) {
554 roundIncrement
= LIT64( 0x0000008000000000 );
555 roundMask
= LIT64( 0x000000FFFFFFFFFF );
560 zSig0
|= ( zSig1
!= 0 );
561 if ( ! roundNearestEven
) {
562 if ( roundingMode
== float_round_to_zero
) {
566 roundIncrement
= roundMask
;
568 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
571 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
575 roundBits
= zSig0
& roundMask
;
576 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
577 if ( ( 0x7FFE < zExp
)
578 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
584 ( float_detect_tininess
== float_tininess_before_rounding
)
586 || ( zSig0
<= zSig0
+ roundIncrement
);
587 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
589 roundBits
= zSig0
& roundMask
;
590 if ( isTiny
&& roundBits
) roundData
->exception
|= float_flag_underflow
;
591 if ( roundBits
) roundData
->exception
|= float_flag_inexact
;
592 zSig0
+= roundIncrement
;
593 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
594 roundIncrement
= roundMask
+ 1;
595 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
596 roundMask
|= roundIncrement
;
598 zSig0
&= ~ roundMask
;
599 return packFloatx80( zSign
, zExp
, zSig0
);
602 if ( roundBits
) roundData
->exception
|= float_flag_inexact
;
603 zSig0
+= roundIncrement
;
604 if ( zSig0
< roundIncrement
) {
606 zSig0
= LIT64( 0x8000000000000000 );
608 roundIncrement
= roundMask
+ 1;
609 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
610 roundMask
|= roundIncrement
;
612 zSig0
&= ~ roundMask
;
613 if ( zSig0
== 0 ) zExp
= 0;
614 return packFloatx80( zSign
, zExp
, zSig0
);
616 increment
= ( (sbits64
) zSig1
< 0 );
617 if ( ! roundNearestEven
) {
618 if ( roundingMode
== float_round_to_zero
) {
623 increment
= ( roundingMode
== float_round_down
) && zSig1
;
626 increment
= ( roundingMode
== float_round_up
) && zSig1
;
630 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
631 if ( ( 0x7FFE < zExp
)
632 || ( ( zExp
== 0x7FFE )
633 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
639 roundData
->exception
|= float_flag_overflow
| float_flag_inexact
;
640 if ( ( roundingMode
== float_round_to_zero
)
641 || ( zSign
&& ( roundingMode
== float_round_up
) )
642 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
644 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
646 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
650 ( float_detect_tininess
== float_tininess_before_rounding
)
653 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
654 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
656 if ( isTiny
&& zSig1
) roundData
->exception
|= float_flag_underflow
;
657 if ( zSig1
) roundData
->exception
|= float_flag_inexact
;
658 if ( roundNearestEven
) {
659 increment
= ( (sbits64
) zSig1
< 0 );
663 increment
= ( roundingMode
== float_round_down
) && zSig1
;
666 increment
= ( roundingMode
== float_round_up
) && zSig1
;
671 zSig0
&= ~ ( ( zSig1
+ zSig1
== 0 ) & roundNearestEven
);
672 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
674 return packFloatx80( zSign
, zExp
, zSig0
);
677 if ( zSig1
) roundData
->exception
|= float_flag_inexact
;
682 zSig0
= LIT64( 0x8000000000000000 );
685 zSig0
&= ~ ( ( zSig1
+ zSig1
== 0 ) & roundNearestEven
);
689 if ( zSig0
== 0 ) zExp
= 0;
692 return packFloatx80( zSign
, zExp
, zSig0
);
696 -------------------------------------------------------------------------------
697 Takes an abstract floating-point value having sign `zSign', exponent
698 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
699 and returns the proper extended double-precision floating-point value
700 corresponding to the abstract input. This routine is just like
701 `roundAndPackFloatx80' except that the input significand does not have to be
703 -------------------------------------------------------------------------------
706 normalizeRoundAndPackFloatx80(
707 struct roundingData
*roundData
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
717 shiftCount
= countLeadingZeros64( zSig0
);
718 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
721 roundAndPackFloatx80( roundData
, zSign
, zExp
, zSig0
, zSig1
);
728 -------------------------------------------------------------------------------
729 Returns the result of converting the 32-bit two's complement integer `a' to
730 the single-precision floating-point format. The conversion is performed
731 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
732 -------------------------------------------------------------------------------
734 float32
int32_to_float32(struct roundingData
*roundData
, int32 a
)
738 if ( a
== 0 ) return 0;
739 if ( a
== 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
741 return normalizeRoundAndPackFloat32( roundData
, zSign
, 0x9C, zSign
? - a
: a
);
746 -------------------------------------------------------------------------------
747 Returns the result of converting the 32-bit two's complement integer `a' to
748 the double-precision floating-point format. The conversion is performed
749 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
750 -------------------------------------------------------------------------------
752 float64
int32_to_float64( int32 a
)
759 if ( a
== 0 ) return 0;
761 absA
= aSign
? - a
: a
;
762 shiftCount
= countLeadingZeros32( absA
) + 21;
764 return packFloat64( aSign
, 0x432 - shiftCount
, zSig
<<shiftCount
);
771 -------------------------------------------------------------------------------
772 Returns the result of converting the 32-bit two's complement integer `a'
773 to the extended double-precision floating-point format. The conversion
774 is performed according to the IEC/IEEE Standard for Binary Floating-point
776 -------------------------------------------------------------------------------
778 floatx80
int32_to_floatx80( int32 a
)
785 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
787 absA
= zSign
? - a
: a
;
788 shiftCount
= countLeadingZeros32( absA
) + 32;
790 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
797 -------------------------------------------------------------------------------
798 Returns the result of converting the single-precision floating-point value
799 `a' to the 32-bit two's complement integer format. The conversion is
800 performed according to the IEC/IEEE Standard for Binary Floating-point
801 Arithmetic---which means in particular that the conversion is rounded
802 according to the current rounding mode. If `a' is a NaN, the largest
803 positive integer is returned. Otherwise, if the conversion overflows, the
804 largest integer with the same sign as `a' is returned.
805 -------------------------------------------------------------------------------
807 int32
float32_to_int32( struct roundingData
*roundData
, float32 a
)
810 int16 aExp
, shiftCount
;
814 aSig
= extractFloat32Frac( a
);
815 aExp
= extractFloat32Exp( a
);
816 aSign
= extractFloat32Sign( a
);
817 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
818 if ( aExp
) aSig
|= 0x00800000;
819 shiftCount
= 0xAF - aExp
;
822 if ( 0 < shiftCount
) shift64RightJamming( zSig
, shiftCount
, &zSig
);
823 return roundAndPackInt32( roundData
, aSign
, zSig
);
828 -------------------------------------------------------------------------------
829 Returns the result of converting the single-precision floating-point value
830 `a' to the 32-bit two's complement integer format. The conversion is
831 performed according to the IEC/IEEE Standard for Binary Floating-point
832 Arithmetic, except that the conversion is always rounded toward zero. If
833 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
834 conversion overflows, the largest integer with the same sign as `a' is
836 -------------------------------------------------------------------------------
838 int32
float32_to_int32_round_to_zero( float32 a
)
841 int16 aExp
, shiftCount
;
845 aSig
= extractFloat32Frac( a
);
846 aExp
= extractFloat32Exp( a
);
847 aSign
= extractFloat32Sign( a
);
848 shiftCount
= aExp
- 0x9E;
849 if ( 0 <= shiftCount
) {
850 if ( a
== 0xCF000000 ) return 0x80000000;
851 float_raise( float_flag_invalid
);
852 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
855 else if ( aExp
<= 0x7E ) {
856 if ( aExp
| aSig
) float_raise( float_flag_inexact
);
859 aSig
= ( aSig
| 0x00800000 )<<8;
860 z
= aSig
>>( - shiftCount
);
861 if ( (bits32
) ( aSig
<<( shiftCount
& 31 ) ) ) {
862 float_raise( float_flag_inexact
);
864 return aSign
? - z
: z
;
869 -------------------------------------------------------------------------------
870 Returns the result of converting the single-precision floating-point value
871 `a' to the double-precision floating-point format. The conversion is
872 performed according to the IEC/IEEE Standard for Binary Floating-point
874 -------------------------------------------------------------------------------
876 float64
float32_to_float64( float32 a
)
882 aSig
= extractFloat32Frac( a
);
883 aExp
= extractFloat32Exp( a
);
884 aSign
= extractFloat32Sign( a
);
885 if ( aExp
== 0xFF ) {
886 if ( aSig
) return commonNaNToFloat64( float32ToCommonNaN( a
) );
887 return packFloat64( aSign
, 0x7FF, 0 );
890 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
891 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
894 return packFloat64( aSign
, aExp
+ 0x380, ( (bits64
) aSig
)<<29 );
901 -------------------------------------------------------------------------------
902 Returns the result of converting the single-precision floating-point value
903 `a' to the extended double-precision floating-point format. The conversion
904 is performed according to the IEC/IEEE Standard for Binary Floating-point
906 -------------------------------------------------------------------------------
908 floatx80
float32_to_floatx80( float32 a
)
914 aSig
= extractFloat32Frac( a
);
915 aExp
= extractFloat32Exp( a
);
916 aSign
= extractFloat32Sign( a
);
917 if ( aExp
== 0xFF ) {
918 if ( aSig
) return commonNaNToFloatx80( float32ToCommonNaN( a
) );
919 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
922 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
923 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
926 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<40 );
933 -------------------------------------------------------------------------------
934 Rounds the single-precision floating-point value `a' to an integer, and
935 returns the result as a single-precision floating-point value. The
936 operation is performed according to the IEC/IEEE Standard for Binary
937 Floating-point Arithmetic.
938 -------------------------------------------------------------------------------
940 float32
float32_round_to_int( struct roundingData
*roundData
, float32 a
)
944 bits32 lastBitMask
, roundBitsMask
;
948 aExp
= extractFloat32Exp( a
);
949 if ( 0x96 <= aExp
) {
950 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
951 return propagateFloat32NaN( a
, a
);
955 roundingMode
= roundData
->mode
;
956 if ( aExp
<= 0x7E ) {
957 if ( (bits32
) ( a
<<1 ) == 0 ) return a
;
958 roundData
->exception
|= float_flag_inexact
;
959 aSign
= extractFloat32Sign( a
);
960 switch ( roundingMode
) {
961 case float_round_nearest_even
:
962 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
963 return packFloat32( aSign
, 0x7F, 0 );
966 case float_round_down
:
967 return aSign
? 0xBF800000 : 0;
969 return aSign
? 0x80000000 : 0x3F800000;
971 return packFloat32( aSign
, 0, 0 );
974 lastBitMask
<<= 0x96 - aExp
;
975 roundBitsMask
= lastBitMask
- 1;
977 if ( roundingMode
== float_round_nearest_even
) {
979 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
981 else if ( roundingMode
!= float_round_to_zero
) {
982 if ( extractFloat32Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
986 z
&= ~ roundBitsMask
;
987 if ( z
!= a
) roundData
->exception
|= float_flag_inexact
;
993 -------------------------------------------------------------------------------
994 Returns the result of adding the absolute values of the single-precision
995 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
996 before being returned. `zSign' is ignored if the result is a NaN. The
997 addition is performed according to the IEC/IEEE Standard for Binary
998 Floating-point Arithmetic.
999 -------------------------------------------------------------------------------
1001 static float32
addFloat32Sigs( struct roundingData
*roundData
, float32 a
, float32 b
, flag zSign
)
1003 int16 aExp
, bExp
, zExp
;
1004 bits32 aSig
, bSig
, zSig
;
1007 aSig
= extractFloat32Frac( a
);
1008 aExp
= extractFloat32Exp( a
);
1009 bSig
= extractFloat32Frac( b
);
1010 bExp
= extractFloat32Exp( b
);
1011 expDiff
= aExp
- bExp
;
1014 if ( 0 < expDiff
) {
1015 if ( aExp
== 0xFF ) {
1016 if ( aSig
) return propagateFloat32NaN( a
, b
);
1025 shift32RightJamming( bSig
, expDiff
, &bSig
);
1028 else if ( expDiff
< 0 ) {
1029 if ( bExp
== 0xFF ) {
1030 if ( bSig
) return propagateFloat32NaN( a
, b
);
1031 return packFloat32( zSign
, 0xFF, 0 );
1039 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1043 if ( aExp
== 0xFF ) {
1044 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b
);
1047 if ( aExp
== 0 ) return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
1048 zSig
= 0x40000000 + aSig
+ bSig
;
1053 zSig
= ( aSig
+ bSig
)<<1;
1055 if ( (sbits32
) zSig
< 0 ) {
1060 return roundAndPackFloat32( roundData
, zSign
, zExp
, zSig
);
1065 -------------------------------------------------------------------------------
1066 Returns the result of subtracting the absolute values of the single-
1067 precision floating-point values `a' and `b'. If `zSign' is true, the
1068 difference is negated before being returned. `zSign' is ignored if the
1069 result is a NaN. The subtraction is performed according to the IEC/IEEE
1070 Standard for Binary Floating-point Arithmetic.
1071 -------------------------------------------------------------------------------
1073 static float32
subFloat32Sigs( struct roundingData
*roundData
, float32 a
, float32 b
, flag zSign
)
1075 int16 aExp
, bExp
, zExp
;
1076 bits32 aSig
, bSig
, zSig
;
1079 aSig
= extractFloat32Frac( a
);
1080 aExp
= extractFloat32Exp( a
);
1081 bSig
= extractFloat32Frac( b
);
1082 bExp
= extractFloat32Exp( b
);
1083 expDiff
= aExp
- bExp
;
1086 if ( 0 < expDiff
) goto aExpBigger
;
1087 if ( expDiff
< 0 ) goto bExpBigger
;
1088 if ( aExp
== 0xFF ) {
1089 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b
);
1090 roundData
->exception
|= float_flag_invalid
;
1091 return float32_default_nan
;
1097 if ( bSig
< aSig
) goto aBigger
;
1098 if ( aSig
< bSig
) goto bBigger
;
1099 return packFloat32( roundData
->mode
== float_round_down
, 0, 0 );
1101 if ( bExp
== 0xFF ) {
1102 if ( bSig
) return propagateFloat32NaN( a
, b
);
1103 return packFloat32( zSign
^ 1, 0xFF, 0 );
1111 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1117 goto normalizeRoundAndPack
;
1119 if ( aExp
== 0xFF ) {
1120 if ( aSig
) return propagateFloat32NaN( a
, b
);
1129 shift32RightJamming( bSig
, expDiff
, &bSig
);
1134 normalizeRoundAndPack
:
1136 return normalizeRoundAndPackFloat32( roundData
, zSign
, zExp
, zSig
);
1141 -------------------------------------------------------------------------------
1142 Returns the result of adding the single-precision floating-point values `a'
1143 and `b'. The operation is performed according to the IEC/IEEE Standard for
1144 Binary Floating-point Arithmetic.
1145 -------------------------------------------------------------------------------
1147 float32
float32_add( struct roundingData
*roundData
, float32 a
, float32 b
)
1151 aSign
= extractFloat32Sign( a
);
1152 bSign
= extractFloat32Sign( b
);
1153 if ( aSign
== bSign
) {
1154 return addFloat32Sigs( roundData
, a
, b
, aSign
);
1157 return subFloat32Sigs( roundData
, a
, b
, aSign
);
1163 -------------------------------------------------------------------------------
1164 Returns the result of subtracting the single-precision floating-point values
1165 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1166 for Binary Floating-point Arithmetic.
1167 -------------------------------------------------------------------------------
1169 float32
float32_sub( struct roundingData
*roundData
, float32 a
, float32 b
)
1173 aSign
= extractFloat32Sign( a
);
1174 bSign
= extractFloat32Sign( b
);
1175 if ( aSign
== bSign
) {
1176 return subFloat32Sigs( roundData
, a
, b
, aSign
);
1179 return addFloat32Sigs( roundData
, a
, b
, aSign
);
1185 -------------------------------------------------------------------------------
1186 Returns the result of multiplying the single-precision floating-point values
1187 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1188 for Binary Floating-point Arithmetic.
1189 -------------------------------------------------------------------------------
1191 float32
float32_mul( struct roundingData
*roundData
, float32 a
, float32 b
)
1193 flag aSign
, bSign
, zSign
;
1194 int16 aExp
, bExp
, zExp
;
1199 aSig
= extractFloat32Frac( a
);
1200 aExp
= extractFloat32Exp( a
);
1201 aSign
= extractFloat32Sign( a
);
1202 bSig
= extractFloat32Frac( b
);
1203 bExp
= extractFloat32Exp( b
);
1204 bSign
= extractFloat32Sign( b
);
1205 zSign
= aSign
^ bSign
;
1206 if ( aExp
== 0xFF ) {
1207 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1208 return propagateFloat32NaN( a
, b
);
1210 if ( ( bExp
| bSig
) == 0 ) {
1211 roundData
->exception
|= float_flag_invalid
;
1212 return float32_default_nan
;
1214 return packFloat32( zSign
, 0xFF, 0 );
1216 if ( bExp
== 0xFF ) {
1217 if ( bSig
) return propagateFloat32NaN( a
, b
);
1218 if ( ( aExp
| aSig
) == 0 ) {
1219 roundData
->exception
|= float_flag_invalid
;
1220 return float32_default_nan
;
1222 return packFloat32( zSign
, 0xFF, 0 );
1225 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1226 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1229 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1230 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1232 zExp
= aExp
+ bExp
- 0x7F;
1233 aSig
= ( aSig
| 0x00800000 )<<7;
1234 bSig
= ( bSig
| 0x00800000 )<<8;
1235 shift64RightJamming( ( (bits64
) aSig
) * bSig
, 32, &zSig64
);
1237 if ( 0 <= (sbits32
) ( zSig
<<1 ) ) {
1241 return roundAndPackFloat32( roundData
, zSign
, zExp
, zSig
);
1246 -------------------------------------------------------------------------------
1247 Returns the result of dividing the single-precision floating-point value `a'
1248 by the corresponding value `b'. The operation is performed according to the
1249 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1250 -------------------------------------------------------------------------------
1252 float32
float32_div( struct roundingData
*roundData
, float32 a
, float32 b
)
1254 flag aSign
, bSign
, zSign
;
1255 int16 aExp
, bExp
, zExp
;
1256 bits32 aSig
, bSig
, zSig
;
1258 aSig
= extractFloat32Frac( a
);
1259 aExp
= extractFloat32Exp( a
);
1260 aSign
= extractFloat32Sign( a
);
1261 bSig
= extractFloat32Frac( b
);
1262 bExp
= extractFloat32Exp( b
);
1263 bSign
= extractFloat32Sign( b
);
1264 zSign
= aSign
^ bSign
;
1265 if ( aExp
== 0xFF ) {
1266 if ( aSig
) return propagateFloat32NaN( a
, b
);
1267 if ( bExp
== 0xFF ) {
1268 if ( bSig
) return propagateFloat32NaN( a
, b
);
1269 roundData
->exception
|= float_flag_invalid
;
1270 return float32_default_nan
;
1272 return packFloat32( zSign
, 0xFF, 0 );
1274 if ( bExp
== 0xFF ) {
1275 if ( bSig
) return propagateFloat32NaN( a
, b
);
1276 return packFloat32( zSign
, 0, 0 );
1280 if ( ( aExp
| aSig
) == 0 ) {
1281 roundData
->exception
|= float_flag_invalid
;
1282 return float32_default_nan
;
1284 roundData
->exception
|= float_flag_divbyzero
;
1285 return packFloat32( zSign
, 0xFF, 0 );
1287 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1290 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1291 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1293 zExp
= aExp
- bExp
+ 0x7D;
1294 aSig
= ( aSig
| 0x00800000 )<<7;
1295 bSig
= ( bSig
| 0x00800000 )<<8;
1296 if ( bSig
<= ( aSig
+ aSig
) ) {
1301 bits64 tmp
= ( (bits64
) aSig
)<<32;
1302 do_div( tmp
, bSig
);
1305 if ( ( zSig
& 0x3F ) == 0 ) {
1306 zSig
|= ( ( (bits64
) bSig
) * zSig
!= ( (bits64
) aSig
)<<32 );
1308 return roundAndPackFloat32( roundData
, zSign
, zExp
, zSig
);
1313 -------------------------------------------------------------------------------
1314 Returns the remainder of the single-precision floating-point value `a'
1315 with respect to the corresponding value `b'. The operation is performed
1316 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1317 -------------------------------------------------------------------------------
1319 float32
float32_rem( struct roundingData
*roundData
, float32 a
, float32 b
)
1321 flag aSign
, bSign
, zSign
;
1322 int16 aExp
, bExp
, expDiff
;
1325 bits64 aSig64
, bSig64
, q64
;
1326 bits32 alternateASig
;
1329 aSig
= extractFloat32Frac( a
);
1330 aExp
= extractFloat32Exp( a
);
1331 aSign
= extractFloat32Sign( a
);
1332 bSig
= extractFloat32Frac( b
);
1333 bExp
= extractFloat32Exp( b
);
1334 bSign
= extractFloat32Sign( b
);
1335 if ( aExp
== 0xFF ) {
1336 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1337 return propagateFloat32NaN( a
, b
);
1339 roundData
->exception
|= float_flag_invalid
;
1340 return float32_default_nan
;
1342 if ( bExp
== 0xFF ) {
1343 if ( bSig
) return propagateFloat32NaN( a
, b
);
1348 roundData
->exception
|= float_flag_invalid
;
1349 return float32_default_nan
;
1351 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1354 if ( aSig
== 0 ) return a
;
1355 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1357 expDiff
= aExp
- bExp
;
1360 if ( expDiff
< 32 ) {
1363 if ( expDiff
< 0 ) {
1364 if ( expDiff
< -1 ) return a
;
1367 q
= ( bSig
<= aSig
);
1368 if ( q
) aSig
-= bSig
;
1369 if ( 0 < expDiff
) {
1370 bits64 tmp
= ( (bits64
) aSig
)<<32;
1371 do_div( tmp
, bSig
);
1375 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
1383 if ( bSig
<= aSig
) aSig
-= bSig
;
1384 aSig64
= ( (bits64
) aSig
)<<40;
1385 bSig64
= ( (bits64
) bSig
)<<40;
1387 while ( 0 < expDiff
) {
1388 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
1389 q64
= ( 2 < q64
) ? q64
- 2 : 0;
1390 aSig64
= - ( ( bSig
* q64
)<<38 );
1394 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
1395 q64
= ( 2 < q64
) ? q64
- 2 : 0;
1396 q
= q64
>>( 64 - expDiff
);
1398 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
1401 alternateASig
= aSig
;
1404 } while ( 0 <= (sbits32
) aSig
);
1405 sigMean
= aSig
+ alternateASig
;
1406 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
1407 aSig
= alternateASig
;
1409 zSign
= ( (sbits32
) aSig
< 0 );
1410 if ( zSign
) aSig
= - aSig
;
1411 return normalizeRoundAndPackFloat32( roundData
, aSign
^ zSign
, bExp
, aSig
);
1416 -------------------------------------------------------------------------------
1417 Returns the square root of the single-precision floating-point value `a'.
1418 The operation is performed according to the IEC/IEEE Standard for Binary
1419 Floating-point Arithmetic.
1420 -------------------------------------------------------------------------------
1422 float32
float32_sqrt( struct roundingData
*roundData
, float32 a
)
1429 aSig
= extractFloat32Frac( a
);
1430 aExp
= extractFloat32Exp( a
);
1431 aSign
= extractFloat32Sign( a
);
1432 if ( aExp
== 0xFF ) {
1433 if ( aSig
) return propagateFloat32NaN( a
, 0 );
1434 if ( ! aSign
) return a
;
1435 roundData
->exception
|= float_flag_invalid
;
1436 return float32_default_nan
;
1439 if ( ( aExp
| aSig
) == 0 ) return a
;
1440 roundData
->exception
|= float_flag_invalid
;
1441 return float32_default_nan
;
1444 if ( aSig
== 0 ) return 0;
1445 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1447 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
1448 aSig
= ( aSig
| 0x00800000 )<<8;
1449 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
1450 if ( ( zSig
& 0x7F ) <= 5 ) {
1456 term
= ( (bits64
) zSig
) * zSig
;
1457 rem
= ( ( (bits64
) aSig
)<<32 ) - term
;
1458 while ( (sbits64
) rem
< 0 ) {
1460 rem
+= ( ( (bits64
) zSig
)<<1 ) | 1;
1462 zSig
|= ( rem
!= 0 );
1465 shift32RightJamming( zSig
, 1, &zSig
);
1466 return roundAndPackFloat32( roundData
, 0, zExp
, zSig
);
1471 -------------------------------------------------------------------------------
1472 Returns 1 if the single-precision floating-point value `a' is equal to the
1473 corresponding value `b', and 0 otherwise. The comparison is performed
1474 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1475 -------------------------------------------------------------------------------
1477 flag
float32_eq( float32 a
, float32 b
)
1480 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1481 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1483 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
1484 float_raise( float_flag_invalid
);
1488 return ( a
== b
) || ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1493 -------------------------------------------------------------------------------
1494 Returns 1 if the single-precision floating-point value `a' is less than or
1495 equal to the corresponding value `b', and 0 otherwise. The comparison is
1496 performed according to the IEC/IEEE Standard for Binary Floating-point
1498 -------------------------------------------------------------------------------
1500 flag
float32_le( float32 a
, float32 b
)
1504 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1505 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1507 float_raise( float_flag_invalid
);
1510 aSign
= extractFloat32Sign( a
);
1511 bSign
= extractFloat32Sign( b
);
1512 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1513 return ( a
== b
) || ( aSign
^ ( a
< b
) );
1518 -------------------------------------------------------------------------------
1519 Returns 1 if the single-precision floating-point value `a' is less than
1520 the corresponding value `b', and 0 otherwise. The comparison is performed
1521 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1522 -------------------------------------------------------------------------------
1524 flag
float32_lt( float32 a
, float32 b
)
1528 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1529 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1531 float_raise( float_flag_invalid
);
1534 aSign
= extractFloat32Sign( a
);
1535 bSign
= extractFloat32Sign( b
);
1536 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( a
| b
)<<1 ) != 0 );
1537 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
1542 -------------------------------------------------------------------------------
1543 Returns 1 if the single-precision floating-point value `a' is equal to the
1544 corresponding value `b', and 0 otherwise. The invalid exception is raised
1545 if either operand is a NaN. Otherwise, the comparison is performed
1546 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1547 -------------------------------------------------------------------------------
1549 flag
float32_eq_signaling( float32 a
, float32 b
)
1552 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1553 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1555 float_raise( float_flag_invalid
);
1558 return ( a
== b
) || ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1563 -------------------------------------------------------------------------------
1564 Returns 1 if the single-precision floating-point value `a' is less than or
1565 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1566 cause an exception. Otherwise, the comparison is performed according to the
1567 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1568 -------------------------------------------------------------------------------
1570 flag
float32_le_quiet( float32 a
, float32 b
)
1575 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1576 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1578 /* Do nothing, even if NaN as we're quiet */
1581 aSign
= extractFloat32Sign( a
);
1582 bSign
= extractFloat32Sign( b
);
1583 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1584 return ( a
== b
) || ( aSign
^ ( a
< b
) );
1589 -------------------------------------------------------------------------------
1590 Returns 1 if the single-precision floating-point value `a' is less than
1591 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1592 exception. Otherwise, the comparison is performed according to the IEC/IEEE
1593 Standard for Binary Floating-point Arithmetic.
1594 -------------------------------------------------------------------------------
1596 flag
float32_lt_quiet( float32 a
, float32 b
)
1600 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1601 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1603 /* Do nothing, even if NaN as we're quiet */
1606 aSign
= extractFloat32Sign( a
);
1607 bSign
= extractFloat32Sign( b
);
1608 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( a
| b
)<<1 ) != 0 );
1609 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
1614 -------------------------------------------------------------------------------
1615 Returns the result of converting the double-precision floating-point value
1616 `a' to the 32-bit two's complement integer format. The conversion is
1617 performed according to the IEC/IEEE Standard for Binary Floating-point
1618 Arithmetic---which means in particular that the conversion is rounded
1619 according to the current rounding mode. If `a' is a NaN, the largest
1620 positive integer is returned. Otherwise, if the conversion overflows, the
1621 largest integer with the same sign as `a' is returned.
1622 -------------------------------------------------------------------------------
1624 int32
float64_to_int32( struct roundingData
*roundData
, float64 a
)
1627 int16 aExp
, shiftCount
;
1630 aSig
= extractFloat64Frac( a
);
1631 aExp
= extractFloat64Exp( a
);
1632 aSign
= extractFloat64Sign( a
);
1633 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
1634 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
1635 shiftCount
= 0x42C - aExp
;
1636 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
1637 return roundAndPackInt32( roundData
, aSign
, aSig
);
1642 -------------------------------------------------------------------------------
1643 Returns the result of converting the double-precision floating-point value
1644 `a' to the 32-bit two's complement integer format. The conversion is
1645 performed according to the IEC/IEEE Standard for Binary Floating-point
1646 Arithmetic, except that the conversion is always rounded toward zero. If
1647 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1648 conversion overflows, the largest integer with the same sign as `a' is
1650 -------------------------------------------------------------------------------
1652 int32
float64_to_int32_round_to_zero( float64 a
)
1655 int16 aExp
, shiftCount
;
1656 bits64 aSig
, savedASig
;
1659 aSig
= extractFloat64Frac( a
);
1660 aExp
= extractFloat64Exp( a
);
1661 aSign
= extractFloat64Sign( a
);
1662 shiftCount
= 0x433 - aExp
;
1663 if ( shiftCount
< 21 ) {
1664 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
1667 else if ( 52 < shiftCount
) {
1668 if ( aExp
|| aSig
) float_raise( float_flag_inexact
);
1671 aSig
|= LIT64( 0x0010000000000000 );
1673 aSig
>>= shiftCount
;
1675 if ( aSign
) z
= - z
;
1676 if ( ( z
< 0 ) ^ aSign
) {
1678 float_raise( float_flag_invalid
);
1679 return aSign
? 0x80000000 : 0x7FFFFFFF;
1681 if ( ( aSig
<<shiftCount
) != savedASig
) {
1682 float_raise( float_flag_inexact
);
1689 -------------------------------------------------------------------------------
1690 Returns the result of converting the double-precision floating-point value
1691 `a' to the 32-bit two's complement unsigned integer format. The conversion
1692 is performed according to the IEC/IEEE Standard for Binary Floating-point
1693 Arithmetic---which means in particular that the conversion is rounded
1694 according to the current rounding mode. If `a' is a NaN, the largest
1695 positive integer is returned. Otherwise, if the conversion overflows, the
1696 largest positive integer is returned.
1697 -------------------------------------------------------------------------------
1699 int32
float64_to_uint32( struct roundingData
*roundData
, float64 a
)
1702 int16 aExp
, shiftCount
;
1705 aSig
= extractFloat64Frac( a
);
1706 aExp
= extractFloat64Exp( a
);
1707 aSign
= 0; //extractFloat64Sign( a );
1708 //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1709 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
1710 shiftCount
= 0x42C - aExp
;
1711 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
1712 return roundAndPackInt32( roundData
, aSign
, aSig
);
1716 -------------------------------------------------------------------------------
1717 Returns the result of converting the double-precision floating-point value
1718 `a' to the 32-bit two's complement integer format. The conversion is
1719 performed according to the IEC/IEEE Standard for Binary Floating-point
1720 Arithmetic, except that the conversion is always rounded toward zero. If
1721 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1722 conversion overflows, the largest positive integer is returned.
1723 -------------------------------------------------------------------------------
1725 int32
float64_to_uint32_round_to_zero( float64 a
)
1728 int16 aExp
, shiftCount
;
1729 bits64 aSig
, savedASig
;
1732 aSig
= extractFloat64Frac( a
);
1733 aExp
= extractFloat64Exp( a
);
1734 aSign
= extractFloat64Sign( a
);
1735 shiftCount
= 0x433 - aExp
;
1736 if ( shiftCount
< 21 ) {
1737 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
1740 else if ( 52 < shiftCount
) {
1741 if ( aExp
|| aSig
) float_raise( float_flag_inexact
);
1744 aSig
|= LIT64( 0x0010000000000000 );
1746 aSig
>>= shiftCount
;
1748 if ( aSign
) z
= - z
;
1749 if ( ( z
< 0 ) ^ aSign
) {
1751 float_raise( float_flag_invalid
);
1752 return aSign
? 0x80000000 : 0x7FFFFFFF;
1754 if ( ( aSig
<<shiftCount
) != savedASig
) {
1755 float_raise( float_flag_inexact
);
1761 -------------------------------------------------------------------------------
1762 Returns the result of converting the double-precision floating-point value
1763 `a' to the single-precision floating-point format. The conversion is
1764 performed according to the IEC/IEEE Standard for Binary Floating-point
1766 -------------------------------------------------------------------------------
1768 float32
float64_to_float32( struct roundingData
*roundData
, float64 a
)
1775 aSig
= extractFloat64Frac( a
);
1776 aExp
= extractFloat64Exp( a
);
1777 aSign
= extractFloat64Sign( a
);
1778 if ( aExp
== 0x7FF ) {
1779 if ( aSig
) return commonNaNToFloat32( float64ToCommonNaN( a
) );
1780 return packFloat32( aSign
, 0xFF, 0 );
1782 shift64RightJamming( aSig
, 22, &aSig
);
1784 if ( aExp
|| zSig
) {
1788 return roundAndPackFloat32( roundData
, aSign
, aExp
, zSig
);
1795 -------------------------------------------------------------------------------
1796 Returns the result of converting the double-precision floating-point value
1797 `a' to the extended double-precision floating-point format. The conversion
1798 is performed according to the IEC/IEEE Standard for Binary Floating-point
1800 -------------------------------------------------------------------------------
1802 floatx80
float64_to_floatx80( float64 a
)
1808 aSig
= extractFloat64Frac( a
);
1809 aExp
= extractFloat64Exp( a
);
1810 aSign
= extractFloat64Sign( a
);
1811 if ( aExp
== 0x7FF ) {
1812 if ( aSig
) return commonNaNToFloatx80( float64ToCommonNaN( a
) );
1813 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1816 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
1817 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
1821 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
1828 -------------------------------------------------------------------------------
1829 Rounds the double-precision floating-point value `a' to an integer, and
1830 returns the result as a double-precision floating-point value. The
1831 operation is performed according to the IEC/IEEE Standard for Binary
1832 Floating-point Arithmetic.
1833 -------------------------------------------------------------------------------
1835 float64
float64_round_to_int( struct roundingData
*roundData
, float64 a
)
1839 bits64 lastBitMask
, roundBitsMask
;
1843 aExp
= extractFloat64Exp( a
);
1844 if ( 0x433 <= aExp
) {
1845 if ( ( aExp
== 0x7FF ) && extractFloat64Frac( a
) ) {
1846 return propagateFloat64NaN( a
, a
);
1850 if ( aExp
<= 0x3FE ) {
1851 if ( (bits64
) ( a
<<1 ) == 0 ) return a
;
1852 roundData
->exception
|= float_flag_inexact
;
1853 aSign
= extractFloat64Sign( a
);
1854 switch ( roundData
->mode
) {
1855 case float_round_nearest_even
:
1856 if ( ( aExp
== 0x3FE ) && extractFloat64Frac( a
) ) {
1857 return packFloat64( aSign
, 0x3FF, 0 );
1860 case float_round_down
:
1861 return aSign
? LIT64( 0xBFF0000000000000 ) : 0;
1862 case float_round_up
:
1864 aSign
? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
1866 return packFloat64( aSign
, 0, 0 );
1869 lastBitMask
<<= 0x433 - aExp
;
1870 roundBitsMask
= lastBitMask
- 1;
1872 roundingMode
= roundData
->mode
;
1873 if ( roundingMode
== float_round_nearest_even
) {
1874 z
+= lastBitMask
>>1;
1875 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
1877 else if ( roundingMode
!= float_round_to_zero
) {
1878 if ( extractFloat64Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
1882 z
&= ~ roundBitsMask
;
1883 if ( z
!= a
) roundData
->exception
|= float_flag_inexact
;
1889 -------------------------------------------------------------------------------
1890 Returns the result of adding the absolute values of the double-precision
1891 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1892 before being returned. `zSign' is ignored if the result is a NaN. The
1893 addition is performed according to the IEC/IEEE Standard for Binary
1894 Floating-point Arithmetic.
1895 -------------------------------------------------------------------------------
1897 static float64
addFloat64Sigs( struct roundingData
*roundData
, float64 a
, float64 b
, flag zSign
)
1899 int16 aExp
, bExp
, zExp
;
1900 bits64 aSig
, bSig
, zSig
;
1903 aSig
= extractFloat64Frac( a
);
1904 aExp
= extractFloat64Exp( a
);
1905 bSig
= extractFloat64Frac( b
);
1906 bExp
= extractFloat64Exp( b
);
1907 expDiff
= aExp
- bExp
;
1910 if ( 0 < expDiff
) {
1911 if ( aExp
== 0x7FF ) {
1912 if ( aSig
) return propagateFloat64NaN( a
, b
);
1919 bSig
|= LIT64( 0x2000000000000000 );
1921 shift64RightJamming( bSig
, expDiff
, &bSig
);
1924 else if ( expDiff
< 0 ) {
1925 if ( bExp
== 0x7FF ) {
1926 if ( bSig
) return propagateFloat64NaN( a
, b
);
1927 return packFloat64( zSign
, 0x7FF, 0 );
1933 aSig
|= LIT64( 0x2000000000000000 );
1935 shift64RightJamming( aSig
, - expDiff
, &aSig
);
1939 if ( aExp
== 0x7FF ) {
1940 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b
);
1943 if ( aExp
== 0 ) return packFloat64( zSign
, 0, ( aSig
+ bSig
)>>9 );
1944 zSig
= LIT64( 0x4000000000000000 ) + aSig
+ bSig
;
1948 aSig
|= LIT64( 0x2000000000000000 );
1949 zSig
= ( aSig
+ bSig
)<<1;
1951 if ( (sbits64
) zSig
< 0 ) {
1956 return roundAndPackFloat64( roundData
, zSign
, zExp
, zSig
);
1961 -------------------------------------------------------------------------------
1962 Returns the result of subtracting the absolute values of the double-
1963 precision floating-point values `a' and `b'. If `zSign' is true, the
1964 difference is negated before being returned. `zSign' is ignored if the
1965 result is a NaN. The subtraction is performed according to the IEC/IEEE
1966 Standard for Binary Floating-point Arithmetic.
1967 -------------------------------------------------------------------------------
1969 static float64
subFloat64Sigs( struct roundingData
*roundData
, float64 a
, float64 b
, flag zSign
)
1971 int16 aExp
, bExp
, zExp
;
1972 bits64 aSig
, bSig
, zSig
;
1975 aSig
= extractFloat64Frac( a
);
1976 aExp
= extractFloat64Exp( a
);
1977 bSig
= extractFloat64Frac( b
);
1978 bExp
= extractFloat64Exp( b
);
1979 expDiff
= aExp
- bExp
;
1982 if ( 0 < expDiff
) goto aExpBigger
;
1983 if ( expDiff
< 0 ) goto bExpBigger
;
1984 if ( aExp
== 0x7FF ) {
1985 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b
);
1986 roundData
->exception
|= float_flag_invalid
;
1987 return float64_default_nan
;
1993 if ( bSig
< aSig
) goto aBigger
;
1994 if ( aSig
< bSig
) goto bBigger
;
1995 return packFloat64( roundData
->mode
== float_round_down
, 0, 0 );
1997 if ( bExp
== 0x7FF ) {
1998 if ( bSig
) return propagateFloat64NaN( a
, b
);
1999 return packFloat64( zSign
^ 1, 0x7FF, 0 );
2005 aSig
|= LIT64( 0x4000000000000000 );
2007 shift64RightJamming( aSig
, - expDiff
, &aSig
);
2008 bSig
|= LIT64( 0x4000000000000000 );
2013 goto normalizeRoundAndPack
;
2015 if ( aExp
== 0x7FF ) {
2016 if ( aSig
) return propagateFloat64NaN( a
, b
);
2023 bSig
|= LIT64( 0x4000000000000000 );
2025 shift64RightJamming( bSig
, expDiff
, &bSig
);
2026 aSig
|= LIT64( 0x4000000000000000 );
2030 normalizeRoundAndPack
:
2032 return normalizeRoundAndPackFloat64( roundData
, zSign
, zExp
, zSig
);
2037 -------------------------------------------------------------------------------
2038 Returns the result of adding the double-precision floating-point values `a'
2039 and `b'. The operation is performed according to the IEC/IEEE Standard for
2040 Binary Floating-point Arithmetic.
2041 -------------------------------------------------------------------------------
2043 float64
float64_add( struct roundingData
*roundData
, float64 a
, float64 b
)
2047 aSign
= extractFloat64Sign( a
);
2048 bSign
= extractFloat64Sign( b
);
2049 if ( aSign
== bSign
) {
2050 return addFloat64Sigs( roundData
, a
, b
, aSign
);
2053 return subFloat64Sigs( roundData
, a
, b
, aSign
);
2059 -------------------------------------------------------------------------------
2060 Returns the result of subtracting the double-precision floating-point values
2061 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2062 for Binary Floating-point Arithmetic.
2063 -------------------------------------------------------------------------------
2065 float64
float64_sub( struct roundingData
*roundData
, float64 a
, float64 b
)
2069 aSign
= extractFloat64Sign( a
);
2070 bSign
= extractFloat64Sign( b
);
2071 if ( aSign
== bSign
) {
2072 return subFloat64Sigs( roundData
, a
, b
, aSign
);
2075 return addFloat64Sigs( roundData
, a
, b
, aSign
);
2081 -------------------------------------------------------------------------------
2082 Returns the result of multiplying the double-precision floating-point values
2083 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2084 for Binary Floating-point Arithmetic.
2085 -------------------------------------------------------------------------------
2087 float64
float64_mul( struct roundingData
*roundData
, float64 a
, float64 b
)
2089 flag aSign
, bSign
, zSign
;
2090 int16 aExp
, bExp
, zExp
;
2091 bits64 aSig
, bSig
, zSig0
, zSig1
;
2093 aSig
= extractFloat64Frac( a
);
2094 aExp
= extractFloat64Exp( a
);
2095 aSign
= extractFloat64Sign( a
);
2096 bSig
= extractFloat64Frac( b
);
2097 bExp
= extractFloat64Exp( b
);
2098 bSign
= extractFloat64Sign( b
);
2099 zSign
= aSign
^ bSign
;
2100 if ( aExp
== 0x7FF ) {
2101 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
2102 return propagateFloat64NaN( a
, b
);
2104 if ( ( bExp
| bSig
) == 0 ) {
2105 roundData
->exception
|= float_flag_invalid
;
2106 return float64_default_nan
;
2108 return packFloat64( zSign
, 0x7FF, 0 );
2110 if ( bExp
== 0x7FF ) {
2111 if ( bSig
) return propagateFloat64NaN( a
, b
);
2112 if ( ( aExp
| aSig
) == 0 ) {
2113 roundData
->exception
|= float_flag_invalid
;
2114 return float64_default_nan
;
2116 return packFloat64( zSign
, 0x7FF, 0 );
2119 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2120 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2123 if ( bSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2124 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2126 zExp
= aExp
+ bExp
- 0x3FF;
2127 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
2128 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2129 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
2130 zSig0
|= ( zSig1
!= 0 );
2131 if ( 0 <= (sbits64
) ( zSig0
<<1 ) ) {
2135 return roundAndPackFloat64( roundData
, zSign
, zExp
, zSig0
);
2140 -------------------------------------------------------------------------------
2141 Returns the result of dividing the double-precision floating-point value `a'
2142 by the corresponding value `b'. The operation is performed according to
2143 the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2144 -------------------------------------------------------------------------------
2146 float64
float64_div( struct roundingData
*roundData
, float64 a
, float64 b
)
2148 flag aSign
, bSign
, zSign
;
2149 int16 aExp
, bExp
, zExp
;
2150 bits64 aSig
, bSig
, zSig
;
2152 bits64 term0
, term1
;
2154 aSig
= extractFloat64Frac( a
);
2155 aExp
= extractFloat64Exp( a
);
2156 aSign
= extractFloat64Sign( a
);
2157 bSig
= extractFloat64Frac( b
);
2158 bExp
= extractFloat64Exp( b
);
2159 bSign
= extractFloat64Sign( b
);
2160 zSign
= aSign
^ bSign
;
2161 if ( aExp
== 0x7FF ) {
2162 if ( aSig
) return propagateFloat64NaN( a
, b
);
2163 if ( bExp
== 0x7FF ) {
2164 if ( bSig
) return propagateFloat64NaN( a
, b
);
2165 roundData
->exception
|= float_flag_invalid
;
2166 return float64_default_nan
;
2168 return packFloat64( zSign
, 0x7FF, 0 );
2170 if ( bExp
== 0x7FF ) {
2171 if ( bSig
) return propagateFloat64NaN( a
, b
);
2172 return packFloat64( zSign
, 0, 0 );
2176 if ( ( aExp
| aSig
) == 0 ) {
2177 roundData
->exception
|= float_flag_invalid
;
2178 return float64_default_nan
;
2180 roundData
->exception
|= float_flag_divbyzero
;
2181 return packFloat64( zSign
, 0x7FF, 0 );
2183 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2186 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2187 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2189 zExp
= aExp
- bExp
+ 0x3FD;
2190 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
2191 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2192 if ( bSig
<= ( aSig
+ aSig
) ) {
2196 zSig
= estimateDiv128To64( aSig
, 0, bSig
);
2197 if ( ( zSig
& 0x1FF ) <= 2 ) {
2198 mul64To128( bSig
, zSig
, &term0
, &term1
);
2199 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
2200 while ( (sbits64
) rem0
< 0 ) {
2202 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
2204 zSig
|= ( rem1
!= 0 );
2206 return roundAndPackFloat64( roundData
, zSign
, zExp
, zSig
);
2211 -------------------------------------------------------------------------------
2212 Returns the remainder of the double-precision floating-point value `a'
2213 with respect to the corresponding value `b'. The operation is performed
2214 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2215 -------------------------------------------------------------------------------
2217 float64
float64_rem( struct roundingData
*roundData
, float64 a
, float64 b
)
2219 flag aSign
, bSign
, zSign
;
2220 int16 aExp
, bExp
, expDiff
;
2222 bits64 q
, alternateASig
;
2225 aSig
= extractFloat64Frac( a
);
2226 aExp
= extractFloat64Exp( a
);
2227 aSign
= extractFloat64Sign( a
);
2228 bSig
= extractFloat64Frac( b
);
2229 bExp
= extractFloat64Exp( b
);
2230 bSign
= extractFloat64Sign( b
);
2231 if ( aExp
== 0x7FF ) {
2232 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
2233 return propagateFloat64NaN( a
, b
);
2235 roundData
->exception
|= float_flag_invalid
;
2236 return float64_default_nan
;
2238 if ( bExp
== 0x7FF ) {
2239 if ( bSig
) return propagateFloat64NaN( a
, b
);
2244 roundData
->exception
|= float_flag_invalid
;
2245 return float64_default_nan
;
2247 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2250 if ( aSig
== 0 ) return a
;
2251 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2253 expDiff
= aExp
- bExp
;
2254 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
2255 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2256 if ( expDiff
< 0 ) {
2257 if ( expDiff
< -1 ) return a
;
2260 q
= ( bSig
<= aSig
);
2261 if ( q
) aSig
-= bSig
;
2263 while ( 0 < expDiff
) {
2264 q
= estimateDiv128To64( aSig
, 0, bSig
);
2265 q
= ( 2 < q
) ? q
- 2 : 0;
2266 aSig
= - ( ( bSig
>>2 ) * q
);
2270 if ( 0 < expDiff
) {
2271 q
= estimateDiv128To64( aSig
, 0, bSig
);
2272 q
= ( 2 < q
) ? q
- 2 : 0;
2275 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
2282 alternateASig
= aSig
;
2285 } while ( 0 <= (sbits64
) aSig
);
2286 sigMean
= aSig
+ alternateASig
;
2287 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
2288 aSig
= alternateASig
;
2290 zSign
= ( (sbits64
) aSig
< 0 );
2291 if ( zSign
) aSig
= - aSig
;
2292 return normalizeRoundAndPackFloat64( roundData
, aSign
^ zSign
, bExp
, aSig
);
2297 -------------------------------------------------------------------------------
2298 Returns the square root of the double-precision floating-point value `a'.
2299 The operation is performed according to the IEC/IEEE Standard for Binary
2300 Floating-point Arithmetic.
2301 -------------------------------------------------------------------------------
2303 float64
float64_sqrt( struct roundingData
*roundData
, float64 a
)
2308 bits64 rem0
, rem1
, term0
, term1
; //, shiftedRem;
2311 aSig
= extractFloat64Frac( a
);
2312 aExp
= extractFloat64Exp( a
);
2313 aSign
= extractFloat64Sign( a
);
2314 if ( aExp
== 0x7FF ) {
2315 if ( aSig
) return propagateFloat64NaN( a
, a
);
2316 if ( ! aSign
) return a
;
2317 roundData
->exception
|= float_flag_invalid
;
2318 return float64_default_nan
;
2321 if ( ( aExp
| aSig
) == 0 ) return a
;
2322 roundData
->exception
|= float_flag_invalid
;
2323 return float64_default_nan
;
2326 if ( aSig
== 0 ) return 0;
2327 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2329 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
2330 aSig
|= LIT64( 0x0010000000000000 );
2331 zSig
= estimateSqrt32( aExp
, aSig
>>21 );
2333 aSig
<<= 9 - ( aExp
& 1 );
2334 zSig
= estimateDiv128To64( aSig
, 0, zSig
) + zSig
+ 2;
2335 if ( ( zSig
& 0x3FF ) <= 5 ) {
2337 zSig
= LIT64( 0xFFFFFFFFFFFFFFFF );
2341 mul64To128( zSig
, zSig
, &term0
, &term1
);
2342 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
2343 while ( (sbits64
) rem0
< 0 ) {
2345 shortShift128Left( 0, zSig
, 1, &term0
, &term1
);
2347 add128( rem0
, rem1
, term0
, term1
, &rem0
, &rem1
);
2349 zSig
|= ( ( rem0
| rem1
) != 0 );
2352 shift64RightJamming( zSig
, 1, &zSig
);
2353 return roundAndPackFloat64( roundData
, 0, zExp
, zSig
);
2358 -------------------------------------------------------------------------------
2359 Returns 1 if the double-precision floating-point value `a' is equal to the
2360 corresponding value `b', and 0 otherwise. The comparison is performed
2361 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2362 -------------------------------------------------------------------------------
2364 flag
float64_eq( float64 a
, float64 b
)
2367 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2368 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2370 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
2371 float_raise( float_flag_invalid
);
2375 return ( a
== b
) || ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
2380 -------------------------------------------------------------------------------
2381 Returns 1 if the double-precision floating-point value `a' is less than or
2382 equal to the corresponding value `b', and 0 otherwise. The comparison is
2383 performed according to the IEC/IEEE Standard for Binary Floating-point
2385 -------------------------------------------------------------------------------
2387 flag
float64_le( float64 a
, float64 b
)
2391 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2392 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2394 float_raise( float_flag_invalid
);
2397 aSign
= extractFloat64Sign( a
);
2398 bSign
= extractFloat64Sign( b
);
2399 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
2400 return ( a
== b
) || ( aSign
^ ( a
< b
) );
2405 -------------------------------------------------------------------------------
2406 Returns 1 if the double-precision floating-point value `a' is less than
2407 the corresponding value `b', and 0 otherwise. The comparison is performed
2408 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2409 -------------------------------------------------------------------------------
2411 flag
float64_lt( float64 a
, float64 b
)
2415 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2416 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2418 float_raise( float_flag_invalid
);
2421 aSign
= extractFloat64Sign( a
);
2422 bSign
= extractFloat64Sign( b
);
2423 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( a
| b
)<<1 ) != 0 );
2424 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
2429 -------------------------------------------------------------------------------
2430 Returns 1 if the double-precision floating-point value `a' is equal to the
2431 corresponding value `b', and 0 otherwise. The invalid exception is raised
2432 if either operand is a NaN. Otherwise, the comparison is performed
2433 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2434 -------------------------------------------------------------------------------
2436 flag
float64_eq_signaling( float64 a
, float64 b
)
2439 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2440 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2442 float_raise( float_flag_invalid
);
2445 return ( a
== b
) || ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
2450 -------------------------------------------------------------------------------
2451 Returns 1 if the double-precision floating-point value `a' is less than or
2452 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2453 cause an exception. Otherwise, the comparison is performed according to the
2454 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2455 -------------------------------------------------------------------------------
2457 flag
float64_le_quiet( float64 a
, float64 b
)
2462 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2463 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2465 /* Do nothing, even if NaN as we're quiet */
2468 aSign
= extractFloat64Sign( a
);
2469 bSign
= extractFloat64Sign( b
);
2470 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
2471 return ( a
== b
) || ( aSign
^ ( a
< b
) );
2476 -------------------------------------------------------------------------------
2477 Returns 1 if the double-precision floating-point value `a' is less than
2478 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2479 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2480 Standard for Binary Floating-point Arithmetic.
2481 -------------------------------------------------------------------------------
2483 flag
float64_lt_quiet( float64 a
, float64 b
)
2487 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2488 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2490 /* Do nothing, even if NaN as we're quiet */
2493 aSign
= extractFloat64Sign( a
);
2494 bSign
= extractFloat64Sign( b
);
2495 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( a
| b
)<<1 ) != 0 );
2496 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
2503 -------------------------------------------------------------------------------
2504 Returns the result of converting the extended double-precision floating-
2505 point value `a' to the 32-bit two's complement integer format. The
2506 conversion is performed according to the IEC/IEEE Standard for Binary
2507 Floating-point Arithmetic---which means in particular that the conversion
2508 is rounded according to the current rounding mode. If `a' is a NaN, the
2509 largest positive integer is returned. Otherwise, if the conversion
2510 overflows, the largest integer with the same sign as `a' is returned.
2511 -------------------------------------------------------------------------------
2513 int32
floatx80_to_int32( struct roundingData
*roundData
, floatx80 a
)
2516 int32 aExp
, shiftCount
;
2519 aSig
= extractFloatx80Frac( a
);
2520 aExp
= extractFloatx80Exp( a
);
2521 aSign
= extractFloatx80Sign( a
);
2522 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
2523 shiftCount
= 0x4037 - aExp
;
2524 if ( shiftCount
<= 0 ) shiftCount
= 1;
2525 shift64RightJamming( aSig
, shiftCount
, &aSig
);
2526 return roundAndPackInt32( roundData
, aSign
, aSig
);
2531 -------------------------------------------------------------------------------
2532 Returns the result of converting the extended double-precision floating-
2533 point value `a' to the 32-bit two's complement integer format. The
2534 conversion is performed according to the IEC/IEEE Standard for Binary
2535 Floating-point Arithmetic, except that the conversion is always rounded
2536 toward zero. If `a' is a NaN, the largest positive integer is returned.
2537 Otherwise, if the conversion overflows, the largest integer with the same
2538 sign as `a' is returned.
2539 -------------------------------------------------------------------------------
2541 int32
floatx80_to_int32_round_to_zero( floatx80 a
)
2544 int32 aExp
, shiftCount
;
2545 bits64 aSig
, savedASig
;
2548 aSig
= extractFloatx80Frac( a
);
2549 aExp
= extractFloatx80Exp( a
);
2550 aSign
= extractFloatx80Sign( a
);
2551 shiftCount
= 0x403E - aExp
;
2552 if ( shiftCount
< 32 ) {
2553 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
2556 else if ( 63 < shiftCount
) {
2557 if ( aExp
|| aSig
) float_raise( float_flag_inexact
);
2561 aSig
>>= shiftCount
;
2563 if ( aSign
) z
= - z
;
2564 if ( ( z
< 0 ) ^ aSign
) {
2566 float_raise( float_flag_invalid
);
2567 return aSign
? 0x80000000 : 0x7FFFFFFF;
2569 if ( ( aSig
<<shiftCount
) != savedASig
) {
2570 float_raise( float_flag_inexact
);
2577 -------------------------------------------------------------------------------
2578 Returns the result of converting the extended double-precision floating-
2579 point value `a' to the single-precision floating-point format. The
2580 conversion is performed according to the IEC/IEEE Standard for Binary
2581 Floating-point Arithmetic.
2582 -------------------------------------------------------------------------------
2584 float32
floatx80_to_float32( struct roundingData
*roundData
, floatx80 a
)
2590 aSig
= extractFloatx80Frac( a
);
2591 aExp
= extractFloatx80Exp( a
);
2592 aSign
= extractFloatx80Sign( a
);
2593 if ( aExp
== 0x7FFF ) {
2594 if ( (bits64
) ( aSig
<<1 ) ) {
2595 return commonNaNToFloat32( floatx80ToCommonNaN( a
) );
2597 return packFloat32( aSign
, 0xFF, 0 );
2599 shift64RightJamming( aSig
, 33, &aSig
);
2600 if ( aExp
|| aSig
) aExp
-= 0x3F81;
2601 return roundAndPackFloat32( roundData
, aSign
, aExp
, aSig
);
2606 -------------------------------------------------------------------------------
2607 Returns the result of converting the extended double-precision floating-
2608 point value `a' to the double-precision floating-point format. The
2609 conversion is performed according to the IEC/IEEE Standard for Binary
2610 Floating-point Arithmetic.
2611 -------------------------------------------------------------------------------
2613 float64
floatx80_to_float64( struct roundingData
*roundData
, floatx80 a
)
2619 aSig
= extractFloatx80Frac( a
);
2620 aExp
= extractFloatx80Exp( a
);
2621 aSign
= extractFloatx80Sign( a
);
2622 if ( aExp
== 0x7FFF ) {
2623 if ( (bits64
) ( aSig
<<1 ) ) {
2624 return commonNaNToFloat64( floatx80ToCommonNaN( a
) );
2626 return packFloat64( aSign
, 0x7FF, 0 );
2628 shift64RightJamming( aSig
, 1, &zSig
);
2629 if ( aExp
|| aSig
) aExp
-= 0x3C01;
2630 return roundAndPackFloat64( roundData
, aSign
, aExp
, zSig
);
2635 -------------------------------------------------------------------------------
2636 Rounds the extended double-precision floating-point value `a' to an integer,
2637 and returns the result as an extended quadruple-precision floating-point
2638 value. The operation is performed according to the IEC/IEEE Standard for
2639 Binary Floating-point Arithmetic.
2640 -------------------------------------------------------------------------------
2642 floatx80
floatx80_round_to_int( struct roundingData
*roundData
, floatx80 a
)
2646 bits64 lastBitMask
, roundBitsMask
;
2650 aExp
= extractFloatx80Exp( a
);
2651 if ( 0x403E <= aExp
) {
2652 if ( ( aExp
== 0x7FFF ) && (bits64
) ( extractFloatx80Frac( a
)<<1 ) ) {
2653 return propagateFloatx80NaN( a
, a
);
2657 if ( aExp
<= 0x3FFE ) {
2659 && ( (bits64
) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
2662 roundData
->exception
|= float_flag_inexact
;
2663 aSign
= extractFloatx80Sign( a
);
2664 switch ( roundData
->mode
) {
2665 case float_round_nearest_even
:
2666 if ( ( aExp
== 0x3FFE ) && (bits64
) ( extractFloatx80Frac( a
)<<1 )
2669 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
2672 case float_round_down
:
2675 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
2676 : packFloatx80( 0, 0, 0 );
2677 case float_round_up
:
2679 aSign
? packFloatx80( 1, 0, 0 )
2680 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
2682 return packFloatx80( aSign
, 0, 0 );
2685 lastBitMask
<<= 0x403E - aExp
;
2686 roundBitsMask
= lastBitMask
- 1;
2688 roundingMode
= roundData
->mode
;
2689 if ( roundingMode
== float_round_nearest_even
) {
2690 z
.low
+= lastBitMask
>>1;
2691 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
2693 else if ( roundingMode
!= float_round_to_zero
) {
2694 if ( extractFloatx80Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
2695 z
.low
+= roundBitsMask
;
2698 z
.low
&= ~ roundBitsMask
;
2701 z
.low
= LIT64( 0x8000000000000000 );
2703 if ( z
.low
!= a
.low
) roundData
->exception
|= float_flag_inexact
;
2709 -------------------------------------------------------------------------------
2710 Returns the result of adding the absolute values of the extended double-
2711 precision floating-point values `a' and `b'. If `zSign' is true, the sum is
2712 negated before being returned. `zSign' is ignored if the result is a NaN.
2713 The addition is performed according to the IEC/IEEE Standard for Binary
2714 Floating-point Arithmetic.
2715 -------------------------------------------------------------------------------
2717 static floatx80
addFloatx80Sigs( struct roundingData
*roundData
, floatx80 a
, floatx80 b
, flag zSign
)
2719 int32 aExp
, bExp
, zExp
;
2720 bits64 aSig
, bSig
, zSig0
, zSig1
;
2723 aSig
= extractFloatx80Frac( a
);
2724 aExp
= extractFloatx80Exp( a
);
2725 bSig
= extractFloatx80Frac( b
);
2726 bExp
= extractFloatx80Exp( b
);
2727 expDiff
= aExp
- bExp
;
2728 if ( 0 < expDiff
) {
2729 if ( aExp
== 0x7FFF ) {
2730 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
2733 if ( bExp
== 0 ) --expDiff
;
2734 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
2737 else if ( expDiff
< 0 ) {
2738 if ( bExp
== 0x7FFF ) {
2739 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
2740 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2742 if ( aExp
== 0 ) ++expDiff
;
2743 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
2747 if ( aExp
== 0x7FFF ) {
2748 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
2749 return propagateFloatx80NaN( a
, b
);
2754 zSig0
= aSig
+ bSig
;
2756 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
2763 zSig0
= aSig
+ bSig
;
2765 if ( (sbits64
) zSig0
< 0 ) goto roundAndPack
;
2767 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
2768 zSig0
|= LIT64( 0x8000000000000000 );
2772 roundAndPackFloatx80(
2773 roundData
, zSign
, zExp
, zSig0
, zSig1
);
2778 -------------------------------------------------------------------------------
2779 Returns the result of subtracting the absolute values of the extended
2780 double-precision floating-point values `a' and `b'. If `zSign' is true,
2781 the difference is negated before being returned. `zSign' is ignored if the
2782 result is a NaN. The subtraction is performed according to the IEC/IEEE
2783 Standard for Binary Floating-point Arithmetic.
2784 -------------------------------------------------------------------------------
2786 static floatx80
subFloatx80Sigs( struct roundingData
*roundData
, floatx80 a
, floatx80 b
, flag zSign
)
2788 int32 aExp
, bExp
, zExp
;
2789 bits64 aSig
, bSig
, zSig0
, zSig1
;
2793 aSig
= extractFloatx80Frac( a
);
2794 aExp
= extractFloatx80Exp( a
);
2795 bSig
= extractFloatx80Frac( b
);
2796 bExp
= extractFloatx80Exp( b
);
2797 expDiff
= aExp
- bExp
;
2798 if ( 0 < expDiff
) goto aExpBigger
;
2799 if ( expDiff
< 0 ) goto bExpBigger
;
2800 if ( aExp
== 0x7FFF ) {
2801 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
2802 return propagateFloatx80NaN( a
, b
);
2804 roundData
->exception
|= float_flag_invalid
;
2805 z
.low
= floatx80_default_nan_low
;
2806 z
.high
= floatx80_default_nan_high
;
2815 if ( bSig
< aSig
) goto aBigger
;
2816 if ( aSig
< bSig
) goto bBigger
;
2817 return packFloatx80( roundData
->mode
== float_round_down
, 0, 0 );
2819 if ( bExp
== 0x7FFF ) {
2820 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
2821 return packFloatx80( zSign
^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
2823 if ( aExp
== 0 ) ++expDiff
;
2824 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
2826 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
2829 goto normalizeRoundAndPack
;
2831 if ( aExp
== 0x7FFF ) {
2832 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
2835 if ( bExp
== 0 ) --expDiff
;
2836 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
2838 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
2840 normalizeRoundAndPack
:
2842 normalizeRoundAndPackFloatx80(
2843 roundData
, zSign
, zExp
, zSig0
, zSig1
);
2848 -------------------------------------------------------------------------------
2849 Returns the result of adding the extended double-precision floating-point
2850 values `a' and `b'. The operation is performed according to the IEC/IEEE
2851 Standard for Binary Floating-point Arithmetic.
2852 -------------------------------------------------------------------------------
2854 floatx80
floatx80_add( struct roundingData
*roundData
, floatx80 a
, floatx80 b
)
2858 aSign
= extractFloatx80Sign( a
);
2859 bSign
= extractFloatx80Sign( b
);
2860 if ( aSign
== bSign
) {
2861 return addFloatx80Sigs( roundData
, a
, b
, aSign
);
2864 return subFloatx80Sigs( roundData
, a
, b
, aSign
);
2870 -------------------------------------------------------------------------------
2871 Returns the result of subtracting the extended double-precision floating-
2872 point values `a' and `b'. The operation is performed according to the
2873 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2874 -------------------------------------------------------------------------------
2876 floatx80
floatx80_sub( struct roundingData
*roundData
, floatx80 a
, floatx80 b
)
2880 aSign
= extractFloatx80Sign( a
);
2881 bSign
= extractFloatx80Sign( b
);
2882 if ( aSign
== bSign
) {
2883 return subFloatx80Sigs( roundData
, a
, b
, aSign
);
2886 return addFloatx80Sigs( roundData
, a
, b
, aSign
);
2892 -------------------------------------------------------------------------------
2893 Returns the result of multiplying the extended double-precision floating-
2894 point values `a' and `b'. The operation is performed according to the
2895 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2896 -------------------------------------------------------------------------------
2898 floatx80
floatx80_mul( struct roundingData
*roundData
, floatx80 a
, floatx80 b
)
2900 flag aSign
, bSign
, zSign
;
2901 int32 aExp
, bExp
, zExp
;
2902 bits64 aSig
, bSig
, zSig0
, zSig1
;
2905 aSig
= extractFloatx80Frac( a
);
2906 aExp
= extractFloatx80Exp( a
);
2907 aSign
= extractFloatx80Sign( a
);
2908 bSig
= extractFloatx80Frac( b
);
2909 bExp
= extractFloatx80Exp( b
);
2910 bSign
= extractFloatx80Sign( b
);
2911 zSign
= aSign
^ bSign
;
2912 if ( aExp
== 0x7FFF ) {
2913 if ( (bits64
) ( aSig
<<1 )
2914 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
2915 return propagateFloatx80NaN( a
, b
);
2917 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
2918 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2920 if ( bExp
== 0x7FFF ) {
2921 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
2922 if ( ( aExp
| aSig
) == 0 ) {
2924 roundData
->exception
|= float_flag_invalid
;
2925 z
.low
= floatx80_default_nan_low
;
2926 z
.high
= floatx80_default_nan_high
;
2930 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2933 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
2934 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
2937 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
2938 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
2940 zExp
= aExp
+ bExp
- 0x3FFE;
2941 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
2942 if ( 0 < (sbits64
) zSig0
) {
2943 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
2947 roundAndPackFloatx80(
2948 roundData
, zSign
, zExp
, zSig0
, zSig1
);
2953 -------------------------------------------------------------------------------
2954 Returns the result of dividing the extended double-precision floating-point
2955 value `a' by the corresponding value `b'. The operation is performed
2956 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2957 -------------------------------------------------------------------------------
2959 floatx80
floatx80_div( struct roundingData
*roundData
, floatx80 a
, floatx80 b
)
2961 flag aSign
, bSign
, zSign
;
2962 int32 aExp
, bExp
, zExp
;
2963 bits64 aSig
, bSig
, zSig0
, zSig1
;
2964 bits64 rem0
, rem1
, rem2
, term0
, term1
, term2
;
2967 aSig
= extractFloatx80Frac( a
);
2968 aExp
= extractFloatx80Exp( a
);
2969 aSign
= extractFloatx80Sign( a
);
2970 bSig
= extractFloatx80Frac( b
);
2971 bExp
= extractFloatx80Exp( b
);
2972 bSign
= extractFloatx80Sign( b
);
2973 zSign
= aSign
^ bSign
;
2974 if ( aExp
== 0x7FFF ) {
2975 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
2976 if ( bExp
== 0x7FFF ) {
2977 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
2980 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2982 if ( bExp
== 0x7FFF ) {
2983 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
2984 return packFloatx80( zSign
, 0, 0 );
2988 if ( ( aExp
| aSig
) == 0 ) {
2990 roundData
->exception
|= float_flag_invalid
;
2991 z
.low
= floatx80_default_nan_low
;
2992 z
.high
= floatx80_default_nan_high
;
2996 roundData
->exception
|= float_flag_divbyzero
;
2997 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2999 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3002 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3003 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
3005 zExp
= aExp
- bExp
+ 0x3FFE;
3007 if ( bSig
<= aSig
) {
3008 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
3011 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
3012 mul64To128( bSig
, zSig0
, &term0
, &term1
);
3013 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
3014 while ( (sbits64
) rem0
< 0 ) {
3016 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
3018 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
3019 if ( (bits64
) ( zSig1
<<1 ) <= 8 ) {
3020 mul64To128( bSig
, zSig1
, &term1
, &term2
);
3021 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
3022 while ( (sbits64
) rem1
< 0 ) {
3024 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
3026 zSig1
|= ( ( rem1
| rem2
) != 0 );
3029 roundAndPackFloatx80(
3030 roundData
, zSign
, zExp
, zSig0
, zSig1
);
3035 -------------------------------------------------------------------------------
3036 Returns the remainder of the extended double-precision floating-point value
3037 `a' with respect to the corresponding value `b'. The operation is performed
3038 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3039 -------------------------------------------------------------------------------
3041 floatx80
floatx80_rem( struct roundingData
*roundData
, floatx80 a
, floatx80 b
)
3043 flag aSign
, bSign
, zSign
;
3044 int32 aExp
, bExp
, expDiff
;
3045 bits64 aSig0
, aSig1
, bSig
;
3046 bits64 q
, term0
, term1
, alternateASig0
, alternateASig1
;
3049 aSig0
= extractFloatx80Frac( a
);
3050 aExp
= extractFloatx80Exp( a
);
3051 aSign
= extractFloatx80Sign( a
);
3052 bSig
= extractFloatx80Frac( b
);
3053 bExp
= extractFloatx80Exp( b
);
3054 bSign
= extractFloatx80Sign( b
);
3055 if ( aExp
== 0x7FFF ) {
3056 if ( (bits64
) ( aSig0
<<1 )
3057 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
3058 return propagateFloatx80NaN( a
, b
);
3062 if ( bExp
== 0x7FFF ) {
3063 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3069 roundData
->exception
|= float_flag_invalid
;
3070 z
.low
= floatx80_default_nan_low
;
3071 z
.high
= floatx80_default_nan_high
;
3075 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3078 if ( (bits64
) ( aSig0
<<1 ) == 0 ) return a
;
3079 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
3081 bSig
|= LIT64( 0x8000000000000000 );
3083 expDiff
= aExp
- bExp
;
3085 if ( expDiff
< 0 ) {
3086 if ( expDiff
< -1 ) return a
;
3087 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
3090 q
= ( bSig
<= aSig0
);
3091 if ( q
) aSig0
-= bSig
;
3093 while ( 0 < expDiff
) {
3094 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
3095 q
= ( 2 < q
) ? q
- 2 : 0;
3096 mul64To128( bSig
, q
, &term0
, &term1
);
3097 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3098 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
3102 if ( 0 < expDiff
) {
3103 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
3104 q
= ( 2 < q
) ? q
- 2 : 0;
3106 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
3107 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3108 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
3109 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
3111 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3118 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
3119 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
3120 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
3123 aSig0
= alternateASig0
;
3124 aSig1
= alternateASig1
;
3129 normalizeRoundAndPackFloatx80(
3130 roundData
, zSign
, bExp
+ expDiff
, aSig0
, aSig1
);
3135 -------------------------------------------------------------------------------
3136 Returns the square root of the extended double-precision floating-point
3137 value `a'. The operation is performed according to the IEC/IEEE Standard
3138 for Binary Floating-point Arithmetic.
3139 -------------------------------------------------------------------------------
3141 floatx80
floatx80_sqrt( struct roundingData
*roundData
, floatx80 a
)
3145 bits64 aSig0
, aSig1
, zSig0
, zSig1
;
3146 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
3147 bits64 shiftedRem0
, shiftedRem1
;
3150 aSig0
= extractFloatx80Frac( a
);
3151 aExp
= extractFloatx80Exp( a
);
3152 aSign
= extractFloatx80Sign( a
);
3153 if ( aExp
== 0x7FFF ) {
3154 if ( (bits64
) ( aSig0
<<1 ) ) return propagateFloatx80NaN( a
, a
);
3155 if ( ! aSign
) return a
;
3159 if ( ( aExp
| aSig0
) == 0 ) return a
;
3161 roundData
->exception
|= float_flag_invalid
;
3162 z
.low
= floatx80_default_nan_low
;
3163 z
.high
= floatx80_default_nan_high
;
3168 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
3169 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
3171 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
3172 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
3175 shift128Right( aSig0
, 0, ( aExp
& 1 ) + 2, &aSig0
, &aSig1
);
3176 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
) + zSig0
+ 4;
3177 if ( 0 <= (sbits64
) zSig0
) zSig0
= LIT64( 0xFFFFFFFFFFFFFFFF );
3178 shortShift128Left( aSig0
, aSig1
, 2, &aSig0
, &aSig1
);
3179 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
3180 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
3181 while ( (sbits64
) rem0
< 0 ) {
3183 shortShift128Left( 0, zSig0
, 1, &term0
, &term1
);
3185 add128( rem0
, rem1
, term0
, term1
, &rem0
, &rem1
);
3187 shortShift128Left( rem0
, rem1
, 63, &shiftedRem0
, &shiftedRem1
);
3188 zSig1
= estimateDiv128To64( shiftedRem0
, shiftedRem1
, zSig0
);
3189 if ( (bits64
) ( zSig1
<<1 ) <= 10 ) {
3190 if ( zSig1
== 0 ) zSig1
= 1;
3191 mul64To128( zSig0
, zSig1
, &term1
, &term2
);
3192 shortShift128Left( term1
, term2
, 1, &term1
, &term2
);
3193 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
3194 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
3195 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
3196 while ( (sbits64
) rem1
< 0 ) {
3198 shortShift192Left( 0, zSig0
, zSig1
, 1, &term1
, &term2
, &term3
);
3201 rem1
, rem2
, rem3
, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
3203 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
3206 roundAndPackFloatx80(
3207 roundData
, 0, zExp
, zSig0
, zSig1
);
3212 -------------------------------------------------------------------------------
3213 Returns 1 if the extended double-precision floating-point value `a' is
3214 equal to the corresponding value `b', and 0 otherwise. The comparison is
3215 performed according to the IEC/IEEE Standard for Binary Floating-point
3217 -------------------------------------------------------------------------------
3219 flag
floatx80_eq( floatx80 a
, floatx80 b
)
3222 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3223 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3224 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3225 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3227 if ( floatx80_is_signaling_nan( a
)
3228 || floatx80_is_signaling_nan( b
) ) {
3229 float_raise( float_flag_invalid
);
3235 && ( ( a
.high
== b
.high
)
3237 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
3243 -------------------------------------------------------------------------------
3244 Returns 1 if the extended double-precision floating-point value `a' is
3245 less than or equal to the corresponding value `b', and 0 otherwise. The
3246 comparison is performed according to the IEC/IEEE Standard for Binary
3247 Floating-point Arithmetic.
3248 -------------------------------------------------------------------------------
3250 flag
floatx80_le( floatx80 a
, floatx80 b
)
3254 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3255 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3256 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3257 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3259 float_raise( float_flag_invalid
);
3262 aSign
= extractFloatx80Sign( a
);
3263 bSign
= extractFloatx80Sign( b
);
3264 if ( aSign
!= bSign
) {
3267 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
3271 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
3272 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
3277 -------------------------------------------------------------------------------
3278 Returns 1 if the extended double-precision floating-point value `a' is
3279 less than the corresponding value `b', and 0 otherwise. The comparison
3280 is performed according to the IEC/IEEE Standard for Binary Floating-point
3282 -------------------------------------------------------------------------------
3284 flag
floatx80_lt( floatx80 a
, floatx80 b
)
3288 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3289 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3290 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3291 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3293 float_raise( float_flag_invalid
);
3296 aSign
= extractFloatx80Sign( a
);
3297 bSign
= extractFloatx80Sign( b
);
3298 if ( aSign
!= bSign
) {
3301 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
3305 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
3306 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
3311 -------------------------------------------------------------------------------
3312 Returns 1 if the extended double-precision floating-point value `a' is equal
3313 to the corresponding value `b', and 0 otherwise. The invalid exception is
3314 raised if either operand is a NaN. Otherwise, the comparison is performed
3315 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3316 -------------------------------------------------------------------------------
3318 flag
floatx80_eq_signaling( floatx80 a
, floatx80 b
)
3321 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3322 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3323 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3324 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3326 float_raise( float_flag_invalid
);
3331 && ( ( a
.high
== b
.high
)
3333 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
3339 -------------------------------------------------------------------------------
3340 Returns 1 if the extended double-precision floating-point value `a' is less
3341 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
3342 do not cause an exception. Otherwise, the comparison is performed according
3343 to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3344 -------------------------------------------------------------------------------
3346 flag
floatx80_le_quiet( floatx80 a
, floatx80 b
)
3350 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3351 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3352 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3353 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3355 /* Do nothing, even if NaN as we're quiet */
3358 aSign
= extractFloatx80Sign( a
);
3359 bSign
= extractFloatx80Sign( b
);
3360 if ( aSign
!= bSign
) {
3363 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
3367 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
3368 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
3373 -------------------------------------------------------------------------------
3374 Returns 1 if the extended double-precision floating-point value `a' is less
3375 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
3376 an exception. Otherwise, the comparison is performed according to the
3377 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3378 -------------------------------------------------------------------------------
3380 flag
floatx80_lt_quiet( floatx80 a
, floatx80 b
)
3384 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3385 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3386 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3387 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3389 /* Do nothing, even if NaN as we're quiet */
3392 aSign
= extractFloatx80Sign( a
);
3393 bSign
= extractFloatx80Sign( b
);
3394 if ( aSign
!= bSign
) {
3397 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
3401 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
3402 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);