GUI: Fix Tomato RAF theme for all builds. Compilation typo.
[tomato.git] / release / src-rt-6.x.4708 / linux / linux-2.6.36 / arch / arm / nwfpe / softfloat.c
blobeff49745ae4021a730f426faf46fd893fa2ba3ff
1 /*
2 ===============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point
5 Arithmetic Package, Release 2.
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
15 arithmetic/softfloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
18 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
19 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
20 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
21 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
23 Derivative works are acceptable, even for commercial purposes, so long as
24 (1) they include prominent notice that the work is derivative, and (2) they
25 include prominent notice akin to these three paragraphs for those parts of
26 this code that are retained.
28 ===============================================================================
31 #include <asm/div64.h>
33 #include "fpa11.h"
34 //#include "milieu.h"
35 //#include "softfloat.h"
38 -------------------------------------------------------------------------------
39 Primitive arithmetic functions, including multi-word arithmetic, and
40 division and square root approximations. (Can be specialized to target if
41 desired.)
42 -------------------------------------------------------------------------------
44 #include "softfloat-macros"
47 -------------------------------------------------------------------------------
48 Functions and definitions to determine: (1) whether tininess for underflow
49 is detected before or after rounding by default, (2) what (if anything)
50 happens when exceptions are raised, (3) how signaling NaNs are distinguished
51 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
52 are propagated from function inputs to output. These details are target-
53 specific.
54 -------------------------------------------------------------------------------
56 #include "softfloat-specialize"
59 -------------------------------------------------------------------------------
60 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
61 and 7, and returns the properly rounded 32-bit integer corresponding to the
62 input. If `zSign' is nonzero, the input is negated before being converted
63 to an integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point
64 input is simply rounded to an integer, with the inexact exception raised if
65 the input cannot be represented exactly as an integer. If the fixed-point
66 input is too large, however, the invalid exception is raised and the largest
67 positive or negative integer is returned.
68 -------------------------------------------------------------------------------
70 static int32 roundAndPackInt32( struct roundingData *roundData, flag zSign, bits64 absZ )
72 int8 roundingMode;
73 flag roundNearestEven;
74 int8 roundIncrement, roundBits;
75 int32 z;
77 roundingMode = roundData->mode;
78 roundNearestEven = ( roundingMode == float_round_nearest_even );
79 roundIncrement = 0x40;
80 if ( ! roundNearestEven ) {
81 if ( roundingMode == float_round_to_zero ) {
82 roundIncrement = 0;
84 else {
85 roundIncrement = 0x7F;
86 if ( zSign ) {
87 if ( roundingMode == float_round_up ) roundIncrement = 0;
89 else {
90 if ( roundingMode == float_round_down ) roundIncrement = 0;
94 roundBits = absZ & 0x7F;
95 absZ = ( absZ + roundIncrement )>>7;
96 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
97 z = absZ;
98 if ( zSign ) z = - z;
99 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
100 roundData->exception |= float_flag_invalid;
101 return zSign ? 0x80000000 : 0x7FFFFFFF;
103 if ( roundBits ) roundData->exception |= float_flag_inexact;
104 return z;
109 -------------------------------------------------------------------------------
110 Returns the fraction bits of the single-precision floating-point value `a'.
111 -------------------------------------------------------------------------------
113 INLINE bits32 extractFloat32Frac( float32 a )
116 return a & 0x007FFFFF;
121 -------------------------------------------------------------------------------
122 Returns the exponent bits of the single-precision floating-point value `a'.
123 -------------------------------------------------------------------------------
125 INLINE int16 extractFloat32Exp( float32 a )
128 return ( a>>23 ) & 0xFF;
133 -------------------------------------------------------------------------------
134 Returns the sign bit of the single-precision floating-point value `a'.
135 -------------------------------------------------------------------------------
139 -------------------------------------------------------------------------------
140 Normalizes the subnormal single-precision floating-point value represented
141 by the denormalized significand `aSig'. The normalized exponent and
142 significand are stored at the locations pointed to by `zExpPtr' and
143 `zSigPtr', respectively.
144 -------------------------------------------------------------------------------
146 static void
147 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
149 int8 shiftCount;
151 shiftCount = countLeadingZeros32( aSig ) - 8;
152 *zSigPtr = aSig<<shiftCount;
153 *zExpPtr = 1 - shiftCount;
158 -------------------------------------------------------------------------------
159 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
160 single-precision floating-point value, returning the result. After being
161 shifted into the proper positions, the three fields are simply added
162 together to form the result. This means that any integer portion of `zSig'
163 will be added into the exponent. Since a properly normalized significand
164 will have an integer portion equal to 1, the `zExp' input should be 1 less
165 than the desired result exponent whenever `zSig' is a complete, normalized
166 significand.
167 -------------------------------------------------------------------------------
169 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
171 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
175 -------------------------------------------------------------------------------
176 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
177 and significand `zSig', and returns the proper single-precision floating-
178 point value corresponding to the abstract input. Ordinarily, the abstract
179 value is simply rounded and packed into the single-precision format, with
180 the inexact exception raised if the abstract input cannot be represented
181 exactly. If the abstract value is too large, however, the overflow and
182 inexact exceptions are raised and an infinity or maximal finite value is
183 returned. If the abstract value is too small, the input value is rounded to
184 a subnormal number, and the underflow and inexact exceptions are raised if
185 the abstract input cannot be represented exactly as a subnormal single-
186 precision floating-point number.
187 The input significand `zSig' has its binary point between bits 30
188 and 29, which is 7 bits to the left of the usual location. This shifted
189 significand must be normalized or smaller. If `zSig' is not normalized,
190 `zExp' must be 0; in that case, the result returned is a subnormal number,
191 and it must not require rounding. In the usual case that `zSig' is
192 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
193 The handling of underflow and overflow follows the IEC/IEEE Standard for
194 Binary Floating-point Arithmetic.
195 -------------------------------------------------------------------------------
197 static float32 roundAndPackFloat32( struct roundingData *roundData, flag zSign, int16 zExp, bits32 zSig )
199 int8 roundingMode;
200 flag roundNearestEven;
201 int8 roundIncrement, roundBits;
202 flag isTiny;
204 roundingMode = roundData->mode;
205 roundNearestEven = ( roundingMode == float_round_nearest_even );
206 roundIncrement = 0x40;
207 if ( ! roundNearestEven ) {
208 if ( roundingMode == float_round_to_zero ) {
209 roundIncrement = 0;
211 else {
212 roundIncrement = 0x7F;
213 if ( zSign ) {
214 if ( roundingMode == float_round_up ) roundIncrement = 0;
216 else {
217 if ( roundingMode == float_round_down ) roundIncrement = 0;
221 roundBits = zSig & 0x7F;
222 if ( 0xFD <= (bits16) zExp ) {
223 if ( ( 0xFD < zExp )
224 || ( ( zExp == 0xFD )
225 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
227 roundData->exception |= float_flag_overflow | float_flag_inexact;
228 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
230 if ( zExp < 0 ) {
231 isTiny =
232 ( float_detect_tininess == float_tininess_before_rounding )
233 || ( zExp < -1 )
234 || ( zSig + roundIncrement < 0x80000000 );
235 shift32RightJamming( zSig, - zExp, &zSig );
236 zExp = 0;
237 roundBits = zSig & 0x7F;
238 if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
241 if ( roundBits ) roundData->exception |= float_flag_inexact;
242 zSig = ( zSig + roundIncrement )>>7;
243 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
244 if ( zSig == 0 ) zExp = 0;
245 return packFloat32( zSign, zExp, zSig );
250 -------------------------------------------------------------------------------
251 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
252 and significand `zSig', and returns the proper single-precision floating-
253 point value corresponding to the abstract input. This routine is just like
254 `roundAndPackFloat32' except that `zSig' does not have to be normalized in
255 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
256 point exponent.
257 -------------------------------------------------------------------------------
259 static float32
260 normalizeRoundAndPackFloat32( struct roundingData *roundData, flag zSign, int16 zExp, bits32 zSig )
262 int8 shiftCount;
264 shiftCount = countLeadingZeros32( zSig ) - 1;
265 return roundAndPackFloat32( roundData, zSign, zExp - shiftCount, zSig<<shiftCount );
270 -------------------------------------------------------------------------------
271 Returns the fraction bits of the double-precision floating-point value `a'.
272 -------------------------------------------------------------------------------
274 INLINE bits64 extractFloat64Frac( float64 a )
277 return a & LIT64( 0x000FFFFFFFFFFFFF );
282 -------------------------------------------------------------------------------
283 Returns the exponent bits of the double-precision floating-point value `a'.
284 -------------------------------------------------------------------------------
286 INLINE int16 extractFloat64Exp( float64 a )
289 return ( a>>52 ) & 0x7FF;
294 -------------------------------------------------------------------------------
295 Returns the sign bit of the double-precision floating-point value `a'.
296 -------------------------------------------------------------------------------
300 -------------------------------------------------------------------------------
301 Normalizes the subnormal double-precision floating-point value represented
302 by the denormalized significand `aSig'. The normalized exponent and
303 significand are stored at the locations pointed to by `zExpPtr' and
304 `zSigPtr', respectively.
305 -------------------------------------------------------------------------------
307 static void
308 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
310 int8 shiftCount;
312 shiftCount = countLeadingZeros64( aSig ) - 11;
313 *zSigPtr = aSig<<shiftCount;
314 *zExpPtr = 1 - shiftCount;
319 -------------------------------------------------------------------------------
320 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
321 double-precision floating-point value, returning the result. After being
322 shifted into the proper positions, the three fields are simply added
323 together to form the result. This means that any integer portion of `zSig'
324 will be added into the exponent. Since a properly normalized significand
325 will have an integer portion equal to 1, the `zExp' input should be 1 less
326 than the desired result exponent whenever `zSig' is a complete, normalized
327 significand.
328 -------------------------------------------------------------------------------
330 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
333 return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
338 -------------------------------------------------------------------------------
339 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
340 and significand `zSig', and returns the proper double-precision floating-
341 point value corresponding to the abstract input. Ordinarily, the abstract
342 value is simply rounded and packed into the double-precision format, with
343 the inexact exception raised if the abstract input cannot be represented
344 exactly. If the abstract value is too large, however, the overflow and
345 inexact exceptions are raised and an infinity or maximal finite value is
346 returned. If the abstract value is too small, the input value is rounded to
347 a subnormal number, and the underflow and inexact exceptions are raised if
348 the abstract input cannot be represented exactly as a subnormal double-
349 precision floating-point number.
350 The input significand `zSig' has its binary point between bits 62
351 and 61, which is 10 bits to the left of the usual location. This shifted
352 significand must be normalized or smaller. If `zSig' is not normalized,
353 `zExp' must be 0; in that case, the result returned is a subnormal number,
354 and it must not require rounding. In the usual case that `zSig' is
355 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
356 The handling of underflow and overflow follows the IEC/IEEE Standard for
357 Binary Floating-point Arithmetic.
358 -------------------------------------------------------------------------------
360 static float64 roundAndPackFloat64( struct roundingData *roundData, flag zSign, int16 zExp, bits64 zSig )
362 int8 roundingMode;
363 flag roundNearestEven;
364 int16 roundIncrement, roundBits;
365 flag isTiny;
367 roundingMode = roundData->mode;
368 roundNearestEven = ( roundingMode == float_round_nearest_even );
369 roundIncrement = 0x200;
370 if ( ! roundNearestEven ) {
371 if ( roundingMode == float_round_to_zero ) {
372 roundIncrement = 0;
374 else {
375 roundIncrement = 0x3FF;
376 if ( zSign ) {
377 if ( roundingMode == float_round_up ) roundIncrement = 0;
379 else {
380 if ( roundingMode == float_round_down ) roundIncrement = 0;
384 roundBits = zSig & 0x3FF;
385 if ( 0x7FD <= (bits16) zExp ) {
386 if ( ( 0x7FD < zExp )
387 || ( ( zExp == 0x7FD )
388 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
390 //register int lr = __builtin_return_address(0);
391 //printk("roundAndPackFloat64 called from 0x%08x\n",lr);
392 roundData->exception |= float_flag_overflow | float_flag_inexact;
393 return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
395 if ( zExp < 0 ) {
396 isTiny =
397 ( float_detect_tininess == float_tininess_before_rounding )
398 || ( zExp < -1 )
399 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
400 shift64RightJamming( zSig, - zExp, &zSig );
401 zExp = 0;
402 roundBits = zSig & 0x3FF;
403 if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
406 if ( roundBits ) roundData->exception |= float_flag_inexact;
407 zSig = ( zSig + roundIncrement )>>10;
408 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
409 if ( zSig == 0 ) zExp = 0;
410 return packFloat64( zSign, zExp, zSig );
415 -------------------------------------------------------------------------------
416 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
417 and significand `zSig', and returns the proper double-precision floating-
418 point value corresponding to the abstract input. This routine is just like
419 `roundAndPackFloat64' except that `zSig' does not have to be normalized in
420 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
421 point exponent.
422 -------------------------------------------------------------------------------
424 static float64
425 normalizeRoundAndPackFloat64( struct roundingData *roundData, flag zSign, int16 zExp, bits64 zSig )
427 int8 shiftCount;
429 shiftCount = countLeadingZeros64( zSig ) - 1;
430 return roundAndPackFloat64( roundData, zSign, zExp - shiftCount, zSig<<shiftCount );
434 #ifdef FLOATX80
437 -------------------------------------------------------------------------------
438 Returns the fraction bits of the extended double-precision floating-point
439 value `a'.
440 -------------------------------------------------------------------------------
442 INLINE bits64 extractFloatx80Frac( floatx80 a )
445 return a.low;
450 -------------------------------------------------------------------------------
451 Returns the exponent bits of the extended double-precision floating-point
452 value `a'.
453 -------------------------------------------------------------------------------
455 INLINE int32 extractFloatx80Exp( floatx80 a )
458 return a.high & 0x7FFF;
463 -------------------------------------------------------------------------------
464 Returns the sign bit of the extended double-precision floating-point value
465 `a'.
466 -------------------------------------------------------------------------------
468 INLINE flag extractFloatx80Sign( floatx80 a )
471 return a.high>>15;
476 -------------------------------------------------------------------------------
477 Normalizes the subnormal extended double-precision floating-point value
478 represented by the denormalized significand `aSig'. The normalized exponent
479 and significand are stored at the locations pointed to by `zExpPtr' and
480 `zSigPtr', respectively.
481 -------------------------------------------------------------------------------
483 static void
484 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
486 int8 shiftCount;
488 shiftCount = countLeadingZeros64( aSig );
489 *zSigPtr = aSig<<shiftCount;
490 *zExpPtr = 1 - shiftCount;
495 -------------------------------------------------------------------------------
496 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
497 extended double-precision floating-point value, returning the result.
498 -------------------------------------------------------------------------------
500 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
502 floatx80 z;
504 z.low = zSig;
505 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
506 z.__padding = 0;
507 return z;
512 -------------------------------------------------------------------------------
513 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
514 and extended significand formed by the concatenation of `zSig0' and `zSig1',
515 and returns the proper extended double-precision floating-point value
516 corresponding to the abstract input. Ordinarily, the abstract value is
517 rounded and packed into the extended double-precision format, with the
518 inexact exception raised if the abstract input cannot be represented
519 exactly. If the abstract value is too large, however, the overflow and
520 inexact exceptions are raised and an infinity or maximal finite value is
521 returned. If the abstract value is too small, the input value is rounded to
522 a subnormal number, and the underflow and inexact exceptions are raised if
523 the abstract input cannot be represented exactly as a subnormal extended
524 double-precision floating-point number.
525 If `roundingPrecision' is 32 or 64, the result is rounded to the same
526 number of bits as single or double precision, respectively. Otherwise, the
527 result is rounded to the full precision of the extended double-precision
528 format.
529 The input significand must be normalized or smaller. If the input
530 significand is not normalized, `zExp' must be 0; in that case, the result
531 returned is a subnormal number, and it must not require rounding. The
532 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
533 Floating-point Arithmetic.
534 -------------------------------------------------------------------------------
536 static floatx80
537 roundAndPackFloatx80(
538 struct roundingData *roundData, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
541 int8 roundingMode, roundingPrecision;
542 flag roundNearestEven, increment, isTiny;
543 int64 roundIncrement, roundMask, roundBits;
545 roundingMode = roundData->mode;
546 roundingPrecision = roundData->precision;
547 roundNearestEven = ( roundingMode == float_round_nearest_even );
548 if ( roundingPrecision == 80 ) goto precision80;
549 if ( roundingPrecision == 64 ) {
550 roundIncrement = LIT64( 0x0000000000000400 );
551 roundMask = LIT64( 0x00000000000007FF );
553 else if ( roundingPrecision == 32 ) {
554 roundIncrement = LIT64( 0x0000008000000000 );
555 roundMask = LIT64( 0x000000FFFFFFFFFF );
557 else {
558 goto precision80;
560 zSig0 |= ( zSig1 != 0 );
561 if ( ! roundNearestEven ) {
562 if ( roundingMode == float_round_to_zero ) {
563 roundIncrement = 0;
565 else {
566 roundIncrement = roundMask;
567 if ( zSign ) {
568 if ( roundingMode == float_round_up ) roundIncrement = 0;
570 else {
571 if ( roundingMode == float_round_down ) roundIncrement = 0;
575 roundBits = zSig0 & roundMask;
576 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
577 if ( ( 0x7FFE < zExp )
578 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
580 goto overflow;
582 if ( zExp <= 0 ) {
583 isTiny =
584 ( float_detect_tininess == float_tininess_before_rounding )
585 || ( zExp < 0 )
586 || ( zSig0 <= zSig0 + roundIncrement );
587 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
588 zExp = 0;
589 roundBits = zSig0 & roundMask;
590 if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
591 if ( roundBits ) roundData->exception |= float_flag_inexact;
592 zSig0 += roundIncrement;
593 if ( (sbits64) zSig0 < 0 ) zExp = 1;
594 roundIncrement = roundMask + 1;
595 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
596 roundMask |= roundIncrement;
598 zSig0 &= ~ roundMask;
599 return packFloatx80( zSign, zExp, zSig0 );
602 if ( roundBits ) roundData->exception |= float_flag_inexact;
603 zSig0 += roundIncrement;
604 if ( zSig0 < roundIncrement ) {
605 ++zExp;
606 zSig0 = LIT64( 0x8000000000000000 );
608 roundIncrement = roundMask + 1;
609 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
610 roundMask |= roundIncrement;
612 zSig0 &= ~ roundMask;
613 if ( zSig0 == 0 ) zExp = 0;
614 return packFloatx80( zSign, zExp, zSig0 );
615 precision80:
616 increment = ( (sbits64) zSig1 < 0 );
617 if ( ! roundNearestEven ) {
618 if ( roundingMode == float_round_to_zero ) {
619 increment = 0;
621 else {
622 if ( zSign ) {
623 increment = ( roundingMode == float_round_down ) && zSig1;
625 else {
626 increment = ( roundingMode == float_round_up ) && zSig1;
630 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
631 if ( ( 0x7FFE < zExp )
632 || ( ( zExp == 0x7FFE )
633 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
634 && increment
637 roundMask = 0;
638 overflow:
639 roundData->exception |= float_flag_overflow | float_flag_inexact;
640 if ( ( roundingMode == float_round_to_zero )
641 || ( zSign && ( roundingMode == float_round_up ) )
642 || ( ! zSign && ( roundingMode == float_round_down ) )
644 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
646 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
648 if ( zExp <= 0 ) {
649 isTiny =
650 ( float_detect_tininess == float_tininess_before_rounding )
651 || ( zExp < 0 )
652 || ! increment
653 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
654 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
655 zExp = 0;
656 if ( isTiny && zSig1 ) roundData->exception |= float_flag_underflow;
657 if ( zSig1 ) roundData->exception |= float_flag_inexact;
658 if ( roundNearestEven ) {
659 increment = ( (sbits64) zSig1 < 0 );
661 else {
662 if ( zSign ) {
663 increment = ( roundingMode == float_round_down ) && zSig1;
665 else {
666 increment = ( roundingMode == float_round_up ) && zSig1;
669 if ( increment ) {
670 ++zSig0;
671 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
672 if ( (sbits64) zSig0 < 0 ) zExp = 1;
674 return packFloatx80( zSign, zExp, zSig0 );
677 if ( zSig1 ) roundData->exception |= float_flag_inexact;
678 if ( increment ) {
679 ++zSig0;
680 if ( zSig0 == 0 ) {
681 ++zExp;
682 zSig0 = LIT64( 0x8000000000000000 );
684 else {
685 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
688 else {
689 if ( zSig0 == 0 ) zExp = 0;
692 return packFloatx80( zSign, zExp, zSig0 );
696 -------------------------------------------------------------------------------
697 Takes an abstract floating-point value having sign `zSign', exponent
698 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
699 and returns the proper extended double-precision floating-point value
700 corresponding to the abstract input. This routine is just like
701 `roundAndPackFloatx80' except that the input significand does not have to be
702 normalized.
703 -------------------------------------------------------------------------------
705 static floatx80
706 normalizeRoundAndPackFloatx80(
707 struct roundingData *roundData, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
710 int8 shiftCount;
712 if ( zSig0 == 0 ) {
713 zSig0 = zSig1;
714 zSig1 = 0;
715 zExp -= 64;
717 shiftCount = countLeadingZeros64( zSig0 );
718 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
719 zExp -= shiftCount;
720 return
721 roundAndPackFloatx80( roundData, zSign, zExp, zSig0, zSig1 );
725 #endif
728 -------------------------------------------------------------------------------
729 Returns the result of converting the 32-bit two's complement integer `a' to
730 the single-precision floating-point format. The conversion is performed
731 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
732 -------------------------------------------------------------------------------
734 float32 int32_to_float32(struct roundingData *roundData, int32 a)
736 flag zSign;
738 if ( a == 0 ) return 0;
739 if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
740 zSign = ( a < 0 );
741 return normalizeRoundAndPackFloat32( roundData, zSign, 0x9C, zSign ? - a : a );
746 -------------------------------------------------------------------------------
747 Returns the result of converting the 32-bit two's complement integer `a' to
748 the double-precision floating-point format. The conversion is performed
749 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
750 -------------------------------------------------------------------------------
752 float64 int32_to_float64( int32 a )
754 flag aSign;
755 uint32 absA;
756 int8 shiftCount;
757 bits64 zSig;
759 if ( a == 0 ) return 0;
760 aSign = ( a < 0 );
761 absA = aSign ? - a : a;
762 shiftCount = countLeadingZeros32( absA ) + 21;
763 zSig = absA;
764 return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount );
768 #ifdef FLOATX80
771 -------------------------------------------------------------------------------
772 Returns the result of converting the 32-bit two's complement integer `a'
773 to the extended double-precision floating-point format. The conversion
774 is performed according to the IEC/IEEE Standard for Binary Floating-point
775 Arithmetic.
776 -------------------------------------------------------------------------------
778 floatx80 int32_to_floatx80( int32 a )
780 flag zSign;
781 uint32 absA;
782 int8 shiftCount;
783 bits64 zSig;
785 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
786 zSign = ( a < 0 );
787 absA = zSign ? - a : a;
788 shiftCount = countLeadingZeros32( absA ) + 32;
789 zSig = absA;
790 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
794 #endif
797 -------------------------------------------------------------------------------
798 Returns the result of converting the single-precision floating-point value
799 `a' to the 32-bit two's complement integer format. The conversion is
800 performed according to the IEC/IEEE Standard for Binary Floating-point
801 Arithmetic---which means in particular that the conversion is rounded
802 according to the current rounding mode. If `a' is a NaN, the largest
803 positive integer is returned. Otherwise, if the conversion overflows, the
804 largest integer with the same sign as `a' is returned.
805 -------------------------------------------------------------------------------
807 int32 float32_to_int32( struct roundingData *roundData, float32 a )
809 flag aSign;
810 int16 aExp, shiftCount;
811 bits32 aSig;
812 bits64 zSig;
814 aSig = extractFloat32Frac( a );
815 aExp = extractFloat32Exp( a );
816 aSign = extractFloat32Sign( a );
817 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
818 if ( aExp ) aSig |= 0x00800000;
819 shiftCount = 0xAF - aExp;
820 zSig = aSig;
821 zSig <<= 32;
822 if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig );
823 return roundAndPackInt32( roundData, aSign, zSig );
828 -------------------------------------------------------------------------------
829 Returns the result of converting the single-precision floating-point value
830 `a' to the 32-bit two's complement integer format. The conversion is
831 performed according to the IEC/IEEE Standard for Binary Floating-point
832 Arithmetic, except that the conversion is always rounded toward zero. If
833 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
834 conversion overflows, the largest integer with the same sign as `a' is
835 returned.
836 -------------------------------------------------------------------------------
838 int32 float32_to_int32_round_to_zero( float32 a )
840 flag aSign;
841 int16 aExp, shiftCount;
842 bits32 aSig;
843 int32 z;
845 aSig = extractFloat32Frac( a );
846 aExp = extractFloat32Exp( a );
847 aSign = extractFloat32Sign( a );
848 shiftCount = aExp - 0x9E;
849 if ( 0 <= shiftCount ) {
850 if ( a == 0xCF000000 ) return 0x80000000;
851 float_raise( float_flag_invalid );
852 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
853 return 0x80000000;
855 else if ( aExp <= 0x7E ) {
856 if ( aExp | aSig ) float_raise( float_flag_inexact );
857 return 0;
859 aSig = ( aSig | 0x00800000 )<<8;
860 z = aSig>>( - shiftCount );
861 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
862 float_raise( float_flag_inexact );
864 return aSign ? - z : z;
869 -------------------------------------------------------------------------------
870 Returns the result of converting the single-precision floating-point value
871 `a' to the double-precision floating-point format. The conversion is
872 performed according to the IEC/IEEE Standard for Binary Floating-point
873 Arithmetic.
874 -------------------------------------------------------------------------------
876 float64 float32_to_float64( float32 a )
878 flag aSign;
879 int16 aExp;
880 bits32 aSig;
882 aSig = extractFloat32Frac( a );
883 aExp = extractFloat32Exp( a );
884 aSign = extractFloat32Sign( a );
885 if ( aExp == 0xFF ) {
886 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
887 return packFloat64( aSign, 0x7FF, 0 );
889 if ( aExp == 0 ) {
890 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
891 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
892 --aExp;
894 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
898 #ifdef FLOATX80
901 -------------------------------------------------------------------------------
902 Returns the result of converting the single-precision floating-point value
903 `a' to the extended double-precision floating-point format. The conversion
904 is performed according to the IEC/IEEE Standard for Binary Floating-point
905 Arithmetic.
906 -------------------------------------------------------------------------------
908 floatx80 float32_to_floatx80( float32 a )
910 flag aSign;
911 int16 aExp;
912 bits32 aSig;
914 aSig = extractFloat32Frac( a );
915 aExp = extractFloat32Exp( a );
916 aSign = extractFloat32Sign( a );
917 if ( aExp == 0xFF ) {
918 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
919 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
921 if ( aExp == 0 ) {
922 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
923 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
925 aSig |= 0x00800000;
926 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
930 #endif
933 -------------------------------------------------------------------------------
934 Rounds the single-precision floating-point value `a' to an integer, and
935 returns the result as a single-precision floating-point value. The
936 operation is performed according to the IEC/IEEE Standard for Binary
937 Floating-point Arithmetic.
938 -------------------------------------------------------------------------------
940 float32 float32_round_to_int( struct roundingData *roundData, float32 a )
942 flag aSign;
943 int16 aExp;
944 bits32 lastBitMask, roundBitsMask;
945 int8 roundingMode;
946 float32 z;
948 aExp = extractFloat32Exp( a );
949 if ( 0x96 <= aExp ) {
950 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
951 return propagateFloat32NaN( a, a );
953 return a;
955 roundingMode = roundData->mode;
956 if ( aExp <= 0x7E ) {
957 if ( (bits32) ( a<<1 ) == 0 ) return a;
958 roundData->exception |= float_flag_inexact;
959 aSign = extractFloat32Sign( a );
960 switch ( roundingMode ) {
961 case float_round_nearest_even:
962 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
963 return packFloat32( aSign, 0x7F, 0 );
965 break;
966 case float_round_down:
967 return aSign ? 0xBF800000 : 0;
968 case float_round_up:
969 return aSign ? 0x80000000 : 0x3F800000;
971 return packFloat32( aSign, 0, 0 );
973 lastBitMask = 1;
974 lastBitMask <<= 0x96 - aExp;
975 roundBitsMask = lastBitMask - 1;
976 z = a;
977 if ( roundingMode == float_round_nearest_even ) {
978 z += lastBitMask>>1;
979 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
981 else if ( roundingMode != float_round_to_zero ) {
982 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
983 z += roundBitsMask;
986 z &= ~ roundBitsMask;
987 if ( z != a ) roundData->exception |= float_flag_inexact;
988 return z;
993 -------------------------------------------------------------------------------
994 Returns the result of adding the absolute values of the single-precision
995 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
996 before being returned. `zSign' is ignored if the result is a NaN. The
997 addition is performed according to the IEC/IEEE Standard for Binary
998 Floating-point Arithmetic.
999 -------------------------------------------------------------------------------
1001 static float32 addFloat32Sigs( struct roundingData *roundData, float32 a, float32 b, flag zSign )
1003 int16 aExp, bExp, zExp;
1004 bits32 aSig, bSig, zSig;
1005 int16 expDiff;
1007 aSig = extractFloat32Frac( a );
1008 aExp = extractFloat32Exp( a );
1009 bSig = extractFloat32Frac( b );
1010 bExp = extractFloat32Exp( b );
1011 expDiff = aExp - bExp;
1012 aSig <<= 6;
1013 bSig <<= 6;
1014 if ( 0 < expDiff ) {
1015 if ( aExp == 0xFF ) {
1016 if ( aSig ) return propagateFloat32NaN( a, b );
1017 return a;
1019 if ( bExp == 0 ) {
1020 --expDiff;
1022 else {
1023 bSig |= 0x20000000;
1025 shift32RightJamming( bSig, expDiff, &bSig );
1026 zExp = aExp;
1028 else if ( expDiff < 0 ) {
1029 if ( bExp == 0xFF ) {
1030 if ( bSig ) return propagateFloat32NaN( a, b );
1031 return packFloat32( zSign, 0xFF, 0 );
1033 if ( aExp == 0 ) {
1034 ++expDiff;
1036 else {
1037 aSig |= 0x20000000;
1039 shift32RightJamming( aSig, - expDiff, &aSig );
1040 zExp = bExp;
1042 else {
1043 if ( aExp == 0xFF ) {
1044 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1045 return a;
1047 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1048 zSig = 0x40000000 + aSig + bSig;
1049 zExp = aExp;
1050 goto roundAndPack;
1052 aSig |= 0x20000000;
1053 zSig = ( aSig + bSig )<<1;
1054 --zExp;
1055 if ( (sbits32) zSig < 0 ) {
1056 zSig = aSig + bSig;
1057 ++zExp;
1059 roundAndPack:
1060 return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1065 -------------------------------------------------------------------------------
1066 Returns the result of subtracting the absolute values of the single-
1067 precision floating-point values `a' and `b'. If `zSign' is true, the
1068 difference is negated before being returned. `zSign' is ignored if the
1069 result is a NaN. The subtraction is performed according to the IEC/IEEE
1070 Standard for Binary Floating-point Arithmetic.
1071 -------------------------------------------------------------------------------
1073 static float32 subFloat32Sigs( struct roundingData *roundData, float32 a, float32 b, flag zSign )
1075 int16 aExp, bExp, zExp;
1076 bits32 aSig, bSig, zSig;
1077 int16 expDiff;
1079 aSig = extractFloat32Frac( a );
1080 aExp = extractFloat32Exp( a );
1081 bSig = extractFloat32Frac( b );
1082 bExp = extractFloat32Exp( b );
1083 expDiff = aExp - bExp;
1084 aSig <<= 7;
1085 bSig <<= 7;
1086 if ( 0 < expDiff ) goto aExpBigger;
1087 if ( expDiff < 0 ) goto bExpBigger;
1088 if ( aExp == 0xFF ) {
1089 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1090 roundData->exception |= float_flag_invalid;
1091 return float32_default_nan;
1093 if ( aExp == 0 ) {
1094 aExp = 1;
1095 bExp = 1;
1097 if ( bSig < aSig ) goto aBigger;
1098 if ( aSig < bSig ) goto bBigger;
1099 return packFloat32( roundData->mode == float_round_down, 0, 0 );
1100 bExpBigger:
1101 if ( bExp == 0xFF ) {
1102 if ( bSig ) return propagateFloat32NaN( a, b );
1103 return packFloat32( zSign ^ 1, 0xFF, 0 );
1105 if ( aExp == 0 ) {
1106 ++expDiff;
1108 else {
1109 aSig |= 0x40000000;
1111 shift32RightJamming( aSig, - expDiff, &aSig );
1112 bSig |= 0x40000000;
1113 bBigger:
1114 zSig = bSig - aSig;
1115 zExp = bExp;
1116 zSign ^= 1;
1117 goto normalizeRoundAndPack;
1118 aExpBigger:
1119 if ( aExp == 0xFF ) {
1120 if ( aSig ) return propagateFloat32NaN( a, b );
1121 return a;
1123 if ( bExp == 0 ) {
1124 --expDiff;
1126 else {
1127 bSig |= 0x40000000;
1129 shift32RightJamming( bSig, expDiff, &bSig );
1130 aSig |= 0x40000000;
1131 aBigger:
1132 zSig = aSig - bSig;
1133 zExp = aExp;
1134 normalizeRoundAndPack:
1135 --zExp;
1136 return normalizeRoundAndPackFloat32( roundData, zSign, zExp, zSig );
1141 -------------------------------------------------------------------------------
1142 Returns the result of adding the single-precision floating-point values `a'
1143 and `b'. The operation is performed according to the IEC/IEEE Standard for
1144 Binary Floating-point Arithmetic.
1145 -------------------------------------------------------------------------------
1147 float32 float32_add( struct roundingData *roundData, float32 a, float32 b )
1149 flag aSign, bSign;
1151 aSign = extractFloat32Sign( a );
1152 bSign = extractFloat32Sign( b );
1153 if ( aSign == bSign ) {
1154 return addFloat32Sigs( roundData, a, b, aSign );
1156 else {
1157 return subFloat32Sigs( roundData, a, b, aSign );
1163 -------------------------------------------------------------------------------
1164 Returns the result of subtracting the single-precision floating-point values
1165 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1166 for Binary Floating-point Arithmetic.
1167 -------------------------------------------------------------------------------
1169 float32 float32_sub( struct roundingData *roundData, float32 a, float32 b )
1171 flag aSign, bSign;
1173 aSign = extractFloat32Sign( a );
1174 bSign = extractFloat32Sign( b );
1175 if ( aSign == bSign ) {
1176 return subFloat32Sigs( roundData, a, b, aSign );
1178 else {
1179 return addFloat32Sigs( roundData, a, b, aSign );
1185 -------------------------------------------------------------------------------
1186 Returns the result of multiplying the single-precision floating-point values
1187 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1188 for Binary Floating-point Arithmetic.
1189 -------------------------------------------------------------------------------
1191 float32 float32_mul( struct roundingData *roundData, float32 a, float32 b )
1193 flag aSign, bSign, zSign;
1194 int16 aExp, bExp, zExp;
1195 bits32 aSig, bSig;
1196 bits64 zSig64;
1197 bits32 zSig;
1199 aSig = extractFloat32Frac( a );
1200 aExp = extractFloat32Exp( a );
1201 aSign = extractFloat32Sign( a );
1202 bSig = extractFloat32Frac( b );
1203 bExp = extractFloat32Exp( b );
1204 bSign = extractFloat32Sign( b );
1205 zSign = aSign ^ bSign;
1206 if ( aExp == 0xFF ) {
1207 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1208 return propagateFloat32NaN( a, b );
1210 if ( ( bExp | bSig ) == 0 ) {
1211 roundData->exception |= float_flag_invalid;
1212 return float32_default_nan;
1214 return packFloat32( zSign, 0xFF, 0 );
1216 if ( bExp == 0xFF ) {
1217 if ( bSig ) return propagateFloat32NaN( a, b );
1218 if ( ( aExp | aSig ) == 0 ) {
1219 roundData->exception |= float_flag_invalid;
1220 return float32_default_nan;
1222 return packFloat32( zSign, 0xFF, 0 );
1224 if ( aExp == 0 ) {
1225 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1226 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1228 if ( bExp == 0 ) {
1229 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1230 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1232 zExp = aExp + bExp - 0x7F;
1233 aSig = ( aSig | 0x00800000 )<<7;
1234 bSig = ( bSig | 0x00800000 )<<8;
1235 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1236 zSig = zSig64;
1237 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1238 zSig <<= 1;
1239 --zExp;
1241 return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1246 -------------------------------------------------------------------------------
1247 Returns the result of dividing the single-precision floating-point value `a'
1248 by the corresponding value `b'. The operation is performed according to the
1249 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1250 -------------------------------------------------------------------------------
1252 float32 float32_div( struct roundingData *roundData, float32 a, float32 b )
1254 flag aSign, bSign, zSign;
1255 int16 aExp, bExp, zExp;
1256 bits32 aSig, bSig, zSig;
1258 aSig = extractFloat32Frac( a );
1259 aExp = extractFloat32Exp( a );
1260 aSign = extractFloat32Sign( a );
1261 bSig = extractFloat32Frac( b );
1262 bExp = extractFloat32Exp( b );
1263 bSign = extractFloat32Sign( b );
1264 zSign = aSign ^ bSign;
1265 if ( aExp == 0xFF ) {
1266 if ( aSig ) return propagateFloat32NaN( a, b );
1267 if ( bExp == 0xFF ) {
1268 if ( bSig ) return propagateFloat32NaN( a, b );
1269 roundData->exception |= float_flag_invalid;
1270 return float32_default_nan;
1272 return packFloat32( zSign, 0xFF, 0 );
1274 if ( bExp == 0xFF ) {
1275 if ( bSig ) return propagateFloat32NaN( a, b );
1276 return packFloat32( zSign, 0, 0 );
1278 if ( bExp == 0 ) {
1279 if ( bSig == 0 ) {
1280 if ( ( aExp | aSig ) == 0 ) {
1281 roundData->exception |= float_flag_invalid;
1282 return float32_default_nan;
1284 roundData->exception |= float_flag_divbyzero;
1285 return packFloat32( zSign, 0xFF, 0 );
1287 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1289 if ( aExp == 0 ) {
1290 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1291 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1293 zExp = aExp - bExp + 0x7D;
1294 aSig = ( aSig | 0x00800000 )<<7;
1295 bSig = ( bSig | 0x00800000 )<<8;
1296 if ( bSig <= ( aSig + aSig ) ) {
1297 aSig >>= 1;
1298 ++zExp;
1301 bits64 tmp = ( (bits64) aSig )<<32;
1302 do_div( tmp, bSig );
1303 zSig = tmp;
1305 if ( ( zSig & 0x3F ) == 0 ) {
1306 zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 );
1308 return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1313 -------------------------------------------------------------------------------
1314 Returns the remainder of the single-precision floating-point value `a'
1315 with respect to the corresponding value `b'. The operation is performed
1316 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1317 -------------------------------------------------------------------------------
1319 float32 float32_rem( struct roundingData *roundData, float32 a, float32 b )
1321 flag aSign, bSign, zSign;
1322 int16 aExp, bExp, expDiff;
1323 bits32 aSig, bSig;
1324 bits32 q;
1325 bits64 aSig64, bSig64, q64;
1326 bits32 alternateASig;
1327 sbits32 sigMean;
1329 aSig = extractFloat32Frac( a );
1330 aExp = extractFloat32Exp( a );
1331 aSign = extractFloat32Sign( a );
1332 bSig = extractFloat32Frac( b );
1333 bExp = extractFloat32Exp( b );
1334 bSign = extractFloat32Sign( b );
1335 if ( aExp == 0xFF ) {
1336 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1337 return propagateFloat32NaN( a, b );
1339 roundData->exception |= float_flag_invalid;
1340 return float32_default_nan;
1342 if ( bExp == 0xFF ) {
1343 if ( bSig ) return propagateFloat32NaN( a, b );
1344 return a;
1346 if ( bExp == 0 ) {
1347 if ( bSig == 0 ) {
1348 roundData->exception |= float_flag_invalid;
1349 return float32_default_nan;
1351 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1353 if ( aExp == 0 ) {
1354 if ( aSig == 0 ) return a;
1355 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1357 expDiff = aExp - bExp;
1358 aSig |= 0x00800000;
1359 bSig |= 0x00800000;
1360 if ( expDiff < 32 ) {
1361 aSig <<= 8;
1362 bSig <<= 8;
1363 if ( expDiff < 0 ) {
1364 if ( expDiff < -1 ) return a;
1365 aSig >>= 1;
1367 q = ( bSig <= aSig );
1368 if ( q ) aSig -= bSig;
1369 if ( 0 < expDiff ) {
1370 bits64 tmp = ( (bits64) aSig )<<32;
1371 do_div( tmp, bSig );
1372 q = tmp;
1373 q >>= 32 - expDiff;
1374 bSig >>= 2;
1375 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1377 else {
1378 aSig >>= 2;
1379 bSig >>= 2;
1382 else {
1383 if ( bSig <= aSig ) aSig -= bSig;
1384 aSig64 = ( (bits64) aSig )<<40;
1385 bSig64 = ( (bits64) bSig )<<40;
1386 expDiff -= 64;
1387 while ( 0 < expDiff ) {
1388 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1389 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1390 aSig64 = - ( ( bSig * q64 )<<38 );
1391 expDiff -= 62;
1393 expDiff += 64;
1394 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1395 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1396 q = q64>>( 64 - expDiff );
1397 bSig <<= 6;
1398 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1400 do {
1401 alternateASig = aSig;
1402 ++q;
1403 aSig -= bSig;
1404 } while ( 0 <= (sbits32) aSig );
1405 sigMean = aSig + alternateASig;
1406 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1407 aSig = alternateASig;
1409 zSign = ( (sbits32) aSig < 0 );
1410 if ( zSign ) aSig = - aSig;
1411 return normalizeRoundAndPackFloat32( roundData, aSign ^ zSign, bExp, aSig );
1416 -------------------------------------------------------------------------------
1417 Returns the square root of the single-precision floating-point value `a'.
1418 The operation is performed according to the IEC/IEEE Standard for Binary
1419 Floating-point Arithmetic.
1420 -------------------------------------------------------------------------------
1422 float32 float32_sqrt( struct roundingData *roundData, float32 a )
1424 flag aSign;
1425 int16 aExp, zExp;
1426 bits32 aSig, zSig;
1427 bits64 rem, term;
1429 aSig = extractFloat32Frac( a );
1430 aExp = extractFloat32Exp( a );
1431 aSign = extractFloat32Sign( a );
1432 if ( aExp == 0xFF ) {
1433 if ( aSig ) return propagateFloat32NaN( a, 0 );
1434 if ( ! aSign ) return a;
1435 roundData->exception |= float_flag_invalid;
1436 return float32_default_nan;
1438 if ( aSign ) {
1439 if ( ( aExp | aSig ) == 0 ) return a;
1440 roundData->exception |= float_flag_invalid;
1441 return float32_default_nan;
1443 if ( aExp == 0 ) {
1444 if ( aSig == 0 ) return 0;
1445 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1447 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1448 aSig = ( aSig | 0x00800000 )<<8;
1449 zSig = estimateSqrt32( aExp, aSig ) + 2;
1450 if ( ( zSig & 0x7F ) <= 5 ) {
1451 if ( zSig < 2 ) {
1452 zSig = 0xFFFFFFFF;
1454 else {
1455 aSig >>= aExp & 1;
1456 term = ( (bits64) zSig ) * zSig;
1457 rem = ( ( (bits64) aSig )<<32 ) - term;
1458 while ( (sbits64) rem < 0 ) {
1459 --zSig;
1460 rem += ( ( (bits64) zSig )<<1 ) | 1;
1462 zSig |= ( rem != 0 );
1465 shift32RightJamming( zSig, 1, &zSig );
1466 return roundAndPackFloat32( roundData, 0, zExp, zSig );
1471 -------------------------------------------------------------------------------
1472 Returns 1 if the single-precision floating-point value `a' is equal to the
1473 corresponding value `b', and 0 otherwise. The comparison is performed
1474 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1475 -------------------------------------------------------------------------------
1477 flag float32_eq( float32 a, float32 b )
1480 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1481 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1483 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1484 float_raise( float_flag_invalid );
1486 return 0;
1488 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1493 -------------------------------------------------------------------------------
1494 Returns 1 if the single-precision floating-point value `a' is less than or
1495 equal to the corresponding value `b', and 0 otherwise. The comparison is
1496 performed according to the IEC/IEEE Standard for Binary Floating-point
1497 Arithmetic.
1498 -------------------------------------------------------------------------------
1500 flag float32_le( float32 a, float32 b )
1502 flag aSign, bSign;
1504 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1505 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1507 float_raise( float_flag_invalid );
1508 return 0;
1510 aSign = extractFloat32Sign( a );
1511 bSign = extractFloat32Sign( b );
1512 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1513 return ( a == b ) || ( aSign ^ ( a < b ) );
1518 -------------------------------------------------------------------------------
1519 Returns 1 if the single-precision floating-point value `a' is less than
1520 the corresponding value `b', and 0 otherwise. The comparison is performed
1521 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1522 -------------------------------------------------------------------------------
1524 flag float32_lt( float32 a, float32 b )
1526 flag aSign, bSign;
1528 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1529 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1531 float_raise( float_flag_invalid );
1532 return 0;
1534 aSign = extractFloat32Sign( a );
1535 bSign = extractFloat32Sign( b );
1536 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1537 return ( a != b ) && ( aSign ^ ( a < b ) );
1542 -------------------------------------------------------------------------------
1543 Returns 1 if the single-precision floating-point value `a' is equal to the
1544 corresponding value `b', and 0 otherwise. The invalid exception is raised
1545 if either operand is a NaN. Otherwise, the comparison is performed
1546 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1547 -------------------------------------------------------------------------------
1549 flag float32_eq_signaling( float32 a, float32 b )
1552 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1553 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1555 float_raise( float_flag_invalid );
1556 return 0;
1558 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1563 -------------------------------------------------------------------------------
1564 Returns 1 if the single-precision floating-point value `a' is less than or
1565 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1566 cause an exception. Otherwise, the comparison is performed according to the
1567 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1568 -------------------------------------------------------------------------------
1570 flag float32_le_quiet( float32 a, float32 b )
1572 flag aSign, bSign;
1573 //int16 aExp, bExp;
1575 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1576 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1578 /* Do nothing, even if NaN as we're quiet */
1579 return 0;
1581 aSign = extractFloat32Sign( a );
1582 bSign = extractFloat32Sign( b );
1583 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1584 return ( a == b ) || ( aSign ^ ( a < b ) );
1589 -------------------------------------------------------------------------------
1590 Returns 1 if the single-precision floating-point value `a' is less than
1591 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1592 exception. Otherwise, the comparison is performed according to the IEC/IEEE
1593 Standard for Binary Floating-point Arithmetic.
1594 -------------------------------------------------------------------------------
1596 flag float32_lt_quiet( float32 a, float32 b )
1598 flag aSign, bSign;
1600 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1601 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1603 /* Do nothing, even if NaN as we're quiet */
1604 return 0;
1606 aSign = extractFloat32Sign( a );
1607 bSign = extractFloat32Sign( b );
1608 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1609 return ( a != b ) && ( aSign ^ ( a < b ) );
1614 -------------------------------------------------------------------------------
1615 Returns the result of converting the double-precision floating-point value
1616 `a' to the 32-bit two's complement integer format. The conversion is
1617 performed according to the IEC/IEEE Standard for Binary Floating-point
1618 Arithmetic---which means in particular that the conversion is rounded
1619 according to the current rounding mode. If `a' is a NaN, the largest
1620 positive integer is returned. Otherwise, if the conversion overflows, the
1621 largest integer with the same sign as `a' is returned.
1622 -------------------------------------------------------------------------------
1624 int32 float64_to_int32( struct roundingData *roundData, float64 a )
1626 flag aSign;
1627 int16 aExp, shiftCount;
1628 bits64 aSig;
1630 aSig = extractFloat64Frac( a );
1631 aExp = extractFloat64Exp( a );
1632 aSign = extractFloat64Sign( a );
1633 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1634 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1635 shiftCount = 0x42C - aExp;
1636 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1637 return roundAndPackInt32( roundData, aSign, aSig );
1642 -------------------------------------------------------------------------------
1643 Returns the result of converting the double-precision floating-point value
1644 `a' to the 32-bit two's complement integer format. The conversion is
1645 performed according to the IEC/IEEE Standard for Binary Floating-point
1646 Arithmetic, except that the conversion is always rounded toward zero. If
1647 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1648 conversion overflows, the largest integer with the same sign as `a' is
1649 returned.
1650 -------------------------------------------------------------------------------
1652 int32 float64_to_int32_round_to_zero( float64 a )
1654 flag aSign;
1655 int16 aExp, shiftCount;
1656 bits64 aSig, savedASig;
1657 int32 z;
1659 aSig = extractFloat64Frac( a );
1660 aExp = extractFloat64Exp( a );
1661 aSign = extractFloat64Sign( a );
1662 shiftCount = 0x433 - aExp;
1663 if ( shiftCount < 21 ) {
1664 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1665 goto invalid;
1667 else if ( 52 < shiftCount ) {
1668 if ( aExp || aSig ) float_raise( float_flag_inexact );
1669 return 0;
1671 aSig |= LIT64( 0x0010000000000000 );
1672 savedASig = aSig;
1673 aSig >>= shiftCount;
1674 z = aSig;
1675 if ( aSign ) z = - z;
1676 if ( ( z < 0 ) ^ aSign ) {
1677 invalid:
1678 float_raise( float_flag_invalid );
1679 return aSign ? 0x80000000 : 0x7FFFFFFF;
1681 if ( ( aSig<<shiftCount ) != savedASig ) {
1682 float_raise( float_flag_inexact );
1684 return z;
1689 -------------------------------------------------------------------------------
1690 Returns the result of converting the double-precision floating-point value
1691 `a' to the 32-bit two's complement unsigned integer format. The conversion
1692 is performed according to the IEC/IEEE Standard for Binary Floating-point
1693 Arithmetic---which means in particular that the conversion is rounded
1694 according to the current rounding mode. If `a' is a NaN, the largest
1695 positive integer is returned. Otherwise, if the conversion overflows, the
1696 largest positive integer is returned.
1697 -------------------------------------------------------------------------------
1699 int32 float64_to_uint32( struct roundingData *roundData, float64 a )
1701 flag aSign;
1702 int16 aExp, shiftCount;
1703 bits64 aSig;
1705 aSig = extractFloat64Frac( a );
1706 aExp = extractFloat64Exp( a );
1707 aSign = 0; //extractFloat64Sign( a );
1708 //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1709 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1710 shiftCount = 0x42C - aExp;
1711 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1712 return roundAndPackInt32( roundData, aSign, aSig );
1716 -------------------------------------------------------------------------------
1717 Returns the result of converting the double-precision floating-point value
1718 `a' to the 32-bit two's complement integer format. The conversion is
1719 performed according to the IEC/IEEE Standard for Binary Floating-point
1720 Arithmetic, except that the conversion is always rounded toward zero. If
1721 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1722 conversion overflows, the largest positive integer is returned.
1723 -------------------------------------------------------------------------------
1725 int32 float64_to_uint32_round_to_zero( float64 a )
1727 flag aSign;
1728 int16 aExp, shiftCount;
1729 bits64 aSig, savedASig;
1730 int32 z;
1732 aSig = extractFloat64Frac( a );
1733 aExp = extractFloat64Exp( a );
1734 aSign = extractFloat64Sign( a );
1735 shiftCount = 0x433 - aExp;
1736 if ( shiftCount < 21 ) {
1737 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1738 goto invalid;
1740 else if ( 52 < shiftCount ) {
1741 if ( aExp || aSig ) float_raise( float_flag_inexact );
1742 return 0;
1744 aSig |= LIT64( 0x0010000000000000 );
1745 savedASig = aSig;
1746 aSig >>= shiftCount;
1747 z = aSig;
1748 if ( aSign ) z = - z;
1749 if ( ( z < 0 ) ^ aSign ) {
1750 invalid:
1751 float_raise( float_flag_invalid );
1752 return aSign ? 0x80000000 : 0x7FFFFFFF;
1754 if ( ( aSig<<shiftCount ) != savedASig ) {
1755 float_raise( float_flag_inexact );
1757 return z;
1761 -------------------------------------------------------------------------------
1762 Returns the result of converting the double-precision floating-point value
1763 `a' to the single-precision floating-point format. The conversion is
1764 performed according to the IEC/IEEE Standard for Binary Floating-point
1765 Arithmetic.
1766 -------------------------------------------------------------------------------
1768 float32 float64_to_float32( struct roundingData *roundData, float64 a )
1770 flag aSign;
1771 int16 aExp;
1772 bits64 aSig;
1773 bits32 zSig;
1775 aSig = extractFloat64Frac( a );
1776 aExp = extractFloat64Exp( a );
1777 aSign = extractFloat64Sign( a );
1778 if ( aExp == 0x7FF ) {
1779 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
1780 return packFloat32( aSign, 0xFF, 0 );
1782 shift64RightJamming( aSig, 22, &aSig );
1783 zSig = aSig;
1784 if ( aExp || zSig ) {
1785 zSig |= 0x40000000;
1786 aExp -= 0x381;
1788 return roundAndPackFloat32( roundData, aSign, aExp, zSig );
1792 #ifdef FLOATX80
1795 -------------------------------------------------------------------------------
1796 Returns the result of converting the double-precision floating-point value
1797 `a' to the extended double-precision floating-point format. The conversion
1798 is performed according to the IEC/IEEE Standard for Binary Floating-point
1799 Arithmetic.
1800 -------------------------------------------------------------------------------
1802 floatx80 float64_to_floatx80( float64 a )
1804 flag aSign;
1805 int16 aExp;
1806 bits64 aSig;
1808 aSig = extractFloat64Frac( a );
1809 aExp = extractFloat64Exp( a );
1810 aSign = extractFloat64Sign( a );
1811 if ( aExp == 0x7FF ) {
1812 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
1813 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1815 if ( aExp == 0 ) {
1816 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1817 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
1819 return
1820 packFloatx80(
1821 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
1825 #endif
1828 -------------------------------------------------------------------------------
1829 Rounds the double-precision floating-point value `a' to an integer, and
1830 returns the result as a double-precision floating-point value. The
1831 operation is performed according to the IEC/IEEE Standard for Binary
1832 Floating-point Arithmetic.
1833 -------------------------------------------------------------------------------
1835 float64 float64_round_to_int( struct roundingData *roundData, float64 a )
1837 flag aSign;
1838 int16 aExp;
1839 bits64 lastBitMask, roundBitsMask;
1840 int8 roundingMode;
1841 float64 z;
1843 aExp = extractFloat64Exp( a );
1844 if ( 0x433 <= aExp ) {
1845 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
1846 return propagateFloat64NaN( a, a );
1848 return a;
1850 if ( aExp <= 0x3FE ) {
1851 if ( (bits64) ( a<<1 ) == 0 ) return a;
1852 roundData->exception |= float_flag_inexact;
1853 aSign = extractFloat64Sign( a );
1854 switch ( roundData->mode ) {
1855 case float_round_nearest_even:
1856 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
1857 return packFloat64( aSign, 0x3FF, 0 );
1859 break;
1860 case float_round_down:
1861 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
1862 case float_round_up:
1863 return
1864 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
1866 return packFloat64( aSign, 0, 0 );
1868 lastBitMask = 1;
1869 lastBitMask <<= 0x433 - aExp;
1870 roundBitsMask = lastBitMask - 1;
1871 z = a;
1872 roundingMode = roundData->mode;
1873 if ( roundingMode == float_round_nearest_even ) {
1874 z += lastBitMask>>1;
1875 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1877 else if ( roundingMode != float_round_to_zero ) {
1878 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1879 z += roundBitsMask;
1882 z &= ~ roundBitsMask;
1883 if ( z != a ) roundData->exception |= float_flag_inexact;
1884 return z;
1889 -------------------------------------------------------------------------------
1890 Returns the result of adding the absolute values of the double-precision
1891 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1892 before being returned. `zSign' is ignored if the result is a NaN. The
1893 addition is performed according to the IEC/IEEE Standard for Binary
1894 Floating-point Arithmetic.
1895 -------------------------------------------------------------------------------
1897 static float64 addFloat64Sigs( struct roundingData *roundData, float64 a, float64 b, flag zSign )
1899 int16 aExp, bExp, zExp;
1900 bits64 aSig, bSig, zSig;
1901 int16 expDiff;
1903 aSig = extractFloat64Frac( a );
1904 aExp = extractFloat64Exp( a );
1905 bSig = extractFloat64Frac( b );
1906 bExp = extractFloat64Exp( b );
1907 expDiff = aExp - bExp;
1908 aSig <<= 9;
1909 bSig <<= 9;
1910 if ( 0 < expDiff ) {
1911 if ( aExp == 0x7FF ) {
1912 if ( aSig ) return propagateFloat64NaN( a, b );
1913 return a;
1915 if ( bExp == 0 ) {
1916 --expDiff;
1918 else {
1919 bSig |= LIT64( 0x2000000000000000 );
1921 shift64RightJamming( bSig, expDiff, &bSig );
1922 zExp = aExp;
1924 else if ( expDiff < 0 ) {
1925 if ( bExp == 0x7FF ) {
1926 if ( bSig ) return propagateFloat64NaN( a, b );
1927 return packFloat64( zSign, 0x7FF, 0 );
1929 if ( aExp == 0 ) {
1930 ++expDiff;
1932 else {
1933 aSig |= LIT64( 0x2000000000000000 );
1935 shift64RightJamming( aSig, - expDiff, &aSig );
1936 zExp = bExp;
1938 else {
1939 if ( aExp == 0x7FF ) {
1940 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
1941 return a;
1943 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
1944 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
1945 zExp = aExp;
1946 goto roundAndPack;
1948 aSig |= LIT64( 0x2000000000000000 );
1949 zSig = ( aSig + bSig )<<1;
1950 --zExp;
1951 if ( (sbits64) zSig < 0 ) {
1952 zSig = aSig + bSig;
1953 ++zExp;
1955 roundAndPack:
1956 return roundAndPackFloat64( roundData, zSign, zExp, zSig );
1961 -------------------------------------------------------------------------------
1962 Returns the result of subtracting the absolute values of the double-
1963 precision floating-point values `a' and `b'. If `zSign' is true, the
1964 difference is negated before being returned. `zSign' is ignored if the
1965 result is a NaN. The subtraction is performed according to the IEC/IEEE
1966 Standard for Binary Floating-point Arithmetic.
1967 -------------------------------------------------------------------------------
1969 static float64 subFloat64Sigs( struct roundingData *roundData, float64 a, float64 b, flag zSign )
1971 int16 aExp, bExp, zExp;
1972 bits64 aSig, bSig, zSig;
1973 int16 expDiff;
1975 aSig = extractFloat64Frac( a );
1976 aExp = extractFloat64Exp( a );
1977 bSig = extractFloat64Frac( b );
1978 bExp = extractFloat64Exp( b );
1979 expDiff = aExp - bExp;
1980 aSig <<= 10;
1981 bSig <<= 10;
1982 if ( 0 < expDiff ) goto aExpBigger;
1983 if ( expDiff < 0 ) goto bExpBigger;
1984 if ( aExp == 0x7FF ) {
1985 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
1986 roundData->exception |= float_flag_invalid;
1987 return float64_default_nan;
1989 if ( aExp == 0 ) {
1990 aExp = 1;
1991 bExp = 1;
1993 if ( bSig < aSig ) goto aBigger;
1994 if ( aSig < bSig ) goto bBigger;
1995 return packFloat64( roundData->mode == float_round_down, 0, 0 );
1996 bExpBigger:
1997 if ( bExp == 0x7FF ) {
1998 if ( bSig ) return propagateFloat64NaN( a, b );
1999 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2001 if ( aExp == 0 ) {
2002 ++expDiff;
2004 else {
2005 aSig |= LIT64( 0x4000000000000000 );
2007 shift64RightJamming( aSig, - expDiff, &aSig );
2008 bSig |= LIT64( 0x4000000000000000 );
2009 bBigger:
2010 zSig = bSig - aSig;
2011 zExp = bExp;
2012 zSign ^= 1;
2013 goto normalizeRoundAndPack;
2014 aExpBigger:
2015 if ( aExp == 0x7FF ) {
2016 if ( aSig ) return propagateFloat64NaN( a, b );
2017 return a;
2019 if ( bExp == 0 ) {
2020 --expDiff;
2022 else {
2023 bSig |= LIT64( 0x4000000000000000 );
2025 shift64RightJamming( bSig, expDiff, &bSig );
2026 aSig |= LIT64( 0x4000000000000000 );
2027 aBigger:
2028 zSig = aSig - bSig;
2029 zExp = aExp;
2030 normalizeRoundAndPack:
2031 --zExp;
2032 return normalizeRoundAndPackFloat64( roundData, zSign, zExp, zSig );
2037 -------------------------------------------------------------------------------
2038 Returns the result of adding the double-precision floating-point values `a'
2039 and `b'. The operation is performed according to the IEC/IEEE Standard for
2040 Binary Floating-point Arithmetic.
2041 -------------------------------------------------------------------------------
2043 float64 float64_add( struct roundingData *roundData, float64 a, float64 b )
2045 flag aSign, bSign;
2047 aSign = extractFloat64Sign( a );
2048 bSign = extractFloat64Sign( b );
2049 if ( aSign == bSign ) {
2050 return addFloat64Sigs( roundData, a, b, aSign );
2052 else {
2053 return subFloat64Sigs( roundData, a, b, aSign );
2059 -------------------------------------------------------------------------------
2060 Returns the result of subtracting the double-precision floating-point values
2061 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2062 for Binary Floating-point Arithmetic.
2063 -------------------------------------------------------------------------------
2065 float64 float64_sub( struct roundingData *roundData, float64 a, float64 b )
2067 flag aSign, bSign;
2069 aSign = extractFloat64Sign( a );
2070 bSign = extractFloat64Sign( b );
2071 if ( aSign == bSign ) {
2072 return subFloat64Sigs( roundData, a, b, aSign );
2074 else {
2075 return addFloat64Sigs( roundData, a, b, aSign );
2081 -------------------------------------------------------------------------------
2082 Returns the result of multiplying the double-precision floating-point values
2083 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2084 for Binary Floating-point Arithmetic.
2085 -------------------------------------------------------------------------------
2087 float64 float64_mul( struct roundingData *roundData, float64 a, float64 b )
2089 flag aSign, bSign, zSign;
2090 int16 aExp, bExp, zExp;
2091 bits64 aSig, bSig, zSig0, zSig1;
2093 aSig = extractFloat64Frac( a );
2094 aExp = extractFloat64Exp( a );
2095 aSign = extractFloat64Sign( a );
2096 bSig = extractFloat64Frac( b );
2097 bExp = extractFloat64Exp( b );
2098 bSign = extractFloat64Sign( b );
2099 zSign = aSign ^ bSign;
2100 if ( aExp == 0x7FF ) {
2101 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2102 return propagateFloat64NaN( a, b );
2104 if ( ( bExp | bSig ) == 0 ) {
2105 roundData->exception |= float_flag_invalid;
2106 return float64_default_nan;
2108 return packFloat64( zSign, 0x7FF, 0 );
2110 if ( bExp == 0x7FF ) {
2111 if ( bSig ) return propagateFloat64NaN( a, b );
2112 if ( ( aExp | aSig ) == 0 ) {
2113 roundData->exception |= float_flag_invalid;
2114 return float64_default_nan;
2116 return packFloat64( zSign, 0x7FF, 0 );
2118 if ( aExp == 0 ) {
2119 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2120 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2122 if ( bExp == 0 ) {
2123 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2124 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2126 zExp = aExp + bExp - 0x3FF;
2127 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2128 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2129 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2130 zSig0 |= ( zSig1 != 0 );
2131 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2132 zSig0 <<= 1;
2133 --zExp;
2135 return roundAndPackFloat64( roundData, zSign, zExp, zSig0 );
2140 -------------------------------------------------------------------------------
2141 Returns the result of dividing the double-precision floating-point value `a'
2142 by the corresponding value `b'. The operation is performed according to
2143 the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2144 -------------------------------------------------------------------------------
2146 float64 float64_div( struct roundingData *roundData, float64 a, float64 b )
2148 flag aSign, bSign, zSign;
2149 int16 aExp, bExp, zExp;
2150 bits64 aSig, bSig, zSig;
2151 bits64 rem0, rem1;
2152 bits64 term0, term1;
2154 aSig = extractFloat64Frac( a );
2155 aExp = extractFloat64Exp( a );
2156 aSign = extractFloat64Sign( a );
2157 bSig = extractFloat64Frac( b );
2158 bExp = extractFloat64Exp( b );
2159 bSign = extractFloat64Sign( b );
2160 zSign = aSign ^ bSign;
2161 if ( aExp == 0x7FF ) {
2162 if ( aSig ) return propagateFloat64NaN( a, b );
2163 if ( bExp == 0x7FF ) {
2164 if ( bSig ) return propagateFloat64NaN( a, b );
2165 roundData->exception |= float_flag_invalid;
2166 return float64_default_nan;
2168 return packFloat64( zSign, 0x7FF, 0 );
2170 if ( bExp == 0x7FF ) {
2171 if ( bSig ) return propagateFloat64NaN( a, b );
2172 return packFloat64( zSign, 0, 0 );
2174 if ( bExp == 0 ) {
2175 if ( bSig == 0 ) {
2176 if ( ( aExp | aSig ) == 0 ) {
2177 roundData->exception |= float_flag_invalid;
2178 return float64_default_nan;
2180 roundData->exception |= float_flag_divbyzero;
2181 return packFloat64( zSign, 0x7FF, 0 );
2183 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2185 if ( aExp == 0 ) {
2186 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2187 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2189 zExp = aExp - bExp + 0x3FD;
2190 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2191 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2192 if ( bSig <= ( aSig + aSig ) ) {
2193 aSig >>= 1;
2194 ++zExp;
2196 zSig = estimateDiv128To64( aSig, 0, bSig );
2197 if ( ( zSig & 0x1FF ) <= 2 ) {
2198 mul64To128( bSig, zSig, &term0, &term1 );
2199 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2200 while ( (sbits64) rem0 < 0 ) {
2201 --zSig;
2202 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2204 zSig |= ( rem1 != 0 );
2206 return roundAndPackFloat64( roundData, zSign, zExp, zSig );
2211 -------------------------------------------------------------------------------
2212 Returns the remainder of the double-precision floating-point value `a'
2213 with respect to the corresponding value `b'. The operation is performed
2214 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2215 -------------------------------------------------------------------------------
2217 float64 float64_rem( struct roundingData *roundData, float64 a, float64 b )
2219 flag aSign, bSign, zSign;
2220 int16 aExp, bExp, expDiff;
2221 bits64 aSig, bSig;
2222 bits64 q, alternateASig;
2223 sbits64 sigMean;
2225 aSig = extractFloat64Frac( a );
2226 aExp = extractFloat64Exp( a );
2227 aSign = extractFloat64Sign( a );
2228 bSig = extractFloat64Frac( b );
2229 bExp = extractFloat64Exp( b );
2230 bSign = extractFloat64Sign( b );
2231 if ( aExp == 0x7FF ) {
2232 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2233 return propagateFloat64NaN( a, b );
2235 roundData->exception |= float_flag_invalid;
2236 return float64_default_nan;
2238 if ( bExp == 0x7FF ) {
2239 if ( bSig ) return propagateFloat64NaN( a, b );
2240 return a;
2242 if ( bExp == 0 ) {
2243 if ( bSig == 0 ) {
2244 roundData->exception |= float_flag_invalid;
2245 return float64_default_nan;
2247 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2249 if ( aExp == 0 ) {
2250 if ( aSig == 0 ) return a;
2251 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2253 expDiff = aExp - bExp;
2254 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2255 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2256 if ( expDiff < 0 ) {
2257 if ( expDiff < -1 ) return a;
2258 aSig >>= 1;
2260 q = ( bSig <= aSig );
2261 if ( q ) aSig -= bSig;
2262 expDiff -= 64;
2263 while ( 0 < expDiff ) {
2264 q = estimateDiv128To64( aSig, 0, bSig );
2265 q = ( 2 < q ) ? q - 2 : 0;
2266 aSig = - ( ( bSig>>2 ) * q );
2267 expDiff -= 62;
2269 expDiff += 64;
2270 if ( 0 < expDiff ) {
2271 q = estimateDiv128To64( aSig, 0, bSig );
2272 q = ( 2 < q ) ? q - 2 : 0;
2273 q >>= 64 - expDiff;
2274 bSig >>= 2;
2275 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2277 else {
2278 aSig >>= 2;
2279 bSig >>= 2;
2281 do {
2282 alternateASig = aSig;
2283 ++q;
2284 aSig -= bSig;
2285 } while ( 0 <= (sbits64) aSig );
2286 sigMean = aSig + alternateASig;
2287 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2288 aSig = alternateASig;
2290 zSign = ( (sbits64) aSig < 0 );
2291 if ( zSign ) aSig = - aSig;
2292 return normalizeRoundAndPackFloat64( roundData, aSign ^ zSign, bExp, aSig );
2297 -------------------------------------------------------------------------------
2298 Returns the square root of the double-precision floating-point value `a'.
2299 The operation is performed according to the IEC/IEEE Standard for Binary
2300 Floating-point Arithmetic.
2301 -------------------------------------------------------------------------------
2303 float64 float64_sqrt( struct roundingData *roundData, float64 a )
2305 flag aSign;
2306 int16 aExp, zExp;
2307 bits64 aSig, zSig;
2308 bits64 rem0, rem1, term0, term1; //, shiftedRem;
2309 //float64 z;
2311 aSig = extractFloat64Frac( a );
2312 aExp = extractFloat64Exp( a );
2313 aSign = extractFloat64Sign( a );
2314 if ( aExp == 0x7FF ) {
2315 if ( aSig ) return propagateFloat64NaN( a, a );
2316 if ( ! aSign ) return a;
2317 roundData->exception |= float_flag_invalid;
2318 return float64_default_nan;
2320 if ( aSign ) {
2321 if ( ( aExp | aSig ) == 0 ) return a;
2322 roundData->exception |= float_flag_invalid;
2323 return float64_default_nan;
2325 if ( aExp == 0 ) {
2326 if ( aSig == 0 ) return 0;
2327 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2329 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2330 aSig |= LIT64( 0x0010000000000000 );
2331 zSig = estimateSqrt32( aExp, aSig>>21 );
2332 zSig <<= 31;
2333 aSig <<= 9 - ( aExp & 1 );
2334 zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2;
2335 if ( ( zSig & 0x3FF ) <= 5 ) {
2336 if ( zSig < 2 ) {
2337 zSig = LIT64( 0xFFFFFFFFFFFFFFFF );
2339 else {
2340 aSig <<= 2;
2341 mul64To128( zSig, zSig, &term0, &term1 );
2342 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2343 while ( (sbits64) rem0 < 0 ) {
2344 --zSig;
2345 shortShift128Left( 0, zSig, 1, &term0, &term1 );
2346 term1 |= 1;
2347 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
2349 zSig |= ( ( rem0 | rem1 ) != 0 );
2352 shift64RightJamming( zSig, 1, &zSig );
2353 return roundAndPackFloat64( roundData, 0, zExp, zSig );
2358 -------------------------------------------------------------------------------
2359 Returns 1 if the double-precision floating-point value `a' is equal to the
2360 corresponding value `b', and 0 otherwise. The comparison is performed
2361 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2362 -------------------------------------------------------------------------------
2364 flag float64_eq( float64 a, float64 b )
2367 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2368 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2370 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2371 float_raise( float_flag_invalid );
2373 return 0;
2375 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2380 -------------------------------------------------------------------------------
2381 Returns 1 if the double-precision floating-point value `a' is less than or
2382 equal to the corresponding value `b', and 0 otherwise. The comparison is
2383 performed according to the IEC/IEEE Standard for Binary Floating-point
2384 Arithmetic.
2385 -------------------------------------------------------------------------------
2387 flag float64_le( float64 a, float64 b )
2389 flag aSign, bSign;
2391 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2392 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2394 float_raise( float_flag_invalid );
2395 return 0;
2397 aSign = extractFloat64Sign( a );
2398 bSign = extractFloat64Sign( b );
2399 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2400 return ( a == b ) || ( aSign ^ ( a < b ) );
2405 -------------------------------------------------------------------------------
2406 Returns 1 if the double-precision floating-point value `a' is less than
2407 the corresponding value `b', and 0 otherwise. The comparison is performed
2408 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2409 -------------------------------------------------------------------------------
2411 flag float64_lt( float64 a, float64 b )
2413 flag aSign, bSign;
2415 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2416 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2418 float_raise( float_flag_invalid );
2419 return 0;
2421 aSign = extractFloat64Sign( a );
2422 bSign = extractFloat64Sign( b );
2423 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2424 return ( a != b ) && ( aSign ^ ( a < b ) );
2429 -------------------------------------------------------------------------------
2430 Returns 1 if the double-precision floating-point value `a' is equal to the
2431 corresponding value `b', and 0 otherwise. The invalid exception is raised
2432 if either operand is a NaN. Otherwise, the comparison is performed
2433 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2434 -------------------------------------------------------------------------------
2436 flag float64_eq_signaling( float64 a, float64 b )
2439 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2440 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2442 float_raise( float_flag_invalid );
2443 return 0;
2445 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2450 -------------------------------------------------------------------------------
2451 Returns 1 if the double-precision floating-point value `a' is less than or
2452 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2453 cause an exception. Otherwise, the comparison is performed according to the
2454 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2455 -------------------------------------------------------------------------------
2457 flag float64_le_quiet( float64 a, float64 b )
2459 flag aSign, bSign;
2460 //int16 aExp, bExp;
2462 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2463 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2465 /* Do nothing, even if NaN as we're quiet */
2466 return 0;
2468 aSign = extractFloat64Sign( a );
2469 bSign = extractFloat64Sign( b );
2470 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2471 return ( a == b ) || ( aSign ^ ( a < b ) );
2476 -------------------------------------------------------------------------------
2477 Returns 1 if the double-precision floating-point value `a' is less than
2478 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2479 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2480 Standard for Binary Floating-point Arithmetic.
2481 -------------------------------------------------------------------------------
2483 flag float64_lt_quiet( float64 a, float64 b )
2485 flag aSign, bSign;
2487 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2488 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2490 /* Do nothing, even if NaN as we're quiet */
2491 return 0;
2493 aSign = extractFloat64Sign( a );
2494 bSign = extractFloat64Sign( b );
2495 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2496 return ( a != b ) && ( aSign ^ ( a < b ) );
2500 #ifdef FLOATX80
2503 -------------------------------------------------------------------------------
2504 Returns the result of converting the extended double-precision floating-
2505 point value `a' to the 32-bit two's complement integer format. The
2506 conversion is performed according to the IEC/IEEE Standard for Binary
2507 Floating-point Arithmetic---which means in particular that the conversion
2508 is rounded according to the current rounding mode. If `a' is a NaN, the
2509 largest positive integer is returned. Otherwise, if the conversion
2510 overflows, the largest integer with the same sign as `a' is returned.
2511 -------------------------------------------------------------------------------
2513 int32 floatx80_to_int32( struct roundingData *roundData, floatx80 a )
2515 flag aSign;
2516 int32 aExp, shiftCount;
2517 bits64 aSig;
2519 aSig = extractFloatx80Frac( a );
2520 aExp = extractFloatx80Exp( a );
2521 aSign = extractFloatx80Sign( a );
2522 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2523 shiftCount = 0x4037 - aExp;
2524 if ( shiftCount <= 0 ) shiftCount = 1;
2525 shift64RightJamming( aSig, shiftCount, &aSig );
2526 return roundAndPackInt32( roundData, aSign, aSig );
2531 -------------------------------------------------------------------------------
2532 Returns the result of converting the extended double-precision floating-
2533 point value `a' to the 32-bit two's complement integer format. The
2534 conversion is performed according to the IEC/IEEE Standard for Binary
2535 Floating-point Arithmetic, except that the conversion is always rounded
2536 toward zero. If `a' is a NaN, the largest positive integer is returned.
2537 Otherwise, if the conversion overflows, the largest integer with the same
2538 sign as `a' is returned.
2539 -------------------------------------------------------------------------------
2541 int32 floatx80_to_int32_round_to_zero( floatx80 a )
2543 flag aSign;
2544 int32 aExp, shiftCount;
2545 bits64 aSig, savedASig;
2546 int32 z;
2548 aSig = extractFloatx80Frac( a );
2549 aExp = extractFloatx80Exp( a );
2550 aSign = extractFloatx80Sign( a );
2551 shiftCount = 0x403E - aExp;
2552 if ( shiftCount < 32 ) {
2553 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2554 goto invalid;
2556 else if ( 63 < shiftCount ) {
2557 if ( aExp || aSig ) float_raise( float_flag_inexact );
2558 return 0;
2560 savedASig = aSig;
2561 aSig >>= shiftCount;
2562 z = aSig;
2563 if ( aSign ) z = - z;
2564 if ( ( z < 0 ) ^ aSign ) {
2565 invalid:
2566 float_raise( float_flag_invalid );
2567 return aSign ? 0x80000000 : 0x7FFFFFFF;
2569 if ( ( aSig<<shiftCount ) != savedASig ) {
2570 float_raise( float_flag_inexact );
2572 return z;
2577 -------------------------------------------------------------------------------
2578 Returns the result of converting the extended double-precision floating-
2579 point value `a' to the single-precision floating-point format. The
2580 conversion is performed according to the IEC/IEEE Standard for Binary
2581 Floating-point Arithmetic.
2582 -------------------------------------------------------------------------------
2584 float32 floatx80_to_float32( struct roundingData *roundData, floatx80 a )
2586 flag aSign;
2587 int32 aExp;
2588 bits64 aSig;
2590 aSig = extractFloatx80Frac( a );
2591 aExp = extractFloatx80Exp( a );
2592 aSign = extractFloatx80Sign( a );
2593 if ( aExp == 0x7FFF ) {
2594 if ( (bits64) ( aSig<<1 ) ) {
2595 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
2597 return packFloat32( aSign, 0xFF, 0 );
2599 shift64RightJamming( aSig, 33, &aSig );
2600 if ( aExp || aSig ) aExp -= 0x3F81;
2601 return roundAndPackFloat32( roundData, aSign, aExp, aSig );
2606 -------------------------------------------------------------------------------
2607 Returns the result of converting the extended double-precision floating-
2608 point value `a' to the double-precision floating-point format. The
2609 conversion is performed according to the IEC/IEEE Standard for Binary
2610 Floating-point Arithmetic.
2611 -------------------------------------------------------------------------------
2613 float64 floatx80_to_float64( struct roundingData *roundData, floatx80 a )
2615 flag aSign;
2616 int32 aExp;
2617 bits64 aSig, zSig;
2619 aSig = extractFloatx80Frac( a );
2620 aExp = extractFloatx80Exp( a );
2621 aSign = extractFloatx80Sign( a );
2622 if ( aExp == 0x7FFF ) {
2623 if ( (bits64) ( aSig<<1 ) ) {
2624 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
2626 return packFloat64( aSign, 0x7FF, 0 );
2628 shift64RightJamming( aSig, 1, &zSig );
2629 if ( aExp || aSig ) aExp -= 0x3C01;
2630 return roundAndPackFloat64( roundData, aSign, aExp, zSig );
2635 -------------------------------------------------------------------------------
2636 Rounds the extended double-precision floating-point value `a' to an integer,
2637 and returns the result as an extended quadruple-precision floating-point
2638 value. The operation is performed according to the IEC/IEEE Standard for
2639 Binary Floating-point Arithmetic.
2640 -------------------------------------------------------------------------------
2642 floatx80 floatx80_round_to_int( struct roundingData *roundData, floatx80 a )
2644 flag aSign;
2645 int32 aExp;
2646 bits64 lastBitMask, roundBitsMask;
2647 int8 roundingMode;
2648 floatx80 z;
2650 aExp = extractFloatx80Exp( a );
2651 if ( 0x403E <= aExp ) {
2652 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
2653 return propagateFloatx80NaN( a, a );
2655 return a;
2657 if ( aExp <= 0x3FFE ) {
2658 if ( ( aExp == 0 )
2659 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
2660 return a;
2662 roundData->exception |= float_flag_inexact;
2663 aSign = extractFloatx80Sign( a );
2664 switch ( roundData->mode ) {
2665 case float_round_nearest_even:
2666 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
2668 return
2669 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
2671 break;
2672 case float_round_down:
2673 return
2674 aSign ?
2675 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
2676 : packFloatx80( 0, 0, 0 );
2677 case float_round_up:
2678 return
2679 aSign ? packFloatx80( 1, 0, 0 )
2680 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
2682 return packFloatx80( aSign, 0, 0 );
2684 lastBitMask = 1;
2685 lastBitMask <<= 0x403E - aExp;
2686 roundBitsMask = lastBitMask - 1;
2687 z = a;
2688 roundingMode = roundData->mode;
2689 if ( roundingMode == float_round_nearest_even ) {
2690 z.low += lastBitMask>>1;
2691 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
2693 else if ( roundingMode != float_round_to_zero ) {
2694 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2695 z.low += roundBitsMask;
2698 z.low &= ~ roundBitsMask;
2699 if ( z.low == 0 ) {
2700 ++z.high;
2701 z.low = LIT64( 0x8000000000000000 );
2703 if ( z.low != a.low ) roundData->exception |= float_flag_inexact;
2704 return z;
2709 -------------------------------------------------------------------------------
2710 Returns the result of adding the absolute values of the extended double-
2711 precision floating-point values `a' and `b'. If `zSign' is true, the sum is
2712 negated before being returned. `zSign' is ignored if the result is a NaN.
2713 The addition is performed according to the IEC/IEEE Standard for Binary
2714 Floating-point Arithmetic.
2715 -------------------------------------------------------------------------------
2717 static floatx80 addFloatx80Sigs( struct roundingData *roundData, floatx80 a, floatx80 b, flag zSign )
2719 int32 aExp, bExp, zExp;
2720 bits64 aSig, bSig, zSig0, zSig1;
2721 int32 expDiff;
2723 aSig = extractFloatx80Frac( a );
2724 aExp = extractFloatx80Exp( a );
2725 bSig = extractFloatx80Frac( b );
2726 bExp = extractFloatx80Exp( b );
2727 expDiff = aExp - bExp;
2728 if ( 0 < expDiff ) {
2729 if ( aExp == 0x7FFF ) {
2730 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2731 return a;
2733 if ( bExp == 0 ) --expDiff;
2734 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2735 zExp = aExp;
2737 else if ( expDiff < 0 ) {
2738 if ( bExp == 0x7FFF ) {
2739 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2740 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2742 if ( aExp == 0 ) ++expDiff;
2743 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2744 zExp = bExp;
2746 else {
2747 if ( aExp == 0x7FFF ) {
2748 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2749 return propagateFloatx80NaN( a, b );
2751 return a;
2753 zSig1 = 0;
2754 zSig0 = aSig + bSig;
2755 if ( aExp == 0 ) {
2756 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
2757 goto roundAndPack;
2759 zExp = aExp;
2760 goto shiftRight1;
2763 zSig0 = aSig + bSig;
2765 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
2766 shiftRight1:
2767 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
2768 zSig0 |= LIT64( 0x8000000000000000 );
2769 ++zExp;
2770 roundAndPack:
2771 return
2772 roundAndPackFloatx80(
2773 roundData, zSign, zExp, zSig0, zSig1 );
2778 -------------------------------------------------------------------------------
2779 Returns the result of subtracting the absolute values of the extended
2780 double-precision floating-point values `a' and `b'. If `zSign' is true,
2781 the difference is negated before being returned. `zSign' is ignored if the
2782 result is a NaN. The subtraction is performed according to the IEC/IEEE
2783 Standard for Binary Floating-point Arithmetic.
2784 -------------------------------------------------------------------------------
2786 static floatx80 subFloatx80Sigs( struct roundingData *roundData, floatx80 a, floatx80 b, flag zSign )
2788 int32 aExp, bExp, zExp;
2789 bits64 aSig, bSig, zSig0, zSig1;
2790 int32 expDiff;
2791 floatx80 z;
2793 aSig = extractFloatx80Frac( a );
2794 aExp = extractFloatx80Exp( a );
2795 bSig = extractFloatx80Frac( b );
2796 bExp = extractFloatx80Exp( b );
2797 expDiff = aExp - bExp;
2798 if ( 0 < expDiff ) goto aExpBigger;
2799 if ( expDiff < 0 ) goto bExpBigger;
2800 if ( aExp == 0x7FFF ) {
2801 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2802 return propagateFloatx80NaN( a, b );
2804 roundData->exception |= float_flag_invalid;
2805 z.low = floatx80_default_nan_low;
2806 z.high = floatx80_default_nan_high;
2807 z.__padding = 0;
2808 return z;
2810 if ( aExp == 0 ) {
2811 aExp = 1;
2812 bExp = 1;
2814 zSig1 = 0;
2815 if ( bSig < aSig ) goto aBigger;
2816 if ( aSig < bSig ) goto bBigger;
2817 return packFloatx80( roundData->mode == float_round_down, 0, 0 );
2818 bExpBigger:
2819 if ( bExp == 0x7FFF ) {
2820 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2821 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
2823 if ( aExp == 0 ) ++expDiff;
2824 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2825 bBigger:
2826 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
2827 zExp = bExp;
2828 zSign ^= 1;
2829 goto normalizeRoundAndPack;
2830 aExpBigger:
2831 if ( aExp == 0x7FFF ) {
2832 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2833 return a;
2835 if ( bExp == 0 ) --expDiff;
2836 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2837 aBigger:
2838 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
2839 zExp = aExp;
2840 normalizeRoundAndPack:
2841 return
2842 normalizeRoundAndPackFloatx80(
2843 roundData, zSign, zExp, zSig0, zSig1 );
2848 -------------------------------------------------------------------------------
2849 Returns the result of adding the extended double-precision floating-point
2850 values `a' and `b'. The operation is performed according to the IEC/IEEE
2851 Standard for Binary Floating-point Arithmetic.
2852 -------------------------------------------------------------------------------
2854 floatx80 floatx80_add( struct roundingData *roundData, floatx80 a, floatx80 b )
2856 flag aSign, bSign;
2858 aSign = extractFloatx80Sign( a );
2859 bSign = extractFloatx80Sign( b );
2860 if ( aSign == bSign ) {
2861 return addFloatx80Sigs( roundData, a, b, aSign );
2863 else {
2864 return subFloatx80Sigs( roundData, a, b, aSign );
2870 -------------------------------------------------------------------------------
2871 Returns the result of subtracting the extended double-precision floating-
2872 point values `a' and `b'. The operation is performed according to the
2873 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2874 -------------------------------------------------------------------------------
2876 floatx80 floatx80_sub( struct roundingData *roundData, floatx80 a, floatx80 b )
2878 flag aSign, bSign;
2880 aSign = extractFloatx80Sign( a );
2881 bSign = extractFloatx80Sign( b );
2882 if ( aSign == bSign ) {
2883 return subFloatx80Sigs( roundData, a, b, aSign );
2885 else {
2886 return addFloatx80Sigs( roundData, a, b, aSign );
2892 -------------------------------------------------------------------------------
2893 Returns the result of multiplying the extended double-precision floating-
2894 point values `a' and `b'. The operation is performed according to the
2895 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2896 -------------------------------------------------------------------------------
2898 floatx80 floatx80_mul( struct roundingData *roundData, floatx80 a, floatx80 b )
2900 flag aSign, bSign, zSign;
2901 int32 aExp, bExp, zExp;
2902 bits64 aSig, bSig, zSig0, zSig1;
2903 floatx80 z;
2905 aSig = extractFloatx80Frac( a );
2906 aExp = extractFloatx80Exp( a );
2907 aSign = extractFloatx80Sign( a );
2908 bSig = extractFloatx80Frac( b );
2909 bExp = extractFloatx80Exp( b );
2910 bSign = extractFloatx80Sign( b );
2911 zSign = aSign ^ bSign;
2912 if ( aExp == 0x7FFF ) {
2913 if ( (bits64) ( aSig<<1 )
2914 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
2915 return propagateFloatx80NaN( a, b );
2917 if ( ( bExp | bSig ) == 0 ) goto invalid;
2918 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2920 if ( bExp == 0x7FFF ) {
2921 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2922 if ( ( aExp | aSig ) == 0 ) {
2923 invalid:
2924 roundData->exception |= float_flag_invalid;
2925 z.low = floatx80_default_nan_low;
2926 z.high = floatx80_default_nan_high;
2927 z.__padding = 0;
2928 return z;
2930 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2932 if ( aExp == 0 ) {
2933 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
2934 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
2936 if ( bExp == 0 ) {
2937 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
2938 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
2940 zExp = aExp + bExp - 0x3FFE;
2941 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2942 if ( 0 < (sbits64) zSig0 ) {
2943 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
2944 --zExp;
2946 return
2947 roundAndPackFloatx80(
2948 roundData, zSign, zExp, zSig0, zSig1 );
2953 -------------------------------------------------------------------------------
2954 Returns the result of dividing the extended double-precision floating-point
2955 value `a' by the corresponding value `b'. The operation is performed
2956 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2957 -------------------------------------------------------------------------------
2959 floatx80 floatx80_div( struct roundingData *roundData, floatx80 a, floatx80 b )
2961 flag aSign, bSign, zSign;
2962 int32 aExp, bExp, zExp;
2963 bits64 aSig, bSig, zSig0, zSig1;
2964 bits64 rem0, rem1, rem2, term0, term1, term2;
2965 floatx80 z;
2967 aSig = extractFloatx80Frac( a );
2968 aExp = extractFloatx80Exp( a );
2969 aSign = extractFloatx80Sign( a );
2970 bSig = extractFloatx80Frac( b );
2971 bExp = extractFloatx80Exp( b );
2972 bSign = extractFloatx80Sign( b );
2973 zSign = aSign ^ bSign;
2974 if ( aExp == 0x7FFF ) {
2975 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2976 if ( bExp == 0x7FFF ) {
2977 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2978 goto invalid;
2980 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2982 if ( bExp == 0x7FFF ) {
2983 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2984 return packFloatx80( zSign, 0, 0 );
2986 if ( bExp == 0 ) {
2987 if ( bSig == 0 ) {
2988 if ( ( aExp | aSig ) == 0 ) {
2989 invalid:
2990 roundData->exception |= float_flag_invalid;
2991 z.low = floatx80_default_nan_low;
2992 z.high = floatx80_default_nan_high;
2993 z.__padding = 0;
2994 return z;
2996 roundData->exception |= float_flag_divbyzero;
2997 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2999 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3001 if ( aExp == 0 ) {
3002 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3003 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3005 zExp = aExp - bExp + 0x3FFE;
3006 rem1 = 0;
3007 if ( bSig <= aSig ) {
3008 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3009 ++zExp;
3011 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3012 mul64To128( bSig, zSig0, &term0, &term1 );
3013 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3014 while ( (sbits64) rem0 < 0 ) {
3015 --zSig0;
3016 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3018 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3019 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3020 mul64To128( bSig, zSig1, &term1, &term2 );
3021 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3022 while ( (sbits64) rem1 < 0 ) {
3023 --zSig1;
3024 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3026 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3028 return
3029 roundAndPackFloatx80(
3030 roundData, zSign, zExp, zSig0, zSig1 );
3035 -------------------------------------------------------------------------------
3036 Returns the remainder of the extended double-precision floating-point value
3037 `a' with respect to the corresponding value `b'. The operation is performed
3038 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3039 -------------------------------------------------------------------------------
3041 floatx80 floatx80_rem( struct roundingData *roundData, floatx80 a, floatx80 b )
3043 flag aSign, bSign, zSign;
3044 int32 aExp, bExp, expDiff;
3045 bits64 aSig0, aSig1, bSig;
3046 bits64 q, term0, term1, alternateASig0, alternateASig1;
3047 floatx80 z;
3049 aSig0 = extractFloatx80Frac( a );
3050 aExp = extractFloatx80Exp( a );
3051 aSign = extractFloatx80Sign( a );
3052 bSig = extractFloatx80Frac( b );
3053 bExp = extractFloatx80Exp( b );
3054 bSign = extractFloatx80Sign( b );
3055 if ( aExp == 0x7FFF ) {
3056 if ( (bits64) ( aSig0<<1 )
3057 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3058 return propagateFloatx80NaN( a, b );
3060 goto invalid;
3062 if ( bExp == 0x7FFF ) {
3063 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3064 return a;
3066 if ( bExp == 0 ) {
3067 if ( bSig == 0 ) {
3068 invalid:
3069 roundData->exception |= float_flag_invalid;
3070 z.low = floatx80_default_nan_low;
3071 z.high = floatx80_default_nan_high;
3072 z.__padding = 0;
3073 return z;
3075 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3077 if ( aExp == 0 ) {
3078 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3079 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3081 bSig |= LIT64( 0x8000000000000000 );
3082 zSign = aSign;
3083 expDiff = aExp - bExp;
3084 aSig1 = 0;
3085 if ( expDiff < 0 ) {
3086 if ( expDiff < -1 ) return a;
3087 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3088 expDiff = 0;
3090 q = ( bSig <= aSig0 );
3091 if ( q ) aSig0 -= bSig;
3092 expDiff -= 64;
3093 while ( 0 < expDiff ) {
3094 q = estimateDiv128To64( aSig0, aSig1, bSig );
3095 q = ( 2 < q ) ? q - 2 : 0;
3096 mul64To128( bSig, q, &term0, &term1 );
3097 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3098 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3099 expDiff -= 62;
3101 expDiff += 64;
3102 if ( 0 < expDiff ) {
3103 q = estimateDiv128To64( aSig0, aSig1, bSig );
3104 q = ( 2 < q ) ? q - 2 : 0;
3105 q >>= 64 - expDiff;
3106 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3107 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3108 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3109 while ( le128( term0, term1, aSig0, aSig1 ) ) {
3110 ++q;
3111 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3114 else {
3115 term1 = 0;
3116 term0 = bSig;
3118 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3119 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3120 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3121 && ( q & 1 ) )
3123 aSig0 = alternateASig0;
3124 aSig1 = alternateASig1;
3125 zSign = ! zSign;
3128 return
3129 normalizeRoundAndPackFloatx80(
3130 roundData, zSign, bExp + expDiff, aSig0, aSig1 );
3135 -------------------------------------------------------------------------------
3136 Returns the square root of the extended double-precision floating-point
3137 value `a'. The operation is performed according to the IEC/IEEE Standard
3138 for Binary Floating-point Arithmetic.
3139 -------------------------------------------------------------------------------
3141 floatx80 floatx80_sqrt( struct roundingData *roundData, floatx80 a )
3143 flag aSign;
3144 int32 aExp, zExp;
3145 bits64 aSig0, aSig1, zSig0, zSig1;
3146 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3147 bits64 shiftedRem0, shiftedRem1;
3148 floatx80 z;
3150 aSig0 = extractFloatx80Frac( a );
3151 aExp = extractFloatx80Exp( a );
3152 aSign = extractFloatx80Sign( a );
3153 if ( aExp == 0x7FFF ) {
3154 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3155 if ( ! aSign ) return a;
3156 goto invalid;
3158 if ( aSign ) {
3159 if ( ( aExp | aSig0 ) == 0 ) return a;
3160 invalid:
3161 roundData->exception |= float_flag_invalid;
3162 z.low = floatx80_default_nan_low;
3163 z.high = floatx80_default_nan_high;
3164 z.__padding = 0;
3165 return z;
3167 if ( aExp == 0 ) {
3168 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3169 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3171 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3172 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3173 zSig0 <<= 31;
3174 aSig1 = 0;
3175 shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 );
3176 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
3177 if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
3178 shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
3179 mul64To128( zSig0, zSig0, &term0, &term1 );
3180 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3181 while ( (sbits64) rem0 < 0 ) {
3182 --zSig0;
3183 shortShift128Left( 0, zSig0, 1, &term0, &term1 );
3184 term1 |= 1;
3185 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
3187 shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
3188 zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
3189 if ( (bits64) ( zSig1<<1 ) <= 10 ) {
3190 if ( zSig1 == 0 ) zSig1 = 1;
3191 mul64To128( zSig0, zSig1, &term1, &term2 );
3192 shortShift128Left( term1, term2, 1, &term1, &term2 );
3193 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3194 mul64To128( zSig1, zSig1, &term2, &term3 );
3195 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3196 while ( (sbits64) rem1 < 0 ) {
3197 --zSig1;
3198 shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
3199 term3 |= 1;
3200 add192(
3201 rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
3203 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3205 return
3206 roundAndPackFloatx80(
3207 roundData, 0, zExp, zSig0, zSig1 );
3212 -------------------------------------------------------------------------------
3213 Returns 1 if the extended double-precision floating-point value `a' is
3214 equal to the corresponding value `b', and 0 otherwise. The comparison is
3215 performed according to the IEC/IEEE Standard for Binary Floating-point
3216 Arithmetic.
3217 -------------------------------------------------------------------------------
3219 flag floatx80_eq( floatx80 a, floatx80 b )
3222 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3223 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3224 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3225 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3227 if ( floatx80_is_signaling_nan( a )
3228 || floatx80_is_signaling_nan( b ) ) {
3229 float_raise( float_flag_invalid );
3231 return 0;
3233 return
3234 ( a.low == b.low )
3235 && ( ( a.high == b.high )
3236 || ( ( a.low == 0 )
3237 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3243 -------------------------------------------------------------------------------
3244 Returns 1 if the extended double-precision floating-point value `a' is
3245 less than or equal to the corresponding value `b', and 0 otherwise. The
3246 comparison is performed according to the IEC/IEEE Standard for Binary
3247 Floating-point Arithmetic.
3248 -------------------------------------------------------------------------------
3250 flag floatx80_le( floatx80 a, floatx80 b )
3252 flag aSign, bSign;
3254 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3255 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3256 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3257 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3259 float_raise( float_flag_invalid );
3260 return 0;
3262 aSign = extractFloatx80Sign( a );
3263 bSign = extractFloatx80Sign( b );
3264 if ( aSign != bSign ) {
3265 return
3266 aSign
3267 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3268 == 0 );
3270 return
3271 aSign ? le128( b.high, b.low, a.high, a.low )
3272 : le128( a.high, a.low, b.high, b.low );
3277 -------------------------------------------------------------------------------
3278 Returns 1 if the extended double-precision floating-point value `a' is
3279 less than the corresponding value `b', and 0 otherwise. The comparison
3280 is performed according to the IEC/IEEE Standard for Binary Floating-point
3281 Arithmetic.
3282 -------------------------------------------------------------------------------
3284 flag floatx80_lt( floatx80 a, floatx80 b )
3286 flag aSign, bSign;
3288 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3289 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3290 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3291 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3293 float_raise( float_flag_invalid );
3294 return 0;
3296 aSign = extractFloatx80Sign( a );
3297 bSign = extractFloatx80Sign( b );
3298 if ( aSign != bSign ) {
3299 return
3300 aSign
3301 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3302 != 0 );
3304 return
3305 aSign ? lt128( b.high, b.low, a.high, a.low )
3306 : lt128( a.high, a.low, b.high, b.low );
3311 -------------------------------------------------------------------------------
3312 Returns 1 if the extended double-precision floating-point value `a' is equal
3313 to the corresponding value `b', and 0 otherwise. The invalid exception is
3314 raised if either operand is a NaN. Otherwise, the comparison is performed
3315 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3316 -------------------------------------------------------------------------------
3318 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3321 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3322 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3323 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3324 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3326 float_raise( float_flag_invalid );
3327 return 0;
3329 return
3330 ( a.low == b.low )
3331 && ( ( a.high == b.high )
3332 || ( ( a.low == 0 )
3333 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3339 -------------------------------------------------------------------------------
3340 Returns 1 if the extended double-precision floating-point value `a' is less
3341 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
3342 do not cause an exception. Otherwise, the comparison is performed according
3343 to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3344 -------------------------------------------------------------------------------
3346 flag floatx80_le_quiet( floatx80 a, floatx80 b )
3348 flag aSign, bSign;
3350 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3351 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3352 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3353 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3355 /* Do nothing, even if NaN as we're quiet */
3356 return 0;
3358 aSign = extractFloatx80Sign( a );
3359 bSign = extractFloatx80Sign( b );
3360 if ( aSign != bSign ) {
3361 return
3362 aSign
3363 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3364 == 0 );
3366 return
3367 aSign ? le128( b.high, b.low, a.high, a.low )
3368 : le128( a.high, a.low, b.high, b.low );
3373 -------------------------------------------------------------------------------
3374 Returns 1 if the extended double-precision floating-point value `a' is less
3375 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
3376 an exception. Otherwise, the comparison is performed according to the
3377 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3378 -------------------------------------------------------------------------------
3380 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3382 flag aSign, bSign;
3384 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3385 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3386 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3387 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3389 /* Do nothing, even if NaN as we're quiet */
3390 return 0;
3392 aSign = extractFloatx80Sign( a );
3393 bSign = extractFloatx80Sign( b );
3394 if ( aSign != bSign ) {
3395 return
3396 aSign
3397 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3398 != 0 );
3400 return
3401 aSign ? lt128( b.high, b.low, a.high, a.low )
3402 : lt128( a.high, a.low, b.high, b.low );
3406 #endif