* add p cc
[mascara-docs.git] / compilers / pcc / pcc-libs-1.0.0 / libsoftfloat / bits32 / softfloat.c
blobbf9c37e4f0a6d01ed3f4596f62b31e5fe55381f7
1 /* $NetBSD: softfloat.c,v 1.1 2002/05/21 23:51:07 bjh21 Exp $ */
3 /*
4 * This version hacked for use with gcc -msoft-float by bjh21.
5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6 * itself).
7 */
9 /*
10 * Things you may want to define:
12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
14 * properly renamed.
18 * This differs from the standard bits32/softfloat.c in that float64
19 * is defined to be a 64-bit integer rather than a structure. The
20 * structure is float64s, with translation between the two going via
21 * float64u.
25 ===============================================================================
27 This C source file is part of the SoftFloat IEC/IEEE Floating-Point
28 Arithmetic Package, Release 2a.
30 Written by John R. Hauser. This work was made possible in part by the
31 International Computer Science Institute, located at Suite 600, 1947 Center
32 Street, Berkeley, California 94704. Funding was partially provided by the
33 National Science Foundation under grant MIP-9311980. The original version
34 of this code was written as part of a project to build a fixed-point vector
35 processor in collaboration with the University of California at Berkeley,
36 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
37 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
38 arithmetic/SoftFloat.html'.
40 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
41 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
42 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
43 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
44 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
46 Derivative works are acceptable, even for commercial purposes, so long as
47 (1) they include prominent notice that the work is derivative, and (2) they
48 include prominent notice akin to these four paragraphs for those parts of
49 this code that are retained.
51 ===============================================================================
54 #include <sys/cdefs.h>
55 #if defined(LIBC_SCCS) && !defined(lint)
56 __RCSID("$NetBSD: softfloat.c,v 1.1 2002/05/21 23:51:07 bjh21 Exp $");
57 #endif /* LIBC_SCCS and not lint */
59 #ifdef SOFTFLOAT_FOR_GCC
60 #include "softfloat-for-gcc.h"
61 #endif
63 #include "milieu.h"
64 #include "softfloat.h"
67 * Conversions between floats as stored in memory and floats as
68 * SoftFloat uses them
70 #ifndef FLOAT64_DEMANGLE
71 #define FLOAT64_DEMANGLE(a) (a)
72 #endif
73 #ifndef FLOAT64_MANGLE
74 #define FLOAT64_MANGLE(a) (a)
75 #endif
78 -------------------------------------------------------------------------------
79 Floating-point rounding mode and exception flags.
80 -------------------------------------------------------------------------------
82 fp_rnd float_rounding_mode = float_round_nearest_even;
83 fp_except float_exception_flags = 0;
86 -------------------------------------------------------------------------------
87 Primitive arithmetic functions, including multi-word arithmetic, and
88 division and square root approximations. (Can be specialized to target if
89 desired.)
90 -------------------------------------------------------------------------------
92 #include "softfloat-macros"
95 -------------------------------------------------------------------------------
96 Functions and definitions to determine: (1) whether tininess for underflow
97 is detected before or after rounding by default, (2) what (if anything)
98 happens when exceptions are raised, (3) how signaling NaNs are distinguished
99 from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
100 are propagated from function inputs to output. These details are target-
101 specific.
102 -------------------------------------------------------------------------------
104 #include "softfloat-specialize"
107 -------------------------------------------------------------------------------
108 Returns the fraction bits of the single-precision floating-point value `a'.
109 -------------------------------------------------------------------------------
111 INLINE bits32 extractFloat32Frac( float32 a )
114 return a & 0x007FFFFF;
119 -------------------------------------------------------------------------------
120 Returns the exponent bits of the single-precision floating-point value `a'.
121 -------------------------------------------------------------------------------
123 INLINE int16 extractFloat32Exp( float32 a )
126 return ( a>>23 ) & 0xFF;
131 -------------------------------------------------------------------------------
132 Returns the sign bit of the single-precision floating-point value `a'.
133 -------------------------------------------------------------------------------
135 INLINE flag extractFloat32Sign( float32 a )
138 return a>>31;
143 -------------------------------------------------------------------------------
144 Normalizes the subnormal single-precision floating-point value represented
145 by the denormalized significand `aSig'. The normalized exponent and
146 significand are stored at the locations pointed to by `zExpPtr' and
147 `zSigPtr', respectively.
148 -------------------------------------------------------------------------------
150 static void
151 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
153 int8 shiftCount;
155 shiftCount = countLeadingZeros32( aSig ) - 8;
156 *zSigPtr = aSig<<shiftCount;
157 *zExpPtr = 1 - shiftCount;
162 -------------------------------------------------------------------------------
163 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
164 single-precision floating-point value, returning the result. After being
165 shifted into the proper positions, the three fields are simply added
166 together to form the result. This means that any integer portion of `zSig'
167 will be added into the exponent. Since a properly normalized significand
168 will have an integer portion equal to 1, the `zExp' input should be 1 less
169 than the desired result exponent whenever `zSig' is a complete, normalized
170 significand.
171 -------------------------------------------------------------------------------
173 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
176 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
181 -------------------------------------------------------------------------------
182 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
183 and significand `zSig', and returns the proper single-precision floating-
184 point value corresponding to the abstract input. Ordinarily, the abstract
185 value is simply rounded and packed into the single-precision format, with
186 the inexact exception raised if the abstract input cannot be represented
187 exactly. However, if the abstract value is too large, the overflow and
188 inexact exceptions are raised and an infinity or maximal finite value is
189 returned. If the abstract value is too small, the input value is rounded to
190 a subnormal number, and the underflow and inexact exceptions are raised if
191 the abstract input cannot be represented exactly as a subnormal single-
192 precision floating-point number.
193 The input significand `zSig' has its binary point between bits 30
194 and 29, which is 7 bits to the left of the usual location. This shifted
195 significand must be normalized or smaller. If `zSig' is not normalized,
196 `zExp' must be 0; in that case, the result returned is a subnormal number,
197 and it must not require rounding. In the usual case that `zSig' is
198 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
199 The handling of underflow and overflow follows the IEC/IEEE Standard for
200 Binary Floating-Point Arithmetic.
201 -------------------------------------------------------------------------------
203 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
205 int8 roundingMode;
206 flag roundNearestEven;
207 int8 roundIncrement, roundBits;
208 flag isTiny;
210 roundingMode = float_rounding_mode;
211 roundNearestEven = roundingMode == float_round_nearest_even;
212 roundIncrement = 0x40;
213 if ( ! roundNearestEven ) {
214 if ( roundingMode == float_round_to_zero ) {
215 roundIncrement = 0;
217 else {
218 roundIncrement = 0x7F;
219 if ( zSign ) {
220 if ( roundingMode == float_round_up ) roundIncrement = 0;
222 else {
223 if ( roundingMode == float_round_down ) roundIncrement = 0;
227 roundBits = zSig & 0x7F;
228 if ( 0xFD <= (bits16) zExp ) {
229 if ( ( 0xFD < zExp )
230 || ( ( zExp == 0xFD )
231 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
233 float_raise( float_flag_overflow | float_flag_inexact );
234 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
236 if ( zExp < 0 ) {
237 isTiny =
238 ( float_detect_tininess == float_tininess_before_rounding )
239 || ( zExp < -1 )
240 || ( zSig + roundIncrement < 0x80000000 );
241 shift32RightJamming( zSig, - zExp, &zSig );
242 zExp = 0;
243 roundBits = zSig & 0x7F;
244 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
247 if ( roundBits ) float_exception_flags |= float_flag_inexact;
248 zSig = ( zSig + roundIncrement )>>7;
249 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
250 if ( zSig == 0 ) zExp = 0;
251 return packFloat32( zSign, zExp, zSig );
256 -------------------------------------------------------------------------------
257 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
258 and significand `zSig', and returns the proper single-precision floating-
259 point value corresponding to the abstract input. This routine is just like
260 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
261 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
262 floating-point exponent.
263 -------------------------------------------------------------------------------
265 static float32
266 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
268 int8 shiftCount;
270 shiftCount = countLeadingZeros32( zSig ) - 1;
271 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
276 -------------------------------------------------------------------------------
277 Returns the least-significant 32 fraction bits of the double-precision
278 floating-point value `a'.
279 -------------------------------------------------------------------------------
281 INLINE bits32 extractFloat64Frac1( float64 a )
284 return FLOAT64_DEMANGLE(a) & LIT64( 0x00000000FFFFFFFF );
289 -------------------------------------------------------------------------------
290 Returns the most-significant 20 fraction bits of the double-precision
291 floating-point value `a'.
292 -------------------------------------------------------------------------------
294 INLINE bits32 extractFloat64Frac0( float64 a )
297 return ( FLOAT64_DEMANGLE(a)>>32 ) & 0x000FFFFF;
302 -------------------------------------------------------------------------------
303 Returns the exponent bits of the double-precision floating-point value `a'.
304 -------------------------------------------------------------------------------
306 INLINE int16 extractFloat64Exp( float64 a )
309 return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
314 -------------------------------------------------------------------------------
315 Returns the sign bit of the double-precision floating-point value `a'.
316 -------------------------------------------------------------------------------
318 INLINE flag extractFloat64Sign( float64 a )
321 return FLOAT64_DEMANGLE(a)>>63;
326 -------------------------------------------------------------------------------
327 Normalizes the subnormal double-precision floating-point value represented
328 by the denormalized significand formed by the concatenation of `aSig0' and
329 `aSig1'. The normalized exponent is stored at the location pointed to by
330 `zExpPtr'. The most significant 21 bits of the normalized significand are
331 stored at the location pointed to by `zSig0Ptr', and the least significant
332 32 bits of the normalized significand are stored at the location pointed to
333 by `zSig1Ptr'.
334 -------------------------------------------------------------------------------
336 static void
337 normalizeFloat64Subnormal(
338 bits32 aSig0,
339 bits32 aSig1,
340 int16 *zExpPtr,
341 bits32 *zSig0Ptr,
342 bits32 *zSig1Ptr
345 int8 shiftCount;
347 if ( aSig0 == 0 ) {
348 shiftCount = countLeadingZeros32( aSig1 ) - 11;
349 if ( shiftCount < 0 ) {
350 *zSig0Ptr = aSig1>>( - shiftCount );
351 *zSig1Ptr = aSig1<<( shiftCount & 31 );
353 else {
354 *zSig0Ptr = aSig1<<shiftCount;
355 *zSig1Ptr = 0;
357 *zExpPtr = - shiftCount - 31;
359 else {
360 shiftCount = countLeadingZeros32( aSig0 ) - 11;
361 shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
362 *zExpPtr = 1 - shiftCount;
368 -------------------------------------------------------------------------------
369 Packs the sign `zSign', the exponent `zExp', and the significand formed by
370 the concatenation of `zSig0' and `zSig1' into a double-precision floating-
371 point value, returning the result. After being shifted into the proper
372 positions, the three fields `zSign', `zExp', and `zSig0' are simply added
373 together to form the most significant 32 bits of the result. This means
374 that any integer portion of `zSig0' will be added into the exponent. Since
375 a properly normalized significand will have an integer portion equal to 1,
376 the `zExp' input should be 1 less than the desired result exponent whenever
377 `zSig0' and `zSig1' concatenated form a complete, normalized significand.
378 -------------------------------------------------------------------------------
380 INLINE float64
381 packFloat64( flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1 )
384 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
385 ( ( (bits64) zExp )<<52 ) +
386 ( ( (bits64) zSig0 )<<32 ) + zSig1 );
392 -------------------------------------------------------------------------------
393 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
394 and extended significand formed by the concatenation of `zSig0', `zSig1',
395 and `zSig2', and returns the proper double-precision floating-point value
396 corresponding to the abstract input. Ordinarily, the abstract value is
397 simply rounded and packed into the double-precision format, with the inexact
398 exception raised if the abstract input cannot be represented exactly.
399 However, if the abstract value is too large, the overflow and inexact
400 exceptions are raised and an infinity or maximal finite value is returned.
401 If the abstract value is too small, the input value is rounded to a
402 subnormal number, and the underflow and inexact exceptions are raised if the
403 abstract input cannot be represented exactly as a subnormal double-precision
404 floating-point number.
405 The input significand must be normalized or smaller. If the input
406 significand is not normalized, `zExp' must be 0; in that case, the result
407 returned is a subnormal number, and it must not require rounding. In the
408 usual case that the input significand is normalized, `zExp' must be 1 less
409 than the ``true'' floating-point exponent. The handling of underflow and
410 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
411 -------------------------------------------------------------------------------
413 static float64
414 roundAndPackFloat64(
415 flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1, bits32 zSig2 )
417 int8 roundingMode;
418 flag roundNearestEven, increment, isTiny;
420 roundingMode = float_rounding_mode;
421 roundNearestEven = ( roundingMode == float_round_nearest_even );
422 increment = ( (sbits32) zSig2 < 0 );
423 if ( ! roundNearestEven ) {
424 if ( roundingMode == float_round_to_zero ) {
425 increment = 0;
427 else {
428 if ( zSign ) {
429 increment = ( roundingMode == float_round_down ) && zSig2;
431 else {
432 increment = ( roundingMode == float_round_up ) && zSig2;
436 if ( 0x7FD <= (bits16) zExp ) {
437 if ( ( 0x7FD < zExp )
438 || ( ( zExp == 0x7FD )
439 && eq64( 0x001FFFFF, 0xFFFFFFFF, zSig0, zSig1 )
440 && increment
443 float_raise( float_flag_overflow | float_flag_inexact );
444 if ( ( roundingMode == float_round_to_zero )
445 || ( zSign && ( roundingMode == float_round_up ) )
446 || ( ! zSign && ( roundingMode == float_round_down ) )
448 return packFloat64( zSign, 0x7FE, 0x000FFFFF, 0xFFFFFFFF );
450 return packFloat64( zSign, 0x7FF, 0, 0 );
452 if ( zExp < 0 ) {
453 isTiny =
454 ( float_detect_tininess == float_tininess_before_rounding )
455 || ( zExp < -1 )
456 || ! increment
457 || lt64( zSig0, zSig1, 0x001FFFFF, 0xFFFFFFFF );
458 shift64ExtraRightJamming(
459 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
460 zExp = 0;
461 if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
462 if ( roundNearestEven ) {
463 increment = ( (sbits32) zSig2 < 0 );
465 else {
466 if ( zSign ) {
467 increment = ( roundingMode == float_round_down ) && zSig2;
469 else {
470 increment = ( roundingMode == float_round_up ) && zSig2;
475 if ( zSig2 ) float_exception_flags |= float_flag_inexact;
476 if ( increment ) {
477 add64( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
478 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
480 else {
481 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
483 return packFloat64( zSign, zExp, zSig0, zSig1 );
488 -------------------------------------------------------------------------------
489 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
490 and significand formed by the concatenation of `zSig0' and `zSig1', and
491 returns the proper double-precision floating-point value corresponding
492 to the abstract input. This routine is just like `roundAndPackFloat64'
493 except that the input significand has fewer bits and does not have to be
494 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
495 point exponent.
496 -------------------------------------------------------------------------------
498 static float64
499 normalizeRoundAndPackFloat64(
500 flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1 )
502 int8 shiftCount;
503 bits32 zSig2;
505 if ( zSig0 == 0 ) {
506 zSig0 = zSig1;
507 zSig1 = 0;
508 zExp -= 32;
510 shiftCount = countLeadingZeros32( zSig0 ) - 11;
511 if ( 0 <= shiftCount ) {
512 zSig2 = 0;
513 shortShift64Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
515 else {
516 shift64ExtraRightJamming(
517 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
519 zExp -= shiftCount;
520 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
525 -------------------------------------------------------------------------------
526 Returns the result of converting the 32-bit two's complement integer `a' to
527 the single-precision floating-point format. The conversion is performed
528 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
529 -------------------------------------------------------------------------------
531 float32 int32_to_float32( int32 a )
533 flag zSign;
535 if ( a == 0 ) return 0;
536 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
537 zSign = ( a < 0 );
538 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
543 -------------------------------------------------------------------------------
544 Returns the result of converting the 32-bit two's complement integer `a' to
545 the double-precision floating-point format. The conversion is performed
546 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
547 -------------------------------------------------------------------------------
549 float64 int32_to_float64( int32 a )
551 flag zSign;
552 bits32 absA;
553 int8 shiftCount;
554 bits32 zSig0, zSig1;
556 if ( a == 0 ) return packFloat64( 0, 0, 0, 0 );
557 zSign = ( a < 0 );
558 absA = zSign ? - a : a;
559 shiftCount = countLeadingZeros32( absA ) - 11;
560 if ( 0 <= shiftCount ) {
561 zSig0 = absA<<shiftCount;
562 zSig1 = 0;
564 else {
565 shift64Right( absA, 0, - shiftCount, &zSig0, &zSig1 );
567 return packFloat64( zSign, 0x412 - shiftCount, zSig0, zSig1 );
571 #ifndef SOFTFLOAT_FOR_GCC
573 -------------------------------------------------------------------------------
574 Returns the result of converting the single-precision floating-point value
575 `a' to the 32-bit two's complement integer format. The conversion is
576 performed according to the IEC/IEEE Standard for Binary Floating-Point
577 Arithmetic---which means in particular that the conversion is rounded
578 according to the current rounding mode. If `a' is a NaN, the largest
579 positive integer is returned. Otherwise, if the conversion overflows, the
580 largest integer with the same sign as `a' is returned.
581 -------------------------------------------------------------------------------
583 int32 float32_to_int32( float32 a )
585 flag aSign;
586 int16 aExp, shiftCount;
587 bits32 aSig, aSigExtra;
588 int32 z;
589 int8 roundingMode;
591 aSig = extractFloat32Frac( a );
592 aExp = extractFloat32Exp( a );
593 aSign = extractFloat32Sign( a );
594 shiftCount = aExp - 0x96;
595 if ( 0 <= shiftCount ) {
596 if ( 0x9E <= aExp ) {
597 if ( a != 0xCF000000 ) {
598 float_raise( float_flag_invalid );
599 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
600 return 0x7FFFFFFF;
603 return (sbits32) 0x80000000;
605 z = ( aSig | 0x00800000 )<<shiftCount;
606 if ( aSign ) z = - z;
608 else {
609 if ( aExp < 0x7E ) {
610 aSigExtra = aExp | aSig;
611 z = 0;
613 else {
614 aSig |= 0x00800000;
615 aSigExtra = aSig<<( shiftCount & 31 );
616 z = aSig>>( - shiftCount );
618 if ( aSigExtra ) float_exception_flags |= float_flag_inexact;
619 roundingMode = float_rounding_mode;
620 if ( roundingMode == float_round_nearest_even ) {
621 if ( (sbits32) aSigExtra < 0 ) {
622 ++z;
623 if ( (bits32) ( aSigExtra<<1 ) == 0 ) z &= ~1;
625 if ( aSign ) z = - z;
627 else {
628 aSigExtra = ( aSigExtra != 0 );
629 if ( aSign ) {
630 z += ( roundingMode == float_round_down ) & aSigExtra;
631 z = - z;
633 else {
634 z += ( roundingMode == float_round_up ) & aSigExtra;
638 return z;
641 #endif
644 -------------------------------------------------------------------------------
645 Returns the result of converting the single-precision floating-point value
646 `a' to the 32-bit two's complement integer format. The conversion is
647 performed according to the IEC/IEEE Standard for Binary Floating-Point
648 Arithmetic, except that the conversion is always rounded toward zero.
649 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
650 the conversion overflows, the largest integer with the same sign as `a' is
651 returned.
652 -------------------------------------------------------------------------------
654 int32 float32_to_int32_round_to_zero( float32 a )
656 flag aSign;
657 int16 aExp, shiftCount;
658 bits32 aSig;
659 int32 z;
661 aSig = extractFloat32Frac( a );
662 aExp = extractFloat32Exp( a );
663 aSign = extractFloat32Sign( a );
664 shiftCount = aExp - 0x9E;
665 if ( 0 <= shiftCount ) {
666 if ( a != 0xCF000000 ) {
667 float_raise( float_flag_invalid );
668 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
670 return (sbits32) 0x80000000;
672 else if ( aExp <= 0x7E ) {
673 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
674 return 0;
676 aSig = ( aSig | 0x00800000 )<<8;
677 z = aSig>>( - shiftCount );
678 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
679 float_exception_flags |= float_flag_inexact;
681 if ( aSign ) z = - z;
682 return z;
687 -------------------------------------------------------------------------------
688 Returns the result of converting the single-precision floating-point value
689 `a' to the double-precision floating-point format. The conversion is
690 performed according to the IEC/IEEE Standard for Binary Floating-Point
691 Arithmetic.
692 -------------------------------------------------------------------------------
694 float64 float32_to_float64( float32 a )
696 flag aSign;
697 int16 aExp;
698 bits32 aSig, zSig0, zSig1;
700 aSig = extractFloat32Frac( a );
701 aExp = extractFloat32Exp( a );
702 aSign = extractFloat32Sign( a );
703 if ( aExp == 0xFF ) {
704 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
705 return packFloat64( aSign, 0x7FF, 0, 0 );
707 if ( aExp == 0 ) {
708 if ( aSig == 0 ) return packFloat64( aSign, 0, 0, 0 );
709 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
710 --aExp;
712 shift64Right( aSig, 0, 3, &zSig0, &zSig1 );
713 return packFloat64( aSign, aExp + 0x380, zSig0, zSig1 );
717 #ifndef SOFTFLOAT_FOR_GCC
719 -------------------------------------------------------------------------------
720 Rounds the single-precision floating-point value `a' to an integer,
721 and returns the result as a single-precision floating-point value. The
722 operation is performed according to the IEC/IEEE Standard for Binary
723 Floating-Point Arithmetic.
724 -------------------------------------------------------------------------------
726 float32 float32_round_to_int( float32 a )
728 flag aSign;
729 int16 aExp;
730 bits32 lastBitMask, roundBitsMask;
731 int8 roundingMode;
732 float32 z;
734 aExp = extractFloat32Exp( a );
735 if ( 0x96 <= aExp ) {
736 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
737 return propagateFloat32NaN( a, a );
739 return a;
741 if ( aExp <= 0x7E ) {
742 if ( (bits32) ( a<<1 ) == 0 ) return a;
743 float_exception_flags |= float_flag_inexact;
744 aSign = extractFloat32Sign( a );
745 switch ( float_rounding_mode ) {
746 case float_round_nearest_even:
747 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
748 return packFloat32( aSign, 0x7F, 0 );
750 break;
751 case float_round_to_zero:
752 break;
753 case float_round_down:
754 return aSign ? 0xBF800000 : 0;
755 case float_round_up:
756 return aSign ? 0x80000000 : 0x3F800000;
758 return packFloat32( aSign, 0, 0 );
760 lastBitMask = 1;
761 lastBitMask <<= 0x96 - aExp;
762 roundBitsMask = lastBitMask - 1;
763 z = a;
764 roundingMode = float_rounding_mode;
765 if ( roundingMode == float_round_nearest_even ) {
766 z += lastBitMask>>1;
767 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
769 else if ( roundingMode != float_round_to_zero ) {
770 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
771 z += roundBitsMask;
774 z &= ~ roundBitsMask;
775 if ( z != a ) float_exception_flags |= float_flag_inexact;
776 return z;
779 #endif
782 -------------------------------------------------------------------------------
783 Returns the result of adding the absolute values of the single-precision
784 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
785 before being returned. `zSign' is ignored if the result is a NaN.
786 The addition is performed according to the IEC/IEEE Standard for Binary
787 Floating-Point Arithmetic.
788 -------------------------------------------------------------------------------
790 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
792 int16 aExp, bExp, zExp;
793 bits32 aSig, bSig, zSig;
794 int16 expDiff;
796 aSig = extractFloat32Frac( a );
797 aExp = extractFloat32Exp( a );
798 bSig = extractFloat32Frac( b );
799 bExp = extractFloat32Exp( b );
800 expDiff = aExp - bExp;
801 aSig <<= 6;
802 bSig <<= 6;
803 if ( 0 < expDiff ) {
804 if ( aExp == 0xFF ) {
805 if ( aSig ) return propagateFloat32NaN( a, b );
806 return a;
808 if ( bExp == 0 ) {
809 --expDiff;
811 else {
812 bSig |= 0x20000000;
814 shift32RightJamming( bSig, expDiff, &bSig );
815 zExp = aExp;
817 else if ( expDiff < 0 ) {
818 if ( bExp == 0xFF ) {
819 if ( bSig ) return propagateFloat32NaN( a, b );
820 return packFloat32( zSign, 0xFF, 0 );
822 if ( aExp == 0 ) {
823 ++expDiff;
825 else {
826 aSig |= 0x20000000;
828 shift32RightJamming( aSig, - expDiff, &aSig );
829 zExp = bExp;
831 else {
832 if ( aExp == 0xFF ) {
833 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
834 return a;
836 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
837 zSig = 0x40000000 + aSig + bSig;
838 zExp = aExp;
839 goto roundAndPack;
841 aSig |= 0x20000000;
842 zSig = ( aSig + bSig )<<1;
843 --zExp;
844 if ( (sbits32) zSig < 0 ) {
845 zSig = aSig + bSig;
846 ++zExp;
848 roundAndPack:
849 return roundAndPackFloat32( zSign, zExp, zSig );
854 -------------------------------------------------------------------------------
855 Returns the result of subtracting the absolute values of the single-
856 precision floating-point values `a' and `b'. If `zSign' is 1, the
857 difference is negated before being returned. `zSign' is ignored if the
858 result is a NaN. The subtraction is performed according to the IEC/IEEE
859 Standard for Binary Floating-Point Arithmetic.
860 -------------------------------------------------------------------------------
862 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
864 int16 aExp, bExp, zExp;
865 bits32 aSig, bSig, zSig;
866 int16 expDiff;
868 aSig = extractFloat32Frac( a );
869 aExp = extractFloat32Exp( a );
870 bSig = extractFloat32Frac( b );
871 bExp = extractFloat32Exp( b );
872 expDiff = aExp - bExp;
873 aSig <<= 7;
874 bSig <<= 7;
875 if ( 0 < expDiff ) goto aExpBigger;
876 if ( expDiff < 0 ) goto bExpBigger;
877 if ( aExp == 0xFF ) {
878 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
879 float_raise( float_flag_invalid );
880 return float32_default_nan;
882 if ( aExp == 0 ) {
883 aExp = 1;
884 bExp = 1;
886 if ( bSig < aSig ) goto aBigger;
887 if ( aSig < bSig ) goto bBigger;
888 return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
889 bExpBigger:
890 if ( bExp == 0xFF ) {
891 if ( bSig ) return propagateFloat32NaN( a, b );
892 return packFloat32( zSign ^ 1, 0xFF, 0 );
894 if ( aExp == 0 ) {
895 ++expDiff;
897 else {
898 aSig |= 0x40000000;
900 shift32RightJamming( aSig, - expDiff, &aSig );
901 bSig |= 0x40000000;
902 bBigger:
903 zSig = bSig - aSig;
904 zExp = bExp;
905 zSign ^= 1;
906 goto normalizeRoundAndPack;
907 aExpBigger:
908 if ( aExp == 0xFF ) {
909 if ( aSig ) return propagateFloat32NaN( a, b );
910 return a;
912 if ( bExp == 0 ) {
913 --expDiff;
915 else {
916 bSig |= 0x40000000;
918 shift32RightJamming( bSig, expDiff, &bSig );
919 aSig |= 0x40000000;
920 aBigger:
921 zSig = aSig - bSig;
922 zExp = aExp;
923 normalizeRoundAndPack:
924 --zExp;
925 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
930 -------------------------------------------------------------------------------
931 Returns the result of adding the single-precision floating-point values `a'
932 and `b'. The operation is performed according to the IEC/IEEE Standard for
933 Binary Floating-Point Arithmetic.
934 -------------------------------------------------------------------------------
936 float32 float32_add( float32 a, float32 b )
938 flag aSign, bSign;
940 aSign = extractFloat32Sign( a );
941 bSign = extractFloat32Sign( b );
942 if ( aSign == bSign ) {
943 return addFloat32Sigs( a, b, aSign );
945 else {
946 return subFloat32Sigs( a, b, aSign );
952 -------------------------------------------------------------------------------
953 Returns the result of subtracting the single-precision floating-point values
954 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
955 for Binary Floating-Point Arithmetic.
956 -------------------------------------------------------------------------------
958 float32 float32_sub( float32 a, float32 b )
960 flag aSign, bSign;
962 aSign = extractFloat32Sign( a );
963 bSign = extractFloat32Sign( b );
964 if ( aSign == bSign ) {
965 return subFloat32Sigs( a, b, aSign );
967 else {
968 return addFloat32Sigs( a, b, aSign );
974 -------------------------------------------------------------------------------
975 Returns the result of multiplying the single-precision floating-point values
976 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
977 for Binary Floating-Point Arithmetic.
978 -------------------------------------------------------------------------------
980 float32 float32_mul( float32 a, float32 b )
982 flag aSign, bSign, zSign;
983 int16 aExp, bExp, zExp;
984 bits32 aSig, bSig, zSig0, zSig1;
986 aSig = extractFloat32Frac( a );
987 aExp = extractFloat32Exp( a );
988 aSign = extractFloat32Sign( a );
989 bSig = extractFloat32Frac( b );
990 bExp = extractFloat32Exp( b );
991 bSign = extractFloat32Sign( b );
992 zSign = aSign ^ bSign;
993 if ( aExp == 0xFF ) {
994 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
995 return propagateFloat32NaN( a, b );
997 if ( ( bExp | bSig ) == 0 ) {
998 float_raise( float_flag_invalid );
999 return float32_default_nan;
1001 return packFloat32( zSign, 0xFF, 0 );
1003 if ( bExp == 0xFF ) {
1004 if ( bSig ) return propagateFloat32NaN( a, b );
1005 if ( ( aExp | aSig ) == 0 ) {
1006 float_raise( float_flag_invalid );
1007 return float32_default_nan;
1009 return packFloat32( zSign, 0xFF, 0 );
1011 if ( aExp == 0 ) {
1012 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1013 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1015 if ( bExp == 0 ) {
1016 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1017 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1019 zExp = aExp + bExp - 0x7F;
1020 aSig = ( aSig | 0x00800000 )<<7;
1021 bSig = ( bSig | 0x00800000 )<<8;
1022 mul32To64( aSig, bSig, &zSig0, &zSig1 );
1023 zSig0 |= ( zSig1 != 0 );
1024 if ( 0 <= (sbits32) ( zSig0<<1 ) ) {
1025 zSig0 <<= 1;
1026 --zExp;
1028 return roundAndPackFloat32( zSign, zExp, zSig0 );
1033 -------------------------------------------------------------------------------
1034 Returns the result of dividing the single-precision floating-point value `a'
1035 by the corresponding value `b'. The operation is performed according to the
1036 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1037 -------------------------------------------------------------------------------
1039 float32 float32_div( float32 a, float32 b )
1041 flag aSign, bSign, zSign;
1042 int16 aExp, bExp, zExp;
1043 bits32 aSig, bSig, zSig, rem0, rem1, term0, term1;
1045 aSig = extractFloat32Frac( a );
1046 aExp = extractFloat32Exp( a );
1047 aSign = extractFloat32Sign( a );
1048 bSig = extractFloat32Frac( b );
1049 bExp = extractFloat32Exp( b );
1050 bSign = extractFloat32Sign( b );
1051 zSign = aSign ^ bSign;
1052 if ( aExp == 0xFF ) {
1053 if ( aSig ) return propagateFloat32NaN( a, b );
1054 if ( bExp == 0xFF ) {
1055 if ( bSig ) return propagateFloat32NaN( a, b );
1056 float_raise( float_flag_invalid );
1057 return float32_default_nan;
1059 return packFloat32( zSign, 0xFF, 0 );
1061 if ( bExp == 0xFF ) {
1062 if ( bSig ) return propagateFloat32NaN( a, b );
1063 return packFloat32( zSign, 0, 0 );
1065 if ( bExp == 0 ) {
1066 if ( bSig == 0 ) {
1067 if ( ( aExp | aSig ) == 0 ) {
1068 float_raise( float_flag_invalid );
1069 return float32_default_nan;
1071 float_raise( float_flag_divbyzero );
1072 return packFloat32( zSign, 0xFF, 0 );
1074 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1076 if ( aExp == 0 ) {
1077 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1078 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1080 zExp = aExp - bExp + 0x7D;
1081 aSig = ( aSig | 0x00800000 )<<7;
1082 bSig = ( bSig | 0x00800000 )<<8;
1083 if ( bSig <= ( aSig + aSig ) ) {
1084 aSig >>= 1;
1085 ++zExp;
1087 zSig = estimateDiv64To32( aSig, 0, bSig );
1088 if ( ( zSig & 0x3F ) <= 2 ) {
1089 mul32To64( bSig, zSig, &term0, &term1 );
1090 sub64( aSig, 0, term0, term1, &rem0, &rem1 );
1091 while ( (sbits32) rem0 < 0 ) {
1092 --zSig;
1093 add64( rem0, rem1, 0, bSig, &rem0, &rem1 );
1095 zSig |= ( rem1 != 0 );
1097 return roundAndPackFloat32( zSign, zExp, zSig );
1101 #ifndef SOFTFLOAT_FOR_GCC
1103 -------------------------------------------------------------------------------
1104 Returns the remainder of the single-precision floating-point value `a'
1105 with respect to the corresponding value `b'. The operation is performed
1106 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1107 -------------------------------------------------------------------------------
1109 float32 float32_rem( float32 a, float32 b )
1111 flag aSign, bSign, zSign;
1112 int16 aExp, bExp, expDiff;
1113 bits32 aSig, bSig, q, allZero, alternateASig;
1114 sbits32 sigMean;
1116 aSig = extractFloat32Frac( a );
1117 aExp = extractFloat32Exp( a );
1118 aSign = extractFloat32Sign( a );
1119 bSig = extractFloat32Frac( b );
1120 bExp = extractFloat32Exp( b );
1121 bSign = extractFloat32Sign( b );
1122 if ( aExp == 0xFF ) {
1123 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1124 return propagateFloat32NaN( a, b );
1126 float_raise( float_flag_invalid );
1127 return float32_default_nan;
1129 if ( bExp == 0xFF ) {
1130 if ( bSig ) return propagateFloat32NaN( a, b );
1131 return a;
1133 if ( bExp == 0 ) {
1134 if ( bSig == 0 ) {
1135 float_raise( float_flag_invalid );
1136 return float32_default_nan;
1138 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1140 if ( aExp == 0 ) {
1141 if ( aSig == 0 ) return a;
1142 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1144 expDiff = aExp - bExp;
1145 aSig = ( aSig | 0x00800000 )<<8;
1146 bSig = ( bSig | 0x00800000 )<<8;
1147 if ( expDiff < 0 ) {
1148 if ( expDiff < -1 ) return a;
1149 aSig >>= 1;
1151 q = ( bSig <= aSig );
1152 if ( q ) aSig -= bSig;
1153 expDiff -= 32;
1154 while ( 0 < expDiff ) {
1155 q = estimateDiv64To32( aSig, 0, bSig );
1156 q = ( 2 < q ) ? q - 2 : 0;
1157 aSig = - ( ( bSig>>2 ) * q );
1158 expDiff -= 30;
1160 expDiff += 32;
1161 if ( 0 < expDiff ) {
1162 q = estimateDiv64To32( aSig, 0, bSig );
1163 q = ( 2 < q ) ? q - 2 : 0;
1164 q >>= 32 - expDiff;
1165 bSig >>= 2;
1166 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1168 else {
1169 aSig >>= 2;
1170 bSig >>= 2;
1172 do {
1173 alternateASig = aSig;
1174 ++q;
1175 aSig -= bSig;
1176 } while ( 0 <= (sbits32) aSig );
1177 sigMean = aSig + alternateASig;
1178 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1179 aSig = alternateASig;
1181 zSign = ( (sbits32) aSig < 0 );
1182 if ( zSign ) aSig = - aSig;
1183 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1186 #endif
1188 #ifndef SOFTFLOAT_FOR_GCC
1190 -------------------------------------------------------------------------------
1191 Returns the square root of the single-precision floating-point value `a'.
1192 The operation is performed according to the IEC/IEEE Standard for Binary
1193 Floating-Point Arithmetic.
1194 -------------------------------------------------------------------------------
1196 float32 float32_sqrt( float32 a )
1198 flag aSign;
1199 int16 aExp, zExp;
1200 bits32 aSig, zSig, rem0, rem1, term0, term1;
1202 aSig = extractFloat32Frac( a );
1203 aExp = extractFloat32Exp( a );
1204 aSign = extractFloat32Sign( a );
1205 if ( aExp == 0xFF ) {
1206 if ( aSig ) return propagateFloat32NaN( a, 0 );
1207 if ( ! aSign ) return a;
1208 float_raise( float_flag_invalid );
1209 return float32_default_nan;
1211 if ( aSign ) {
1212 if ( ( aExp | aSig ) == 0 ) return a;
1213 float_raise( float_flag_invalid );
1214 return float32_default_nan;
1216 if ( aExp == 0 ) {
1217 if ( aSig == 0 ) return 0;
1218 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1220 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1221 aSig = ( aSig | 0x00800000 )<<8;
1222 zSig = estimateSqrt32( aExp, aSig ) + 2;
1223 if ( ( zSig & 0x7F ) <= 5 ) {
1224 if ( zSig < 2 ) {
1225 zSig = 0x7FFFFFFF;
1226 goto roundAndPack;
1228 else {
1229 aSig >>= aExp & 1;
1230 mul32To64( zSig, zSig, &term0, &term1 );
1231 sub64( aSig, 0, term0, term1, &rem0, &rem1 );
1232 while ( (sbits32) rem0 < 0 ) {
1233 --zSig;
1234 shortShift64Left( 0, zSig, 1, &term0, &term1 );
1235 term1 |= 1;
1236 add64( rem0, rem1, term0, term1, &rem0, &rem1 );
1238 zSig |= ( ( rem0 | rem1 ) != 0 );
1241 shift32RightJamming( zSig, 1, &zSig );
1242 roundAndPack:
1243 return roundAndPackFloat32( 0, zExp, zSig );
1246 #endif
1249 -------------------------------------------------------------------------------
1250 Returns 1 if the single-precision floating-point value `a' is equal to
1251 the corresponding value `b', and 0 otherwise. The comparison is performed
1252 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1253 -------------------------------------------------------------------------------
1255 flag float32_eq( float32 a, float32 b )
1258 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1259 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1261 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1262 float_raise( float_flag_invalid );
1264 return 0;
1266 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1271 -------------------------------------------------------------------------------
1272 Returns 1 if the single-precision floating-point value `a' is less than
1273 or equal to the corresponding value `b', and 0 otherwise. The comparison
1274 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1275 Arithmetic.
1276 -------------------------------------------------------------------------------
1278 flag float32_le( float32 a, float32 b )
1280 flag aSign, bSign;
1282 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1283 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1285 float_raise( float_flag_invalid );
1286 return 0;
1288 aSign = extractFloat32Sign( a );
1289 bSign = extractFloat32Sign( b );
1290 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1291 return ( a == b ) || ( aSign ^ ( a < b ) );
1296 -------------------------------------------------------------------------------
1297 Returns 1 if the single-precision floating-point value `a' is less than
1298 the corresponding value `b', and 0 otherwise. The comparison is performed
1299 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1300 -------------------------------------------------------------------------------
1302 flag float32_lt( float32 a, float32 b )
1304 flag aSign, bSign;
1306 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1307 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1309 float_raise( float_flag_invalid );
1310 return 0;
1312 aSign = extractFloat32Sign( a );
1313 bSign = extractFloat32Sign( b );
1314 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1315 return ( a != b ) && ( aSign ^ ( a < b ) );
1319 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1321 -------------------------------------------------------------------------------
1322 Returns 1 if the single-precision floating-point value `a' is equal to
1323 the corresponding value `b', and 0 otherwise. The invalid exception is
1324 raised if either operand is a NaN. Otherwise, the comparison is performed
1325 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1326 -------------------------------------------------------------------------------
1328 flag float32_eq_signaling( float32 a, float32 b )
1331 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1332 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1334 float_raise( float_flag_invalid );
1335 return 0;
1337 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1342 -------------------------------------------------------------------------------
1343 Returns 1 if the single-precision floating-point value `a' is less than or
1344 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1345 cause an exception. Otherwise, the comparison is performed according to the
1346 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1347 -------------------------------------------------------------------------------
1349 flag float32_le_quiet( float32 a, float32 b )
1351 flag aSign, bSign;
1352 int16 aExp, bExp;
1354 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1355 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1357 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1358 float_raise( float_flag_invalid );
1360 return 0;
1362 aSign = extractFloat32Sign( a );
1363 bSign = extractFloat32Sign( b );
1364 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1365 return ( a == b ) || ( aSign ^ ( a < b ) );
1370 -------------------------------------------------------------------------------
1371 Returns 1 if the single-precision floating-point value `a' is less than
1372 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1373 exception. Otherwise, the comparison is performed according to the IEC/IEEE
1374 Standard for Binary Floating-Point Arithmetic.
1375 -------------------------------------------------------------------------------
1377 flag float32_lt_quiet( float32 a, float32 b )
1379 flag aSign, bSign;
1381 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1382 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1384 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1385 float_raise( float_flag_invalid );
1387 return 0;
1389 aSign = extractFloat32Sign( a );
1390 bSign = extractFloat32Sign( b );
1391 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1392 return ( a != b ) && ( aSign ^ ( a < b ) );
1395 #endif /* !SOFTFLOAT_FOR_GCC */
1397 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1399 -------------------------------------------------------------------------------
1400 Returns the result of converting the double-precision floating-point value
1401 `a' to the 32-bit two's complement integer format. The conversion is
1402 performed according to the IEC/IEEE Standard for Binary Floating-Point
1403 Arithmetic---which means in particular that the conversion is rounded
1404 according to the current rounding mode. If `a' is a NaN, the largest
1405 positive integer is returned. Otherwise, if the conversion overflows, the
1406 largest integer with the same sign as `a' is returned.
1407 -------------------------------------------------------------------------------
1409 int32 float64_to_int32( float64 a )
1411 flag aSign;
1412 int16 aExp, shiftCount;
1413 bits32 aSig0, aSig1, absZ, aSigExtra;
1414 int32 z;
1415 int8 roundingMode;
1417 aSig1 = extractFloat64Frac1( a );
1418 aSig0 = extractFloat64Frac0( a );
1419 aExp = extractFloat64Exp( a );
1420 aSign = extractFloat64Sign( a );
1421 shiftCount = aExp - 0x413;
1422 if ( 0 <= shiftCount ) {
1423 if ( 0x41E < aExp ) {
1424 if ( ( aExp == 0x7FF ) && ( aSig0 | aSig1 ) ) aSign = 0;
1425 goto invalid;
1427 shortShift64Left(
1428 aSig0 | 0x00100000, aSig1, shiftCount, &absZ, &aSigExtra );
1429 if ( 0x80000000 < absZ ) goto invalid;
1431 else {
1432 aSig1 = ( aSig1 != 0 );
1433 if ( aExp < 0x3FE ) {
1434 aSigExtra = aExp | aSig0 | aSig1;
1435 absZ = 0;
1437 else {
1438 aSig0 |= 0x00100000;
1439 aSigExtra = ( aSig0<<( shiftCount & 31 ) ) | aSig1;
1440 absZ = aSig0>>( - shiftCount );
1443 roundingMode = float_rounding_mode;
1444 if ( roundingMode == float_round_nearest_even ) {
1445 if ( (sbits32) aSigExtra < 0 ) {
1446 ++absZ;
1447 if ( (bits32) ( aSigExtra<<1 ) == 0 ) absZ &= ~1;
1449 z = aSign ? - absZ : absZ;
1451 else {
1452 aSigExtra = ( aSigExtra != 0 );
1453 if ( aSign ) {
1454 z = - ( absZ
1455 + ( ( roundingMode == float_round_down ) & aSigExtra ) );
1457 else {
1458 z = absZ + ( ( roundingMode == float_round_up ) & aSigExtra );
1461 if ( ( aSign ^ ( z < 0 ) ) && z ) {
1462 invalid:
1463 float_raise( float_flag_invalid );
1464 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
1466 if ( aSigExtra ) float_exception_flags |= float_flag_inexact;
1467 return z;
1470 #endif /* !SOFTFLOAT_FOR_GCC */
1473 -------------------------------------------------------------------------------
1474 Returns the result of converting the double-precision floating-point value
1475 `a' to the 32-bit two's complement integer format. The conversion is
1476 performed according to the IEC/IEEE Standard for Binary Floating-Point
1477 Arithmetic, except that the conversion is always rounded toward zero.
1478 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1479 the conversion overflows, the largest integer with the same sign as `a' is
1480 returned.
1481 -------------------------------------------------------------------------------
1483 int32 float64_to_int32_round_to_zero( float64 a )
1485 flag aSign;
1486 int16 aExp, shiftCount;
1487 bits32 aSig0, aSig1, absZ, aSigExtra;
1488 int32 z;
1490 aSig1 = extractFloat64Frac1( a );
1491 aSig0 = extractFloat64Frac0( a );
1492 aExp = extractFloat64Exp( a );
1493 aSign = extractFloat64Sign( a );
1494 shiftCount = aExp - 0x413;
1495 if ( 0 <= shiftCount ) {
1496 if ( 0x41E < aExp ) {
1497 if ( ( aExp == 0x7FF ) && ( aSig0 | aSig1 ) ) aSign = 0;
1498 goto invalid;
1500 shortShift64Left(
1501 aSig0 | 0x00100000, aSig1, shiftCount, &absZ, &aSigExtra );
1503 else {
1504 if ( aExp < 0x3FF ) {
1505 if ( aExp | aSig0 | aSig1 ) {
1506 float_exception_flags |= float_flag_inexact;
1508 return 0;
1510 aSig0 |= 0x00100000;
1511 aSigExtra = ( aSig0<<( shiftCount & 31 ) ) | aSig1;
1512 absZ = aSig0>>( - shiftCount );
1514 z = aSign ? - absZ : absZ;
1515 if ( ( aSign ^ ( z < 0 ) ) && z ) {
1516 invalid:
1517 float_raise( float_flag_invalid );
1518 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
1520 if ( aSigExtra ) float_exception_flags |= float_flag_inexact;
1521 return z;
1526 -------------------------------------------------------------------------------
1527 Returns the result of converting the double-precision floating-point value
1528 `a' to the single-precision floating-point format. The conversion is
1529 performed according to the IEC/IEEE Standard for Binary Floating-Point
1530 Arithmetic.
1531 -------------------------------------------------------------------------------
1533 float32 float64_to_float32( float64 a )
1535 flag aSign;
1536 int16 aExp;
1537 bits32 aSig0, aSig1, zSig;
1538 bits32 allZero;
1540 aSig1 = extractFloat64Frac1( a );
1541 aSig0 = extractFloat64Frac0( a );
1542 aExp = extractFloat64Exp( a );
1543 aSign = extractFloat64Sign( a );
1544 if ( aExp == 0x7FF ) {
1545 if ( aSig0 | aSig1 ) {
1546 return commonNaNToFloat32( float64ToCommonNaN( a ) );
1548 return packFloat32( aSign, 0xFF, 0 );
1550 shift64RightJamming( aSig0, aSig1, 22, &allZero, &zSig );
1551 if ( aExp ) zSig |= 0x40000000;
1552 return roundAndPackFloat32( aSign, aExp - 0x381, zSig );
1556 #ifndef SOFTFLOAT_FOR_GCC
1558 -------------------------------------------------------------------------------
1559 Rounds the double-precision floating-point value `a' to an integer,
1560 and returns the result as a double-precision floating-point value. The
1561 operation is performed according to the IEC/IEEE Standard for Binary
1562 Floating-Point Arithmetic.
1563 -------------------------------------------------------------------------------
1565 float64 float64_round_to_int( float64 a )
1567 flag aSign;
1568 int16 aExp;
1569 bits32 lastBitMask, roundBitsMask;
1570 int8 roundingMode;
1571 float64 z;
1573 aExp = extractFloat64Exp( a );
1574 if ( 0x413 <= aExp ) {
1575 if ( 0x433 <= aExp ) {
1576 if ( ( aExp == 0x7FF )
1577 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) {
1578 return propagateFloat64NaN( a, a );
1580 return a;
1582 lastBitMask = 1;
1583 lastBitMask = ( lastBitMask<<( 0x432 - aExp ) )<<1;
1584 roundBitsMask = lastBitMask - 1;
1585 z = a;
1586 roundingMode = float_rounding_mode;
1587 if ( roundingMode == float_round_nearest_even ) {
1588 if ( lastBitMask ) {
1589 add64( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
1590 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
1592 else {
1593 if ( (sbits32) z.low < 0 ) {
1594 ++z.high;
1595 if ( (bits32) ( z.low<<1 ) == 0 ) z.high &= ~1;
1599 else if ( roundingMode != float_round_to_zero ) {
1600 if ( extractFloat64Sign( z )
1601 ^ ( roundingMode == float_round_up ) ) {
1602 add64( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
1605 z.low &= ~ roundBitsMask;
1607 else {
1608 if ( aExp <= 0x3FE ) {
1609 if ( ( ( (bits32) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
1610 float_exception_flags |= float_flag_inexact;
1611 aSign = extractFloat64Sign( a );
1612 switch ( float_rounding_mode ) {
1613 case float_round_nearest_even:
1614 if ( ( aExp == 0x3FE )
1615 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) )
1617 return packFloat64( aSign, 0x3FF, 0, 0 );
1619 break;
1620 case float_round_down:
1621 return
1622 aSign ? packFloat64( 1, 0x3FF, 0, 0 )
1623 : packFloat64( 0, 0, 0, 0 );
1624 case float_round_up:
1625 return
1626 aSign ? packFloat64( 1, 0, 0, 0 )
1627 : packFloat64( 0, 0x3FF, 0, 0 );
1629 return packFloat64( aSign, 0, 0, 0 );
1631 lastBitMask = 1;
1632 lastBitMask <<= 0x413 - aExp;
1633 roundBitsMask = lastBitMask - 1;
1634 z.low = 0;
1635 z.high = a.high;
1636 roundingMode = float_rounding_mode;
1637 if ( roundingMode == float_round_nearest_even ) {
1638 z.high += lastBitMask>>1;
1639 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
1640 z.high &= ~ lastBitMask;
1643 else if ( roundingMode != float_round_to_zero ) {
1644 if ( extractFloat64Sign( z )
1645 ^ ( roundingMode == float_round_up ) ) {
1646 z.high |= ( a.low != 0 );
1647 z.high += roundBitsMask;
1650 z.high &= ~ roundBitsMask;
1652 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
1653 float_exception_flags |= float_flag_inexact;
1655 return z;
1658 #endif
1661 -------------------------------------------------------------------------------
1662 Returns the result of adding the absolute values of the double-precision
1663 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1664 before being returned. `zSign' is ignored if the result is a NaN.
1665 The addition is performed according to the IEC/IEEE Standard for Binary
1666 Floating-Point Arithmetic.
1667 -------------------------------------------------------------------------------
1669 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
1671 int16 aExp, bExp, zExp;
1672 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
1673 int16 expDiff;
1675 aSig1 = extractFloat64Frac1( a );
1676 aSig0 = extractFloat64Frac0( a );
1677 aExp = extractFloat64Exp( a );
1678 bSig1 = extractFloat64Frac1( b );
1679 bSig0 = extractFloat64Frac0( b );
1680 bExp = extractFloat64Exp( b );
1681 expDiff = aExp - bExp;
1682 if ( 0 < expDiff ) {
1683 if ( aExp == 0x7FF ) {
1684 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );
1685 return a;
1687 if ( bExp == 0 ) {
1688 --expDiff;
1690 else {
1691 bSig0 |= 0x00100000;
1693 shift64ExtraRightJamming(
1694 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
1695 zExp = aExp;
1697 else if ( expDiff < 0 ) {
1698 if ( bExp == 0x7FF ) {
1699 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1700 return packFloat64( zSign, 0x7FF, 0, 0 );
1702 if ( aExp == 0 ) {
1703 ++expDiff;
1705 else {
1706 aSig0 |= 0x00100000;
1708 shift64ExtraRightJamming(
1709 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
1710 zExp = bExp;
1712 else {
1713 if ( aExp == 0x7FF ) {
1714 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
1715 return propagateFloat64NaN( a, b );
1717 return a;
1719 add64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
1720 if ( aExp == 0 ) return packFloat64( zSign, 0, zSig0, zSig1 );
1721 zSig2 = 0;
1722 zSig0 |= 0x00200000;
1723 zExp = aExp;
1724 goto shiftRight1;
1726 aSig0 |= 0x00100000;
1727 add64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
1728 --zExp;
1729 if ( zSig0 < 0x00200000 ) goto roundAndPack;
1730 ++zExp;
1731 shiftRight1:
1732 shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
1733 roundAndPack:
1734 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
1739 -------------------------------------------------------------------------------
1740 Returns the result of subtracting the absolute values of the double-
1741 precision floating-point values `a' and `b'. If `zSign' is 1, the
1742 difference is negated before being returned. `zSign' is ignored if the
1743 result is a NaN. The subtraction is performed according to the IEC/IEEE
1744 Standard for Binary Floating-Point Arithmetic.
1745 -------------------------------------------------------------------------------
1747 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
1749 int16 aExp, bExp, zExp;
1750 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
1751 int16 expDiff;
1753 aSig1 = extractFloat64Frac1( a );
1754 aSig0 = extractFloat64Frac0( a );
1755 aExp = extractFloat64Exp( a );
1756 bSig1 = extractFloat64Frac1( b );
1757 bSig0 = extractFloat64Frac0( b );
1758 bExp = extractFloat64Exp( b );
1759 expDiff = aExp - bExp;
1760 shortShift64Left( aSig0, aSig1, 10, &aSig0, &aSig1 );
1761 shortShift64Left( bSig0, bSig1, 10, &bSig0, &bSig1 );
1762 if ( 0 < expDiff ) goto aExpBigger;
1763 if ( expDiff < 0 ) goto bExpBigger;
1764 if ( aExp == 0x7FF ) {
1765 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
1766 return propagateFloat64NaN( a, b );
1768 float_raise( float_flag_invalid );
1769 return float64_default_nan;
1771 if ( aExp == 0 ) {
1772 aExp = 1;
1773 bExp = 1;
1775 if ( bSig0 < aSig0 ) goto aBigger;
1776 if ( aSig0 < bSig0 ) goto bBigger;
1777 if ( bSig1 < aSig1 ) goto aBigger;
1778 if ( aSig1 < bSig1 ) goto bBigger;
1779 return packFloat64( float_rounding_mode == float_round_down, 0, 0, 0 );
1780 bExpBigger:
1781 if ( bExp == 0x7FF ) {
1782 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1783 return packFloat64( zSign ^ 1, 0x7FF, 0, 0 );
1785 if ( aExp == 0 ) {
1786 ++expDiff;
1788 else {
1789 aSig0 |= 0x40000000;
1791 shift64RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
1792 bSig0 |= 0x40000000;
1793 bBigger:
1794 sub64( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
1795 zExp = bExp;
1796 zSign ^= 1;
1797 goto normalizeRoundAndPack;
1798 aExpBigger:
1799 if ( aExp == 0x7FF ) {
1800 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );
1801 return a;
1803 if ( bExp == 0 ) {
1804 --expDiff;
1806 else {
1807 bSig0 |= 0x40000000;
1809 shift64RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
1810 aSig0 |= 0x40000000;
1811 aBigger:
1812 sub64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
1813 zExp = aExp;
1814 normalizeRoundAndPack:
1815 --zExp;
1816 return normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1 );
1821 -------------------------------------------------------------------------------
1822 Returns the result of adding the double-precision floating-point values `a'
1823 and `b'. The operation is performed according to the IEC/IEEE Standard for
1824 Binary Floating-Point Arithmetic.
1825 -------------------------------------------------------------------------------
1827 float64 float64_add( float64 a, float64 b )
1829 flag aSign, bSign;
1831 aSign = extractFloat64Sign( a );
1832 bSign = extractFloat64Sign( b );
1833 if ( aSign == bSign ) {
1834 return addFloat64Sigs( a, b, aSign );
1836 else {
1837 return subFloat64Sigs( a, b, aSign );
1843 -------------------------------------------------------------------------------
1844 Returns the result of subtracting the double-precision floating-point values
1845 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1846 for Binary Floating-Point Arithmetic.
1847 -------------------------------------------------------------------------------
1849 float64 float64_sub( float64 a, float64 b )
1851 flag aSign, bSign;
1853 aSign = extractFloat64Sign( a );
1854 bSign = extractFloat64Sign( b );
1855 if ( aSign == bSign ) {
1856 return subFloat64Sigs( a, b, aSign );
1858 else {
1859 return addFloat64Sigs( a, b, aSign );
1865 -------------------------------------------------------------------------------
1866 Returns the result of multiplying the double-precision floating-point values
1867 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1868 for Binary Floating-Point Arithmetic.
1869 -------------------------------------------------------------------------------
1871 float64 float64_mul( float64 a, float64 b )
1873 flag aSign, bSign, zSign;
1874 int16 aExp, bExp, zExp;
1875 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
1877 aSig1 = extractFloat64Frac1( a );
1878 aSig0 = extractFloat64Frac0( a );
1879 aExp = extractFloat64Exp( a );
1880 aSign = extractFloat64Sign( a );
1881 bSig1 = extractFloat64Frac1( b );
1882 bSig0 = extractFloat64Frac0( b );
1883 bExp = extractFloat64Exp( b );
1884 bSign = extractFloat64Sign( b );
1885 zSign = aSign ^ bSign;
1886 if ( aExp == 0x7FF ) {
1887 if ( ( aSig0 | aSig1 )
1888 || ( ( bExp == 0x7FF ) && ( bSig0 | bSig1 ) ) ) {
1889 return propagateFloat64NaN( a, b );
1891 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
1892 return packFloat64( zSign, 0x7FF, 0, 0 );
1894 if ( bExp == 0x7FF ) {
1895 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1896 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
1897 invalid:
1898 float_raise( float_flag_invalid );
1899 return float64_default_nan;
1901 return packFloat64( zSign, 0x7FF, 0, 0 );
1903 if ( aExp == 0 ) {
1904 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );
1905 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
1907 if ( bExp == 0 ) {
1908 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );
1909 normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
1911 zExp = aExp + bExp - 0x400;
1912 aSig0 |= 0x00100000;
1913 shortShift64Left( bSig0, bSig1, 12, &bSig0, &bSig1 );
1914 mul64To128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
1915 add64( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
1916 zSig2 |= ( zSig3 != 0 );
1917 if ( 0x00200000 <= zSig0 ) {
1918 shift64ExtraRightJamming(
1919 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
1920 ++zExp;
1922 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
1927 -------------------------------------------------------------------------------
1928 Returns the result of dividing the double-precision floating-point value `a'
1929 by the corresponding value `b'. The operation is performed according to the
1930 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1931 -------------------------------------------------------------------------------
1933 float64 float64_div( float64 a, float64 b )
1935 flag aSign, bSign, zSign;
1936 int16 aExp, bExp, zExp;
1937 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
1938 bits32 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
1940 aSig1 = extractFloat64Frac1( a );
1941 aSig0 = extractFloat64Frac0( a );
1942 aExp = extractFloat64Exp( a );
1943 aSign = extractFloat64Sign( a );
1944 bSig1 = extractFloat64Frac1( b );
1945 bSig0 = extractFloat64Frac0( b );
1946 bExp = extractFloat64Exp( b );
1947 bSign = extractFloat64Sign( b );
1948 zSign = aSign ^ bSign;
1949 if ( aExp == 0x7FF ) {
1950 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );
1951 if ( bExp == 0x7FF ) {
1952 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1953 goto invalid;
1955 return packFloat64( zSign, 0x7FF, 0, 0 );
1957 if ( bExp == 0x7FF ) {
1958 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1959 return packFloat64( zSign, 0, 0, 0 );
1961 if ( bExp == 0 ) {
1962 if ( ( bSig0 | bSig1 ) == 0 ) {
1963 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
1964 invalid:
1965 float_raise( float_flag_invalid );
1966 return float64_default_nan;
1968 float_raise( float_flag_divbyzero );
1969 return packFloat64( zSign, 0x7FF, 0, 0 );
1971 normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
1973 if ( aExp == 0 ) {
1974 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );
1975 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
1977 zExp = aExp - bExp + 0x3FD;
1978 shortShift64Left( aSig0 | 0x00100000, aSig1, 11, &aSig0, &aSig1 );
1979 shortShift64Left( bSig0 | 0x00100000, bSig1, 11, &bSig0, &bSig1 );
1980 if ( le64( bSig0, bSig1, aSig0, aSig1 ) ) {
1981 shift64Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
1982 ++zExp;
1984 zSig0 = estimateDiv64To32( aSig0, aSig1, bSig0 );
1985 mul64By32To96( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
1986 sub96( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
1987 while ( (sbits32) rem0 < 0 ) {
1988 --zSig0;
1989 add96( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
1991 zSig1 = estimateDiv64To32( rem1, rem2, bSig0 );
1992 if ( ( zSig1 & 0x3FF ) <= 4 ) {
1993 mul64By32To96( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
1994 sub96( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
1995 while ( (sbits32) rem1 < 0 ) {
1996 --zSig1;
1997 add96( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
1999 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
2001 shift64ExtraRightJamming( zSig0, zSig1, 0, 11, &zSig0, &zSig1, &zSig2 );
2002 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
2006 #ifndef SOFTFLOAT_FOR_GCC
2008 -------------------------------------------------------------------------------
2009 Returns the remainder of the double-precision floating-point value `a'
2010 with respect to the corresponding value `b'. The operation is performed
2011 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2012 -------------------------------------------------------------------------------
2014 float64 float64_rem( float64 a, float64 b )
2016 flag aSign, bSign, zSign;
2017 int16 aExp, bExp, expDiff;
2018 bits32 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
2019 bits32 allZero, alternateASig0, alternateASig1, sigMean1;
2020 sbits32 sigMean0;
2021 float64 z;
2023 aSig1 = extractFloat64Frac1( a );
2024 aSig0 = extractFloat64Frac0( a );
2025 aExp = extractFloat64Exp( a );
2026 aSign = extractFloat64Sign( a );
2027 bSig1 = extractFloat64Frac1( b );
2028 bSig0 = extractFloat64Frac0( b );
2029 bExp = extractFloat64Exp( b );
2030 bSign = extractFloat64Sign( b );
2031 if ( aExp == 0x7FF ) {
2032 if ( ( aSig0 | aSig1 )
2033 || ( ( bExp == 0x7FF ) && ( bSig0 | bSig1 ) ) ) {
2034 return propagateFloat64NaN( a, b );
2036 goto invalid;
2038 if ( bExp == 0x7FF ) {
2039 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
2040 return a;
2042 if ( bExp == 0 ) {
2043 if ( ( bSig0 | bSig1 ) == 0 ) {
2044 invalid:
2045 float_raise( float_flag_invalid );
2046 return float64_default_nan;
2048 normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
2050 if ( aExp == 0 ) {
2051 if ( ( aSig0 | aSig1 ) == 0 ) return a;
2052 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
2054 expDiff = aExp - bExp;
2055 if ( expDiff < -1 ) return a;
2056 shortShift64Left(
2057 aSig0 | 0x00100000, aSig1, 11 - ( expDiff < 0 ), &aSig0, &aSig1 );
2058 shortShift64Left( bSig0 | 0x00100000, bSig1, 11, &bSig0, &bSig1 );
2059 q = le64( bSig0, bSig1, aSig0, aSig1 );
2060 if ( q ) sub64( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
2061 expDiff -= 32;
2062 while ( 0 < expDiff ) {
2063 q = estimateDiv64To32( aSig0, aSig1, bSig0 );
2064 q = ( 4 < q ) ? q - 4 : 0;
2065 mul64By32To96( bSig0, bSig1, q, &term0, &term1, &term2 );
2066 shortShift96Left( term0, term1, term2, 29, &term1, &term2, &allZero );
2067 shortShift64Left( aSig0, aSig1, 29, &aSig0, &allZero );
2068 sub64( aSig0, 0, term1, term2, &aSig0, &aSig1 );
2069 expDiff -= 29;
2071 if ( -32 < expDiff ) {
2072 q = estimateDiv64To32( aSig0, aSig1, bSig0 );
2073 q = ( 4 < q ) ? q - 4 : 0;
2074 q >>= - expDiff;
2075 shift64Right( bSig0, bSig1, 8, &bSig0, &bSig1 );
2076 expDiff += 24;
2077 if ( expDiff < 0 ) {
2078 shift64Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
2080 else {
2081 shortShift64Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
2083 mul64By32To96( bSig0, bSig1, q, &term0, &term1, &term2 );
2084 sub64( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
2086 else {
2087 shift64Right( aSig0, aSig1, 8, &aSig0, &aSig1 );
2088 shift64Right( bSig0, bSig1, 8, &bSig0, &bSig1 );
2090 do {
2091 alternateASig0 = aSig0;
2092 alternateASig1 = aSig1;
2093 ++q;
2094 sub64( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
2095 } while ( 0 <= (sbits32) aSig0 );
2096 add64(
2097 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
2098 if ( ( sigMean0 < 0 )
2099 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
2100 aSig0 = alternateASig0;
2101 aSig1 = alternateASig1;
2103 zSign = ( (sbits32) aSig0 < 0 );
2104 if ( zSign ) sub64( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
2105 return
2106 normalizeRoundAndPackFloat64( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
2109 #endif
2111 #ifndef SOFTFLOAT_FOR_GCC
2113 -------------------------------------------------------------------------------
2114 Returns the square root of the double-precision floating-point value `a'.
2115 The operation is performed according to the IEC/IEEE Standard for Binary
2116 Floating-Point Arithmetic.
2117 -------------------------------------------------------------------------------
2119 float64 float64_sqrt( float64 a )
2121 flag aSign;
2122 int16 aExp, zExp;
2123 bits32 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
2124 bits32 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
2125 float64 z;
2127 aSig1 = extractFloat64Frac1( a );
2128 aSig0 = extractFloat64Frac0( a );
2129 aExp = extractFloat64Exp( a );
2130 aSign = extractFloat64Sign( a );
2131 if ( aExp == 0x7FF ) {
2132 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, a );
2133 if ( ! aSign ) return a;
2134 goto invalid;
2136 if ( aSign ) {
2137 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
2138 invalid:
2139 float_raise( float_flag_invalid );
2140 return float64_default_nan;
2142 if ( aExp == 0 ) {
2143 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( 0, 0, 0, 0 );
2144 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
2146 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2147 aSig0 |= 0x00100000;
2148 shortShift64Left( aSig0, aSig1, 11, &term0, &term1 );
2149 zSig0 = ( estimateSqrt32( aExp, term0 )>>1 ) + 1;
2150 if ( zSig0 == 0 ) zSig0 = 0x7FFFFFFF;
2151 doubleZSig0 = zSig0 + zSig0;
2152 shortShift64Left( aSig0, aSig1, 9 - ( aExp & 1 ), &aSig0, &aSig1 );
2153 mul32To64( zSig0, zSig0, &term0, &term1 );
2154 sub64( aSig0, aSig1, term0, term1, &rem0, &rem1 );
2155 while ( (sbits32) rem0 < 0 ) {
2156 --zSig0;
2157 doubleZSig0 -= 2;
2158 add64( rem0, rem1, 0, doubleZSig0 | 1, &rem0, &rem1 );
2160 zSig1 = estimateDiv64To32( rem1, 0, doubleZSig0 );
2161 if ( ( zSig1 & 0x1FF ) <= 5 ) {
2162 if ( zSig1 == 0 ) zSig1 = 1;
2163 mul32To64( doubleZSig0, zSig1, &term1, &term2 );
2164 sub64( rem1, 0, term1, term2, &rem1, &rem2 );
2165 mul32To64( zSig1, zSig1, &term2, &term3 );
2166 sub96( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
2167 while ( (sbits32) rem1 < 0 ) {
2168 --zSig1;
2169 shortShift64Left( 0, zSig1, 1, &term2, &term3 );
2170 term3 |= 1;
2171 term2 |= doubleZSig0;
2172 add96( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
2174 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
2176 shift64ExtraRightJamming( zSig0, zSig1, 0, 10, &zSig0, &zSig1, &zSig2 );
2177 return roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2 );
2180 #endif
2183 -------------------------------------------------------------------------------
2184 Returns 1 if the double-precision floating-point value `a' is equal to
2185 the corresponding value `b', and 0 otherwise. The comparison is performed
2186 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2187 -------------------------------------------------------------------------------
2189 flag float64_eq( float64 a, float64 b )
2192 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2193 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2194 || ( ( extractFloat64Exp( b ) == 0x7FF )
2195 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2197 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2198 float_raise( float_flag_invalid );
2200 return 0;
2202 return ( a == b ) ||
2203 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
2208 -------------------------------------------------------------------------------
2209 Returns 1 if the double-precision floating-point value `a' is less than
2210 or equal to the corresponding value `b', and 0 otherwise. The comparison
2211 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2212 Arithmetic.
2213 -------------------------------------------------------------------------------
2215 flag float64_le( float64 a, float64 b )
2217 flag aSign, bSign;
2219 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2220 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2221 || ( ( extractFloat64Exp( b ) == 0x7FF )
2222 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2224 float_raise( float_flag_invalid );
2225 return 0;
2227 aSign = extractFloat64Sign( a );
2228 bSign = extractFloat64Sign( b );
2229 if ( aSign != bSign )
2230 return aSign ||
2231 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
2232 0 );
2233 return ( a == b ) ||
2234 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
2238 -------------------------------------------------------------------------------
2239 Returns 1 if the double-precision floating-point value `a' is less than
2240 the corresponding value `b', and 0 otherwise. The comparison is performed
2241 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2242 -------------------------------------------------------------------------------
2244 flag float64_lt( float64 a, float64 b )
2246 flag aSign, bSign;
2248 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2249 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2250 || ( ( extractFloat64Exp( b ) == 0x7FF )
2251 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2253 float_raise( float_flag_invalid );
2254 return 0;
2256 aSign = extractFloat64Sign( a );
2257 bSign = extractFloat64Sign( b );
2258 if ( aSign != bSign )
2259 return aSign &&
2260 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
2261 0 );
2262 return ( a != b ) &&
2263 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
2267 #ifndef SOFTFLOAT_FOR_GCC
2269 -------------------------------------------------------------------------------
2270 Returns 1 if the double-precision floating-point value `a' is equal to
2271 the corresponding value `b', and 0 otherwise. The invalid exception is
2272 raised if either operand is a NaN. Otherwise, the comparison is performed
2273 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2274 -------------------------------------------------------------------------------
2276 flag float64_eq_signaling( float64 a, float64 b )
2279 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2280 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2281 || ( ( extractFloat64Exp( b ) == 0x7FF )
2282 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2284 float_raise( float_flag_invalid );
2285 return 0;
2287 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2292 -------------------------------------------------------------------------------
2293 Returns 1 if the double-precision floating-point value `a' is less than or
2294 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2295 cause an exception. Otherwise, the comparison is performed according to the
2296 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2297 -------------------------------------------------------------------------------
2299 flag float64_le_quiet( float64 a, float64 b )
2301 flag aSign, bSign;
2303 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2304 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2305 || ( ( extractFloat64Exp( b ) == 0x7FF )
2306 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2308 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2309 float_raise( float_flag_invalid );
2311 return 0;
2313 aSign = extractFloat64Sign( a );
2314 bSign = extractFloat64Sign( b );
2315 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2316 return ( a == b ) || ( aSign ^ ( a < b ) );
2321 -------------------------------------------------------------------------------
2322 Returns 1 if the double-precision floating-point value `a' is less than
2323 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2324 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2325 Standard for Binary Floating-Point Arithmetic.
2326 -------------------------------------------------------------------------------
2328 flag float64_lt_quiet( float64 a, float64 b )
2330 flag aSign, bSign;
2332 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2333 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2334 || ( ( extractFloat64Exp( b ) == 0x7FF )
2335 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2337 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2338 float_raise( float_flag_invalid );
2340 return 0;
2342 aSign = extractFloat64Sign( a );
2343 bSign = extractFloat64Sign( b );
2344 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2345 return ( a != b ) && ( aSign ^ ( a < b ) );
2349 #endif