2 /*============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15 arithmetic/SoftFloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
26 Derivative works are acceptable, even for commercial purposes, so long as
27 (1) the source code for the derivative work includes prominent notice that
28 the work is derivative, and (2) the source code includes prominent notice with
29 these four paragraphs for those parts of this code that are retained.
31 =============================================================================*/
33 /* FIXME: Flush-To-Zero only effects results. Denormal inputs should also
34 be flushed to zero. */
35 #include "softfloat.h"
37 /*----------------------------------------------------------------------------
38 | Primitive arithmetic functions, including multi-word arithmetic, and
39 | division and square root approximations. (Can be specialized to target if
41 *----------------------------------------------------------------------------*/
42 #include "softfloat-macros.h"
44 /*----------------------------------------------------------------------------
45 | Functions and definitions to determine: (1) whether tininess for underflow
46 | is detected before or after rounding by default, (2) what (if anything)
47 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
48 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
49 | are propagated from function inputs to output. These details are target-
51 *----------------------------------------------------------------------------*/
52 #include "softfloat-specialize.h"
54 void set_float_rounding_mode(int val STATUS_PARAM
)
56 STATUS(float_rounding_mode
) = val
;
59 void set_float_exception_flags(int val STATUS_PARAM
)
61 STATUS(float_exception_flags
) = val
;
65 void set_floatx80_rounding_precision(int val STATUS_PARAM
)
67 STATUS(floatx80_rounding_precision
) = val
;
71 /*----------------------------------------------------------------------------
72 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
73 | and 7, and returns the properly rounded 32-bit integer corresponding to the
74 | input. If `zSign' is 1, the input is negated before being converted to an
75 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
76 | is simply rounded to an integer, with the inexact exception raised if the
77 | input cannot be represented exactly as an integer. However, if the fixed-
78 | point input is too large, the invalid exception is raised and the largest
79 | positive or negative integer is returned.
80 *----------------------------------------------------------------------------*/
82 static int32
roundAndPackInt32( flag zSign
, bits64 absZ STATUS_PARAM
)
85 flag roundNearestEven
;
86 int8 roundIncrement
, roundBits
;
89 roundingMode
= STATUS(float_rounding_mode
);
90 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
91 roundIncrement
= 0x40;
92 if ( ! roundNearestEven
) {
93 if ( roundingMode
== float_round_to_zero
) {
97 roundIncrement
= 0x7F;
99 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
102 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
106 roundBits
= absZ
& 0x7F;
107 absZ
= ( absZ
+ roundIncrement
)>>7;
108 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
110 if ( zSign
) z
= - z
;
111 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
112 float_raise( float_flag_invalid STATUS_VAR
);
113 return zSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
115 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
120 /*----------------------------------------------------------------------------
121 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
122 | `absZ1', with binary point between bits 63 and 64 (between the input words),
123 | and returns the properly rounded 64-bit integer corresponding to the input.
124 | If `zSign' is 1, the input is negated before being converted to an integer.
125 | Ordinarily, the fixed-point input is simply rounded to an integer, with
126 | the inexact exception raised if the input cannot be represented exactly as
127 | an integer. However, if the fixed-point input is too large, the invalid
128 | exception is raised and the largest positive or negative integer is
130 *----------------------------------------------------------------------------*/
132 static int64
roundAndPackInt64( flag zSign
, bits64 absZ0
, bits64 absZ1 STATUS_PARAM
)
135 flag roundNearestEven
, increment
;
138 roundingMode
= STATUS(float_rounding_mode
);
139 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
140 increment
= ( (sbits64
) absZ1
< 0 );
141 if ( ! roundNearestEven
) {
142 if ( roundingMode
== float_round_to_zero
) {
147 increment
= ( roundingMode
== float_round_down
) && absZ1
;
150 increment
= ( roundingMode
== float_round_up
) && absZ1
;
156 if ( absZ0
== 0 ) goto overflow
;
157 absZ0
&= ~ ( ( (bits64
) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
160 if ( zSign
) z
= - z
;
161 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
163 float_raise( float_flag_invalid STATUS_VAR
);
165 zSign
? (sbits64
) LIT64( 0x8000000000000000 )
166 : LIT64( 0x7FFFFFFFFFFFFFFF );
168 if ( absZ1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
173 /*----------------------------------------------------------------------------
174 | Returns the fraction bits of the single-precision floating-point value `a'.
175 *----------------------------------------------------------------------------*/
177 INLINE bits32
extractFloat32Frac( float32 a
)
180 return float32_val(a
) & 0x007FFFFF;
184 /*----------------------------------------------------------------------------
185 | Returns the exponent bits of the single-precision floating-point value `a'.
186 *----------------------------------------------------------------------------*/
188 INLINE int16
extractFloat32Exp( float32 a
)
191 return ( float32_val(a
)>>23 ) & 0xFF;
195 /*----------------------------------------------------------------------------
196 | Returns the sign bit of the single-precision floating-point value `a'.
197 *----------------------------------------------------------------------------*/
199 INLINE flag
extractFloat32Sign( float32 a
)
202 return float32_val(a
)>>31;
206 /*----------------------------------------------------------------------------
207 | Normalizes the subnormal single-precision floating-point value represented
208 | by the denormalized significand `aSig'. The normalized exponent and
209 | significand are stored at the locations pointed to by `zExpPtr' and
210 | `zSigPtr', respectively.
211 *----------------------------------------------------------------------------*/
214 normalizeFloat32Subnormal( bits32 aSig
, int16
*zExpPtr
, bits32
*zSigPtr
)
218 shiftCount
= countLeadingZeros32( aSig
) - 8;
219 *zSigPtr
= aSig
<<shiftCount
;
220 *zExpPtr
= 1 - shiftCount
;
224 /*----------------------------------------------------------------------------
225 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
226 | single-precision floating-point value, returning the result. After being
227 | shifted into the proper positions, the three fields are simply added
228 | together to form the result. This means that any integer portion of `zSig'
229 | will be added into the exponent. Since a properly normalized significand
230 | will have an integer portion equal to 1, the `zExp' input should be 1 less
231 | than the desired result exponent whenever `zSig' is a complete, normalized
233 *----------------------------------------------------------------------------*/
235 INLINE float32
packFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
239 ( ( (bits32
) zSign
)<<31 ) + ( ( (bits32
) zExp
)<<23 ) + zSig
);
243 /*----------------------------------------------------------------------------
244 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
245 | and significand `zSig', and returns the proper single-precision floating-
246 | point value corresponding to the abstract input. Ordinarily, the abstract
247 | value is simply rounded and packed into the single-precision format, with
248 | the inexact exception raised if the abstract input cannot be represented
249 | exactly. However, if the abstract value is too large, the overflow and
250 | inexact exceptions are raised and an infinity or maximal finite value is
251 | returned. If the abstract value is too small, the input value is rounded to
252 | a subnormal number, and the underflow and inexact exceptions are raised if
253 | the abstract input cannot be represented exactly as a subnormal single-
254 | precision floating-point number.
255 | The input significand `zSig' has its binary point between bits 30
256 | and 29, which is 7 bits to the left of the usual location. This shifted
257 | significand must be normalized or smaller. If `zSig' is not normalized,
258 | `zExp' must be 0; in that case, the result returned is a subnormal number,
259 | and it must not require rounding. In the usual case that `zSig' is
260 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
261 | The handling of underflow and overflow follows the IEC/IEEE Standard for
262 | Binary Floating-Point Arithmetic.
263 *----------------------------------------------------------------------------*/
265 static float32
roundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig STATUS_PARAM
)
268 flag roundNearestEven
;
269 int8 roundIncrement
, roundBits
;
272 roundingMode
= STATUS(float_rounding_mode
);
273 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
274 roundIncrement
= 0x40;
275 if ( ! roundNearestEven
) {
276 if ( roundingMode
== float_round_to_zero
) {
280 roundIncrement
= 0x7F;
282 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
285 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
289 roundBits
= zSig
& 0x7F;
290 if ( 0xFD <= (bits16
) zExp
) {
292 || ( ( zExp
== 0xFD )
293 && ( (sbits32
) ( zSig
+ roundIncrement
) < 0 ) )
295 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
296 return packFloat32( zSign
, 0xFF, - ( roundIncrement
== 0 ));
299 if ( STATUS(flush_to_zero
) ) return packFloat32( zSign
, 0, 0 );
301 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
303 || ( zSig
+ roundIncrement
< 0x80000000 );
304 shift32RightJamming( zSig
, - zExp
, &zSig
);
306 roundBits
= zSig
& 0x7F;
307 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
310 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
311 zSig
= ( zSig
+ roundIncrement
)>>7;
312 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
313 if ( zSig
== 0 ) zExp
= 0;
314 return packFloat32( zSign
, zExp
, zSig
);
318 /*----------------------------------------------------------------------------
319 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
320 | and significand `zSig', and returns the proper single-precision floating-
321 | point value corresponding to the abstract input. This routine is just like
322 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
323 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
324 | floating-point exponent.
325 *----------------------------------------------------------------------------*/
328 normalizeRoundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig STATUS_PARAM
)
332 shiftCount
= countLeadingZeros32( zSig
) - 1;
333 return roundAndPackFloat32( zSign
, zExp
- shiftCount
, zSig
<<shiftCount STATUS_VAR
);
337 /*----------------------------------------------------------------------------
338 | Returns the fraction bits of the double-precision floating-point value `a'.
339 *----------------------------------------------------------------------------*/
341 INLINE bits64
extractFloat64Frac( float64 a
)
344 return float64_val(a
) & LIT64( 0x000FFFFFFFFFFFFF );
348 /*----------------------------------------------------------------------------
349 | Returns the exponent bits of the double-precision floating-point value `a'.
350 *----------------------------------------------------------------------------*/
352 INLINE int16
extractFloat64Exp( float64 a
)
355 return ( float64_val(a
)>>52 ) & 0x7FF;
359 /*----------------------------------------------------------------------------
360 | Returns the sign bit of the double-precision floating-point value `a'.
361 *----------------------------------------------------------------------------*/
363 INLINE flag
extractFloat64Sign( float64 a
)
366 return float64_val(a
)>>63;
370 /*----------------------------------------------------------------------------
371 | Normalizes the subnormal double-precision floating-point value represented
372 | by the denormalized significand `aSig'. The normalized exponent and
373 | significand are stored at the locations pointed to by `zExpPtr' and
374 | `zSigPtr', respectively.
375 *----------------------------------------------------------------------------*/
378 normalizeFloat64Subnormal( bits64 aSig
, int16
*zExpPtr
, bits64
*zSigPtr
)
382 shiftCount
= countLeadingZeros64( aSig
) - 11;
383 *zSigPtr
= aSig
<<shiftCount
;
384 *zExpPtr
= 1 - shiftCount
;
388 /*----------------------------------------------------------------------------
389 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
390 | double-precision floating-point value, returning the result. After being
391 | shifted into the proper positions, the three fields are simply added
392 | together to form the result. This means that any integer portion of `zSig'
393 | will be added into the exponent. Since a properly normalized significand
394 | will have an integer portion equal to 1, the `zExp' input should be 1 less
395 | than the desired result exponent whenever `zSig' is a complete, normalized
397 *----------------------------------------------------------------------------*/
399 INLINE float64
packFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
403 ( ( (bits64
) zSign
)<<63 ) + ( ( (bits64
) zExp
)<<52 ) + zSig
);
407 /*----------------------------------------------------------------------------
408 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
409 | and significand `zSig', and returns the proper double-precision floating-
410 | point value corresponding to the abstract input. Ordinarily, the abstract
411 | value is simply rounded and packed into the double-precision format, with
412 | the inexact exception raised if the abstract input cannot be represented
413 | exactly. However, if the abstract value is too large, the overflow and
414 | inexact exceptions are raised and an infinity or maximal finite value is
415 | returned. If the abstract value is too small, the input value is rounded
416 | to a subnormal number, and the underflow and inexact exceptions are raised
417 | if the abstract input cannot be represented exactly as a subnormal double-
418 | precision floating-point number.
419 | The input significand `zSig' has its binary point between bits 62
420 | and 61, which is 10 bits to the left of the usual location. This shifted
421 | significand must be normalized or smaller. If `zSig' is not normalized,
422 | `zExp' must be 0; in that case, the result returned is a subnormal number,
423 | and it must not require rounding. In the usual case that `zSig' is
424 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
425 | The handling of underflow and overflow follows the IEC/IEEE Standard for
426 | Binary Floating-Point Arithmetic.
427 *----------------------------------------------------------------------------*/
429 static float64
roundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig STATUS_PARAM
)
432 flag roundNearestEven
;
433 int16 roundIncrement
, roundBits
;
436 roundingMode
= STATUS(float_rounding_mode
);
437 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
438 roundIncrement
= 0x200;
439 if ( ! roundNearestEven
) {
440 if ( roundingMode
== float_round_to_zero
) {
444 roundIncrement
= 0x3FF;
446 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
449 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
453 roundBits
= zSig
& 0x3FF;
454 if ( 0x7FD <= (bits16
) zExp
) {
455 if ( ( 0x7FD < zExp
)
456 || ( ( zExp
== 0x7FD )
457 && ( (sbits64
) ( zSig
+ roundIncrement
) < 0 ) )
459 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
460 return packFloat64( zSign
, 0x7FF, - ( roundIncrement
== 0 ));
463 if ( STATUS(flush_to_zero
) ) return packFloat64( zSign
, 0, 0 );
465 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
467 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
468 shift64RightJamming( zSig
, - zExp
, &zSig
);
470 roundBits
= zSig
& 0x3FF;
471 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
474 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
475 zSig
= ( zSig
+ roundIncrement
)>>10;
476 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
477 if ( zSig
== 0 ) zExp
= 0;
478 return packFloat64( zSign
, zExp
, zSig
);
482 /*----------------------------------------------------------------------------
483 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
484 | and significand `zSig', and returns the proper double-precision floating-
485 | point value corresponding to the abstract input. This routine is just like
486 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
487 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
488 | floating-point exponent.
489 *----------------------------------------------------------------------------*/
492 normalizeRoundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig STATUS_PARAM
)
496 shiftCount
= countLeadingZeros64( zSig
) - 1;
497 return roundAndPackFloat64( zSign
, zExp
- shiftCount
, zSig
<<shiftCount STATUS_VAR
);
503 /*----------------------------------------------------------------------------
504 | Returns the fraction bits of the extended double-precision floating-point
506 *----------------------------------------------------------------------------*/
508 INLINE bits64
extractFloatx80Frac( floatx80 a
)
515 /*----------------------------------------------------------------------------
516 | Returns the exponent bits of the extended double-precision floating-point
518 *----------------------------------------------------------------------------*/
520 INLINE int32
extractFloatx80Exp( floatx80 a
)
523 return a
.high
& 0x7FFF;
527 /*----------------------------------------------------------------------------
528 | Returns the sign bit of the extended double-precision floating-point value
530 *----------------------------------------------------------------------------*/
532 INLINE flag
extractFloatx80Sign( floatx80 a
)
539 /*----------------------------------------------------------------------------
540 | Normalizes the subnormal extended double-precision floating-point value
541 | represented by the denormalized significand `aSig'. The normalized exponent
542 | and significand are stored at the locations pointed to by `zExpPtr' and
543 | `zSigPtr', respectively.
544 *----------------------------------------------------------------------------*/
547 normalizeFloatx80Subnormal( bits64 aSig
, int32
*zExpPtr
, bits64
*zSigPtr
)
551 shiftCount
= countLeadingZeros64( aSig
);
552 *zSigPtr
= aSig
<<shiftCount
;
553 *zExpPtr
= 1 - shiftCount
;
557 /*----------------------------------------------------------------------------
558 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
559 | extended double-precision floating-point value, returning the result.
560 *----------------------------------------------------------------------------*/
562 INLINE floatx80
packFloatx80( flag zSign
, int32 zExp
, bits64 zSig
)
567 z
.high
= ( ( (bits16
) zSign
)<<15 ) + zExp
;
572 /*----------------------------------------------------------------------------
573 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
574 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
575 | and returns the proper extended double-precision floating-point value
576 | corresponding to the abstract input. Ordinarily, the abstract value is
577 | rounded and packed into the extended double-precision format, with the
578 | inexact exception raised if the abstract input cannot be represented
579 | exactly. However, if the abstract value is too large, the overflow and
580 | inexact exceptions are raised and an infinity or maximal finite value is
581 | returned. If the abstract value is too small, the input value is rounded to
582 | a subnormal number, and the underflow and inexact exceptions are raised if
583 | the abstract input cannot be represented exactly as a subnormal extended
584 | double-precision floating-point number.
585 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
586 | number of bits as single or double precision, respectively. Otherwise, the
587 | result is rounded to the full precision of the extended double-precision
589 | The input significand must be normalized or smaller. If the input
590 | significand is not normalized, `zExp' must be 0; in that case, the result
591 | returned is a subnormal number, and it must not require rounding. The
592 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
593 | Floating-Point Arithmetic.
594 *----------------------------------------------------------------------------*/
597 roundAndPackFloatx80(
598 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
602 flag roundNearestEven
, increment
, isTiny
;
603 int64 roundIncrement
, roundMask
, roundBits
;
605 roundingMode
= STATUS(float_rounding_mode
);
606 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
607 if ( roundingPrecision
== 80 ) goto precision80
;
608 if ( roundingPrecision
== 64 ) {
609 roundIncrement
= LIT64( 0x0000000000000400 );
610 roundMask
= LIT64( 0x00000000000007FF );
612 else if ( roundingPrecision
== 32 ) {
613 roundIncrement
= LIT64( 0x0000008000000000 );
614 roundMask
= LIT64( 0x000000FFFFFFFFFF );
619 zSig0
|= ( zSig1
!= 0 );
620 if ( ! roundNearestEven
) {
621 if ( roundingMode
== float_round_to_zero
) {
625 roundIncrement
= roundMask
;
627 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
630 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
634 roundBits
= zSig0
& roundMask
;
635 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
636 if ( ( 0x7FFE < zExp
)
637 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
642 if ( STATUS(flush_to_zero
) ) return packFloatx80( zSign
, 0, 0 );
644 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
646 || ( zSig0
<= zSig0
+ roundIncrement
);
647 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
649 roundBits
= zSig0
& roundMask
;
650 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
651 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
652 zSig0
+= roundIncrement
;
653 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
654 roundIncrement
= roundMask
+ 1;
655 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
656 roundMask
|= roundIncrement
;
658 zSig0
&= ~ roundMask
;
659 return packFloatx80( zSign
, zExp
, zSig0
);
662 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
663 zSig0
+= roundIncrement
;
664 if ( zSig0
< roundIncrement
) {
666 zSig0
= LIT64( 0x8000000000000000 );
668 roundIncrement
= roundMask
+ 1;
669 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
670 roundMask
|= roundIncrement
;
672 zSig0
&= ~ roundMask
;
673 if ( zSig0
== 0 ) zExp
= 0;
674 return packFloatx80( zSign
, zExp
, zSig0
);
676 increment
= ( (sbits64
) zSig1
< 0 );
677 if ( ! roundNearestEven
) {
678 if ( roundingMode
== float_round_to_zero
) {
683 increment
= ( roundingMode
== float_round_down
) && zSig1
;
686 increment
= ( roundingMode
== float_round_up
) && zSig1
;
690 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
691 if ( ( 0x7FFE < zExp
)
692 || ( ( zExp
== 0x7FFE )
693 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
699 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
700 if ( ( roundingMode
== float_round_to_zero
)
701 || ( zSign
&& ( roundingMode
== float_round_up
) )
702 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
704 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
706 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
710 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
713 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
714 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
716 if ( isTiny
&& zSig1
) float_raise( float_flag_underflow STATUS_VAR
);
717 if ( zSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
718 if ( roundNearestEven
) {
719 increment
= ( (sbits64
) zSig1
< 0 );
723 increment
= ( roundingMode
== float_round_down
) && zSig1
;
726 increment
= ( roundingMode
== float_round_up
) && zSig1
;
732 ~ ( ( (bits64
) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
733 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
735 return packFloatx80( zSign
, zExp
, zSig0
);
738 if ( zSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
743 zSig0
= LIT64( 0x8000000000000000 );
746 zSig0
&= ~ ( ( (bits64
) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
750 if ( zSig0
== 0 ) zExp
= 0;
752 return packFloatx80( zSign
, zExp
, zSig0
);
756 /*----------------------------------------------------------------------------
757 | Takes an abstract floating-point value having sign `zSign', exponent
758 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
759 | and returns the proper extended double-precision floating-point value
760 | corresponding to the abstract input. This routine is just like
761 | `roundAndPackFloatx80' except that the input significand does not have to be
763 *----------------------------------------------------------------------------*/
766 normalizeRoundAndPackFloatx80(
767 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
777 shiftCount
= countLeadingZeros64( zSig0
);
778 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
781 roundAndPackFloatx80( roundingPrecision
, zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
789 /*----------------------------------------------------------------------------
790 | Returns the least-significant 64 fraction bits of the quadruple-precision
791 | floating-point value `a'.
792 *----------------------------------------------------------------------------*/
794 INLINE bits64
extractFloat128Frac1( float128 a
)
801 /*----------------------------------------------------------------------------
802 | Returns the most-significant 48 fraction bits of the quadruple-precision
803 | floating-point value `a'.
804 *----------------------------------------------------------------------------*/
806 INLINE bits64
extractFloat128Frac0( float128 a
)
809 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
813 /*----------------------------------------------------------------------------
814 | Returns the exponent bits of the quadruple-precision floating-point value
816 *----------------------------------------------------------------------------*/
818 INLINE int32
extractFloat128Exp( float128 a
)
821 return ( a
.high
>>48 ) & 0x7FFF;
825 /*----------------------------------------------------------------------------
826 | Returns the sign bit of the quadruple-precision floating-point value `a'.
827 *----------------------------------------------------------------------------*/
829 INLINE flag
extractFloat128Sign( float128 a
)
836 /*----------------------------------------------------------------------------
837 | Normalizes the subnormal quadruple-precision floating-point value
838 | represented by the denormalized significand formed by the concatenation of
839 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
840 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
841 | significand are stored at the location pointed to by `zSig0Ptr', and the
842 | least significant 64 bits of the normalized significand are stored at the
843 | location pointed to by `zSig1Ptr'.
844 *----------------------------------------------------------------------------*/
847 normalizeFloat128Subnormal(
858 shiftCount
= countLeadingZeros64( aSig1
) - 15;
859 if ( shiftCount
< 0 ) {
860 *zSig0Ptr
= aSig1
>>( - shiftCount
);
861 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
864 *zSig0Ptr
= aSig1
<<shiftCount
;
867 *zExpPtr
= - shiftCount
- 63;
870 shiftCount
= countLeadingZeros64( aSig0
) - 15;
871 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
872 *zExpPtr
= 1 - shiftCount
;
877 /*----------------------------------------------------------------------------
878 | Packs the sign `zSign', the exponent `zExp', and the significand formed
879 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
880 | floating-point value, returning the result. After being shifted into the
881 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
882 | added together to form the most significant 32 bits of the result. This
883 | means that any integer portion of `zSig0' will be added into the exponent.
884 | Since a properly normalized significand will have an integer portion equal
885 | to 1, the `zExp' input should be 1 less than the desired result exponent
886 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
888 *----------------------------------------------------------------------------*/
891 packFloat128( flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
)
896 z
.high
= ( ( (bits64
) zSign
)<<63 ) + ( ( (bits64
) zExp
)<<48 ) + zSig0
;
901 /*----------------------------------------------------------------------------
902 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
903 | and extended significand formed by the concatenation of `zSig0', `zSig1',
904 | and `zSig2', and returns the proper quadruple-precision floating-point value
905 | corresponding to the abstract input. Ordinarily, the abstract value is
906 | simply rounded and packed into the quadruple-precision format, with the
907 | inexact exception raised if the abstract input cannot be represented
908 | exactly. However, if the abstract value is too large, the overflow and
909 | inexact exceptions are raised and an infinity or maximal finite value is
910 | returned. If the abstract value is too small, the input value is rounded to
911 | a subnormal number, and the underflow and inexact exceptions are raised if
912 | the abstract input cannot be represented exactly as a subnormal quadruple-
913 | precision floating-point number.
914 | The input significand must be normalized or smaller. If the input
915 | significand is not normalized, `zExp' must be 0; in that case, the result
916 | returned is a subnormal number, and it must not require rounding. In the
917 | usual case that the input significand is normalized, `zExp' must be 1 less
918 | than the ``true'' floating-point exponent. The handling of underflow and
919 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
920 *----------------------------------------------------------------------------*/
923 roundAndPackFloat128(
924 flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
, bits64 zSig2 STATUS_PARAM
)
927 flag roundNearestEven
, increment
, isTiny
;
929 roundingMode
= STATUS(float_rounding_mode
);
930 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
931 increment
= ( (sbits64
) zSig2
< 0 );
932 if ( ! roundNearestEven
) {
933 if ( roundingMode
== float_round_to_zero
) {
938 increment
= ( roundingMode
== float_round_down
) && zSig2
;
941 increment
= ( roundingMode
== float_round_up
) && zSig2
;
945 if ( 0x7FFD <= (bits32
) zExp
) {
946 if ( ( 0x7FFD < zExp
)
947 || ( ( zExp
== 0x7FFD )
949 LIT64( 0x0001FFFFFFFFFFFF ),
950 LIT64( 0xFFFFFFFFFFFFFFFF ),
957 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
958 if ( ( roundingMode
== float_round_to_zero
)
959 || ( zSign
&& ( roundingMode
== float_round_up
) )
960 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
966 LIT64( 0x0000FFFFFFFFFFFF ),
967 LIT64( 0xFFFFFFFFFFFFFFFF )
970 return packFloat128( zSign
, 0x7FFF, 0, 0 );
973 if ( STATUS(flush_to_zero
) ) return packFloat128( zSign
, 0, 0, 0 );
975 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
981 LIT64( 0x0001FFFFFFFFFFFF ),
982 LIT64( 0xFFFFFFFFFFFFFFFF )
984 shift128ExtraRightJamming(
985 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
987 if ( isTiny
&& zSig2
) float_raise( float_flag_underflow STATUS_VAR
);
988 if ( roundNearestEven
) {
989 increment
= ( (sbits64
) zSig2
< 0 );
993 increment
= ( roundingMode
== float_round_down
) && zSig2
;
996 increment
= ( roundingMode
== float_round_up
) && zSig2
;
1001 if ( zSig2
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1003 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
1004 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
1007 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
1009 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1013 /*----------------------------------------------------------------------------
1014 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1015 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1016 | returns the proper quadruple-precision floating-point value corresponding
1017 | to the abstract input. This routine is just like `roundAndPackFloat128'
1018 | except that the input significand has fewer bits and does not have to be
1019 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1021 *----------------------------------------------------------------------------*/
1024 normalizeRoundAndPackFloat128(
1025 flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1 STATUS_PARAM
)
1035 shiftCount
= countLeadingZeros64( zSig0
) - 15;
1036 if ( 0 <= shiftCount
) {
1038 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1041 shift128ExtraRightJamming(
1042 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
1045 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
1051 /*----------------------------------------------------------------------------
1052 | Returns the result of converting the 32-bit two's complement integer `a'
1053 | to the single-precision floating-point format. The conversion is performed
1054 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1055 *----------------------------------------------------------------------------*/
1057 float32
int32_to_float32( int32 a STATUS_PARAM
)
1061 if ( a
== 0 ) return float32_zero
;
1062 if ( a
== (sbits32
) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1064 return normalizeRoundAndPackFloat32( zSign
, 0x9C, zSign
? - a
: a STATUS_VAR
);
1068 /*----------------------------------------------------------------------------
1069 | Returns the result of converting the 32-bit two's complement integer `a'
1070 | to the double-precision floating-point format. The conversion is performed
1071 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1072 *----------------------------------------------------------------------------*/
1074 float64
int32_to_float64( int32 a STATUS_PARAM
)
1081 if ( a
== 0 ) return float64_zero
;
1083 absA
= zSign
? - a
: a
;
1084 shiftCount
= countLeadingZeros32( absA
) + 21;
1086 return packFloat64( zSign
, 0x432 - shiftCount
, zSig
<<shiftCount
);
1092 /*----------------------------------------------------------------------------
1093 | Returns the result of converting the 32-bit two's complement integer `a'
1094 | to the extended double-precision floating-point format. The conversion
1095 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1097 *----------------------------------------------------------------------------*/
1099 floatx80
int32_to_floatx80( int32 a STATUS_PARAM
)
1106 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1108 absA
= zSign
? - a
: a
;
1109 shiftCount
= countLeadingZeros32( absA
) + 32;
1111 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
1119 /*----------------------------------------------------------------------------
1120 | Returns the result of converting the 32-bit two's complement integer `a' to
1121 | the quadruple-precision floating-point format. The conversion is performed
1122 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1123 *----------------------------------------------------------------------------*/
1125 float128
int32_to_float128( int32 a STATUS_PARAM
)
1132 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1134 absA
= zSign
? - a
: a
;
1135 shiftCount
= countLeadingZeros32( absA
) + 17;
1137 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
1143 /*----------------------------------------------------------------------------
1144 | Returns the result of converting the 64-bit two's complement integer `a'
1145 | to the single-precision floating-point format. The conversion is performed
1146 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1147 *----------------------------------------------------------------------------*/
1149 float32
int64_to_float32( int64 a STATUS_PARAM
)
1155 if ( a
== 0 ) return float32_zero
;
1157 absA
= zSign
? - a
: a
;
1158 shiftCount
= countLeadingZeros64( absA
) - 40;
1159 if ( 0 <= shiftCount
) {
1160 return packFloat32( zSign
, 0x95 - shiftCount
, absA
<<shiftCount
);
1164 if ( shiftCount
< 0 ) {
1165 shift64RightJamming( absA
, - shiftCount
, &absA
);
1168 absA
<<= shiftCount
;
1170 return roundAndPackFloat32( zSign
, 0x9C - shiftCount
, absA STATUS_VAR
);
1175 float32
uint64_to_float32( uint64 a STATUS_PARAM
)
1179 if ( a
== 0 ) return float32_zero
;
1180 shiftCount
= countLeadingZeros64( a
) - 40;
1181 if ( 0 <= shiftCount
) {
1182 return packFloat32( 1 > 0, 0x95 - shiftCount
, a
<<shiftCount
);
1186 if ( shiftCount
< 0 ) {
1187 shift64RightJamming( a
, - shiftCount
, &a
);
1192 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount
, a STATUS_VAR
);
1196 /*----------------------------------------------------------------------------
1197 | Returns the result of converting the 64-bit two's complement integer `a'
1198 | to the double-precision floating-point format. The conversion is performed
1199 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1200 *----------------------------------------------------------------------------*/
1202 float64
int64_to_float64( int64 a STATUS_PARAM
)
1206 if ( a
== 0 ) return float64_zero
;
1207 if ( a
== (sbits64
) LIT64( 0x8000000000000000 ) ) {
1208 return packFloat64( 1, 0x43E, 0 );
1211 return normalizeRoundAndPackFloat64( zSign
, 0x43C, zSign
? - a
: a STATUS_VAR
);
1215 float64
uint64_to_float64( uint64 a STATUS_PARAM
)
1217 if ( a
== 0 ) return float64_zero
;
1218 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR
);
1224 /*----------------------------------------------------------------------------
1225 | Returns the result of converting the 64-bit two's complement integer `a'
1226 | to the extended double-precision floating-point format. The conversion
1227 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1229 *----------------------------------------------------------------------------*/
1231 floatx80
int64_to_floatx80( int64 a STATUS_PARAM
)
1237 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1239 absA
= zSign
? - a
: a
;
1240 shiftCount
= countLeadingZeros64( absA
);
1241 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
1249 /*----------------------------------------------------------------------------
1250 | Returns the result of converting the 64-bit two's complement integer `a' to
1251 | the quadruple-precision floating-point format. The conversion is performed
1252 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1253 *----------------------------------------------------------------------------*/
1255 float128
int64_to_float128( int64 a STATUS_PARAM
)
1261 bits64 zSig0
, zSig1
;
1263 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1265 absA
= zSign
? - a
: a
;
1266 shiftCount
= countLeadingZeros64( absA
) + 49;
1267 zExp
= 0x406E - shiftCount
;
1268 if ( 64 <= shiftCount
) {
1277 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1278 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1284 /*----------------------------------------------------------------------------
1285 | Returns the result of converting the single-precision floating-point value
1286 | `a' to the 32-bit two's complement integer format. The conversion is
1287 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1288 | Arithmetic---which means in particular that the conversion is rounded
1289 | according to the current rounding mode. If `a' is a NaN, the largest
1290 | positive integer is returned. Otherwise, if the conversion overflows, the
1291 | largest integer with the same sign as `a' is returned.
1292 *----------------------------------------------------------------------------*/
1294 int32
float32_to_int32( float32 a STATUS_PARAM
)
1297 int16 aExp
, shiftCount
;
1301 aSig
= extractFloat32Frac( a
);
1302 aExp
= extractFloat32Exp( a
);
1303 aSign
= extractFloat32Sign( a
);
1304 if ( ( aExp
== 0xFF ) && aSig
) aSign
= 0;
1305 if ( aExp
) aSig
|= 0x00800000;
1306 shiftCount
= 0xAF - aExp
;
1309 if ( 0 < shiftCount
) shift64RightJamming( aSig64
, shiftCount
, &aSig64
);
1310 return roundAndPackInt32( aSign
, aSig64 STATUS_VAR
);
1314 /*----------------------------------------------------------------------------
1315 | Returns the result of converting the single-precision floating-point value
1316 | `a' to the 32-bit two's complement integer format. The conversion is
1317 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1318 | Arithmetic, except that the conversion is always rounded toward zero.
1319 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1320 | the conversion overflows, the largest integer with the same sign as `a' is
1322 *----------------------------------------------------------------------------*/
1324 int32
float32_to_int32_round_to_zero( float32 a STATUS_PARAM
)
1327 int16 aExp
, shiftCount
;
1331 aSig
= extractFloat32Frac( a
);
1332 aExp
= extractFloat32Exp( a
);
1333 aSign
= extractFloat32Sign( a
);
1334 shiftCount
= aExp
- 0x9E;
1335 if ( 0 <= shiftCount
) {
1336 if ( float32_val(a
) != 0xCF000000 ) {
1337 float_raise( float_flag_invalid STATUS_VAR
);
1338 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
1340 return (sbits32
) 0x80000000;
1342 else if ( aExp
<= 0x7E ) {
1343 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1346 aSig
= ( aSig
| 0x00800000 )<<8;
1347 z
= aSig
>>( - shiftCount
);
1348 if ( (bits32
) ( aSig
<<( shiftCount
& 31 ) ) ) {
1349 STATUS(float_exception_flags
) |= float_flag_inexact
;
1351 if ( aSign
) z
= - z
;
1356 /*----------------------------------------------------------------------------
1357 | Returns the result of converting the single-precision floating-point value
1358 | `a' to the 64-bit two's complement integer format. The conversion is
1359 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1360 | Arithmetic---which means in particular that the conversion is rounded
1361 | according to the current rounding mode. If `a' is a NaN, the largest
1362 | positive integer is returned. Otherwise, if the conversion overflows, the
1363 | largest integer with the same sign as `a' is returned.
1364 *----------------------------------------------------------------------------*/
1366 int64
float32_to_int64( float32 a STATUS_PARAM
)
1369 int16 aExp
, shiftCount
;
1371 bits64 aSig64
, aSigExtra
;
1373 aSig
= extractFloat32Frac( a
);
1374 aExp
= extractFloat32Exp( a
);
1375 aSign
= extractFloat32Sign( a
);
1376 shiftCount
= 0xBE - aExp
;
1377 if ( shiftCount
< 0 ) {
1378 float_raise( float_flag_invalid STATUS_VAR
);
1379 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1380 return LIT64( 0x7FFFFFFFFFFFFFFF );
1382 return (sbits64
) LIT64( 0x8000000000000000 );
1384 if ( aExp
) aSig
|= 0x00800000;
1387 shift64ExtraRightJamming( aSig64
, 0, shiftCount
, &aSig64
, &aSigExtra
);
1388 return roundAndPackInt64( aSign
, aSig64
, aSigExtra STATUS_VAR
);
1392 /*----------------------------------------------------------------------------
1393 | Returns the result of converting the single-precision floating-point value
1394 | `a' to the 64-bit two's complement integer format. The conversion is
1395 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1396 | Arithmetic, except that the conversion is always rounded toward zero. If
1397 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1398 | conversion overflows, the largest integer with the same sign as `a' is
1400 *----------------------------------------------------------------------------*/
1402 int64
float32_to_int64_round_to_zero( float32 a STATUS_PARAM
)
1405 int16 aExp
, shiftCount
;
1410 aSig
= extractFloat32Frac( a
);
1411 aExp
= extractFloat32Exp( a
);
1412 aSign
= extractFloat32Sign( a
);
1413 shiftCount
= aExp
- 0xBE;
1414 if ( 0 <= shiftCount
) {
1415 if ( float32_val(a
) != 0xDF000000 ) {
1416 float_raise( float_flag_invalid STATUS_VAR
);
1417 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1418 return LIT64( 0x7FFFFFFFFFFFFFFF );
1421 return (sbits64
) LIT64( 0x8000000000000000 );
1423 else if ( aExp
<= 0x7E ) {
1424 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1427 aSig64
= aSig
| 0x00800000;
1429 z
= aSig64
>>( - shiftCount
);
1430 if ( (bits64
) ( aSig64
<<( shiftCount
& 63 ) ) ) {
1431 STATUS(float_exception_flags
) |= float_flag_inexact
;
1433 if ( aSign
) z
= - z
;
1438 /*----------------------------------------------------------------------------
1439 | Returns the result of converting the single-precision floating-point value
1440 | `a' to the double-precision floating-point format. The conversion is
1441 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1443 *----------------------------------------------------------------------------*/
1445 float64
float32_to_float64( float32 a STATUS_PARAM
)
1451 aSig
= extractFloat32Frac( a
);
1452 aExp
= extractFloat32Exp( a
);
1453 aSign
= extractFloat32Sign( a
);
1454 if ( aExp
== 0xFF ) {
1455 if ( aSig
) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR
));
1456 return packFloat64( aSign
, 0x7FF, 0 );
1459 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
1460 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1463 return packFloat64( aSign
, aExp
+ 0x380, ( (bits64
) aSig
)<<29 );
1469 /*----------------------------------------------------------------------------
1470 | Returns the result of converting the single-precision floating-point value
1471 | `a' to the extended double-precision floating-point format. The conversion
1472 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1474 *----------------------------------------------------------------------------*/
1476 floatx80
float32_to_floatx80( float32 a STATUS_PARAM
)
1482 aSig
= extractFloat32Frac( a
);
1483 aExp
= extractFloat32Exp( a
);
1484 aSign
= extractFloat32Sign( a
);
1485 if ( aExp
== 0xFF ) {
1486 if ( aSig
) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR
) );
1487 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1490 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
1491 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1494 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<40 );
1502 /*----------------------------------------------------------------------------
1503 | Returns the result of converting the single-precision floating-point value
1504 | `a' to the double-precision floating-point format. The conversion is
1505 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1507 *----------------------------------------------------------------------------*/
1509 float128
float32_to_float128( float32 a STATUS_PARAM
)
1515 aSig
= extractFloat32Frac( a
);
1516 aExp
= extractFloat32Exp( a
);
1517 aSign
= extractFloat32Sign( a
);
1518 if ( aExp
== 0xFF ) {
1519 if ( aSig
) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR
) );
1520 return packFloat128( aSign
, 0x7FFF, 0, 0 );
1523 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
1524 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1527 return packFloat128( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<25, 0 );
1533 /*----------------------------------------------------------------------------
1534 | Rounds the single-precision floating-point value `a' to an integer, and
1535 | returns the result as a single-precision floating-point value. The
1536 | operation is performed according to the IEC/IEEE Standard for Binary
1537 | Floating-Point Arithmetic.
1538 *----------------------------------------------------------------------------*/
1540 float32
float32_round_to_int( float32 a STATUS_PARAM
)
1544 bits32 lastBitMask
, roundBitsMask
;
1548 aExp
= extractFloat32Exp( a
);
1549 if ( 0x96 <= aExp
) {
1550 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
1551 return propagateFloat32NaN( a
, a STATUS_VAR
);
1555 if ( aExp
<= 0x7E ) {
1556 if ( (bits32
) ( float32_val(a
)<<1 ) == 0 ) return a
;
1557 STATUS(float_exception_flags
) |= float_flag_inexact
;
1558 aSign
= extractFloat32Sign( a
);
1559 switch ( STATUS(float_rounding_mode
) ) {
1560 case float_round_nearest_even
:
1561 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
1562 return packFloat32( aSign
, 0x7F, 0 );
1565 case float_round_down
:
1566 return make_float32(aSign
? 0xBF800000 : 0);
1567 case float_round_up
:
1568 return make_float32(aSign
? 0x80000000 : 0x3F800000);
1570 return packFloat32( aSign
, 0, 0 );
1573 lastBitMask
<<= 0x96 - aExp
;
1574 roundBitsMask
= lastBitMask
- 1;
1576 roundingMode
= STATUS(float_rounding_mode
);
1577 if ( roundingMode
== float_round_nearest_even
) {
1578 z
+= lastBitMask
>>1;
1579 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
1581 else if ( roundingMode
!= float_round_to_zero
) {
1582 if ( extractFloat32Sign( make_float32(z
) ) ^ ( roundingMode
== float_round_up
) ) {
1586 z
&= ~ roundBitsMask
;
1587 if ( z
!= float32_val(a
) ) STATUS(float_exception_flags
) |= float_flag_inexact
;
1588 return make_float32(z
);
1592 /*----------------------------------------------------------------------------
1593 | Returns the result of adding the absolute values of the single-precision
1594 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1595 | before being returned. `zSign' is ignored if the result is a NaN.
1596 | The addition is performed according to the IEC/IEEE Standard for Binary
1597 | Floating-Point Arithmetic.
1598 *----------------------------------------------------------------------------*/
1600 static float32
addFloat32Sigs( float32 a
, float32 b
, flag zSign STATUS_PARAM
)
1602 int16 aExp
, bExp
, zExp
;
1603 bits32 aSig
, bSig
, zSig
;
1606 aSig
= extractFloat32Frac( a
);
1607 aExp
= extractFloat32Exp( a
);
1608 bSig
= extractFloat32Frac( b
);
1609 bExp
= extractFloat32Exp( b
);
1610 expDiff
= aExp
- bExp
;
1613 if ( 0 < expDiff
) {
1614 if ( aExp
== 0xFF ) {
1615 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1624 shift32RightJamming( bSig
, expDiff
, &bSig
);
1627 else if ( expDiff
< 0 ) {
1628 if ( bExp
== 0xFF ) {
1629 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1630 return packFloat32( zSign
, 0xFF, 0 );
1638 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1642 if ( aExp
== 0xFF ) {
1643 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1647 if ( STATUS(flush_to_zero
) ) return packFloat32( zSign
, 0, 0 );
1648 return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
1650 zSig
= 0x40000000 + aSig
+ bSig
;
1655 zSig
= ( aSig
+ bSig
)<<1;
1657 if ( (sbits32
) zSig
< 0 ) {
1662 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1666 /*----------------------------------------------------------------------------
1667 | Returns the result of subtracting the absolute values of the single-
1668 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1669 | difference is negated before being returned. `zSign' is ignored if the
1670 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1671 | Standard for Binary Floating-Point Arithmetic.
1672 *----------------------------------------------------------------------------*/
1674 static float32
subFloat32Sigs( float32 a
, float32 b
, flag zSign STATUS_PARAM
)
1676 int16 aExp
, bExp
, zExp
;
1677 bits32 aSig
, bSig
, zSig
;
1680 aSig
= extractFloat32Frac( a
);
1681 aExp
= extractFloat32Exp( a
);
1682 bSig
= extractFloat32Frac( b
);
1683 bExp
= extractFloat32Exp( b
);
1684 expDiff
= aExp
- bExp
;
1687 if ( 0 < expDiff
) goto aExpBigger
;
1688 if ( expDiff
< 0 ) goto bExpBigger
;
1689 if ( aExp
== 0xFF ) {
1690 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1691 float_raise( float_flag_invalid STATUS_VAR
);
1692 return float32_default_nan
;
1698 if ( bSig
< aSig
) goto aBigger
;
1699 if ( aSig
< bSig
) goto bBigger
;
1700 return packFloat32( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
1702 if ( bExp
== 0xFF ) {
1703 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1704 return packFloat32( zSign
^ 1, 0xFF, 0 );
1712 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1718 goto normalizeRoundAndPack
;
1720 if ( aExp
== 0xFF ) {
1721 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1730 shift32RightJamming( bSig
, expDiff
, &bSig
);
1735 normalizeRoundAndPack
:
1737 return normalizeRoundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1741 /*----------------------------------------------------------------------------
1742 | Returns the result of adding the single-precision floating-point values `a'
1743 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1744 | Binary Floating-Point Arithmetic.
1745 *----------------------------------------------------------------------------*/
1747 float32
float32_add( float32 a
, float32 b STATUS_PARAM
)
1751 aSign
= extractFloat32Sign( a
);
1752 bSign
= extractFloat32Sign( b
);
1753 if ( aSign
== bSign
) {
1754 return addFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1757 return subFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1762 /*----------------------------------------------------------------------------
1763 | Returns the result of subtracting the single-precision floating-point values
1764 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1765 | for Binary Floating-Point Arithmetic.
1766 *----------------------------------------------------------------------------*/
1768 float32
float32_sub( float32 a
, float32 b STATUS_PARAM
)
1772 aSign
= extractFloat32Sign( a
);
1773 bSign
= extractFloat32Sign( b
);
1774 if ( aSign
== bSign
) {
1775 return subFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1778 return addFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1783 /*----------------------------------------------------------------------------
1784 | Returns the result of multiplying the single-precision floating-point values
1785 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1786 | for Binary Floating-Point Arithmetic.
1787 *----------------------------------------------------------------------------*/
1789 float32
float32_mul( float32 a
, float32 b STATUS_PARAM
)
1791 flag aSign
, bSign
, zSign
;
1792 int16 aExp
, bExp
, zExp
;
1797 aSig
= extractFloat32Frac( a
);
1798 aExp
= extractFloat32Exp( a
);
1799 aSign
= extractFloat32Sign( a
);
1800 bSig
= extractFloat32Frac( b
);
1801 bExp
= extractFloat32Exp( b
);
1802 bSign
= extractFloat32Sign( b
);
1803 zSign
= aSign
^ bSign
;
1804 if ( aExp
== 0xFF ) {
1805 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1806 return propagateFloat32NaN( a
, b STATUS_VAR
);
1808 if ( ( bExp
| bSig
) == 0 ) {
1809 float_raise( float_flag_invalid STATUS_VAR
);
1810 return float32_default_nan
;
1812 return packFloat32( zSign
, 0xFF, 0 );
1814 if ( bExp
== 0xFF ) {
1815 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1816 if ( ( aExp
| aSig
) == 0 ) {
1817 float_raise( float_flag_invalid STATUS_VAR
);
1818 return float32_default_nan
;
1820 return packFloat32( zSign
, 0xFF, 0 );
1823 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1824 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1827 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1828 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1830 zExp
= aExp
+ bExp
- 0x7F;
1831 aSig
= ( aSig
| 0x00800000 )<<7;
1832 bSig
= ( bSig
| 0x00800000 )<<8;
1833 shift64RightJamming( ( (bits64
) aSig
) * bSig
, 32, &zSig64
);
1835 if ( 0 <= (sbits32
) ( zSig
<<1 ) ) {
1839 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1843 /*----------------------------------------------------------------------------
1844 | Returns the result of dividing the single-precision floating-point value `a'
1845 | by the corresponding value `b'. The operation is performed according to the
1846 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1847 *----------------------------------------------------------------------------*/
1849 float32
float32_div( float32 a
, float32 b STATUS_PARAM
)
1851 flag aSign
, bSign
, zSign
;
1852 int16 aExp
, bExp
, zExp
;
1853 bits32 aSig
, bSig
, zSig
;
1855 aSig
= extractFloat32Frac( a
);
1856 aExp
= extractFloat32Exp( a
);
1857 aSign
= extractFloat32Sign( a
);
1858 bSig
= extractFloat32Frac( b
);
1859 bExp
= extractFloat32Exp( b
);
1860 bSign
= extractFloat32Sign( b
);
1861 zSign
= aSign
^ bSign
;
1862 if ( aExp
== 0xFF ) {
1863 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1864 if ( bExp
== 0xFF ) {
1865 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1866 float_raise( float_flag_invalid STATUS_VAR
);
1867 return float32_default_nan
;
1869 return packFloat32( zSign
, 0xFF, 0 );
1871 if ( bExp
== 0xFF ) {
1872 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1873 return packFloat32( zSign
, 0, 0 );
1877 if ( ( aExp
| aSig
) == 0 ) {
1878 float_raise( float_flag_invalid STATUS_VAR
);
1879 return float32_default_nan
;
1881 float_raise( float_flag_divbyzero STATUS_VAR
);
1882 return packFloat32( zSign
, 0xFF, 0 );
1884 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1887 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1888 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1890 zExp
= aExp
- bExp
+ 0x7D;
1891 aSig
= ( aSig
| 0x00800000 )<<7;
1892 bSig
= ( bSig
| 0x00800000 )<<8;
1893 if ( bSig
<= ( aSig
+ aSig
) ) {
1897 zSig
= ( ( (bits64
) aSig
)<<32 ) / bSig
;
1898 if ( ( zSig
& 0x3F ) == 0 ) {
1899 zSig
|= ( (bits64
) bSig
* zSig
!= ( (bits64
) aSig
)<<32 );
1901 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1905 /*----------------------------------------------------------------------------
1906 | Returns the remainder of the single-precision floating-point value `a'
1907 | with respect to the corresponding value `b'. The operation is performed
1908 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1909 *----------------------------------------------------------------------------*/
1911 float32
float32_rem( float32 a
, float32 b STATUS_PARAM
)
1913 flag aSign
, bSign
, zSign
;
1914 int16 aExp
, bExp
, expDiff
;
1917 bits64 aSig64
, bSig64
, q64
;
1918 bits32 alternateASig
;
1921 aSig
= extractFloat32Frac( a
);
1922 aExp
= extractFloat32Exp( a
);
1923 aSign
= extractFloat32Sign( a
);
1924 bSig
= extractFloat32Frac( b
);
1925 bExp
= extractFloat32Exp( b
);
1926 bSign
= extractFloat32Sign( b
);
1927 if ( aExp
== 0xFF ) {
1928 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1929 return propagateFloat32NaN( a
, b STATUS_VAR
);
1931 float_raise( float_flag_invalid STATUS_VAR
);
1932 return float32_default_nan
;
1934 if ( bExp
== 0xFF ) {
1935 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1940 float_raise( float_flag_invalid STATUS_VAR
);
1941 return float32_default_nan
;
1943 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1946 if ( aSig
== 0 ) return a
;
1947 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1949 expDiff
= aExp
- bExp
;
1952 if ( expDiff
< 32 ) {
1955 if ( expDiff
< 0 ) {
1956 if ( expDiff
< -1 ) return a
;
1959 q
= ( bSig
<= aSig
);
1960 if ( q
) aSig
-= bSig
;
1961 if ( 0 < expDiff
) {
1962 q
= ( ( (bits64
) aSig
)<<32 ) / bSig
;
1965 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
1973 if ( bSig
<= aSig
) aSig
-= bSig
;
1974 aSig64
= ( (bits64
) aSig
)<<40;
1975 bSig64
= ( (bits64
) bSig
)<<40;
1977 while ( 0 < expDiff
) {
1978 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
1979 q64
= ( 2 < q64
) ? q64
- 2 : 0;
1980 aSig64
= - ( ( bSig
* q64
)<<38 );
1984 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
1985 q64
= ( 2 < q64
) ? q64
- 2 : 0;
1986 q
= q64
>>( 64 - expDiff
);
1988 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
1991 alternateASig
= aSig
;
1994 } while ( 0 <= (sbits32
) aSig
);
1995 sigMean
= aSig
+ alternateASig
;
1996 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
1997 aSig
= alternateASig
;
1999 zSign
= ( (sbits32
) aSig
< 0 );
2000 if ( zSign
) aSig
= - aSig
;
2001 return normalizeRoundAndPackFloat32( aSign
^ zSign
, bExp
, aSig STATUS_VAR
);
2005 /*----------------------------------------------------------------------------
2006 | Returns the square root of the single-precision floating-point value `a'.
2007 | The operation is performed according to the IEC/IEEE Standard for Binary
2008 | Floating-Point Arithmetic.
2009 *----------------------------------------------------------------------------*/
2011 float32
float32_sqrt( float32 a STATUS_PARAM
)
2018 aSig
= extractFloat32Frac( a
);
2019 aExp
= extractFloat32Exp( a
);
2020 aSign
= extractFloat32Sign( a
);
2021 if ( aExp
== 0xFF ) {
2022 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2023 if ( ! aSign
) return a
;
2024 float_raise( float_flag_invalid STATUS_VAR
);
2025 return float32_default_nan
;
2028 if ( ( aExp
| aSig
) == 0 ) return a
;
2029 float_raise( float_flag_invalid STATUS_VAR
);
2030 return float32_default_nan
;
2033 if ( aSig
== 0 ) return float32_zero
;
2034 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2036 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
2037 aSig
= ( aSig
| 0x00800000 )<<8;
2038 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
2039 if ( ( zSig
& 0x7F ) <= 5 ) {
2045 term
= ( (bits64
) zSig
) * zSig
;
2046 rem
= ( ( (bits64
) aSig
)<<32 ) - term
;
2047 while ( (sbits64
) rem
< 0 ) {
2049 rem
+= ( ( (bits64
) zSig
)<<1 ) | 1;
2051 zSig
|= ( rem
!= 0 );
2053 shift32RightJamming( zSig
, 1, &zSig
);
2055 return roundAndPackFloat32( 0, zExp
, zSig STATUS_VAR
);
2059 /*----------------------------------------------------------------------------
2060 | Returns the binary log of the single-precision floating-point value `a'.
2061 | The operation is performed according to the IEC/IEEE Standard for Binary
2062 | Floating-Point Arithmetic.
2063 *----------------------------------------------------------------------------*/
2064 float32
float32_log2( float32 a STATUS_PARAM
)
2068 bits32 aSig
, zSig
, i
;
2070 aSig
= extractFloat32Frac( a
);
2071 aExp
= extractFloat32Exp( a
);
2072 aSign
= extractFloat32Sign( a
);
2075 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
2076 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2079 float_raise( float_flag_invalid STATUS_VAR
);
2080 return float32_default_nan
;
2082 if ( aExp
== 0xFF ) {
2083 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2092 for (i
= 1 << 22; i
> 0; i
>>= 1) {
2093 aSig
= ( (bits64
)aSig
* aSig
) >> 23;
2094 if ( aSig
& 0x01000000 ) {
2103 return normalizeRoundAndPackFloat32( zSign
, 0x85, zSig STATUS_VAR
);
2106 /*----------------------------------------------------------------------------
2107 | Returns 1 if the single-precision floating-point value `a' is equal to
2108 | the corresponding value `b', and 0 otherwise. The comparison is performed
2109 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2110 *----------------------------------------------------------------------------*/
2112 int float32_eq( float32 a
, float32 b STATUS_PARAM
)
2115 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2116 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2118 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2119 float_raise( float_flag_invalid STATUS_VAR
);
2123 return ( float32_val(a
) == float32_val(b
) ) ||
2124 ( (bits32
) ( ( float32_val(a
) | float32_val(b
) )<<1 ) == 0 );
2128 /*----------------------------------------------------------------------------
2129 | Returns 1 if the single-precision floating-point value `a' is less than
2130 | or equal to the corresponding value `b', and 0 otherwise. The comparison
2131 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2133 *----------------------------------------------------------------------------*/
2135 int float32_le( float32 a
, float32 b STATUS_PARAM
)
2140 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2141 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2143 float_raise( float_flag_invalid STATUS_VAR
);
2146 aSign
= extractFloat32Sign( a
);
2147 bSign
= extractFloat32Sign( b
);
2148 av
= float32_val(a
);
2149 bv
= float32_val(b
);
2150 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( av
| bv
)<<1 ) == 0 );
2151 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
2155 /*----------------------------------------------------------------------------
2156 | Returns 1 if the single-precision floating-point value `a' is less than
2157 | the corresponding value `b', and 0 otherwise. The comparison is performed
2158 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2159 *----------------------------------------------------------------------------*/
2161 int float32_lt( float32 a
, float32 b STATUS_PARAM
)
2166 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2167 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2169 float_raise( float_flag_invalid STATUS_VAR
);
2172 aSign
= extractFloat32Sign( a
);
2173 bSign
= extractFloat32Sign( b
);
2174 av
= float32_val(a
);
2175 bv
= float32_val(b
);
2176 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( av
| bv
)<<1 ) != 0 );
2177 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
2181 /*----------------------------------------------------------------------------
2182 | Returns 1 if the single-precision floating-point value `a' is equal to
2183 | the corresponding value `b', and 0 otherwise. The invalid exception is
2184 | raised if either operand is a NaN. Otherwise, the comparison is performed
2185 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2186 *----------------------------------------------------------------------------*/
2188 int float32_eq_signaling( float32 a
, float32 b STATUS_PARAM
)
2192 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2193 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2195 float_raise( float_flag_invalid STATUS_VAR
);
2198 av
= float32_val(a
);
2199 bv
= float32_val(b
);
2200 return ( av
== bv
) || ( (bits32
) ( ( av
| bv
)<<1 ) == 0 );
2204 /*----------------------------------------------------------------------------
2205 | Returns 1 if the single-precision floating-point value `a' is less than or
2206 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2207 | cause an exception. Otherwise, the comparison is performed according to the
2208 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2209 *----------------------------------------------------------------------------*/
2211 int float32_le_quiet( float32 a
, float32 b STATUS_PARAM
)
2216 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2217 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2219 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2220 float_raise( float_flag_invalid STATUS_VAR
);
2224 aSign
= extractFloat32Sign( a
);
2225 bSign
= extractFloat32Sign( b
);
2226 av
= float32_val(a
);
2227 bv
= float32_val(b
);
2228 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( av
| bv
)<<1 ) == 0 );
2229 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
2233 /*----------------------------------------------------------------------------
2234 | Returns 1 if the single-precision floating-point value `a' is less than
2235 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2236 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2237 | Standard for Binary Floating-Point Arithmetic.
2238 *----------------------------------------------------------------------------*/
2240 int float32_lt_quiet( float32 a
, float32 b STATUS_PARAM
)
2245 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2246 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2248 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2249 float_raise( float_flag_invalid STATUS_VAR
);
2253 aSign
= extractFloat32Sign( a
);
2254 bSign
= extractFloat32Sign( b
);
2255 av
= float32_val(a
);
2256 bv
= float32_val(b
);
2257 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( av
| bv
)<<1 ) != 0 );
2258 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
2262 /*----------------------------------------------------------------------------
2263 | Returns the result of converting the double-precision floating-point value
2264 | `a' to the 32-bit two's complement integer format. The conversion is
2265 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2266 | Arithmetic---which means in particular that the conversion is rounded
2267 | according to the current rounding mode. If `a' is a NaN, the largest
2268 | positive integer is returned. Otherwise, if the conversion overflows, the
2269 | largest integer with the same sign as `a' is returned.
2270 *----------------------------------------------------------------------------*/
2272 int32
float64_to_int32( float64 a STATUS_PARAM
)
2275 int16 aExp
, shiftCount
;
2278 aSig
= extractFloat64Frac( a
);
2279 aExp
= extractFloat64Exp( a
);
2280 aSign
= extractFloat64Sign( a
);
2281 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2282 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2283 shiftCount
= 0x42C - aExp
;
2284 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
2285 return roundAndPackInt32( aSign
, aSig STATUS_VAR
);
2289 /*----------------------------------------------------------------------------
2290 | Returns the result of converting the double-precision floating-point value
2291 | `a' to the 32-bit two's complement integer format. The conversion is
2292 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2293 | Arithmetic, except that the conversion is always rounded toward zero.
2294 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2295 | the conversion overflows, the largest integer with the same sign as `a' is
2297 *----------------------------------------------------------------------------*/
2299 int32
float64_to_int32_round_to_zero( float64 a STATUS_PARAM
)
2302 int16 aExp
, shiftCount
;
2303 bits64 aSig
, savedASig
;
2306 aSig
= extractFloat64Frac( a
);
2307 aExp
= extractFloat64Exp( a
);
2308 aSign
= extractFloat64Sign( a
);
2309 if ( 0x41E < aExp
) {
2310 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2313 else if ( aExp
< 0x3FF ) {
2314 if ( aExp
|| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
2317 aSig
|= LIT64( 0x0010000000000000 );
2318 shiftCount
= 0x433 - aExp
;
2320 aSig
>>= shiftCount
;
2322 if ( aSign
) z
= - z
;
2323 if ( ( z
< 0 ) ^ aSign
) {
2325 float_raise( float_flag_invalid STATUS_VAR
);
2326 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
2328 if ( ( aSig
<<shiftCount
) != savedASig
) {
2329 STATUS(float_exception_flags
) |= float_flag_inexact
;
2335 /*----------------------------------------------------------------------------
2336 | Returns the result of converting the double-precision floating-point value
2337 | `a' to the 64-bit two's complement integer format. The conversion is
2338 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2339 | Arithmetic---which means in particular that the conversion is rounded
2340 | according to the current rounding mode. If `a' is a NaN, the largest
2341 | positive integer is returned. Otherwise, if the conversion overflows, the
2342 | largest integer with the same sign as `a' is returned.
2343 *----------------------------------------------------------------------------*/
2345 int64
float64_to_int64( float64 a STATUS_PARAM
)
2348 int16 aExp
, shiftCount
;
2349 bits64 aSig
, aSigExtra
;
2351 aSig
= extractFloat64Frac( a
);
2352 aExp
= extractFloat64Exp( a
);
2353 aSign
= extractFloat64Sign( a
);
2354 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2355 shiftCount
= 0x433 - aExp
;
2356 if ( shiftCount
<= 0 ) {
2357 if ( 0x43E < aExp
) {
2358 float_raise( float_flag_invalid STATUS_VAR
);
2360 || ( ( aExp
== 0x7FF )
2361 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2363 return LIT64( 0x7FFFFFFFFFFFFFFF );
2365 return (sbits64
) LIT64( 0x8000000000000000 );
2368 aSig
<<= - shiftCount
;
2371 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
2373 return roundAndPackInt64( aSign
, aSig
, aSigExtra STATUS_VAR
);
2377 /*----------------------------------------------------------------------------
2378 | Returns the result of converting the double-precision floating-point value
2379 | `a' to the 64-bit two's complement integer format. The conversion is
2380 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2381 | Arithmetic, except that the conversion is always rounded toward zero.
2382 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2383 | the conversion overflows, the largest integer with the same sign as `a' is
2385 *----------------------------------------------------------------------------*/
2387 int64
float64_to_int64_round_to_zero( float64 a STATUS_PARAM
)
2390 int16 aExp
, shiftCount
;
2394 aSig
= extractFloat64Frac( a
);
2395 aExp
= extractFloat64Exp( a
);
2396 aSign
= extractFloat64Sign( a
);
2397 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2398 shiftCount
= aExp
- 0x433;
2399 if ( 0 <= shiftCount
) {
2400 if ( 0x43E <= aExp
) {
2401 if ( float64_val(a
) != LIT64( 0xC3E0000000000000 ) ) {
2402 float_raise( float_flag_invalid STATUS_VAR
);
2404 || ( ( aExp
== 0x7FF )
2405 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2407 return LIT64( 0x7FFFFFFFFFFFFFFF );
2410 return (sbits64
) LIT64( 0x8000000000000000 );
2412 z
= aSig
<<shiftCount
;
2415 if ( aExp
< 0x3FE ) {
2416 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
2419 z
= aSig
>>( - shiftCount
);
2420 if ( (bits64
) ( aSig
<<( shiftCount
& 63 ) ) ) {
2421 STATUS(float_exception_flags
) |= float_flag_inexact
;
2424 if ( aSign
) z
= - z
;
2429 /*----------------------------------------------------------------------------
2430 | Returns the result of converting the double-precision floating-point value
2431 | `a' to the single-precision floating-point format. The conversion is
2432 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2434 *----------------------------------------------------------------------------*/
2436 float32
float64_to_float32( float64 a STATUS_PARAM
)
2443 aSig
= extractFloat64Frac( a
);
2444 aExp
= extractFloat64Exp( a
);
2445 aSign
= extractFloat64Sign( a
);
2446 if ( aExp
== 0x7FF ) {
2447 if ( aSig
) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR
) );
2448 return packFloat32( aSign
, 0xFF, 0 );
2450 shift64RightJamming( aSig
, 22, &aSig
);
2452 if ( aExp
|| zSig
) {
2456 return roundAndPackFloat32( aSign
, aExp
, zSig STATUS_VAR
);
2461 /*----------------------------------------------------------------------------
2462 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2463 | half-precision floating-point value, returning the result. After being
2464 | shifted into the proper positions, the three fields are simply added
2465 | together to form the result. This means that any integer portion of `zSig'
2466 | will be added into the exponent. Since a properly normalized significand
2467 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2468 | than the desired result exponent whenever `zSig' is a complete, normalized
2470 *----------------------------------------------------------------------------*/
2471 static bits16
packFloat16(flag zSign
, int16 zExp
, bits16 zSig
)
2473 return (((bits32
)zSign
) << 15) + (((bits32
)zExp
) << 10) + zSig
;
2476 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2477 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
2479 float32
float16_to_float32( bits16 a
, flag ieee STATUS_PARAM
)
2486 aExp
= (a
>> 10) & 0x1f;
2489 if (aExp
== 0x1f && ieee
) {
2491 /* Make sure correct exceptions are raised. */
2492 float32ToCommonNaN(a STATUS_VAR
);
2495 return packFloat32(aSign
, 0xff, aSig
<< 13);
2501 return packFloat32(aSign
, 0, 0);
2504 shiftCount
= countLeadingZeros32( aSig
) - 21;
2505 aSig
= aSig
<< shiftCount
;
2508 return packFloat32( aSign
, aExp
+ 0x70, aSig
<< 13);
2511 bits16
float32_to_float16( float32 a
, flag ieee STATUS_PARAM
)
2520 aSig
= extractFloat32Frac( a
);
2521 aExp
= extractFloat32Exp( a
);
2522 aSign
= extractFloat32Sign( a
);
2523 if ( aExp
== 0xFF ) {
2525 /* Make sure correct exceptions are raised. */
2526 float32ToCommonNaN(a STATUS_VAR
);
2529 return packFloat16(aSign
, 0x1f, aSig
>> 13);
2531 if (aExp
== 0 && aSign
== 0) {
2532 return packFloat16(aSign
, 0, 0);
2534 /* Decimal point between bits 22 and 23. */
2548 float_raise( float_flag_underflow STATUS_VAR
);
2549 roundingMode
= STATUS(float_rounding_mode
);
2550 switch (roundingMode
) {
2551 case float_round_nearest_even
:
2552 increment
= (mask
+ 1) >> 1;
2553 if ((aSig
& mask
) == increment
) {
2554 increment
= aSig
& (increment
<< 1);
2557 case float_round_up
:
2558 increment
= aSign
? 0 : mask
;
2560 case float_round_down
:
2561 increment
= aSign
? mask
: 0;
2563 default: /* round_to_zero */
2568 if (aSig
>= 0x01000000) {
2572 } else if (aExp
< -14
2573 && STATUS(float_detect_tininess
) == float_tininess_before_rounding
) {
2574 float_raise( float_flag_underflow STATUS_VAR
);
2579 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
2580 return packFloat16(aSign
, 0x1f, 0);
2584 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
2585 return packFloat16(aSign
, 0x1f, 0x3ff);
2589 return packFloat16(aSign
, 0, 0);
2592 aSig
>>= -14 - aExp
;
2595 return packFloat16(aSign
, aExp
+ 14, aSig
>> 13);
2600 /*----------------------------------------------------------------------------
2601 | Returns the result of converting the double-precision floating-point value
2602 | `a' to the extended double-precision floating-point format. The conversion
2603 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2605 *----------------------------------------------------------------------------*/
2607 floatx80
float64_to_floatx80( float64 a STATUS_PARAM
)
2613 aSig
= extractFloat64Frac( a
);
2614 aExp
= extractFloat64Exp( a
);
2615 aSign
= extractFloat64Sign( a
);
2616 if ( aExp
== 0x7FF ) {
2617 if ( aSig
) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR
) );
2618 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2621 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
2622 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2626 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
2634 /*----------------------------------------------------------------------------
2635 | Returns the result of converting the double-precision floating-point value
2636 | `a' to the quadruple-precision floating-point format. The conversion is
2637 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2639 *----------------------------------------------------------------------------*/
2641 float128
float64_to_float128( float64 a STATUS_PARAM
)
2645 bits64 aSig
, zSig0
, zSig1
;
2647 aSig
= extractFloat64Frac( a
);
2648 aExp
= extractFloat64Exp( a
);
2649 aSign
= extractFloat64Sign( a
);
2650 if ( aExp
== 0x7FF ) {
2651 if ( aSig
) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR
) );
2652 return packFloat128( aSign
, 0x7FFF, 0, 0 );
2655 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
2656 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2659 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
2660 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
2666 /*----------------------------------------------------------------------------
2667 | Rounds the double-precision floating-point value `a' to an integer, and
2668 | returns the result as a double-precision floating-point value. The
2669 | operation is performed according to the IEC/IEEE Standard for Binary
2670 | Floating-Point Arithmetic.
2671 *----------------------------------------------------------------------------*/
2673 float64
float64_round_to_int( float64 a STATUS_PARAM
)
2677 bits64 lastBitMask
, roundBitsMask
;
2681 aExp
= extractFloat64Exp( a
);
2682 if ( 0x433 <= aExp
) {
2683 if ( ( aExp
== 0x7FF ) && extractFloat64Frac( a
) ) {
2684 return propagateFloat64NaN( a
, a STATUS_VAR
);
2688 if ( aExp
< 0x3FF ) {
2689 if ( (bits64
) ( float64_val(a
)<<1 ) == 0 ) return a
;
2690 STATUS(float_exception_flags
) |= float_flag_inexact
;
2691 aSign
= extractFloat64Sign( a
);
2692 switch ( STATUS(float_rounding_mode
) ) {
2693 case float_round_nearest_even
:
2694 if ( ( aExp
== 0x3FE ) && extractFloat64Frac( a
) ) {
2695 return packFloat64( aSign
, 0x3FF, 0 );
2698 case float_round_down
:
2699 return make_float64(aSign
? LIT64( 0xBFF0000000000000 ) : 0);
2700 case float_round_up
:
2701 return make_float64(
2702 aSign
? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2704 return packFloat64( aSign
, 0, 0 );
2707 lastBitMask
<<= 0x433 - aExp
;
2708 roundBitsMask
= lastBitMask
- 1;
2710 roundingMode
= STATUS(float_rounding_mode
);
2711 if ( roundingMode
== float_round_nearest_even
) {
2712 z
+= lastBitMask
>>1;
2713 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
2715 else if ( roundingMode
!= float_round_to_zero
) {
2716 if ( extractFloat64Sign( make_float64(z
) ) ^ ( roundingMode
== float_round_up
) ) {
2720 z
&= ~ roundBitsMask
;
2721 if ( z
!= float64_val(a
) )
2722 STATUS(float_exception_flags
) |= float_flag_inexact
;
2723 return make_float64(z
);
2727 float64
float64_trunc_to_int( float64 a STATUS_PARAM
)
2731 oldmode
= STATUS(float_rounding_mode
);
2732 STATUS(float_rounding_mode
) = float_round_to_zero
;
2733 res
= float64_round_to_int(a STATUS_VAR
);
2734 STATUS(float_rounding_mode
) = oldmode
;
2738 /*----------------------------------------------------------------------------
2739 | Returns the result of adding the absolute values of the double-precision
2740 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2741 | before being returned. `zSign' is ignored if the result is a NaN.
2742 | The addition is performed according to the IEC/IEEE Standard for Binary
2743 | Floating-Point Arithmetic.
2744 *----------------------------------------------------------------------------*/
2746 static float64
addFloat64Sigs( float64 a
, float64 b
, flag zSign STATUS_PARAM
)
2748 int16 aExp
, bExp
, zExp
;
2749 bits64 aSig
, bSig
, zSig
;
2752 aSig
= extractFloat64Frac( a
);
2753 aExp
= extractFloat64Exp( a
);
2754 bSig
= extractFloat64Frac( b
);
2755 bExp
= extractFloat64Exp( b
);
2756 expDiff
= aExp
- bExp
;
2759 if ( 0 < expDiff
) {
2760 if ( aExp
== 0x7FF ) {
2761 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2768 bSig
|= LIT64( 0x2000000000000000 );
2770 shift64RightJamming( bSig
, expDiff
, &bSig
);
2773 else if ( expDiff
< 0 ) {
2774 if ( bExp
== 0x7FF ) {
2775 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2776 return packFloat64( zSign
, 0x7FF, 0 );
2782 aSig
|= LIT64( 0x2000000000000000 );
2784 shift64RightJamming( aSig
, - expDiff
, &aSig
);
2788 if ( aExp
== 0x7FF ) {
2789 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2793 if ( STATUS(flush_to_zero
) ) return packFloat64( zSign
, 0, 0 );
2794 return packFloat64( zSign
, 0, ( aSig
+ bSig
)>>9 );
2796 zSig
= LIT64( 0x4000000000000000 ) + aSig
+ bSig
;
2800 aSig
|= LIT64( 0x2000000000000000 );
2801 zSig
= ( aSig
+ bSig
)<<1;
2803 if ( (sbits64
) zSig
< 0 ) {
2808 return roundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
2812 /*----------------------------------------------------------------------------
2813 | Returns the result of subtracting the absolute values of the double-
2814 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2815 | difference is negated before being returned. `zSign' is ignored if the
2816 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2817 | Standard for Binary Floating-Point Arithmetic.
2818 *----------------------------------------------------------------------------*/
2820 static float64
subFloat64Sigs( float64 a
, float64 b
, flag zSign STATUS_PARAM
)
2822 int16 aExp
, bExp
, zExp
;
2823 bits64 aSig
, bSig
, zSig
;
2826 aSig
= extractFloat64Frac( a
);
2827 aExp
= extractFloat64Exp( a
);
2828 bSig
= extractFloat64Frac( b
);
2829 bExp
= extractFloat64Exp( b
);
2830 expDiff
= aExp
- bExp
;
2833 if ( 0 < expDiff
) goto aExpBigger
;
2834 if ( expDiff
< 0 ) goto bExpBigger
;
2835 if ( aExp
== 0x7FF ) {
2836 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2837 float_raise( float_flag_invalid STATUS_VAR
);
2838 return float64_default_nan
;
2844 if ( bSig
< aSig
) goto aBigger
;
2845 if ( aSig
< bSig
) goto bBigger
;
2846 return packFloat64( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
2848 if ( bExp
== 0x7FF ) {
2849 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2850 return packFloat64( zSign
^ 1, 0x7FF, 0 );
2856 aSig
|= LIT64( 0x4000000000000000 );
2858 shift64RightJamming( aSig
, - expDiff
, &aSig
);
2859 bSig
|= LIT64( 0x4000000000000000 );
2864 goto normalizeRoundAndPack
;
2866 if ( aExp
== 0x7FF ) {
2867 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2874 bSig
|= LIT64( 0x4000000000000000 );
2876 shift64RightJamming( bSig
, expDiff
, &bSig
);
2877 aSig
|= LIT64( 0x4000000000000000 );
2881 normalizeRoundAndPack
:
2883 return normalizeRoundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
2887 /*----------------------------------------------------------------------------
2888 | Returns the result of adding the double-precision floating-point values `a'
2889 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2890 | Binary Floating-Point Arithmetic.
2891 *----------------------------------------------------------------------------*/
2893 float64
float64_add( float64 a
, float64 b STATUS_PARAM
)
2897 aSign
= extractFloat64Sign( a
);
2898 bSign
= extractFloat64Sign( b
);
2899 if ( aSign
== bSign
) {
2900 return addFloat64Sigs( a
, b
, aSign STATUS_VAR
);
2903 return subFloat64Sigs( a
, b
, aSign STATUS_VAR
);
2908 /*----------------------------------------------------------------------------
2909 | Returns the result of subtracting the double-precision floating-point values
2910 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2911 | for Binary Floating-Point Arithmetic.
2912 *----------------------------------------------------------------------------*/
2914 float64
float64_sub( float64 a
, float64 b STATUS_PARAM
)
2918 aSign
= extractFloat64Sign( a
);
2919 bSign
= extractFloat64Sign( b
);
2920 if ( aSign
== bSign
) {
2921 return subFloat64Sigs( a
, b
, aSign STATUS_VAR
);
2924 return addFloat64Sigs( a
, b
, aSign STATUS_VAR
);
2929 /*----------------------------------------------------------------------------
2930 | Returns the result of multiplying the double-precision floating-point values
2931 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2932 | for Binary Floating-Point Arithmetic.
2933 *----------------------------------------------------------------------------*/
2935 float64
float64_mul( float64 a
, float64 b STATUS_PARAM
)
2937 flag aSign
, bSign
, zSign
;
2938 int16 aExp
, bExp
, zExp
;
2939 bits64 aSig
, bSig
, zSig0
, zSig1
;
2941 aSig
= extractFloat64Frac( a
);
2942 aExp
= extractFloat64Exp( a
);
2943 aSign
= extractFloat64Sign( a
);
2944 bSig
= extractFloat64Frac( b
);
2945 bExp
= extractFloat64Exp( b
);
2946 bSign
= extractFloat64Sign( b
);
2947 zSign
= aSign
^ bSign
;
2948 if ( aExp
== 0x7FF ) {
2949 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
2950 return propagateFloat64NaN( a
, b STATUS_VAR
);
2952 if ( ( bExp
| bSig
) == 0 ) {
2953 float_raise( float_flag_invalid STATUS_VAR
);
2954 return float64_default_nan
;
2956 return packFloat64( zSign
, 0x7FF, 0 );
2958 if ( bExp
== 0x7FF ) {
2959 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2960 if ( ( aExp
| aSig
) == 0 ) {
2961 float_raise( float_flag_invalid STATUS_VAR
);
2962 return float64_default_nan
;
2964 return packFloat64( zSign
, 0x7FF, 0 );
2967 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2968 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2971 if ( bSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2972 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2974 zExp
= aExp
+ bExp
- 0x3FF;
2975 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
2976 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2977 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
2978 zSig0
|= ( zSig1
!= 0 );
2979 if ( 0 <= (sbits64
) ( zSig0
<<1 ) ) {
2983 return roundAndPackFloat64( zSign
, zExp
, zSig0 STATUS_VAR
);
2987 /*----------------------------------------------------------------------------
2988 | Returns the result of dividing the double-precision floating-point value `a'
2989 | by the corresponding value `b'. The operation is performed according to
2990 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2991 *----------------------------------------------------------------------------*/
2993 float64
float64_div( float64 a
, float64 b STATUS_PARAM
)
2995 flag aSign
, bSign
, zSign
;
2996 int16 aExp
, bExp
, zExp
;
2997 bits64 aSig
, bSig
, zSig
;
2999 bits64 term0
, term1
;
3001 aSig
= extractFloat64Frac( a
);
3002 aExp
= extractFloat64Exp( a
);
3003 aSign
= extractFloat64Sign( a
);
3004 bSig
= extractFloat64Frac( b
);
3005 bExp
= extractFloat64Exp( b
);
3006 bSign
= extractFloat64Sign( b
);
3007 zSign
= aSign
^ bSign
;
3008 if ( aExp
== 0x7FF ) {
3009 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3010 if ( bExp
== 0x7FF ) {
3011 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3012 float_raise( float_flag_invalid STATUS_VAR
);
3013 return float64_default_nan
;
3015 return packFloat64( zSign
, 0x7FF, 0 );
3017 if ( bExp
== 0x7FF ) {
3018 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3019 return packFloat64( zSign
, 0, 0 );
3023 if ( ( aExp
| aSig
) == 0 ) {
3024 float_raise( float_flag_invalid STATUS_VAR
);
3025 return float64_default_nan
;
3027 float_raise( float_flag_divbyzero STATUS_VAR
);
3028 return packFloat64( zSign
, 0x7FF, 0 );
3030 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3033 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
3034 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3036 zExp
= aExp
- bExp
+ 0x3FD;
3037 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
3038 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3039 if ( bSig
<= ( aSig
+ aSig
) ) {
3043 zSig
= estimateDiv128To64( aSig
, 0, bSig
);
3044 if ( ( zSig
& 0x1FF ) <= 2 ) {
3045 mul64To128( bSig
, zSig
, &term0
, &term1
);
3046 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
3047 while ( (sbits64
) rem0
< 0 ) {
3049 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
3051 zSig
|= ( rem1
!= 0 );
3053 return roundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
3057 /*----------------------------------------------------------------------------
3058 | Returns the remainder of the double-precision floating-point value `a'
3059 | with respect to the corresponding value `b'. The operation is performed
3060 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3061 *----------------------------------------------------------------------------*/
3063 float64
float64_rem( float64 a
, float64 b STATUS_PARAM
)
3065 flag aSign
, bSign
, zSign
;
3066 int16 aExp
, bExp
, expDiff
;
3068 bits64 q
, alternateASig
;
3071 aSig
= extractFloat64Frac( a
);
3072 aExp
= extractFloat64Exp( a
);
3073 aSign
= extractFloat64Sign( a
);
3074 bSig
= extractFloat64Frac( b
);
3075 bExp
= extractFloat64Exp( b
);
3076 bSign
= extractFloat64Sign( b
);
3077 if ( aExp
== 0x7FF ) {
3078 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
3079 return propagateFloat64NaN( a
, b STATUS_VAR
);
3081 float_raise( float_flag_invalid STATUS_VAR
);
3082 return float64_default_nan
;
3084 if ( bExp
== 0x7FF ) {
3085 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3090 float_raise( float_flag_invalid STATUS_VAR
);
3091 return float64_default_nan
;
3093 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3096 if ( aSig
== 0 ) return a
;
3097 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3099 expDiff
= aExp
- bExp
;
3100 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
3101 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3102 if ( expDiff
< 0 ) {
3103 if ( expDiff
< -1 ) return a
;
3106 q
= ( bSig
<= aSig
);
3107 if ( q
) aSig
-= bSig
;
3109 while ( 0 < expDiff
) {
3110 q
= estimateDiv128To64( aSig
, 0, bSig
);
3111 q
= ( 2 < q
) ? q
- 2 : 0;
3112 aSig
= - ( ( bSig
>>2 ) * q
);
3116 if ( 0 < expDiff
) {
3117 q
= estimateDiv128To64( aSig
, 0, bSig
);
3118 q
= ( 2 < q
) ? q
- 2 : 0;
3121 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
3128 alternateASig
= aSig
;
3131 } while ( 0 <= (sbits64
) aSig
);
3132 sigMean
= aSig
+ alternateASig
;
3133 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
3134 aSig
= alternateASig
;
3136 zSign
= ( (sbits64
) aSig
< 0 );
3137 if ( zSign
) aSig
= - aSig
;
3138 return normalizeRoundAndPackFloat64( aSign
^ zSign
, bExp
, aSig STATUS_VAR
);
3142 /*----------------------------------------------------------------------------
3143 | Returns the square root of the double-precision floating-point value `a'.
3144 | The operation is performed according to the IEC/IEEE Standard for Binary
3145 | Floating-Point Arithmetic.
3146 *----------------------------------------------------------------------------*/
3148 float64
float64_sqrt( float64 a STATUS_PARAM
)
3152 bits64 aSig
, zSig
, doubleZSig
;
3153 bits64 rem0
, rem1
, term0
, term1
;
3155 aSig
= extractFloat64Frac( a
);
3156 aExp
= extractFloat64Exp( a
);
3157 aSign
= extractFloat64Sign( a
);
3158 if ( aExp
== 0x7FF ) {
3159 if ( aSig
) return propagateFloat64NaN( a
, a STATUS_VAR
);
3160 if ( ! aSign
) return a
;
3161 float_raise( float_flag_invalid STATUS_VAR
);
3162 return float64_default_nan
;
3165 if ( ( aExp
| aSig
) == 0 ) return a
;
3166 float_raise( float_flag_invalid STATUS_VAR
);
3167 return float64_default_nan
;
3170 if ( aSig
== 0 ) return float64_zero
;
3171 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3173 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
3174 aSig
|= LIT64( 0x0010000000000000 );
3175 zSig
= estimateSqrt32( aExp
, aSig
>>21 );
3176 aSig
<<= 9 - ( aExp
& 1 );
3177 zSig
= estimateDiv128To64( aSig
, 0, zSig
<<32 ) + ( zSig
<<30 );
3178 if ( ( zSig
& 0x1FF ) <= 5 ) {
3179 doubleZSig
= zSig
<<1;
3180 mul64To128( zSig
, zSig
, &term0
, &term1
);
3181 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
3182 while ( (sbits64
) rem0
< 0 ) {
3185 add128( rem0
, rem1
, zSig
>>63, doubleZSig
| 1, &rem0
, &rem1
);
3187 zSig
|= ( ( rem0
| rem1
) != 0 );
3189 return roundAndPackFloat64( 0, zExp
, zSig STATUS_VAR
);
3193 /*----------------------------------------------------------------------------
3194 | Returns the binary log of the double-precision floating-point value `a'.
3195 | The operation is performed according to the IEC/IEEE Standard for Binary
3196 | Floating-Point Arithmetic.
3197 *----------------------------------------------------------------------------*/
3198 float64
float64_log2( float64 a STATUS_PARAM
)
3202 bits64 aSig
, aSig0
, aSig1
, zSig
, i
;
3204 aSig
= extractFloat64Frac( a
);
3205 aExp
= extractFloat64Exp( a
);
3206 aSign
= extractFloat64Sign( a
);
3209 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
3210 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3213 float_raise( float_flag_invalid STATUS_VAR
);
3214 return float64_default_nan
;
3216 if ( aExp
== 0x7FF ) {
3217 if ( aSig
) return propagateFloat64NaN( a
, float64_zero STATUS_VAR
);
3222 aSig
|= LIT64( 0x0010000000000000 );
3224 zSig
= (bits64
)aExp
<< 52;
3225 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
3226 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
3227 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
3228 if ( aSig
& LIT64( 0x0020000000000000 ) ) {
3236 return normalizeRoundAndPackFloat64( zSign
, 0x408, zSig STATUS_VAR
);
3239 /*----------------------------------------------------------------------------
3240 | Returns 1 if the double-precision floating-point value `a' is equal to the
3241 | corresponding value `b', and 0 otherwise. The comparison is performed
3242 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3243 *----------------------------------------------------------------------------*/
3245 int float64_eq( float64 a
, float64 b STATUS_PARAM
)
3249 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3250 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3252 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3253 float_raise( float_flag_invalid STATUS_VAR
);
3257 av
= float64_val(a
);
3258 bv
= float64_val(b
);
3259 return ( av
== bv
) || ( (bits64
) ( ( av
| bv
)<<1 ) == 0 );
3263 /*----------------------------------------------------------------------------
3264 | Returns 1 if the double-precision floating-point value `a' is less than or
3265 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3266 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3268 *----------------------------------------------------------------------------*/
3270 int float64_le( float64 a
, float64 b STATUS_PARAM
)
3275 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3276 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3278 float_raise( float_flag_invalid STATUS_VAR
);
3281 aSign
= extractFloat64Sign( a
);
3282 bSign
= extractFloat64Sign( b
);
3283 av
= float64_val(a
);
3284 bv
= float64_val(b
);
3285 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( av
| bv
)<<1 ) == 0 );
3286 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3290 /*----------------------------------------------------------------------------
3291 | Returns 1 if the double-precision floating-point value `a' is less than
3292 | the corresponding value `b', and 0 otherwise. The comparison is performed
3293 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3294 *----------------------------------------------------------------------------*/
3296 int float64_lt( float64 a
, float64 b STATUS_PARAM
)
3301 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3302 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3304 float_raise( float_flag_invalid STATUS_VAR
);
3307 aSign
= extractFloat64Sign( a
);
3308 bSign
= extractFloat64Sign( b
);
3309 av
= float64_val(a
);
3310 bv
= float64_val(b
);
3311 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( av
| bv
)<<1 ) != 0 );
3312 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3316 /*----------------------------------------------------------------------------
3317 | Returns 1 if the double-precision floating-point value `a' is equal to the
3318 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3319 | if either operand is a NaN. Otherwise, the comparison is performed
3320 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3321 *----------------------------------------------------------------------------*/
3323 int float64_eq_signaling( float64 a
, float64 b STATUS_PARAM
)
3327 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3328 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3330 float_raise( float_flag_invalid STATUS_VAR
);
3333 av
= float64_val(a
);
3334 bv
= float64_val(b
);
3335 return ( av
== bv
) || ( (bits64
) ( ( av
| bv
)<<1 ) == 0 );
3339 /*----------------------------------------------------------------------------
3340 | Returns 1 if the double-precision floating-point value `a' is less than or
3341 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3342 | cause an exception. Otherwise, the comparison is performed according to the
3343 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3344 *----------------------------------------------------------------------------*/
3346 int float64_le_quiet( float64 a
, float64 b STATUS_PARAM
)
3351 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3352 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3354 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3355 float_raise( float_flag_invalid STATUS_VAR
);
3359 aSign
= extractFloat64Sign( a
);
3360 bSign
= extractFloat64Sign( b
);
3361 av
= float64_val(a
);
3362 bv
= float64_val(b
);
3363 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( av
| bv
)<<1 ) == 0 );
3364 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3368 /*----------------------------------------------------------------------------
3369 | Returns 1 if the double-precision floating-point value `a' is less than
3370 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3371 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3372 | Standard for Binary Floating-Point Arithmetic.
3373 *----------------------------------------------------------------------------*/
3375 int float64_lt_quiet( float64 a
, float64 b STATUS_PARAM
)
3380 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3381 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3383 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3384 float_raise( float_flag_invalid STATUS_VAR
);
3388 aSign
= extractFloat64Sign( a
);
3389 bSign
= extractFloat64Sign( b
);
3390 av
= float64_val(a
);
3391 bv
= float64_val(b
);
3392 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( av
| bv
)<<1 ) != 0 );
3393 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3399 /*----------------------------------------------------------------------------
3400 | Returns the result of converting the extended double-precision floating-
3401 | point value `a' to the 32-bit two's complement integer format. The
3402 | conversion is performed according to the IEC/IEEE Standard for Binary
3403 | Floating-Point Arithmetic---which means in particular that the conversion
3404 | is rounded according to the current rounding mode. If `a' is a NaN, the
3405 | largest positive integer is returned. Otherwise, if the conversion
3406 | overflows, the largest integer with the same sign as `a' is returned.
3407 *----------------------------------------------------------------------------*/
3409 int32
floatx80_to_int32( floatx80 a STATUS_PARAM
)
3412 int32 aExp
, shiftCount
;
3415 aSig
= extractFloatx80Frac( a
);
3416 aExp
= extractFloatx80Exp( a
);
3417 aSign
= extractFloatx80Sign( a
);
3418 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
3419 shiftCount
= 0x4037 - aExp
;
3420 if ( shiftCount
<= 0 ) shiftCount
= 1;
3421 shift64RightJamming( aSig
, shiftCount
, &aSig
);
3422 return roundAndPackInt32( aSign
, aSig STATUS_VAR
);
3426 /*----------------------------------------------------------------------------
3427 | Returns the result of converting the extended double-precision floating-
3428 | point value `a' to the 32-bit two's complement integer format. The
3429 | conversion is performed according to the IEC/IEEE Standard for Binary
3430 | Floating-Point Arithmetic, except that the conversion is always rounded
3431 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3432 | Otherwise, if the conversion overflows, the largest integer with the same
3433 | sign as `a' is returned.
3434 *----------------------------------------------------------------------------*/
3436 int32
floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM
)
3439 int32 aExp
, shiftCount
;
3440 bits64 aSig
, savedASig
;
3443 aSig
= extractFloatx80Frac( a
);
3444 aExp
= extractFloatx80Exp( a
);
3445 aSign
= extractFloatx80Sign( a
);
3446 if ( 0x401E < aExp
) {
3447 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
3450 else if ( aExp
< 0x3FFF ) {
3451 if ( aExp
|| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
3454 shiftCount
= 0x403E - aExp
;
3456 aSig
>>= shiftCount
;
3458 if ( aSign
) z
= - z
;
3459 if ( ( z
< 0 ) ^ aSign
) {
3461 float_raise( float_flag_invalid STATUS_VAR
);
3462 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
3464 if ( ( aSig
<<shiftCount
) != savedASig
) {
3465 STATUS(float_exception_flags
) |= float_flag_inexact
;
3471 /*----------------------------------------------------------------------------
3472 | Returns the result of converting the extended double-precision floating-
3473 | point value `a' to the 64-bit two's complement integer format. The
3474 | conversion is performed according to the IEC/IEEE Standard for Binary
3475 | Floating-Point Arithmetic---which means in particular that the conversion
3476 | is rounded according to the current rounding mode. If `a' is a NaN,
3477 | the largest positive integer is returned. Otherwise, if the conversion
3478 | overflows, the largest integer with the same sign as `a' is returned.
3479 *----------------------------------------------------------------------------*/
3481 int64
floatx80_to_int64( floatx80 a STATUS_PARAM
)
3484 int32 aExp
, shiftCount
;
3485 bits64 aSig
, aSigExtra
;
3487 aSig
= extractFloatx80Frac( a
);
3488 aExp
= extractFloatx80Exp( a
);
3489 aSign
= extractFloatx80Sign( a
);
3490 shiftCount
= 0x403E - aExp
;
3491 if ( shiftCount
<= 0 ) {
3493 float_raise( float_flag_invalid STATUS_VAR
);
3495 || ( ( aExp
== 0x7FFF )
3496 && ( aSig
!= LIT64( 0x8000000000000000 ) ) )
3498 return LIT64( 0x7FFFFFFFFFFFFFFF );
3500 return (sbits64
) LIT64( 0x8000000000000000 );
3505 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
3507 return roundAndPackInt64( aSign
, aSig
, aSigExtra STATUS_VAR
);
3511 /*----------------------------------------------------------------------------
3512 | Returns the result of converting the extended double-precision floating-
3513 | point value `a' to the 64-bit two's complement integer format. The
3514 | conversion is performed according to the IEC/IEEE Standard for Binary
3515 | Floating-Point Arithmetic, except that the conversion is always rounded
3516 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3517 | Otherwise, if the conversion overflows, the largest integer with the same
3518 | sign as `a' is returned.
3519 *----------------------------------------------------------------------------*/
3521 int64
floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM
)
3524 int32 aExp
, shiftCount
;
3528 aSig
= extractFloatx80Frac( a
);
3529 aExp
= extractFloatx80Exp( a
);
3530 aSign
= extractFloatx80Sign( a
);
3531 shiftCount
= aExp
- 0x403E;
3532 if ( 0 <= shiftCount
) {
3533 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
3534 if ( ( a
.high
!= 0xC03E ) || aSig
) {
3535 float_raise( float_flag_invalid STATUS_VAR
);
3536 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
3537 return LIT64( 0x7FFFFFFFFFFFFFFF );
3540 return (sbits64
) LIT64( 0x8000000000000000 );
3542 else if ( aExp
< 0x3FFF ) {
3543 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
3546 z
= aSig
>>( - shiftCount
);
3547 if ( (bits64
) ( aSig
<<( shiftCount
& 63 ) ) ) {
3548 STATUS(float_exception_flags
) |= float_flag_inexact
;
3550 if ( aSign
) z
= - z
;
3555 /*----------------------------------------------------------------------------
3556 | Returns the result of converting the extended double-precision floating-
3557 | point value `a' to the single-precision floating-point format. The
3558 | conversion is performed according to the IEC/IEEE Standard for Binary
3559 | Floating-Point Arithmetic.
3560 *----------------------------------------------------------------------------*/
3562 float32
floatx80_to_float32( floatx80 a STATUS_PARAM
)
3568 aSig
= extractFloatx80Frac( a
);
3569 aExp
= extractFloatx80Exp( a
);
3570 aSign
= extractFloatx80Sign( a
);
3571 if ( aExp
== 0x7FFF ) {
3572 if ( (bits64
) ( aSig
<<1 ) ) {
3573 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR
) );
3575 return packFloat32( aSign
, 0xFF, 0 );
3577 shift64RightJamming( aSig
, 33, &aSig
);
3578 if ( aExp
|| aSig
) aExp
-= 0x3F81;
3579 return roundAndPackFloat32( aSign
, aExp
, aSig STATUS_VAR
);
3583 /*----------------------------------------------------------------------------
3584 | Returns the result of converting the extended double-precision floating-
3585 | point value `a' to the double-precision floating-point format. The
3586 | conversion is performed according to the IEC/IEEE Standard for Binary
3587 | Floating-Point Arithmetic.
3588 *----------------------------------------------------------------------------*/
3590 float64
floatx80_to_float64( floatx80 a STATUS_PARAM
)
3596 aSig
= extractFloatx80Frac( a
);
3597 aExp
= extractFloatx80Exp( a
);
3598 aSign
= extractFloatx80Sign( a
);
3599 if ( aExp
== 0x7FFF ) {
3600 if ( (bits64
) ( aSig
<<1 ) ) {
3601 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR
) );
3603 return packFloat64( aSign
, 0x7FF, 0 );
3605 shift64RightJamming( aSig
, 1, &zSig
);
3606 if ( aExp
|| aSig
) aExp
-= 0x3C01;
3607 return roundAndPackFloat64( aSign
, aExp
, zSig STATUS_VAR
);
3613 /*----------------------------------------------------------------------------
3614 | Returns the result of converting the extended double-precision floating-
3615 | point value `a' to the quadruple-precision floating-point format. The
3616 | conversion is performed according to the IEC/IEEE Standard for Binary
3617 | Floating-Point Arithmetic.
3618 *----------------------------------------------------------------------------*/
3620 float128
floatx80_to_float128( floatx80 a STATUS_PARAM
)
3624 bits64 aSig
, zSig0
, zSig1
;
3626 aSig
= extractFloatx80Frac( a
);
3627 aExp
= extractFloatx80Exp( a
);
3628 aSign
= extractFloatx80Sign( a
);
3629 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) {
3630 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR
) );
3632 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
3633 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
3639 /*----------------------------------------------------------------------------
3640 | Rounds the extended double-precision floating-point value `a' to an integer,
3641 | and returns the result as an extended quadruple-precision floating-point
3642 | value. The operation is performed according to the IEC/IEEE Standard for
3643 | Binary Floating-Point Arithmetic.
3644 *----------------------------------------------------------------------------*/
3646 floatx80
floatx80_round_to_int( floatx80 a STATUS_PARAM
)
3650 bits64 lastBitMask
, roundBitsMask
;
3654 aExp
= extractFloatx80Exp( a
);
3655 if ( 0x403E <= aExp
) {
3656 if ( ( aExp
== 0x7FFF ) && (bits64
) ( extractFloatx80Frac( a
)<<1 ) ) {
3657 return propagateFloatx80NaN( a
, a STATUS_VAR
);
3661 if ( aExp
< 0x3FFF ) {
3663 && ( (bits64
) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
3666 STATUS(float_exception_flags
) |= float_flag_inexact
;
3667 aSign
= extractFloatx80Sign( a
);
3668 switch ( STATUS(float_rounding_mode
) ) {
3669 case float_round_nearest_even
:
3670 if ( ( aExp
== 0x3FFE ) && (bits64
) ( extractFloatx80Frac( a
)<<1 )
3673 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
3676 case float_round_down
:
3679 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3680 : packFloatx80( 0, 0, 0 );
3681 case float_round_up
:
3683 aSign
? packFloatx80( 1, 0, 0 )
3684 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3686 return packFloatx80( aSign
, 0, 0 );
3689 lastBitMask
<<= 0x403E - aExp
;
3690 roundBitsMask
= lastBitMask
- 1;
3692 roundingMode
= STATUS(float_rounding_mode
);
3693 if ( roundingMode
== float_round_nearest_even
) {
3694 z
.low
+= lastBitMask
>>1;
3695 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
3697 else if ( roundingMode
!= float_round_to_zero
) {
3698 if ( extractFloatx80Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
3699 z
.low
+= roundBitsMask
;
3702 z
.low
&= ~ roundBitsMask
;
3705 z
.low
= LIT64( 0x8000000000000000 );
3707 if ( z
.low
!= a
.low
) STATUS(float_exception_flags
) |= float_flag_inexact
;
3712 /*----------------------------------------------------------------------------
3713 | Returns the result of adding the absolute values of the extended double-
3714 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3715 | negated before being returned. `zSign' is ignored if the result is a NaN.
3716 | The addition is performed according to the IEC/IEEE Standard for Binary
3717 | Floating-Point Arithmetic.
3718 *----------------------------------------------------------------------------*/
3720 static floatx80
addFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign STATUS_PARAM
)
3722 int32 aExp
, bExp
, zExp
;
3723 bits64 aSig
, bSig
, zSig0
, zSig1
;
3726 aSig
= extractFloatx80Frac( a
);
3727 aExp
= extractFloatx80Exp( a
);
3728 bSig
= extractFloatx80Frac( b
);
3729 bExp
= extractFloatx80Exp( b
);
3730 expDiff
= aExp
- bExp
;
3731 if ( 0 < expDiff
) {
3732 if ( aExp
== 0x7FFF ) {
3733 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3736 if ( bExp
== 0 ) --expDiff
;
3737 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
3740 else if ( expDiff
< 0 ) {
3741 if ( bExp
== 0x7FFF ) {
3742 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3743 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3745 if ( aExp
== 0 ) ++expDiff
;
3746 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
3750 if ( aExp
== 0x7FFF ) {
3751 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
3752 return propagateFloatx80NaN( a
, b STATUS_VAR
);
3757 zSig0
= aSig
+ bSig
;
3759 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
3765 zSig0
= aSig
+ bSig
;
3766 if ( (sbits64
) zSig0
< 0 ) goto roundAndPack
;
3768 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
3769 zSig0
|= LIT64( 0x8000000000000000 );
3773 roundAndPackFloatx80(
3774 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
3778 /*----------------------------------------------------------------------------
3779 | Returns the result of subtracting the absolute values of the extended
3780 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3781 | difference is negated before being returned. `zSign' is ignored if the
3782 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3783 | Standard for Binary Floating-Point Arithmetic.
3784 *----------------------------------------------------------------------------*/
3786 static floatx80
subFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign STATUS_PARAM
)
3788 int32 aExp
, bExp
, zExp
;
3789 bits64 aSig
, bSig
, zSig0
, zSig1
;
3793 aSig
= extractFloatx80Frac( a
);
3794 aExp
= extractFloatx80Exp( a
);
3795 bSig
= extractFloatx80Frac( b
);
3796 bExp
= extractFloatx80Exp( b
);
3797 expDiff
= aExp
- bExp
;
3798 if ( 0 < expDiff
) goto aExpBigger
;
3799 if ( expDiff
< 0 ) goto bExpBigger
;
3800 if ( aExp
== 0x7FFF ) {
3801 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
3802 return propagateFloatx80NaN( a
, b STATUS_VAR
);
3804 float_raise( float_flag_invalid STATUS_VAR
);
3805 z
.low
= floatx80_default_nan_low
;
3806 z
.high
= floatx80_default_nan_high
;
3814 if ( bSig
< aSig
) goto aBigger
;
3815 if ( aSig
< bSig
) goto bBigger
;
3816 return packFloatx80( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
3818 if ( bExp
== 0x7FFF ) {
3819 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3820 return packFloatx80( zSign
^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3822 if ( aExp
== 0 ) ++expDiff
;
3823 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
3825 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
3828 goto normalizeRoundAndPack
;
3830 if ( aExp
== 0x7FFF ) {
3831 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3834 if ( bExp
== 0 ) --expDiff
;
3835 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
3837 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
3839 normalizeRoundAndPack
:
3841 normalizeRoundAndPackFloatx80(
3842 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
3846 /*----------------------------------------------------------------------------
3847 | Returns the result of adding the extended double-precision floating-point
3848 | values `a' and `b'. The operation is performed according to the IEC/IEEE
3849 | Standard for Binary Floating-Point Arithmetic.
3850 *----------------------------------------------------------------------------*/
3852 floatx80
floatx80_add( floatx80 a
, floatx80 b STATUS_PARAM
)
3856 aSign
= extractFloatx80Sign( a
);
3857 bSign
= extractFloatx80Sign( b
);
3858 if ( aSign
== bSign
) {
3859 return addFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
3862 return subFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
3867 /*----------------------------------------------------------------------------
3868 | Returns the result of subtracting the extended double-precision floating-
3869 | point values `a' and `b'. The operation is performed according to the
3870 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3871 *----------------------------------------------------------------------------*/
3873 floatx80
floatx80_sub( floatx80 a
, floatx80 b STATUS_PARAM
)
3877 aSign
= extractFloatx80Sign( a
);
3878 bSign
= extractFloatx80Sign( b
);
3879 if ( aSign
== bSign
) {
3880 return subFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
3883 return addFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
3888 /*----------------------------------------------------------------------------
3889 | Returns the result of multiplying the extended double-precision floating-
3890 | point values `a' and `b'. The operation is performed according to the
3891 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3892 *----------------------------------------------------------------------------*/
3894 floatx80
floatx80_mul( floatx80 a
, floatx80 b STATUS_PARAM
)
3896 flag aSign
, bSign
, zSign
;
3897 int32 aExp
, bExp
, zExp
;
3898 bits64 aSig
, bSig
, zSig0
, zSig1
;
3901 aSig
= extractFloatx80Frac( a
);
3902 aExp
= extractFloatx80Exp( a
);
3903 aSign
= extractFloatx80Sign( a
);
3904 bSig
= extractFloatx80Frac( b
);
3905 bExp
= extractFloatx80Exp( b
);
3906 bSign
= extractFloatx80Sign( b
);
3907 zSign
= aSign
^ bSign
;
3908 if ( aExp
== 0x7FFF ) {
3909 if ( (bits64
) ( aSig
<<1 )
3910 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
3911 return propagateFloatx80NaN( a
, b STATUS_VAR
);
3913 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
3914 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3916 if ( bExp
== 0x7FFF ) {
3917 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3918 if ( ( aExp
| aSig
) == 0 ) {
3920 float_raise( float_flag_invalid STATUS_VAR
);
3921 z
.low
= floatx80_default_nan_low
;
3922 z
.high
= floatx80_default_nan_high
;
3925 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3928 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3929 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
3932 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3933 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3935 zExp
= aExp
+ bExp
- 0x3FFE;
3936 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
3937 if ( 0 < (sbits64
) zSig0
) {
3938 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
3942 roundAndPackFloatx80(
3943 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
3947 /*----------------------------------------------------------------------------
3948 | Returns the result of dividing the extended double-precision floating-point
3949 | value `a' by the corresponding value `b'. The operation is performed
3950 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3951 *----------------------------------------------------------------------------*/
3953 floatx80
floatx80_div( floatx80 a
, floatx80 b STATUS_PARAM
)
3955 flag aSign
, bSign
, zSign
;
3956 int32 aExp
, bExp
, zExp
;
3957 bits64 aSig
, bSig
, zSig0
, zSig1
;
3958 bits64 rem0
, rem1
, rem2
, term0
, term1
, term2
;
3961 aSig
= extractFloatx80Frac( a
);
3962 aExp
= extractFloatx80Exp( a
);
3963 aSign
= extractFloatx80Sign( a
);
3964 bSig
= extractFloatx80Frac( b
);
3965 bExp
= extractFloatx80Exp( b
);
3966 bSign
= extractFloatx80Sign( b
);
3967 zSign
= aSign
^ bSign
;
3968 if ( aExp
== 0x7FFF ) {
3969 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3970 if ( bExp
== 0x7FFF ) {
3971 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3974 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3976 if ( bExp
== 0x7FFF ) {
3977 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3978 return packFloatx80( zSign
, 0, 0 );
3982 if ( ( aExp
| aSig
) == 0 ) {
3984 float_raise( float_flag_invalid STATUS_VAR
);
3985 z
.low
= floatx80_default_nan_low
;
3986 z
.high
= floatx80_default_nan_high
;
3989 float_raise( float_flag_divbyzero STATUS_VAR
);
3990 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3992 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3995 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3996 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
3998 zExp
= aExp
- bExp
+ 0x3FFE;
4000 if ( bSig
<= aSig
) {
4001 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
4004 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
4005 mul64To128( bSig
, zSig0
, &term0
, &term1
);
4006 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
4007 while ( (sbits64
) rem0
< 0 ) {
4009 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
4011 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
4012 if ( (bits64
) ( zSig1
<<1 ) <= 8 ) {
4013 mul64To128( bSig
, zSig1
, &term1
, &term2
);
4014 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
4015 while ( (sbits64
) rem1
< 0 ) {
4017 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
4019 zSig1
|= ( ( rem1
| rem2
) != 0 );
4022 roundAndPackFloatx80(
4023 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4027 /*----------------------------------------------------------------------------
4028 | Returns the remainder of the extended double-precision floating-point value
4029 | `a' with respect to the corresponding value `b'. The operation is performed
4030 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4031 *----------------------------------------------------------------------------*/
4033 floatx80
floatx80_rem( floatx80 a
, floatx80 b STATUS_PARAM
)
4035 flag aSign
, bSign
, zSign
;
4036 int32 aExp
, bExp
, expDiff
;
4037 bits64 aSig0
, aSig1
, bSig
;
4038 bits64 q
, term0
, term1
, alternateASig0
, alternateASig1
;
4041 aSig0
= extractFloatx80Frac( a
);
4042 aExp
= extractFloatx80Exp( a
);
4043 aSign
= extractFloatx80Sign( a
);
4044 bSig
= extractFloatx80Frac( b
);
4045 bExp
= extractFloatx80Exp( b
);
4046 bSign
= extractFloatx80Sign( b
);
4047 if ( aExp
== 0x7FFF ) {
4048 if ( (bits64
) ( aSig0
<<1 )
4049 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
4050 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4054 if ( bExp
== 0x7FFF ) {
4055 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4061 float_raise( float_flag_invalid STATUS_VAR
);
4062 z
.low
= floatx80_default_nan_low
;
4063 z
.high
= floatx80_default_nan_high
;
4066 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
4069 if ( (bits64
) ( aSig0
<<1 ) == 0 ) return a
;
4070 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
4072 bSig
|= LIT64( 0x8000000000000000 );
4074 expDiff
= aExp
- bExp
;
4076 if ( expDiff
< 0 ) {
4077 if ( expDiff
< -1 ) return a
;
4078 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
4081 q
= ( bSig
<= aSig0
);
4082 if ( q
) aSig0
-= bSig
;
4084 while ( 0 < expDiff
) {
4085 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
4086 q
= ( 2 < q
) ? q
- 2 : 0;
4087 mul64To128( bSig
, q
, &term0
, &term1
);
4088 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
4089 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
4093 if ( 0 < expDiff
) {
4094 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
4095 q
= ( 2 < q
) ? q
- 2 : 0;
4097 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
4098 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
4099 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
4100 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
4102 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
4109 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
4110 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
4111 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
4114 aSig0
= alternateASig0
;
4115 aSig1
= alternateASig1
;
4119 normalizeRoundAndPackFloatx80(
4120 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1 STATUS_VAR
);
4124 /*----------------------------------------------------------------------------
4125 | Returns the square root of the extended double-precision floating-point
4126 | value `a'. The operation is performed according to the IEC/IEEE Standard
4127 | for Binary Floating-Point Arithmetic.
4128 *----------------------------------------------------------------------------*/
4130 floatx80
floatx80_sqrt( floatx80 a STATUS_PARAM
)
4134 bits64 aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
4135 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
4138 aSig0
= extractFloatx80Frac( a
);
4139 aExp
= extractFloatx80Exp( a
);
4140 aSign
= extractFloatx80Sign( a
);
4141 if ( aExp
== 0x7FFF ) {
4142 if ( (bits64
) ( aSig0
<<1 ) ) return propagateFloatx80NaN( a
, a STATUS_VAR
);
4143 if ( ! aSign
) return a
;
4147 if ( ( aExp
| aSig0
) == 0 ) return a
;
4149 float_raise( float_flag_invalid STATUS_VAR
);
4150 z
.low
= floatx80_default_nan_low
;
4151 z
.high
= floatx80_default_nan_high
;
4155 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
4156 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
4158 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
4159 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
4160 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
4161 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
4162 doubleZSig0
= zSig0
<<1;
4163 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
4164 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
4165 while ( (sbits64
) rem0
< 0 ) {
4168 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
4170 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
4171 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4172 if ( zSig1
== 0 ) zSig1
= 1;
4173 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
4174 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
4175 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
4176 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
4177 while ( (sbits64
) rem1
< 0 ) {
4179 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
4181 term2
|= doubleZSig0
;
4182 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
4184 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
4186 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
4187 zSig0
|= doubleZSig0
;
4189 roundAndPackFloatx80(
4190 STATUS(floatx80_rounding_precision
), 0, zExp
, zSig0
, zSig1 STATUS_VAR
);
4194 /*----------------------------------------------------------------------------
4195 | Returns 1 if the extended double-precision floating-point value `a' is
4196 | equal to the corresponding value `b', and 0 otherwise. The comparison is
4197 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4199 *----------------------------------------------------------------------------*/
4201 int floatx80_eq( floatx80 a
, floatx80 b STATUS_PARAM
)
4204 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4205 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4206 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4207 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4209 if ( floatx80_is_signaling_nan( a
)
4210 || floatx80_is_signaling_nan( b
) ) {
4211 float_raise( float_flag_invalid STATUS_VAR
);
4217 && ( ( a
.high
== b
.high
)
4219 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
4224 /*----------------------------------------------------------------------------
4225 | Returns 1 if the extended double-precision floating-point value `a' is
4226 | less than or equal to the corresponding value `b', and 0 otherwise. The
4227 | comparison is performed according to the IEC/IEEE Standard for Binary
4228 | Floating-Point Arithmetic.
4229 *----------------------------------------------------------------------------*/
4231 int floatx80_le( floatx80 a
, floatx80 b STATUS_PARAM
)
4235 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4236 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4237 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4238 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4240 float_raise( float_flag_invalid STATUS_VAR
);
4243 aSign
= extractFloatx80Sign( a
);
4244 bSign
= extractFloatx80Sign( b
);
4245 if ( aSign
!= bSign
) {
4248 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4252 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4253 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4257 /*----------------------------------------------------------------------------
4258 | Returns 1 if the extended double-precision floating-point value `a' is
4259 | less than the corresponding value `b', and 0 otherwise. The comparison
4260 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4262 *----------------------------------------------------------------------------*/
4264 int floatx80_lt( floatx80 a
, floatx80 b STATUS_PARAM
)
4268 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4269 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4270 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4271 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4273 float_raise( float_flag_invalid STATUS_VAR
);
4276 aSign
= extractFloatx80Sign( a
);
4277 bSign
= extractFloatx80Sign( b
);
4278 if ( aSign
!= bSign
) {
4281 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4285 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4286 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4290 /*----------------------------------------------------------------------------
4291 | Returns 1 if the extended double-precision floating-point value `a' is equal
4292 | to the corresponding value `b', and 0 otherwise. The invalid exception is
4293 | raised if either operand is a NaN. Otherwise, the comparison is performed
4294 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4295 *----------------------------------------------------------------------------*/
4297 int floatx80_eq_signaling( floatx80 a
, floatx80 b STATUS_PARAM
)
4300 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4301 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4302 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4303 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4305 float_raise( float_flag_invalid STATUS_VAR
);
4310 && ( ( a
.high
== b
.high
)
4312 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
4317 /*----------------------------------------------------------------------------
4318 | Returns 1 if the extended double-precision floating-point value `a' is less
4319 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4320 | do not cause an exception. Otherwise, the comparison is performed according
4321 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4322 *----------------------------------------------------------------------------*/
4324 int floatx80_le_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
4328 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4329 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4330 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4331 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4333 if ( floatx80_is_signaling_nan( a
)
4334 || floatx80_is_signaling_nan( b
) ) {
4335 float_raise( float_flag_invalid STATUS_VAR
);
4339 aSign
= extractFloatx80Sign( a
);
4340 bSign
= extractFloatx80Sign( b
);
4341 if ( aSign
!= bSign
) {
4344 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4348 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4349 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4353 /*----------------------------------------------------------------------------
4354 | Returns 1 if the extended double-precision floating-point value `a' is less
4355 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4356 | an exception. Otherwise, the comparison is performed according to the
4357 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4358 *----------------------------------------------------------------------------*/
4360 int floatx80_lt_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
4364 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4365 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4366 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4367 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4369 if ( floatx80_is_signaling_nan( a
)
4370 || floatx80_is_signaling_nan( b
) ) {
4371 float_raise( float_flag_invalid STATUS_VAR
);
4375 aSign
= extractFloatx80Sign( a
);
4376 bSign
= extractFloatx80Sign( b
);
4377 if ( aSign
!= bSign
) {
4380 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4384 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4385 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4393 /*----------------------------------------------------------------------------
4394 | Returns the result of converting the quadruple-precision floating-point
4395 | value `a' to the 32-bit two's complement integer format. The conversion
4396 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4397 | Arithmetic---which means in particular that the conversion is rounded
4398 | according to the current rounding mode. If `a' is a NaN, the largest
4399 | positive integer is returned. Otherwise, if the conversion overflows, the
4400 | largest integer with the same sign as `a' is returned.
4401 *----------------------------------------------------------------------------*/
4403 int32
float128_to_int32( float128 a STATUS_PARAM
)
4406 int32 aExp
, shiftCount
;
4407 bits64 aSig0
, aSig1
;
4409 aSig1
= extractFloat128Frac1( a
);
4410 aSig0
= extractFloat128Frac0( a
);
4411 aExp
= extractFloat128Exp( a
);
4412 aSign
= extractFloat128Sign( a
);
4413 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
4414 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4415 aSig0
|= ( aSig1
!= 0 );
4416 shiftCount
= 0x4028 - aExp
;
4417 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
4418 return roundAndPackInt32( aSign
, aSig0 STATUS_VAR
);
4422 /*----------------------------------------------------------------------------
4423 | Returns the result of converting the quadruple-precision floating-point
4424 | value `a' to the 32-bit two's complement integer format. The conversion
4425 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4426 | Arithmetic, except that the conversion is always rounded toward zero. If
4427 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4428 | conversion overflows, the largest integer with the same sign as `a' is
4430 *----------------------------------------------------------------------------*/
4432 int32
float128_to_int32_round_to_zero( float128 a STATUS_PARAM
)
4435 int32 aExp
, shiftCount
;
4436 bits64 aSig0
, aSig1
, savedASig
;
4439 aSig1
= extractFloat128Frac1( a
);
4440 aSig0
= extractFloat128Frac0( a
);
4441 aExp
= extractFloat128Exp( a
);
4442 aSign
= extractFloat128Sign( a
);
4443 aSig0
|= ( aSig1
!= 0 );
4444 if ( 0x401E < aExp
) {
4445 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
4448 else if ( aExp
< 0x3FFF ) {
4449 if ( aExp
|| aSig0
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4452 aSig0
|= LIT64( 0x0001000000000000 );
4453 shiftCount
= 0x402F - aExp
;
4455 aSig0
>>= shiftCount
;
4457 if ( aSign
) z
= - z
;
4458 if ( ( z
< 0 ) ^ aSign
) {
4460 float_raise( float_flag_invalid STATUS_VAR
);
4461 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
4463 if ( ( aSig0
<<shiftCount
) != savedASig
) {
4464 STATUS(float_exception_flags
) |= float_flag_inexact
;
4470 /*----------------------------------------------------------------------------
4471 | Returns the result of converting the quadruple-precision floating-point
4472 | value `a' to the 64-bit two's complement integer format. The conversion
4473 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4474 | Arithmetic---which means in particular that the conversion is rounded
4475 | according to the current rounding mode. If `a' is a NaN, the largest
4476 | positive integer is returned. Otherwise, if the conversion overflows, the
4477 | largest integer with the same sign as `a' is returned.
4478 *----------------------------------------------------------------------------*/
4480 int64
float128_to_int64( float128 a STATUS_PARAM
)
4483 int32 aExp
, shiftCount
;
4484 bits64 aSig0
, aSig1
;
4486 aSig1
= extractFloat128Frac1( a
);
4487 aSig0
= extractFloat128Frac0( a
);
4488 aExp
= extractFloat128Exp( a
);
4489 aSign
= extractFloat128Sign( a
);
4490 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4491 shiftCount
= 0x402F - aExp
;
4492 if ( shiftCount
<= 0 ) {
4493 if ( 0x403E < aExp
) {
4494 float_raise( float_flag_invalid STATUS_VAR
);
4496 || ( ( aExp
== 0x7FFF )
4497 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
4500 return LIT64( 0x7FFFFFFFFFFFFFFF );
4502 return (sbits64
) LIT64( 0x8000000000000000 );
4504 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
4507 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
4509 return roundAndPackInt64( aSign
, aSig0
, aSig1 STATUS_VAR
);
4513 /*----------------------------------------------------------------------------
4514 | Returns the result of converting the quadruple-precision floating-point
4515 | value `a' to the 64-bit two's complement integer format. The conversion
4516 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4517 | Arithmetic, except that the conversion is always rounded toward zero.
4518 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4519 | the conversion overflows, the largest integer with the same sign as `a' is
4521 *----------------------------------------------------------------------------*/
4523 int64
float128_to_int64_round_to_zero( float128 a STATUS_PARAM
)
4526 int32 aExp
, shiftCount
;
4527 bits64 aSig0
, aSig1
;
4530 aSig1
= extractFloat128Frac1( a
);
4531 aSig0
= extractFloat128Frac0( a
);
4532 aExp
= extractFloat128Exp( a
);
4533 aSign
= extractFloat128Sign( a
);
4534 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4535 shiftCount
= aExp
- 0x402F;
4536 if ( 0 < shiftCount
) {
4537 if ( 0x403E <= aExp
) {
4538 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
4539 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
4540 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
4541 if ( aSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4544 float_raise( float_flag_invalid STATUS_VAR
);
4545 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
4546 return LIT64( 0x7FFFFFFFFFFFFFFF );
4549 return (sbits64
) LIT64( 0x8000000000000000 );
4551 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
4552 if ( (bits64
) ( aSig1
<<shiftCount
) ) {
4553 STATUS(float_exception_flags
) |= float_flag_inexact
;
4557 if ( aExp
< 0x3FFF ) {
4558 if ( aExp
| aSig0
| aSig1
) {
4559 STATUS(float_exception_flags
) |= float_flag_inexact
;
4563 z
= aSig0
>>( - shiftCount
);
4565 || ( shiftCount
&& (bits64
) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
4566 STATUS(float_exception_flags
) |= float_flag_inexact
;
4569 if ( aSign
) z
= - z
;
4574 /*----------------------------------------------------------------------------
4575 | Returns the result of converting the quadruple-precision floating-point
4576 | value `a' to the single-precision floating-point format. The conversion
4577 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4579 *----------------------------------------------------------------------------*/
4581 float32
float128_to_float32( float128 a STATUS_PARAM
)
4585 bits64 aSig0
, aSig1
;
4588 aSig1
= extractFloat128Frac1( a
);
4589 aSig0
= extractFloat128Frac0( a
);
4590 aExp
= extractFloat128Exp( a
);
4591 aSign
= extractFloat128Sign( a
);
4592 if ( aExp
== 0x7FFF ) {
4593 if ( aSig0
| aSig1
) {
4594 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR
) );
4596 return packFloat32( aSign
, 0xFF, 0 );
4598 aSig0
|= ( aSig1
!= 0 );
4599 shift64RightJamming( aSig0
, 18, &aSig0
);
4601 if ( aExp
|| zSig
) {
4605 return roundAndPackFloat32( aSign
, aExp
, zSig STATUS_VAR
);
4609 /*----------------------------------------------------------------------------
4610 | Returns the result of converting the quadruple-precision floating-point
4611 | value `a' to the double-precision floating-point format. The conversion
4612 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4614 *----------------------------------------------------------------------------*/
4616 float64
float128_to_float64( float128 a STATUS_PARAM
)
4620 bits64 aSig0
, aSig1
;
4622 aSig1
= extractFloat128Frac1( a
);
4623 aSig0
= extractFloat128Frac0( a
);
4624 aExp
= extractFloat128Exp( a
);
4625 aSign
= extractFloat128Sign( a
);
4626 if ( aExp
== 0x7FFF ) {
4627 if ( aSig0
| aSig1
) {
4628 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR
) );
4630 return packFloat64( aSign
, 0x7FF, 0 );
4632 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
4633 aSig0
|= ( aSig1
!= 0 );
4634 if ( aExp
|| aSig0
) {
4635 aSig0
|= LIT64( 0x4000000000000000 );
4638 return roundAndPackFloat64( aSign
, aExp
, aSig0 STATUS_VAR
);
4644 /*----------------------------------------------------------------------------
4645 | Returns the result of converting the quadruple-precision floating-point
4646 | value `a' to the extended double-precision floating-point format. The
4647 | conversion is performed according to the IEC/IEEE Standard for Binary
4648 | Floating-Point Arithmetic.
4649 *----------------------------------------------------------------------------*/
4651 floatx80
float128_to_floatx80( float128 a STATUS_PARAM
)
4655 bits64 aSig0
, aSig1
;
4657 aSig1
= extractFloat128Frac1( a
);
4658 aSig0
= extractFloat128Frac0( a
);
4659 aExp
= extractFloat128Exp( a
);
4660 aSign
= extractFloat128Sign( a
);
4661 if ( aExp
== 0x7FFF ) {
4662 if ( aSig0
| aSig1
) {
4663 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR
) );
4665 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4668 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
4669 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4672 aSig0
|= LIT64( 0x0001000000000000 );
4674 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
4675 return roundAndPackFloatx80( 80, aSign
, aExp
, aSig0
, aSig1 STATUS_VAR
);
4681 /*----------------------------------------------------------------------------
4682 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4683 | returns the result as a quadruple-precision floating-point value. The
4684 | operation is performed according to the IEC/IEEE Standard for Binary
4685 | Floating-Point Arithmetic.
4686 *----------------------------------------------------------------------------*/
4688 float128
float128_round_to_int( float128 a STATUS_PARAM
)
4692 bits64 lastBitMask
, roundBitsMask
;
4696 aExp
= extractFloat128Exp( a
);
4697 if ( 0x402F <= aExp
) {
4698 if ( 0x406F <= aExp
) {
4699 if ( ( aExp
== 0x7FFF )
4700 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
4702 return propagateFloat128NaN( a
, a STATUS_VAR
);
4707 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
4708 roundBitsMask
= lastBitMask
- 1;
4710 roundingMode
= STATUS(float_rounding_mode
);
4711 if ( roundingMode
== float_round_nearest_even
) {
4712 if ( lastBitMask
) {
4713 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
4714 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
4717 if ( (sbits64
) z
.low
< 0 ) {
4719 if ( (bits64
) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
4723 else if ( roundingMode
!= float_round_to_zero
) {
4724 if ( extractFloat128Sign( z
)
4725 ^ ( roundingMode
== float_round_up
) ) {
4726 add128( z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
4729 z
.low
&= ~ roundBitsMask
;
4732 if ( aExp
< 0x3FFF ) {
4733 if ( ( ( (bits64
) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
4734 STATUS(float_exception_flags
) |= float_flag_inexact
;
4735 aSign
= extractFloat128Sign( a
);
4736 switch ( STATUS(float_rounding_mode
) ) {
4737 case float_round_nearest_even
:
4738 if ( ( aExp
== 0x3FFE )
4739 && ( extractFloat128Frac0( a
)
4740 | extractFloat128Frac1( a
) )
4742 return packFloat128( aSign
, 0x3FFF, 0, 0 );
4745 case float_round_down
:
4747 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
4748 : packFloat128( 0, 0, 0, 0 );
4749 case float_round_up
:
4751 aSign
? packFloat128( 1, 0, 0, 0 )
4752 : packFloat128( 0, 0x3FFF, 0, 0 );
4754 return packFloat128( aSign
, 0, 0, 0 );
4757 lastBitMask
<<= 0x402F - aExp
;
4758 roundBitsMask
= lastBitMask
- 1;
4761 roundingMode
= STATUS(float_rounding_mode
);
4762 if ( roundingMode
== float_round_nearest_even
) {
4763 z
.high
+= lastBitMask
>>1;
4764 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
4765 z
.high
&= ~ lastBitMask
;
4768 else if ( roundingMode
!= float_round_to_zero
) {
4769 if ( extractFloat128Sign( z
)
4770 ^ ( roundingMode
== float_round_up
) ) {
4771 z
.high
|= ( a
.low
!= 0 );
4772 z
.high
+= roundBitsMask
;
4775 z
.high
&= ~ roundBitsMask
;
4777 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
4778 STATUS(float_exception_flags
) |= float_flag_inexact
;
4784 /*----------------------------------------------------------------------------
4785 | Returns the result of adding the absolute values of the quadruple-precision
4786 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4787 | before being returned. `zSign' is ignored if the result is a NaN.
4788 | The addition is performed according to the IEC/IEEE Standard for Binary
4789 | Floating-Point Arithmetic.
4790 *----------------------------------------------------------------------------*/
4792 static float128
addFloat128Sigs( float128 a
, float128 b
, flag zSign STATUS_PARAM
)
4794 int32 aExp
, bExp
, zExp
;
4795 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
4798 aSig1
= extractFloat128Frac1( a
);
4799 aSig0
= extractFloat128Frac0( a
);
4800 aExp
= extractFloat128Exp( a
);
4801 bSig1
= extractFloat128Frac1( b
);
4802 bSig0
= extractFloat128Frac0( b
);
4803 bExp
= extractFloat128Exp( b
);
4804 expDiff
= aExp
- bExp
;
4805 if ( 0 < expDiff
) {
4806 if ( aExp
== 0x7FFF ) {
4807 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
4814 bSig0
|= LIT64( 0x0001000000000000 );
4816 shift128ExtraRightJamming(
4817 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
4820 else if ( expDiff
< 0 ) {
4821 if ( bExp
== 0x7FFF ) {
4822 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
4823 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4829 aSig0
|= LIT64( 0x0001000000000000 );
4831 shift128ExtraRightJamming(
4832 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
4836 if ( aExp
== 0x7FFF ) {
4837 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
4838 return propagateFloat128NaN( a
, b STATUS_VAR
);
4842 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4844 if ( STATUS(flush_to_zero
) ) return packFloat128( zSign
, 0, 0, 0 );
4845 return packFloat128( zSign
, 0, zSig0
, zSig1
);
4848 zSig0
|= LIT64( 0x0002000000000000 );
4852 aSig0
|= LIT64( 0x0001000000000000 );
4853 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4855 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
4858 shift128ExtraRightJamming(
4859 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
4861 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
4865 /*----------------------------------------------------------------------------
4866 | Returns the result of subtracting the absolute values of the quadruple-
4867 | precision floating-point values `a' and `b'. If `zSign' is 1, the
4868 | difference is negated before being returned. `zSign' is ignored if the
4869 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4870 | Standard for Binary Floating-Point Arithmetic.
4871 *----------------------------------------------------------------------------*/
4873 static float128
subFloat128Sigs( float128 a
, float128 b
, flag zSign STATUS_PARAM
)
4875 int32 aExp
, bExp
, zExp
;
4876 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
4880 aSig1
= extractFloat128Frac1( a
);
4881 aSig0
= extractFloat128Frac0( a
);
4882 aExp
= extractFloat128Exp( a
);
4883 bSig1
= extractFloat128Frac1( b
);
4884 bSig0
= extractFloat128Frac0( b
);
4885 bExp
= extractFloat128Exp( b
);
4886 expDiff
= aExp
- bExp
;
4887 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
4888 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
4889 if ( 0 < expDiff
) goto aExpBigger
;
4890 if ( expDiff
< 0 ) goto bExpBigger
;
4891 if ( aExp
== 0x7FFF ) {
4892 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
4893 return propagateFloat128NaN( a
, b STATUS_VAR
);
4895 float_raise( float_flag_invalid STATUS_VAR
);
4896 z
.low
= float128_default_nan_low
;
4897 z
.high
= float128_default_nan_high
;
4904 if ( bSig0
< aSig0
) goto aBigger
;
4905 if ( aSig0
< bSig0
) goto bBigger
;
4906 if ( bSig1
< aSig1
) goto aBigger
;
4907 if ( aSig1
< bSig1
) goto bBigger
;
4908 return packFloat128( STATUS(float_rounding_mode
) == float_round_down
, 0, 0, 0 );
4910 if ( bExp
== 0x7FFF ) {
4911 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
4912 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
4918 aSig0
|= LIT64( 0x4000000000000000 );
4920 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
4921 bSig0
|= LIT64( 0x4000000000000000 );
4923 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
4926 goto normalizeRoundAndPack
;
4928 if ( aExp
== 0x7FFF ) {
4929 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
4936 bSig0
|= LIT64( 0x4000000000000000 );
4938 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
4939 aSig0
|= LIT64( 0x4000000000000000 );
4941 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4943 normalizeRoundAndPack
:
4945 return normalizeRoundAndPackFloat128( zSign
, zExp
- 14, zSig0
, zSig1 STATUS_VAR
);
4949 /*----------------------------------------------------------------------------
4950 | Returns the result of adding the quadruple-precision floating-point values
4951 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4952 | for Binary Floating-Point Arithmetic.
4953 *----------------------------------------------------------------------------*/
4955 float128
float128_add( float128 a
, float128 b STATUS_PARAM
)
4959 aSign
= extractFloat128Sign( a
);
4960 bSign
= extractFloat128Sign( b
);
4961 if ( aSign
== bSign
) {
4962 return addFloat128Sigs( a
, b
, aSign STATUS_VAR
);
4965 return subFloat128Sigs( a
, b
, aSign STATUS_VAR
);
4970 /*----------------------------------------------------------------------------
4971 | Returns the result of subtracting the quadruple-precision floating-point
4972 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4973 | Standard for Binary Floating-Point Arithmetic.
4974 *----------------------------------------------------------------------------*/
4976 float128
float128_sub( float128 a
, float128 b STATUS_PARAM
)
4980 aSign
= extractFloat128Sign( a
);
4981 bSign
= extractFloat128Sign( b
);
4982 if ( aSign
== bSign
) {
4983 return subFloat128Sigs( a
, b
, aSign STATUS_VAR
);
4986 return addFloat128Sigs( a
, b
, aSign STATUS_VAR
);
4991 /*----------------------------------------------------------------------------
4992 | Returns the result of multiplying the quadruple-precision floating-point
4993 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4994 | Standard for Binary Floating-Point Arithmetic.
4995 *----------------------------------------------------------------------------*/
4997 float128
float128_mul( float128 a
, float128 b STATUS_PARAM
)
4999 flag aSign
, bSign
, zSign
;
5000 int32 aExp
, bExp
, zExp
;
5001 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
5004 aSig1
= extractFloat128Frac1( a
);
5005 aSig0
= extractFloat128Frac0( a
);
5006 aExp
= extractFloat128Exp( a
);
5007 aSign
= extractFloat128Sign( a
);
5008 bSig1
= extractFloat128Frac1( b
);
5009 bSig0
= extractFloat128Frac0( b
);
5010 bExp
= extractFloat128Exp( b
);
5011 bSign
= extractFloat128Sign( b
);
5012 zSign
= aSign
^ bSign
;
5013 if ( aExp
== 0x7FFF ) {
5014 if ( ( aSig0
| aSig1
)
5015 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
5016 return propagateFloat128NaN( a
, b STATUS_VAR
);
5018 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
5019 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5021 if ( bExp
== 0x7FFF ) {
5022 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5023 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
5025 float_raise( float_flag_invalid STATUS_VAR
);
5026 z
.low
= float128_default_nan_low
;
5027 z
.high
= float128_default_nan_high
;
5030 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5033 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
5034 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5037 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
5038 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5040 zExp
= aExp
+ bExp
- 0x4000;
5041 aSig0
|= LIT64( 0x0001000000000000 );
5042 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
5043 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
5044 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
5045 zSig2
|= ( zSig3
!= 0 );
5046 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
5047 shift128ExtraRightJamming(
5048 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
5051 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5055 /*----------------------------------------------------------------------------
5056 | Returns the result of dividing the quadruple-precision floating-point value
5057 | `a' by the corresponding value `b'. The operation is performed according to
5058 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5059 *----------------------------------------------------------------------------*/
5061 float128
float128_div( float128 a
, float128 b STATUS_PARAM
)
5063 flag aSign
, bSign
, zSign
;
5064 int32 aExp
, bExp
, zExp
;
5065 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
5066 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5069 aSig1
= extractFloat128Frac1( a
);
5070 aSig0
= extractFloat128Frac0( a
);
5071 aExp
= extractFloat128Exp( a
);
5072 aSign
= extractFloat128Sign( a
);
5073 bSig1
= extractFloat128Frac1( b
);
5074 bSig0
= extractFloat128Frac0( b
);
5075 bExp
= extractFloat128Exp( b
);
5076 bSign
= extractFloat128Sign( b
);
5077 zSign
= aSign
^ bSign
;
5078 if ( aExp
== 0x7FFF ) {
5079 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5080 if ( bExp
== 0x7FFF ) {
5081 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5084 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5086 if ( bExp
== 0x7FFF ) {
5087 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5088 return packFloat128( zSign
, 0, 0, 0 );
5091 if ( ( bSig0
| bSig1
) == 0 ) {
5092 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
5094 float_raise( float_flag_invalid STATUS_VAR
);
5095 z
.low
= float128_default_nan_low
;
5096 z
.high
= float128_default_nan_high
;
5099 float_raise( float_flag_divbyzero STATUS_VAR
);
5100 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5102 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5105 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
5106 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5108 zExp
= aExp
- bExp
+ 0x3FFD;
5110 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
5112 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
5113 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
5114 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
5117 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5118 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
5119 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
5120 while ( (sbits64
) rem0
< 0 ) {
5122 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
5124 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
5125 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
5126 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
5127 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
5128 while ( (sbits64
) rem1
< 0 ) {
5130 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
5132 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5134 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
5135 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5139 /*----------------------------------------------------------------------------
5140 | Returns the remainder of the quadruple-precision floating-point value `a'
5141 | with respect to the corresponding value `b'. The operation is performed
5142 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5143 *----------------------------------------------------------------------------*/
5145 float128
float128_rem( float128 a
, float128 b STATUS_PARAM
)
5147 flag aSign
, bSign
, zSign
;
5148 int32 aExp
, bExp
, expDiff
;
5149 bits64 aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
5150 bits64 allZero
, alternateASig0
, alternateASig1
, sigMean1
;
5154 aSig1
= extractFloat128Frac1( a
);
5155 aSig0
= extractFloat128Frac0( a
);
5156 aExp
= extractFloat128Exp( a
);
5157 aSign
= extractFloat128Sign( a
);
5158 bSig1
= extractFloat128Frac1( b
);
5159 bSig0
= extractFloat128Frac0( b
);
5160 bExp
= extractFloat128Exp( b
);
5161 bSign
= extractFloat128Sign( b
);
5162 if ( aExp
== 0x7FFF ) {
5163 if ( ( aSig0
| aSig1
)
5164 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
5165 return propagateFloat128NaN( a
, b STATUS_VAR
);
5169 if ( bExp
== 0x7FFF ) {
5170 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5174 if ( ( bSig0
| bSig1
) == 0 ) {
5176 float_raise( float_flag_invalid STATUS_VAR
);
5177 z
.low
= float128_default_nan_low
;
5178 z
.high
= float128_default_nan_high
;
5181 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5184 if ( ( aSig0
| aSig1
) == 0 ) return a
;
5185 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5187 expDiff
= aExp
- bExp
;
5188 if ( expDiff
< -1 ) return a
;
5190 aSig0
| LIT64( 0x0001000000000000 ),
5192 15 - ( expDiff
< 0 ),
5197 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
5198 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
5199 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
5201 while ( 0 < expDiff
) {
5202 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5203 q
= ( 4 < q
) ? q
- 4 : 0;
5204 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
5205 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
5206 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
5207 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
5210 if ( -64 < expDiff
) {
5211 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5212 q
= ( 4 < q
) ? q
- 4 : 0;
5214 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
5216 if ( expDiff
< 0 ) {
5217 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
5220 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
5222 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
5223 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
5226 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
5227 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
5230 alternateASig0
= aSig0
;
5231 alternateASig1
= aSig1
;
5233 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
5234 } while ( 0 <= (sbits64
) aSig0
);
5236 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (bits64
*)&sigMean0
, &sigMean1
);
5237 if ( ( sigMean0
< 0 )
5238 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
5239 aSig0
= alternateASig0
;
5240 aSig1
= alternateASig1
;
5242 zSign
= ( (sbits64
) aSig0
< 0 );
5243 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
5245 normalizeRoundAndPackFloat128( aSign
^ zSign
, bExp
- 4, aSig0
, aSig1 STATUS_VAR
);
5249 /*----------------------------------------------------------------------------
5250 | Returns the square root of the quadruple-precision floating-point value `a'.
5251 | The operation is performed according to the IEC/IEEE Standard for Binary
5252 | Floating-Point Arithmetic.
5253 *----------------------------------------------------------------------------*/
5255 float128
float128_sqrt( float128 a STATUS_PARAM
)
5259 bits64 aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
5260 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5263 aSig1
= extractFloat128Frac1( a
);
5264 aSig0
= extractFloat128Frac0( a
);
5265 aExp
= extractFloat128Exp( a
);
5266 aSign
= extractFloat128Sign( a
);
5267 if ( aExp
== 0x7FFF ) {
5268 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, a STATUS_VAR
);
5269 if ( ! aSign
) return a
;
5273 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
5275 float_raise( float_flag_invalid STATUS_VAR
);
5276 z
.low
= float128_default_nan_low
;
5277 z
.high
= float128_default_nan_high
;
5281 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
5282 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5284 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
5285 aSig0
|= LIT64( 0x0001000000000000 );
5286 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
5287 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
5288 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
5289 doubleZSig0
= zSig0
<<1;
5290 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
5291 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
5292 while ( (sbits64
) rem0
< 0 ) {
5295 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
5297 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
5298 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
5299 if ( zSig1
== 0 ) zSig1
= 1;
5300 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
5301 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5302 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
5303 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5304 while ( (sbits64
) rem1
< 0 ) {
5306 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
5308 term2
|= doubleZSig0
;
5309 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5311 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5313 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
5314 return roundAndPackFloat128( 0, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5318 /*----------------------------------------------------------------------------
5319 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5320 | the corresponding value `b', and 0 otherwise. The comparison is performed
5321 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5322 *----------------------------------------------------------------------------*/
5324 int float128_eq( float128 a
, float128 b STATUS_PARAM
)
5327 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5328 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5329 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5330 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5332 if ( float128_is_signaling_nan( a
)
5333 || float128_is_signaling_nan( b
) ) {
5334 float_raise( float_flag_invalid STATUS_VAR
);
5340 && ( ( a
.high
== b
.high
)
5342 && ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5347 /*----------------------------------------------------------------------------
5348 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5349 | or equal to the corresponding value `b', and 0 otherwise. The comparison
5350 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5352 *----------------------------------------------------------------------------*/
5354 int float128_le( float128 a
, float128 b STATUS_PARAM
)
5358 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5359 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5360 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5361 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5363 float_raise( float_flag_invalid STATUS_VAR
);
5366 aSign
= extractFloat128Sign( a
);
5367 bSign
= extractFloat128Sign( b
);
5368 if ( aSign
!= bSign
) {
5371 || ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5375 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5376 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5380 /*----------------------------------------------------------------------------
5381 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5382 | the corresponding value `b', and 0 otherwise. The comparison is performed
5383 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5384 *----------------------------------------------------------------------------*/
5386 int float128_lt( float128 a
, float128 b STATUS_PARAM
)
5390 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5391 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5392 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5393 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5395 float_raise( float_flag_invalid STATUS_VAR
);
5398 aSign
= extractFloat128Sign( a
);
5399 bSign
= extractFloat128Sign( b
);
5400 if ( aSign
!= bSign
) {
5403 && ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5407 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5408 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5412 /*----------------------------------------------------------------------------
5413 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5414 | the corresponding value `b', and 0 otherwise. The invalid exception is
5415 | raised if either operand is a NaN. Otherwise, the comparison is performed
5416 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5417 *----------------------------------------------------------------------------*/
5419 int float128_eq_signaling( float128 a
, float128 b STATUS_PARAM
)
5422 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5423 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5424 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5425 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5427 float_raise( float_flag_invalid STATUS_VAR
);
5432 && ( ( a
.high
== b
.high
)
5434 && ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5439 /*----------------------------------------------------------------------------
5440 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5441 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5442 | cause an exception. Otherwise, the comparison is performed according to the
5443 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5444 *----------------------------------------------------------------------------*/
5446 int float128_le_quiet( float128 a
, float128 b STATUS_PARAM
)
5450 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5451 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5452 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5453 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5455 if ( float128_is_signaling_nan( a
)
5456 || float128_is_signaling_nan( b
) ) {
5457 float_raise( float_flag_invalid STATUS_VAR
);
5461 aSign
= extractFloat128Sign( a
);
5462 bSign
= extractFloat128Sign( b
);
5463 if ( aSign
!= bSign
) {
5466 || ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5470 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5471 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5475 /*----------------------------------------------------------------------------
5476 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5477 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5478 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5479 | Standard for Binary Floating-Point Arithmetic.
5480 *----------------------------------------------------------------------------*/
5482 int float128_lt_quiet( float128 a
, float128 b STATUS_PARAM
)
5486 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5487 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5488 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5489 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5491 if ( float128_is_signaling_nan( a
)
5492 || float128_is_signaling_nan( b
) ) {
5493 float_raise( float_flag_invalid STATUS_VAR
);
5497 aSign
= extractFloat128Sign( a
);
5498 bSign
= extractFloat128Sign( b
);
5499 if ( aSign
!= bSign
) {
5502 && ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5506 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5507 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5513 /* misc functions */
5514 float32
uint32_to_float32( unsigned int a STATUS_PARAM
)
5516 return int64_to_float32(a STATUS_VAR
);
5519 float64
uint32_to_float64( unsigned int a STATUS_PARAM
)
5521 return int64_to_float64(a STATUS_VAR
);
5524 unsigned int float32_to_uint32( float32 a STATUS_PARAM
)
5529 v
= float32_to_int64(a STATUS_VAR
);
5532 float_raise( float_flag_invalid STATUS_VAR
);
5533 } else if (v
> 0xffffffff) {
5535 float_raise( float_flag_invalid STATUS_VAR
);
5542 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM
)
5547 v
= float32_to_int64_round_to_zero(a STATUS_VAR
);
5550 float_raise( float_flag_invalid STATUS_VAR
);
5551 } else if (v
> 0xffffffff) {
5553 float_raise( float_flag_invalid STATUS_VAR
);
5560 unsigned int float64_to_uint32( float64 a STATUS_PARAM
)
5565 v
= float64_to_int64(a STATUS_VAR
);
5568 float_raise( float_flag_invalid STATUS_VAR
);
5569 } else if (v
> 0xffffffff) {
5571 float_raise( float_flag_invalid STATUS_VAR
);
5578 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM
)
5583 v
= float64_to_int64_round_to_zero(a STATUS_VAR
);
5586 float_raise( float_flag_invalid STATUS_VAR
);
5587 } else if (v
> 0xffffffff) {
5589 float_raise( float_flag_invalid STATUS_VAR
);
5596 /* FIXME: This looks broken. */
5597 uint64_t float64_to_uint64 (float64 a STATUS_PARAM
)
5601 v
= float64_val(int64_to_float64(INT64_MIN STATUS_VAR
));
5602 v
+= float64_val(a
);
5603 v
= float64_to_int64(make_float64(v
) STATUS_VAR
);
5605 return v
- INT64_MIN
;
5608 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM
)
5612 v
= float64_val(int64_to_float64(INT64_MIN STATUS_VAR
));
5613 v
+= float64_val(a
);
5614 v
= float64_to_int64_round_to_zero(make_float64(v
) STATUS_VAR
);
5616 return v
- INT64_MIN
;
5619 #define COMPARE(s, nan_exp) \
5620 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
5621 int is_quiet STATUS_PARAM ) \
5623 flag aSign, bSign; \
5626 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
5627 extractFloat ## s ## Frac( a ) ) || \
5628 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
5629 extractFloat ## s ## Frac( b ) )) { \
5631 float ## s ## _is_signaling_nan( a ) || \
5632 float ## s ## _is_signaling_nan( b ) ) { \
5633 float_raise( float_flag_invalid STATUS_VAR); \
5635 return float_relation_unordered; \
5637 aSign = extractFloat ## s ## Sign( a ); \
5638 bSign = extractFloat ## s ## Sign( b ); \
5639 av = float ## s ## _val(a); \
5640 bv = float ## s ## _val(b); \
5641 if ( aSign != bSign ) { \
5642 if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) { \
5644 return float_relation_equal; \
5646 return 1 - (2 * aSign); \
5650 return float_relation_equal; \
5652 return 1 - 2 * (aSign ^ ( av < bv )); \
5657 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
5659 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
5662 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
5664 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
5670 INLINE
int float128_compare_internal( float128 a
, float128 b
,
5671 int is_quiet STATUS_PARAM
)
5675 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
5676 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
5677 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
5678 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
5680 float128_is_signaling_nan( a
) ||
5681 float128_is_signaling_nan( b
) ) {
5682 float_raise( float_flag_invalid STATUS_VAR
);
5684 return float_relation_unordered
;
5686 aSign
= extractFloat128Sign( a
);
5687 bSign
= extractFloat128Sign( b
);
5688 if ( aSign
!= bSign
) {
5689 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
5691 return float_relation_equal
;
5693 return 1 - (2 * aSign
);
5696 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
5697 return float_relation_equal
;
5699 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
5704 int float128_compare( float128 a
, float128 b STATUS_PARAM
)
5706 return float128_compare_internal(a
, b
, 0 STATUS_VAR
);
5709 int float128_compare_quiet( float128 a
, float128 b STATUS_PARAM
)
5711 return float128_compare_internal(a
, b
, 1 STATUS_VAR
);
5714 /* Multiply A by 2 raised to the power N. */
5715 float32
float32_scalbn( float32 a
, int n STATUS_PARAM
)
5721 aSig
= extractFloat32Frac( a
);
5722 aExp
= extractFloat32Exp( a
);
5723 aSign
= extractFloat32Sign( a
);
5725 if ( aExp
== 0xFF ) {
5730 else if ( aSig
== 0 )
5735 return normalizeRoundAndPackFloat32( aSign
, aExp
, aSig STATUS_VAR
);
5738 float64
float64_scalbn( float64 a
, int n STATUS_PARAM
)
5744 aSig
= extractFloat64Frac( a
);
5745 aExp
= extractFloat64Exp( a
);
5746 aSign
= extractFloat64Sign( a
);
5748 if ( aExp
== 0x7FF ) {
5752 aSig
|= LIT64( 0x0010000000000000 );
5753 else if ( aSig
== 0 )
5758 return normalizeRoundAndPackFloat64( aSign
, aExp
, aSig STATUS_VAR
);
5762 floatx80
floatx80_scalbn( floatx80 a
, int n STATUS_PARAM
)
5768 aSig
= extractFloatx80Frac( a
);
5769 aExp
= extractFloatx80Exp( a
);
5770 aSign
= extractFloatx80Sign( a
);
5772 if ( aExp
== 0x7FF ) {
5775 if (aExp
== 0 && aSig
== 0)
5779 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision
),
5780 aSign
, aExp
, aSig
, 0 STATUS_VAR
);
5785 float128
float128_scalbn( float128 a
, int n STATUS_PARAM
)
5789 bits64 aSig0
, aSig1
;
5791 aSig1
= extractFloat128Frac1( a
);
5792 aSig0
= extractFloat128Frac0( a
);
5793 aExp
= extractFloat128Exp( a
);
5794 aSign
= extractFloat128Sign( a
);
5795 if ( aExp
== 0x7FFF ) {
5799 aSig0
|= LIT64( 0x0001000000000000 );
5800 else if ( aSig0
== 0 && aSig1
== 0 )
5804 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1