2 /*============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15 arithmetic/SoftFloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
26 Derivative works are acceptable, even for commercial purposes, so long as
27 (1) the source code for the derivative work includes prominent notice that
28 the work is derivative, and (2) the source code includes prominent notice with
29 these four paragraphs for those parts of this code that are retained.
31 =============================================================================*/
33 #include "softfloat.h"
35 /*----------------------------------------------------------------------------
36 | Primitive arithmetic functions, including multi-word arithmetic, and
37 | division and square root approximations. (Can be specialized to target if
39 *----------------------------------------------------------------------------*/
40 #include "softfloat-macros.h"
42 /*----------------------------------------------------------------------------
43 | Functions and definitions to determine: (1) whether tininess for underflow
44 | is detected before or after rounding by default, (2) what (if anything)
45 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
46 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
47 | are propagated from function inputs to output. These details are target-
49 *----------------------------------------------------------------------------*/
50 #include "softfloat-specialize.h"
52 void set_float_rounding_mode(int val STATUS_PARAM
)
54 STATUS(float_rounding_mode
) = val
;
57 void set_float_exception_flags(int val STATUS_PARAM
)
59 STATUS(float_exception_flags
) = val
;
63 void set_floatx80_rounding_precision(int val STATUS_PARAM
)
65 STATUS(floatx80_rounding_precision
) = val
;
69 /*----------------------------------------------------------------------------
70 | Returns the fraction bits of the half-precision floating-point value `a'.
71 *----------------------------------------------------------------------------*/
73 INLINE
uint32_t extractFloat16Frac(float16 a
)
75 return float16_val(a
) & 0x3ff;
78 /*----------------------------------------------------------------------------
79 | Returns the exponent bits of the half-precision floating-point value `a'.
80 *----------------------------------------------------------------------------*/
82 INLINE int16
extractFloat16Exp(float16 a
)
84 return (float16_val(a
) >> 10) & 0x1f;
87 /*----------------------------------------------------------------------------
88 | Returns the sign bit of the single-precision floating-point value `a'.
89 *----------------------------------------------------------------------------*/
91 INLINE flag
extractFloat16Sign(float16 a
)
93 return float16_val(a
)>>15;
96 /*----------------------------------------------------------------------------
97 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
98 | and 7, and returns the properly rounded 32-bit integer corresponding to the
99 | input. If `zSign' is 1, the input is negated before being converted to an
100 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
101 | is simply rounded to an integer, with the inexact exception raised if the
102 | input cannot be represented exactly as an integer. However, if the fixed-
103 | point input is too large, the invalid exception is raised and the largest
104 | positive or negative integer is returned.
105 *----------------------------------------------------------------------------*/
107 static int32
roundAndPackInt32( flag zSign
, bits64 absZ STATUS_PARAM
)
110 flag roundNearestEven
;
111 int8 roundIncrement
, roundBits
;
114 roundingMode
= STATUS(float_rounding_mode
);
115 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
116 roundIncrement
= 0x40;
117 if ( ! roundNearestEven
) {
118 if ( roundingMode
== float_round_to_zero
) {
122 roundIncrement
= 0x7F;
124 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
127 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
131 roundBits
= absZ
& 0x7F;
132 absZ
= ( absZ
+ roundIncrement
)>>7;
133 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
135 if ( zSign
) z
= - z
;
136 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
137 float_raise( float_flag_invalid STATUS_VAR
);
138 return zSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
140 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
145 /*----------------------------------------------------------------------------
146 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
147 | `absZ1', with binary point between bits 63 and 64 (between the input words),
148 | and returns the properly rounded 64-bit integer corresponding to the input.
149 | If `zSign' is 1, the input is negated before being converted to an integer.
150 | Ordinarily, the fixed-point input is simply rounded to an integer, with
151 | the inexact exception raised if the input cannot be represented exactly as
152 | an integer. However, if the fixed-point input is too large, the invalid
153 | exception is raised and the largest positive or negative integer is
155 *----------------------------------------------------------------------------*/
157 static int64
roundAndPackInt64( flag zSign
, bits64 absZ0
, bits64 absZ1 STATUS_PARAM
)
160 flag roundNearestEven
, increment
;
163 roundingMode
= STATUS(float_rounding_mode
);
164 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
165 increment
= ( (sbits64
) absZ1
< 0 );
166 if ( ! roundNearestEven
) {
167 if ( roundingMode
== float_round_to_zero
) {
172 increment
= ( roundingMode
== float_round_down
) && absZ1
;
175 increment
= ( roundingMode
== float_round_up
) && absZ1
;
181 if ( absZ0
== 0 ) goto overflow
;
182 absZ0
&= ~ ( ( (bits64
) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
185 if ( zSign
) z
= - z
;
186 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
188 float_raise( float_flag_invalid STATUS_VAR
);
190 zSign
? (sbits64
) LIT64( 0x8000000000000000 )
191 : LIT64( 0x7FFFFFFFFFFFFFFF );
193 if ( absZ1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
198 /*----------------------------------------------------------------------------
199 | Returns the fraction bits of the single-precision floating-point value `a'.
200 *----------------------------------------------------------------------------*/
202 INLINE bits32
extractFloat32Frac( float32 a
)
205 return float32_val(a
) & 0x007FFFFF;
209 /*----------------------------------------------------------------------------
210 | Returns the exponent bits of the single-precision floating-point value `a'.
211 *----------------------------------------------------------------------------*/
213 INLINE int16
extractFloat32Exp( float32 a
)
216 return ( float32_val(a
)>>23 ) & 0xFF;
220 /*----------------------------------------------------------------------------
221 | Returns the sign bit of the single-precision floating-point value `a'.
222 *----------------------------------------------------------------------------*/
224 INLINE flag
extractFloat32Sign( float32 a
)
227 return float32_val(a
)>>31;
231 /*----------------------------------------------------------------------------
232 | If `a' is denormal and we are in flush-to-zero mode then set the
233 | input-denormal exception and return zero. Otherwise just return the value.
234 *----------------------------------------------------------------------------*/
235 static float32
float32_squash_input_denormal(float32 a STATUS_PARAM
)
237 if (STATUS(flush_inputs_to_zero
)) {
238 if (extractFloat32Exp(a
) == 0 && extractFloat32Frac(a
) != 0) {
239 float_raise(float_flag_input_denormal STATUS_VAR
);
240 return make_float32(float32_val(a
) & 0x80000000);
246 /*----------------------------------------------------------------------------
247 | Normalizes the subnormal single-precision floating-point value represented
248 | by the denormalized significand `aSig'. The normalized exponent and
249 | significand are stored at the locations pointed to by `zExpPtr' and
250 | `zSigPtr', respectively.
251 *----------------------------------------------------------------------------*/
254 normalizeFloat32Subnormal( bits32 aSig
, int16
*zExpPtr
, bits32
*zSigPtr
)
258 shiftCount
= countLeadingZeros32( aSig
) - 8;
259 *zSigPtr
= aSig
<<shiftCount
;
260 *zExpPtr
= 1 - shiftCount
;
264 /*----------------------------------------------------------------------------
265 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
266 | single-precision floating-point value, returning the result. After being
267 | shifted into the proper positions, the three fields are simply added
268 | together to form the result. This means that any integer portion of `zSig'
269 | will be added into the exponent. Since a properly normalized significand
270 | will have an integer portion equal to 1, the `zExp' input should be 1 less
271 | than the desired result exponent whenever `zSig' is a complete, normalized
273 *----------------------------------------------------------------------------*/
275 INLINE float32
packFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
279 ( ( (bits32
) zSign
)<<31 ) + ( ( (bits32
) zExp
)<<23 ) + zSig
);
283 /*----------------------------------------------------------------------------
284 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
285 | and significand `zSig', and returns the proper single-precision floating-
286 | point value corresponding to the abstract input. Ordinarily, the abstract
287 | value is simply rounded and packed into the single-precision format, with
288 | the inexact exception raised if the abstract input cannot be represented
289 | exactly. However, if the abstract value is too large, the overflow and
290 | inexact exceptions are raised and an infinity or maximal finite value is
291 | returned. If the abstract value is too small, the input value is rounded to
292 | a subnormal number, and the underflow and inexact exceptions are raised if
293 | the abstract input cannot be represented exactly as a subnormal single-
294 | precision floating-point number.
295 | The input significand `zSig' has its binary point between bits 30
296 | and 29, which is 7 bits to the left of the usual location. This shifted
297 | significand must be normalized or smaller. If `zSig' is not normalized,
298 | `zExp' must be 0; in that case, the result returned is a subnormal number,
299 | and it must not require rounding. In the usual case that `zSig' is
300 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
301 | The handling of underflow and overflow follows the IEC/IEEE Standard for
302 | Binary Floating-Point Arithmetic.
303 *----------------------------------------------------------------------------*/
305 static float32
roundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig STATUS_PARAM
)
308 flag roundNearestEven
;
309 int8 roundIncrement
, roundBits
;
312 roundingMode
= STATUS(float_rounding_mode
);
313 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
314 roundIncrement
= 0x40;
315 if ( ! roundNearestEven
) {
316 if ( roundingMode
== float_round_to_zero
) {
320 roundIncrement
= 0x7F;
322 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
325 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
329 roundBits
= zSig
& 0x7F;
330 if ( 0xFD <= (bits16
) zExp
) {
332 || ( ( zExp
== 0xFD )
333 && ( (sbits32
) ( zSig
+ roundIncrement
) < 0 ) )
335 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
336 return packFloat32( zSign
, 0xFF, - ( roundIncrement
== 0 ));
339 if ( STATUS(flush_to_zero
) ) return packFloat32( zSign
, 0, 0 );
341 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
343 || ( zSig
+ roundIncrement
< 0x80000000 );
344 shift32RightJamming( zSig
, - zExp
, &zSig
);
346 roundBits
= zSig
& 0x7F;
347 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
350 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
351 zSig
= ( zSig
+ roundIncrement
)>>7;
352 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
353 if ( zSig
== 0 ) zExp
= 0;
354 return packFloat32( zSign
, zExp
, zSig
);
358 /*----------------------------------------------------------------------------
359 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
360 | and significand `zSig', and returns the proper single-precision floating-
361 | point value corresponding to the abstract input. This routine is just like
362 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
363 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
364 | floating-point exponent.
365 *----------------------------------------------------------------------------*/
368 normalizeRoundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig STATUS_PARAM
)
372 shiftCount
= countLeadingZeros32( zSig
) - 1;
373 return roundAndPackFloat32( zSign
, zExp
- shiftCount
, zSig
<<shiftCount STATUS_VAR
);
377 /*----------------------------------------------------------------------------
378 | Returns the fraction bits of the double-precision floating-point value `a'.
379 *----------------------------------------------------------------------------*/
381 INLINE bits64
extractFloat64Frac( float64 a
)
384 return float64_val(a
) & LIT64( 0x000FFFFFFFFFFFFF );
388 /*----------------------------------------------------------------------------
389 | Returns the exponent bits of the double-precision floating-point value `a'.
390 *----------------------------------------------------------------------------*/
392 INLINE int16
extractFloat64Exp( float64 a
)
395 return ( float64_val(a
)>>52 ) & 0x7FF;
399 /*----------------------------------------------------------------------------
400 | Returns the sign bit of the double-precision floating-point value `a'.
401 *----------------------------------------------------------------------------*/
403 INLINE flag
extractFloat64Sign( float64 a
)
406 return float64_val(a
)>>63;
410 /*----------------------------------------------------------------------------
411 | If `a' is denormal and we are in flush-to-zero mode then set the
412 | input-denormal exception and return zero. Otherwise just return the value.
413 *----------------------------------------------------------------------------*/
414 static float64
float64_squash_input_denormal(float64 a STATUS_PARAM
)
416 if (STATUS(flush_inputs_to_zero
)) {
417 if (extractFloat64Exp(a
) == 0 && extractFloat64Frac(a
) != 0) {
418 float_raise(float_flag_input_denormal STATUS_VAR
);
419 return make_float64(float64_val(a
) & (1ULL << 63));
425 /*----------------------------------------------------------------------------
426 | Normalizes the subnormal double-precision floating-point value represented
427 | by the denormalized significand `aSig'. The normalized exponent and
428 | significand are stored at the locations pointed to by `zExpPtr' and
429 | `zSigPtr', respectively.
430 *----------------------------------------------------------------------------*/
433 normalizeFloat64Subnormal( bits64 aSig
, int16
*zExpPtr
, bits64
*zSigPtr
)
437 shiftCount
= countLeadingZeros64( aSig
) - 11;
438 *zSigPtr
= aSig
<<shiftCount
;
439 *zExpPtr
= 1 - shiftCount
;
443 /*----------------------------------------------------------------------------
444 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
445 | double-precision floating-point value, returning the result. After being
446 | shifted into the proper positions, the three fields are simply added
447 | together to form the result. This means that any integer portion of `zSig'
448 | will be added into the exponent. Since a properly normalized significand
449 | will have an integer portion equal to 1, the `zExp' input should be 1 less
450 | than the desired result exponent whenever `zSig' is a complete, normalized
452 *----------------------------------------------------------------------------*/
454 INLINE float64
packFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
458 ( ( (bits64
) zSign
)<<63 ) + ( ( (bits64
) zExp
)<<52 ) + zSig
);
462 /*----------------------------------------------------------------------------
463 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
464 | and significand `zSig', and returns the proper double-precision floating-
465 | point value corresponding to the abstract input. Ordinarily, the abstract
466 | value is simply rounded and packed into the double-precision format, with
467 | the inexact exception raised if the abstract input cannot be represented
468 | exactly. However, if the abstract value is too large, the overflow and
469 | inexact exceptions are raised and an infinity or maximal finite value is
470 | returned. If the abstract value is too small, the input value is rounded
471 | to a subnormal number, and the underflow and inexact exceptions are raised
472 | if the abstract input cannot be represented exactly as a subnormal double-
473 | precision floating-point number.
474 | The input significand `zSig' has its binary point between bits 62
475 | and 61, which is 10 bits to the left of the usual location. This shifted
476 | significand must be normalized or smaller. If `zSig' is not normalized,
477 | `zExp' must be 0; in that case, the result returned is a subnormal number,
478 | and it must not require rounding. In the usual case that `zSig' is
479 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
480 | The handling of underflow and overflow follows the IEC/IEEE Standard for
481 | Binary Floating-Point Arithmetic.
482 *----------------------------------------------------------------------------*/
484 static float64
roundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig STATUS_PARAM
)
487 flag roundNearestEven
;
488 int16 roundIncrement
, roundBits
;
491 roundingMode
= STATUS(float_rounding_mode
);
492 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
493 roundIncrement
= 0x200;
494 if ( ! roundNearestEven
) {
495 if ( roundingMode
== float_round_to_zero
) {
499 roundIncrement
= 0x3FF;
501 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
504 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
508 roundBits
= zSig
& 0x3FF;
509 if ( 0x7FD <= (bits16
) zExp
) {
510 if ( ( 0x7FD < zExp
)
511 || ( ( zExp
== 0x7FD )
512 && ( (sbits64
) ( zSig
+ roundIncrement
) < 0 ) )
514 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
515 return packFloat64( zSign
, 0x7FF, - ( roundIncrement
== 0 ));
518 if ( STATUS(flush_to_zero
) ) return packFloat64( zSign
, 0, 0 );
520 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
522 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
523 shift64RightJamming( zSig
, - zExp
, &zSig
);
525 roundBits
= zSig
& 0x3FF;
526 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
529 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
530 zSig
= ( zSig
+ roundIncrement
)>>10;
531 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
532 if ( zSig
== 0 ) zExp
= 0;
533 return packFloat64( zSign
, zExp
, zSig
);
537 /*----------------------------------------------------------------------------
538 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
539 | and significand `zSig', and returns the proper double-precision floating-
540 | point value corresponding to the abstract input. This routine is just like
541 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
542 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
543 | floating-point exponent.
544 *----------------------------------------------------------------------------*/
547 normalizeRoundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig STATUS_PARAM
)
551 shiftCount
= countLeadingZeros64( zSig
) - 1;
552 return roundAndPackFloat64( zSign
, zExp
- shiftCount
, zSig
<<shiftCount STATUS_VAR
);
558 /*----------------------------------------------------------------------------
559 | Returns the fraction bits of the extended double-precision floating-point
561 *----------------------------------------------------------------------------*/
563 INLINE bits64
extractFloatx80Frac( floatx80 a
)
570 /*----------------------------------------------------------------------------
571 | Returns the exponent bits of the extended double-precision floating-point
573 *----------------------------------------------------------------------------*/
575 INLINE int32
extractFloatx80Exp( floatx80 a
)
578 return a
.high
& 0x7FFF;
582 /*----------------------------------------------------------------------------
583 | Returns the sign bit of the extended double-precision floating-point value
585 *----------------------------------------------------------------------------*/
587 INLINE flag
extractFloatx80Sign( floatx80 a
)
594 /*----------------------------------------------------------------------------
595 | Normalizes the subnormal extended double-precision floating-point value
596 | represented by the denormalized significand `aSig'. The normalized exponent
597 | and significand are stored at the locations pointed to by `zExpPtr' and
598 | `zSigPtr', respectively.
599 *----------------------------------------------------------------------------*/
602 normalizeFloatx80Subnormal( bits64 aSig
, int32
*zExpPtr
, bits64
*zSigPtr
)
606 shiftCount
= countLeadingZeros64( aSig
);
607 *zSigPtr
= aSig
<<shiftCount
;
608 *zExpPtr
= 1 - shiftCount
;
612 /*----------------------------------------------------------------------------
613 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
614 | extended double-precision floating-point value, returning the result.
615 *----------------------------------------------------------------------------*/
617 INLINE floatx80
packFloatx80( flag zSign
, int32 zExp
, bits64 zSig
)
622 z
.high
= ( ( (bits16
) zSign
)<<15 ) + zExp
;
627 /*----------------------------------------------------------------------------
628 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
629 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
630 | and returns the proper extended double-precision floating-point value
631 | corresponding to the abstract input. Ordinarily, the abstract value is
632 | rounded and packed into the extended double-precision format, with the
633 | inexact exception raised if the abstract input cannot be represented
634 | exactly. However, if the abstract value is too large, the overflow and
635 | inexact exceptions are raised and an infinity or maximal finite value is
636 | returned. If the abstract value is too small, the input value is rounded to
637 | a subnormal number, and the underflow and inexact exceptions are raised if
638 | the abstract input cannot be represented exactly as a subnormal extended
639 | double-precision floating-point number.
640 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
641 | number of bits as single or double precision, respectively. Otherwise, the
642 | result is rounded to the full precision of the extended double-precision
644 | The input significand must be normalized or smaller. If the input
645 | significand is not normalized, `zExp' must be 0; in that case, the result
646 | returned is a subnormal number, and it must not require rounding. The
647 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
648 | Floating-Point Arithmetic.
649 *----------------------------------------------------------------------------*/
652 roundAndPackFloatx80(
653 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
657 flag roundNearestEven
, increment
, isTiny
;
658 int64 roundIncrement
, roundMask
, roundBits
;
660 roundingMode
= STATUS(float_rounding_mode
);
661 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
662 if ( roundingPrecision
== 80 ) goto precision80
;
663 if ( roundingPrecision
== 64 ) {
664 roundIncrement
= LIT64( 0x0000000000000400 );
665 roundMask
= LIT64( 0x00000000000007FF );
667 else if ( roundingPrecision
== 32 ) {
668 roundIncrement
= LIT64( 0x0000008000000000 );
669 roundMask
= LIT64( 0x000000FFFFFFFFFF );
674 zSig0
|= ( zSig1
!= 0 );
675 if ( ! roundNearestEven
) {
676 if ( roundingMode
== float_round_to_zero
) {
680 roundIncrement
= roundMask
;
682 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
685 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
689 roundBits
= zSig0
& roundMask
;
690 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
691 if ( ( 0x7FFE < zExp
)
692 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
697 if ( STATUS(flush_to_zero
) ) return packFloatx80( zSign
, 0, 0 );
699 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
701 || ( zSig0
<= zSig0
+ roundIncrement
);
702 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
704 roundBits
= zSig0
& roundMask
;
705 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
706 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
707 zSig0
+= roundIncrement
;
708 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
709 roundIncrement
= roundMask
+ 1;
710 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
711 roundMask
|= roundIncrement
;
713 zSig0
&= ~ roundMask
;
714 return packFloatx80( zSign
, zExp
, zSig0
);
717 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
718 zSig0
+= roundIncrement
;
719 if ( zSig0
< roundIncrement
) {
721 zSig0
= LIT64( 0x8000000000000000 );
723 roundIncrement
= roundMask
+ 1;
724 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
725 roundMask
|= roundIncrement
;
727 zSig0
&= ~ roundMask
;
728 if ( zSig0
== 0 ) zExp
= 0;
729 return packFloatx80( zSign
, zExp
, zSig0
);
731 increment
= ( (sbits64
) zSig1
< 0 );
732 if ( ! roundNearestEven
) {
733 if ( roundingMode
== float_round_to_zero
) {
738 increment
= ( roundingMode
== float_round_down
) && zSig1
;
741 increment
= ( roundingMode
== float_round_up
) && zSig1
;
745 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
746 if ( ( 0x7FFE < zExp
)
747 || ( ( zExp
== 0x7FFE )
748 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
754 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
755 if ( ( roundingMode
== float_round_to_zero
)
756 || ( zSign
&& ( roundingMode
== float_round_up
) )
757 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
759 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
761 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
765 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
768 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
769 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
771 if ( isTiny
&& zSig1
) float_raise( float_flag_underflow STATUS_VAR
);
772 if ( zSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
773 if ( roundNearestEven
) {
774 increment
= ( (sbits64
) zSig1
< 0 );
778 increment
= ( roundingMode
== float_round_down
) && zSig1
;
781 increment
= ( roundingMode
== float_round_up
) && zSig1
;
787 ~ ( ( (bits64
) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
788 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
790 return packFloatx80( zSign
, zExp
, zSig0
);
793 if ( zSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
798 zSig0
= LIT64( 0x8000000000000000 );
801 zSig0
&= ~ ( ( (bits64
) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
805 if ( zSig0
== 0 ) zExp
= 0;
807 return packFloatx80( zSign
, zExp
, zSig0
);
811 /*----------------------------------------------------------------------------
812 | Takes an abstract floating-point value having sign `zSign', exponent
813 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
814 | and returns the proper extended double-precision floating-point value
815 | corresponding to the abstract input. This routine is just like
816 | `roundAndPackFloatx80' except that the input significand does not have to be
818 *----------------------------------------------------------------------------*/
821 normalizeRoundAndPackFloatx80(
822 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
832 shiftCount
= countLeadingZeros64( zSig0
);
833 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
836 roundAndPackFloatx80( roundingPrecision
, zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
844 /*----------------------------------------------------------------------------
845 | Returns the least-significant 64 fraction bits of the quadruple-precision
846 | floating-point value `a'.
847 *----------------------------------------------------------------------------*/
849 INLINE bits64
extractFloat128Frac1( float128 a
)
856 /*----------------------------------------------------------------------------
857 | Returns the most-significant 48 fraction bits of the quadruple-precision
858 | floating-point value `a'.
859 *----------------------------------------------------------------------------*/
861 INLINE bits64
extractFloat128Frac0( float128 a
)
864 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
868 /*----------------------------------------------------------------------------
869 | Returns the exponent bits of the quadruple-precision floating-point value
871 *----------------------------------------------------------------------------*/
873 INLINE int32
extractFloat128Exp( float128 a
)
876 return ( a
.high
>>48 ) & 0x7FFF;
880 /*----------------------------------------------------------------------------
881 | Returns the sign bit of the quadruple-precision floating-point value `a'.
882 *----------------------------------------------------------------------------*/
884 INLINE flag
extractFloat128Sign( float128 a
)
891 /*----------------------------------------------------------------------------
892 | Normalizes the subnormal quadruple-precision floating-point value
893 | represented by the denormalized significand formed by the concatenation of
894 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
895 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
896 | significand are stored at the location pointed to by `zSig0Ptr', and the
897 | least significant 64 bits of the normalized significand are stored at the
898 | location pointed to by `zSig1Ptr'.
899 *----------------------------------------------------------------------------*/
902 normalizeFloat128Subnormal(
913 shiftCount
= countLeadingZeros64( aSig1
) - 15;
914 if ( shiftCount
< 0 ) {
915 *zSig0Ptr
= aSig1
>>( - shiftCount
);
916 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
919 *zSig0Ptr
= aSig1
<<shiftCount
;
922 *zExpPtr
= - shiftCount
- 63;
925 shiftCount
= countLeadingZeros64( aSig0
) - 15;
926 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
927 *zExpPtr
= 1 - shiftCount
;
932 /*----------------------------------------------------------------------------
933 | Packs the sign `zSign', the exponent `zExp', and the significand formed
934 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
935 | floating-point value, returning the result. After being shifted into the
936 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
937 | added together to form the most significant 32 bits of the result. This
938 | means that any integer portion of `zSig0' will be added into the exponent.
939 | Since a properly normalized significand will have an integer portion equal
940 | to 1, the `zExp' input should be 1 less than the desired result exponent
941 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
943 *----------------------------------------------------------------------------*/
946 packFloat128( flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
)
951 z
.high
= ( ( (bits64
) zSign
)<<63 ) + ( ( (bits64
) zExp
)<<48 ) + zSig0
;
956 /*----------------------------------------------------------------------------
957 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
958 | and extended significand formed by the concatenation of `zSig0', `zSig1',
959 | and `zSig2', and returns the proper quadruple-precision floating-point value
960 | corresponding to the abstract input. Ordinarily, the abstract value is
961 | simply rounded and packed into the quadruple-precision format, with the
962 | inexact exception raised if the abstract input cannot be represented
963 | exactly. However, if the abstract value is too large, the overflow and
964 | inexact exceptions are raised and an infinity or maximal finite value is
965 | returned. If the abstract value is too small, the input value is rounded to
966 | a subnormal number, and the underflow and inexact exceptions are raised if
967 | the abstract input cannot be represented exactly as a subnormal quadruple-
968 | precision floating-point number.
969 | The input significand must be normalized or smaller. If the input
970 | significand is not normalized, `zExp' must be 0; in that case, the result
971 | returned is a subnormal number, and it must not require rounding. In the
972 | usual case that the input significand is normalized, `zExp' must be 1 less
973 | than the ``true'' floating-point exponent. The handling of underflow and
974 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
975 *----------------------------------------------------------------------------*/
978 roundAndPackFloat128(
979 flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
, bits64 zSig2 STATUS_PARAM
)
982 flag roundNearestEven
, increment
, isTiny
;
984 roundingMode
= STATUS(float_rounding_mode
);
985 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
986 increment
= ( (sbits64
) zSig2
< 0 );
987 if ( ! roundNearestEven
) {
988 if ( roundingMode
== float_round_to_zero
) {
993 increment
= ( roundingMode
== float_round_down
) && zSig2
;
996 increment
= ( roundingMode
== float_round_up
) && zSig2
;
1000 if ( 0x7FFD <= (bits32
) zExp
) {
1001 if ( ( 0x7FFD < zExp
)
1002 || ( ( zExp
== 0x7FFD )
1004 LIT64( 0x0001FFFFFFFFFFFF ),
1005 LIT64( 0xFFFFFFFFFFFFFFFF ),
1012 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
1013 if ( ( roundingMode
== float_round_to_zero
)
1014 || ( zSign
&& ( roundingMode
== float_round_up
) )
1015 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
1021 LIT64( 0x0000FFFFFFFFFFFF ),
1022 LIT64( 0xFFFFFFFFFFFFFFFF )
1025 return packFloat128( zSign
, 0x7FFF, 0, 0 );
1028 if ( STATUS(flush_to_zero
) ) return packFloat128( zSign
, 0, 0, 0 );
1030 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
1036 LIT64( 0x0001FFFFFFFFFFFF ),
1037 LIT64( 0xFFFFFFFFFFFFFFFF )
1039 shift128ExtraRightJamming(
1040 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
1042 if ( isTiny
&& zSig2
) float_raise( float_flag_underflow STATUS_VAR
);
1043 if ( roundNearestEven
) {
1044 increment
= ( (sbits64
) zSig2
< 0 );
1048 increment
= ( roundingMode
== float_round_down
) && zSig2
;
1051 increment
= ( roundingMode
== float_round_up
) && zSig2
;
1056 if ( zSig2
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1058 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
1059 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
1062 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
1064 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1068 /*----------------------------------------------------------------------------
1069 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1070 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1071 | returns the proper quadruple-precision floating-point value corresponding
1072 | to the abstract input. This routine is just like `roundAndPackFloat128'
1073 | except that the input significand has fewer bits and does not have to be
1074 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1076 *----------------------------------------------------------------------------*/
1079 normalizeRoundAndPackFloat128(
1080 flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1 STATUS_PARAM
)
1090 shiftCount
= countLeadingZeros64( zSig0
) - 15;
1091 if ( 0 <= shiftCount
) {
1093 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1096 shift128ExtraRightJamming(
1097 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
1100 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
1106 /*----------------------------------------------------------------------------
1107 | Returns the result of converting the 32-bit two's complement integer `a'
1108 | to the single-precision floating-point format. The conversion is performed
1109 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1110 *----------------------------------------------------------------------------*/
1112 float32
int32_to_float32( int32 a STATUS_PARAM
)
1116 if ( a
== 0 ) return float32_zero
;
1117 if ( a
== (sbits32
) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1119 return normalizeRoundAndPackFloat32( zSign
, 0x9C, zSign
? - a
: a STATUS_VAR
);
1123 /*----------------------------------------------------------------------------
1124 | Returns the result of converting the 32-bit two's complement integer `a'
1125 | to the double-precision floating-point format. The conversion is performed
1126 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1127 *----------------------------------------------------------------------------*/
1129 float64
int32_to_float64( int32 a STATUS_PARAM
)
1136 if ( a
== 0 ) return float64_zero
;
1138 absA
= zSign
? - a
: a
;
1139 shiftCount
= countLeadingZeros32( absA
) + 21;
1141 return packFloat64( zSign
, 0x432 - shiftCount
, zSig
<<shiftCount
);
1147 /*----------------------------------------------------------------------------
1148 | Returns the result of converting the 32-bit two's complement integer `a'
1149 | to the extended double-precision floating-point format. The conversion
1150 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1152 *----------------------------------------------------------------------------*/
1154 floatx80
int32_to_floatx80( int32 a STATUS_PARAM
)
1161 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1163 absA
= zSign
? - a
: a
;
1164 shiftCount
= countLeadingZeros32( absA
) + 32;
1166 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
1174 /*----------------------------------------------------------------------------
1175 | Returns the result of converting the 32-bit two's complement integer `a' to
1176 | the quadruple-precision floating-point format. The conversion is performed
1177 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1178 *----------------------------------------------------------------------------*/
1180 float128
int32_to_float128( int32 a STATUS_PARAM
)
1187 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1189 absA
= zSign
? - a
: a
;
1190 shiftCount
= countLeadingZeros32( absA
) + 17;
1192 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
1198 /*----------------------------------------------------------------------------
1199 | Returns the result of converting the 64-bit two's complement integer `a'
1200 | to the single-precision floating-point format. The conversion is performed
1201 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1202 *----------------------------------------------------------------------------*/
1204 float32
int64_to_float32( int64 a STATUS_PARAM
)
1210 if ( a
== 0 ) return float32_zero
;
1212 absA
= zSign
? - a
: a
;
1213 shiftCount
= countLeadingZeros64( absA
) - 40;
1214 if ( 0 <= shiftCount
) {
1215 return packFloat32( zSign
, 0x95 - shiftCount
, absA
<<shiftCount
);
1219 if ( shiftCount
< 0 ) {
1220 shift64RightJamming( absA
, - shiftCount
, &absA
);
1223 absA
<<= shiftCount
;
1225 return roundAndPackFloat32( zSign
, 0x9C - shiftCount
, absA STATUS_VAR
);
1230 float32
uint64_to_float32( uint64 a STATUS_PARAM
)
1234 if ( a
== 0 ) return float32_zero
;
1235 shiftCount
= countLeadingZeros64( a
) - 40;
1236 if ( 0 <= shiftCount
) {
1237 return packFloat32( 1 > 0, 0x95 - shiftCount
, a
<<shiftCount
);
1241 if ( shiftCount
< 0 ) {
1242 shift64RightJamming( a
, - shiftCount
, &a
);
1247 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount
, a STATUS_VAR
);
1251 /*----------------------------------------------------------------------------
1252 | Returns the result of converting the 64-bit two's complement integer `a'
1253 | to the double-precision floating-point format. The conversion is performed
1254 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1255 *----------------------------------------------------------------------------*/
1257 float64
int64_to_float64( int64 a STATUS_PARAM
)
1261 if ( a
== 0 ) return float64_zero
;
1262 if ( a
== (sbits64
) LIT64( 0x8000000000000000 ) ) {
1263 return packFloat64( 1, 0x43E, 0 );
1266 return normalizeRoundAndPackFloat64( zSign
, 0x43C, zSign
? - a
: a STATUS_VAR
);
1270 float64
uint64_to_float64( uint64 a STATUS_PARAM
)
1272 if ( a
== 0 ) return float64_zero
;
1273 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR
);
1279 /*----------------------------------------------------------------------------
1280 | Returns the result of converting the 64-bit two's complement integer `a'
1281 | to the extended double-precision floating-point format. The conversion
1282 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1284 *----------------------------------------------------------------------------*/
1286 floatx80
int64_to_floatx80( int64 a STATUS_PARAM
)
1292 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1294 absA
= zSign
? - a
: a
;
1295 shiftCount
= countLeadingZeros64( absA
);
1296 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
1304 /*----------------------------------------------------------------------------
1305 | Returns the result of converting the 64-bit two's complement integer `a' to
1306 | the quadruple-precision floating-point format. The conversion is performed
1307 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1308 *----------------------------------------------------------------------------*/
1310 float128
int64_to_float128( int64 a STATUS_PARAM
)
1316 bits64 zSig0
, zSig1
;
1318 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1320 absA
= zSign
? - a
: a
;
1321 shiftCount
= countLeadingZeros64( absA
) + 49;
1322 zExp
= 0x406E - shiftCount
;
1323 if ( 64 <= shiftCount
) {
1332 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1333 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1339 /*----------------------------------------------------------------------------
1340 | Returns the result of converting the single-precision floating-point value
1341 | `a' to the 32-bit two's complement integer format. The conversion is
1342 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1343 | Arithmetic---which means in particular that the conversion is rounded
1344 | according to the current rounding mode. If `a' is a NaN, the largest
1345 | positive integer is returned. Otherwise, if the conversion overflows, the
1346 | largest integer with the same sign as `a' is returned.
1347 *----------------------------------------------------------------------------*/
1349 int32
float32_to_int32( float32 a STATUS_PARAM
)
1352 int16 aExp
, shiftCount
;
1356 a
= float32_squash_input_denormal(a STATUS_VAR
);
1357 aSig
= extractFloat32Frac( a
);
1358 aExp
= extractFloat32Exp( a
);
1359 aSign
= extractFloat32Sign( a
);
1360 if ( ( aExp
== 0xFF ) && aSig
) aSign
= 0;
1361 if ( aExp
) aSig
|= 0x00800000;
1362 shiftCount
= 0xAF - aExp
;
1365 if ( 0 < shiftCount
) shift64RightJamming( aSig64
, shiftCount
, &aSig64
);
1366 return roundAndPackInt32( aSign
, aSig64 STATUS_VAR
);
1370 /*----------------------------------------------------------------------------
1371 | Returns the result of converting the single-precision floating-point value
1372 | `a' to the 32-bit two's complement integer format. The conversion is
1373 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1374 | Arithmetic, except that the conversion is always rounded toward zero.
1375 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1376 | the conversion overflows, the largest integer with the same sign as `a' is
1378 *----------------------------------------------------------------------------*/
1380 int32
float32_to_int32_round_to_zero( float32 a STATUS_PARAM
)
1383 int16 aExp
, shiftCount
;
1386 a
= float32_squash_input_denormal(a STATUS_VAR
);
1388 aSig
= extractFloat32Frac( a
);
1389 aExp
= extractFloat32Exp( a
);
1390 aSign
= extractFloat32Sign( a
);
1391 shiftCount
= aExp
- 0x9E;
1392 if ( 0 <= shiftCount
) {
1393 if ( float32_val(a
) != 0xCF000000 ) {
1394 float_raise( float_flag_invalid STATUS_VAR
);
1395 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
1397 return (sbits32
) 0x80000000;
1399 else if ( aExp
<= 0x7E ) {
1400 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1403 aSig
= ( aSig
| 0x00800000 )<<8;
1404 z
= aSig
>>( - shiftCount
);
1405 if ( (bits32
) ( aSig
<<( shiftCount
& 31 ) ) ) {
1406 STATUS(float_exception_flags
) |= float_flag_inexact
;
1408 if ( aSign
) z
= - z
;
1413 /*----------------------------------------------------------------------------
1414 | Returns the result of converting the single-precision floating-point value
1415 | `a' to the 16-bit two's complement integer format. The conversion is
1416 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1417 | Arithmetic, except that the conversion is always rounded toward zero.
1418 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1419 | the conversion overflows, the largest integer with the same sign as `a' is
1421 *----------------------------------------------------------------------------*/
1423 int16
float32_to_int16_round_to_zero( float32 a STATUS_PARAM
)
1426 int16 aExp
, shiftCount
;
1430 aSig
= extractFloat32Frac( a
);
1431 aExp
= extractFloat32Exp( a
);
1432 aSign
= extractFloat32Sign( a
);
1433 shiftCount
= aExp
- 0x8E;
1434 if ( 0 <= shiftCount
) {
1435 if ( float32_val(a
) != 0xC7000000 ) {
1436 float_raise( float_flag_invalid STATUS_VAR
);
1437 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1441 return (sbits32
) 0xffff8000;
1443 else if ( aExp
<= 0x7E ) {
1444 if ( aExp
| aSig
) {
1445 STATUS(float_exception_flags
) |= float_flag_inexact
;
1450 aSig
= ( aSig
| 0x00800000 )<<8;
1451 z
= aSig
>>( - shiftCount
);
1452 if ( (bits32
) ( aSig
<<( shiftCount
& 31 ) ) ) {
1453 STATUS(float_exception_flags
) |= float_flag_inexact
;
1462 /*----------------------------------------------------------------------------
1463 | Returns the result of converting the single-precision floating-point value
1464 | `a' to the 64-bit two's complement integer format. The conversion is
1465 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1466 | Arithmetic---which means in particular that the conversion is rounded
1467 | according to the current rounding mode. If `a' is a NaN, the largest
1468 | positive integer is returned. Otherwise, if the conversion overflows, the
1469 | largest integer with the same sign as `a' is returned.
1470 *----------------------------------------------------------------------------*/
1472 int64
float32_to_int64( float32 a STATUS_PARAM
)
1475 int16 aExp
, shiftCount
;
1477 bits64 aSig64
, aSigExtra
;
1478 a
= float32_squash_input_denormal(a STATUS_VAR
);
1480 aSig
= extractFloat32Frac( a
);
1481 aExp
= extractFloat32Exp( a
);
1482 aSign
= extractFloat32Sign( a
);
1483 shiftCount
= 0xBE - aExp
;
1484 if ( shiftCount
< 0 ) {
1485 float_raise( float_flag_invalid STATUS_VAR
);
1486 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1487 return LIT64( 0x7FFFFFFFFFFFFFFF );
1489 return (sbits64
) LIT64( 0x8000000000000000 );
1491 if ( aExp
) aSig
|= 0x00800000;
1494 shift64ExtraRightJamming( aSig64
, 0, shiftCount
, &aSig64
, &aSigExtra
);
1495 return roundAndPackInt64( aSign
, aSig64
, aSigExtra STATUS_VAR
);
1499 /*----------------------------------------------------------------------------
1500 | Returns the result of converting the single-precision floating-point value
1501 | `a' to the 64-bit two's complement integer format. The conversion is
1502 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1503 | Arithmetic, except that the conversion is always rounded toward zero. If
1504 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1505 | conversion overflows, the largest integer with the same sign as `a' is
1507 *----------------------------------------------------------------------------*/
1509 int64
float32_to_int64_round_to_zero( float32 a STATUS_PARAM
)
1512 int16 aExp
, shiftCount
;
1516 a
= float32_squash_input_denormal(a STATUS_VAR
);
1518 aSig
= extractFloat32Frac( a
);
1519 aExp
= extractFloat32Exp( a
);
1520 aSign
= extractFloat32Sign( a
);
1521 shiftCount
= aExp
- 0xBE;
1522 if ( 0 <= shiftCount
) {
1523 if ( float32_val(a
) != 0xDF000000 ) {
1524 float_raise( float_flag_invalid STATUS_VAR
);
1525 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1526 return LIT64( 0x7FFFFFFFFFFFFFFF );
1529 return (sbits64
) LIT64( 0x8000000000000000 );
1531 else if ( aExp
<= 0x7E ) {
1532 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1535 aSig64
= aSig
| 0x00800000;
1537 z
= aSig64
>>( - shiftCount
);
1538 if ( (bits64
) ( aSig64
<<( shiftCount
& 63 ) ) ) {
1539 STATUS(float_exception_flags
) |= float_flag_inexact
;
1541 if ( aSign
) z
= - z
;
1546 /*----------------------------------------------------------------------------
1547 | Returns the result of converting the single-precision floating-point value
1548 | `a' to the double-precision floating-point format. The conversion is
1549 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1551 *----------------------------------------------------------------------------*/
1553 float64
float32_to_float64( float32 a STATUS_PARAM
)
1558 a
= float32_squash_input_denormal(a STATUS_VAR
);
1560 aSig
= extractFloat32Frac( a
);
1561 aExp
= extractFloat32Exp( a
);
1562 aSign
= extractFloat32Sign( a
);
1563 if ( aExp
== 0xFF ) {
1564 if ( aSig
) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
1565 return packFloat64( aSign
, 0x7FF, 0 );
1568 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
1569 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1572 return packFloat64( aSign
, aExp
+ 0x380, ( (bits64
) aSig
)<<29 );
1578 /*----------------------------------------------------------------------------
1579 | Returns the result of converting the single-precision floating-point value
1580 | `a' to the extended double-precision floating-point format. The conversion
1581 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1583 *----------------------------------------------------------------------------*/
1585 floatx80
float32_to_floatx80( float32 a STATUS_PARAM
)
1591 a
= float32_squash_input_denormal(a STATUS_VAR
);
1592 aSig
= extractFloat32Frac( a
);
1593 aExp
= extractFloat32Exp( a
);
1594 aSign
= extractFloat32Sign( a
);
1595 if ( aExp
== 0xFF ) {
1596 if ( aSig
) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
1597 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1600 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
1601 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1604 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<40 );
1612 /*----------------------------------------------------------------------------
1613 | Returns the result of converting the single-precision floating-point value
1614 | `a' to the double-precision floating-point format. The conversion is
1615 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1617 *----------------------------------------------------------------------------*/
1619 float128
float32_to_float128( float32 a STATUS_PARAM
)
1625 a
= float32_squash_input_denormal(a STATUS_VAR
);
1626 aSig
= extractFloat32Frac( a
);
1627 aExp
= extractFloat32Exp( a
);
1628 aSign
= extractFloat32Sign( a
);
1629 if ( aExp
== 0xFF ) {
1630 if ( aSig
) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
1631 return packFloat128( aSign
, 0x7FFF, 0, 0 );
1634 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
1635 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1638 return packFloat128( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<25, 0 );
1644 /*----------------------------------------------------------------------------
1645 | Rounds the single-precision floating-point value `a' to an integer, and
1646 | returns the result as a single-precision floating-point value. The
1647 | operation is performed according to the IEC/IEEE Standard for Binary
1648 | Floating-Point Arithmetic.
1649 *----------------------------------------------------------------------------*/
1651 float32
float32_round_to_int( float32 a STATUS_PARAM
)
1655 bits32 lastBitMask
, roundBitsMask
;
1658 a
= float32_squash_input_denormal(a STATUS_VAR
);
1660 aExp
= extractFloat32Exp( a
);
1661 if ( 0x96 <= aExp
) {
1662 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
1663 return propagateFloat32NaN( a
, a STATUS_VAR
);
1667 if ( aExp
<= 0x7E ) {
1668 if ( (bits32
) ( float32_val(a
)<<1 ) == 0 ) return a
;
1669 STATUS(float_exception_flags
) |= float_flag_inexact
;
1670 aSign
= extractFloat32Sign( a
);
1671 switch ( STATUS(float_rounding_mode
) ) {
1672 case float_round_nearest_even
:
1673 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
1674 return packFloat32( aSign
, 0x7F, 0 );
1677 case float_round_down
:
1678 return make_float32(aSign
? 0xBF800000 : 0);
1679 case float_round_up
:
1680 return make_float32(aSign
? 0x80000000 : 0x3F800000);
1682 return packFloat32( aSign
, 0, 0 );
1685 lastBitMask
<<= 0x96 - aExp
;
1686 roundBitsMask
= lastBitMask
- 1;
1688 roundingMode
= STATUS(float_rounding_mode
);
1689 if ( roundingMode
== float_round_nearest_even
) {
1690 z
+= lastBitMask
>>1;
1691 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
1693 else if ( roundingMode
!= float_round_to_zero
) {
1694 if ( extractFloat32Sign( make_float32(z
) ) ^ ( roundingMode
== float_round_up
) ) {
1698 z
&= ~ roundBitsMask
;
1699 if ( z
!= float32_val(a
) ) STATUS(float_exception_flags
) |= float_flag_inexact
;
1700 return make_float32(z
);
1704 /*----------------------------------------------------------------------------
1705 | Returns the result of adding the absolute values of the single-precision
1706 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1707 | before being returned. `zSign' is ignored if the result is a NaN.
1708 | The addition is performed according to the IEC/IEEE Standard for Binary
1709 | Floating-Point Arithmetic.
1710 *----------------------------------------------------------------------------*/
1712 static float32
addFloat32Sigs( float32 a
, float32 b
, flag zSign STATUS_PARAM
)
1714 int16 aExp
, bExp
, zExp
;
1715 bits32 aSig
, bSig
, zSig
;
1718 aSig
= extractFloat32Frac( a
);
1719 aExp
= extractFloat32Exp( a
);
1720 bSig
= extractFloat32Frac( b
);
1721 bExp
= extractFloat32Exp( b
);
1722 expDiff
= aExp
- bExp
;
1725 if ( 0 < expDiff
) {
1726 if ( aExp
== 0xFF ) {
1727 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1736 shift32RightJamming( bSig
, expDiff
, &bSig
);
1739 else if ( expDiff
< 0 ) {
1740 if ( bExp
== 0xFF ) {
1741 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1742 return packFloat32( zSign
, 0xFF, 0 );
1750 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1754 if ( aExp
== 0xFF ) {
1755 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1759 if ( STATUS(flush_to_zero
) ) return packFloat32( zSign
, 0, 0 );
1760 return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
1762 zSig
= 0x40000000 + aSig
+ bSig
;
1767 zSig
= ( aSig
+ bSig
)<<1;
1769 if ( (sbits32
) zSig
< 0 ) {
1774 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1778 /*----------------------------------------------------------------------------
1779 | Returns the result of subtracting the absolute values of the single-
1780 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1781 | difference is negated before being returned. `zSign' is ignored if the
1782 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1783 | Standard for Binary Floating-Point Arithmetic.
1784 *----------------------------------------------------------------------------*/
1786 static float32
subFloat32Sigs( float32 a
, float32 b
, flag zSign STATUS_PARAM
)
1788 int16 aExp
, bExp
, zExp
;
1789 bits32 aSig
, bSig
, zSig
;
1792 aSig
= extractFloat32Frac( a
);
1793 aExp
= extractFloat32Exp( a
);
1794 bSig
= extractFloat32Frac( b
);
1795 bExp
= extractFloat32Exp( b
);
1796 expDiff
= aExp
- bExp
;
1799 if ( 0 < expDiff
) goto aExpBigger
;
1800 if ( expDiff
< 0 ) goto bExpBigger
;
1801 if ( aExp
== 0xFF ) {
1802 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1803 float_raise( float_flag_invalid STATUS_VAR
);
1804 return float32_default_nan
;
1810 if ( bSig
< aSig
) goto aBigger
;
1811 if ( aSig
< bSig
) goto bBigger
;
1812 return packFloat32( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
1814 if ( bExp
== 0xFF ) {
1815 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1816 return packFloat32( zSign
^ 1, 0xFF, 0 );
1824 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1830 goto normalizeRoundAndPack
;
1832 if ( aExp
== 0xFF ) {
1833 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1842 shift32RightJamming( bSig
, expDiff
, &bSig
);
1847 normalizeRoundAndPack
:
1849 return normalizeRoundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1853 /*----------------------------------------------------------------------------
1854 | Returns the result of adding the single-precision floating-point values `a'
1855 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1856 | Binary Floating-Point Arithmetic.
1857 *----------------------------------------------------------------------------*/
1859 float32
float32_add( float32 a
, float32 b STATUS_PARAM
)
1862 a
= float32_squash_input_denormal(a STATUS_VAR
);
1863 b
= float32_squash_input_denormal(b STATUS_VAR
);
1865 aSign
= extractFloat32Sign( a
);
1866 bSign
= extractFloat32Sign( b
);
1867 if ( aSign
== bSign
) {
1868 return addFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1871 return subFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1876 /*----------------------------------------------------------------------------
1877 | Returns the result of subtracting the single-precision floating-point values
1878 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1879 | for Binary Floating-Point Arithmetic.
1880 *----------------------------------------------------------------------------*/
1882 float32
float32_sub( float32 a
, float32 b STATUS_PARAM
)
1885 a
= float32_squash_input_denormal(a STATUS_VAR
);
1886 b
= float32_squash_input_denormal(b STATUS_VAR
);
1888 aSign
= extractFloat32Sign( a
);
1889 bSign
= extractFloat32Sign( b
);
1890 if ( aSign
== bSign
) {
1891 return subFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1894 return addFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1899 /*----------------------------------------------------------------------------
1900 | Returns the result of multiplying the single-precision floating-point values
1901 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1902 | for Binary Floating-Point Arithmetic.
1903 *----------------------------------------------------------------------------*/
1905 float32
float32_mul( float32 a
, float32 b STATUS_PARAM
)
1907 flag aSign
, bSign
, zSign
;
1908 int16 aExp
, bExp
, zExp
;
1913 a
= float32_squash_input_denormal(a STATUS_VAR
);
1914 b
= float32_squash_input_denormal(b STATUS_VAR
);
1916 aSig
= extractFloat32Frac( a
);
1917 aExp
= extractFloat32Exp( a
);
1918 aSign
= extractFloat32Sign( a
);
1919 bSig
= extractFloat32Frac( b
);
1920 bExp
= extractFloat32Exp( b
);
1921 bSign
= extractFloat32Sign( b
);
1922 zSign
= aSign
^ bSign
;
1923 if ( aExp
== 0xFF ) {
1924 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1925 return propagateFloat32NaN( a
, b STATUS_VAR
);
1927 if ( ( bExp
| bSig
) == 0 ) {
1928 float_raise( float_flag_invalid STATUS_VAR
);
1929 return float32_default_nan
;
1931 return packFloat32( zSign
, 0xFF, 0 );
1933 if ( bExp
== 0xFF ) {
1934 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1935 if ( ( aExp
| aSig
) == 0 ) {
1936 float_raise( float_flag_invalid STATUS_VAR
);
1937 return float32_default_nan
;
1939 return packFloat32( zSign
, 0xFF, 0 );
1942 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1943 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1946 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1947 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1949 zExp
= aExp
+ bExp
- 0x7F;
1950 aSig
= ( aSig
| 0x00800000 )<<7;
1951 bSig
= ( bSig
| 0x00800000 )<<8;
1952 shift64RightJamming( ( (bits64
) aSig
) * bSig
, 32, &zSig64
);
1954 if ( 0 <= (sbits32
) ( zSig
<<1 ) ) {
1958 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1962 /*----------------------------------------------------------------------------
1963 | Returns the result of dividing the single-precision floating-point value `a'
1964 | by the corresponding value `b'. The operation is performed according to the
1965 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1966 *----------------------------------------------------------------------------*/
1968 float32
float32_div( float32 a
, float32 b STATUS_PARAM
)
1970 flag aSign
, bSign
, zSign
;
1971 int16 aExp
, bExp
, zExp
;
1972 bits32 aSig
, bSig
, zSig
;
1973 a
= float32_squash_input_denormal(a STATUS_VAR
);
1974 b
= float32_squash_input_denormal(b STATUS_VAR
);
1976 aSig
= extractFloat32Frac( a
);
1977 aExp
= extractFloat32Exp( a
);
1978 aSign
= extractFloat32Sign( a
);
1979 bSig
= extractFloat32Frac( b
);
1980 bExp
= extractFloat32Exp( b
);
1981 bSign
= extractFloat32Sign( b
);
1982 zSign
= aSign
^ bSign
;
1983 if ( aExp
== 0xFF ) {
1984 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1985 if ( bExp
== 0xFF ) {
1986 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1987 float_raise( float_flag_invalid STATUS_VAR
);
1988 return float32_default_nan
;
1990 return packFloat32( zSign
, 0xFF, 0 );
1992 if ( bExp
== 0xFF ) {
1993 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1994 return packFloat32( zSign
, 0, 0 );
1998 if ( ( aExp
| aSig
) == 0 ) {
1999 float_raise( float_flag_invalid STATUS_VAR
);
2000 return float32_default_nan
;
2002 float_raise( float_flag_divbyzero STATUS_VAR
);
2003 return packFloat32( zSign
, 0xFF, 0 );
2005 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2008 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
2009 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2011 zExp
= aExp
- bExp
+ 0x7D;
2012 aSig
= ( aSig
| 0x00800000 )<<7;
2013 bSig
= ( bSig
| 0x00800000 )<<8;
2014 if ( bSig
<= ( aSig
+ aSig
) ) {
2018 zSig
= ( ( (bits64
) aSig
)<<32 ) / bSig
;
2019 if ( ( zSig
& 0x3F ) == 0 ) {
2020 zSig
|= ( (bits64
) bSig
* zSig
!= ( (bits64
) aSig
)<<32 );
2022 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
2026 /*----------------------------------------------------------------------------
2027 | Returns the remainder of the single-precision floating-point value `a'
2028 | with respect to the corresponding value `b'. The operation is performed
2029 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2030 *----------------------------------------------------------------------------*/
2032 float32
float32_rem( float32 a
, float32 b STATUS_PARAM
)
2035 int16 aExp
, bExp
, expDiff
;
2038 bits64 aSig64
, bSig64
, q64
;
2039 bits32 alternateASig
;
2041 a
= float32_squash_input_denormal(a STATUS_VAR
);
2042 b
= float32_squash_input_denormal(b STATUS_VAR
);
2044 aSig
= extractFloat32Frac( a
);
2045 aExp
= extractFloat32Exp( a
);
2046 aSign
= extractFloat32Sign( a
);
2047 bSig
= extractFloat32Frac( b
);
2048 bExp
= extractFloat32Exp( b
);
2049 if ( aExp
== 0xFF ) {
2050 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
2051 return propagateFloat32NaN( a
, b STATUS_VAR
);
2053 float_raise( float_flag_invalid STATUS_VAR
);
2054 return float32_default_nan
;
2056 if ( bExp
== 0xFF ) {
2057 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
2062 float_raise( float_flag_invalid STATUS_VAR
);
2063 return float32_default_nan
;
2065 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2068 if ( aSig
== 0 ) return a
;
2069 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2071 expDiff
= aExp
- bExp
;
2074 if ( expDiff
< 32 ) {
2077 if ( expDiff
< 0 ) {
2078 if ( expDiff
< -1 ) return a
;
2081 q
= ( bSig
<= aSig
);
2082 if ( q
) aSig
-= bSig
;
2083 if ( 0 < expDiff
) {
2084 q
= ( ( (bits64
) aSig
)<<32 ) / bSig
;
2087 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
2095 if ( bSig
<= aSig
) aSig
-= bSig
;
2096 aSig64
= ( (bits64
) aSig
)<<40;
2097 bSig64
= ( (bits64
) bSig
)<<40;
2099 while ( 0 < expDiff
) {
2100 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2101 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2102 aSig64
= - ( ( bSig
* q64
)<<38 );
2106 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2107 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2108 q
= q64
>>( 64 - expDiff
);
2110 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
2113 alternateASig
= aSig
;
2116 } while ( 0 <= (sbits32
) aSig
);
2117 sigMean
= aSig
+ alternateASig
;
2118 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
2119 aSig
= alternateASig
;
2121 zSign
= ( (sbits32
) aSig
< 0 );
2122 if ( zSign
) aSig
= - aSig
;
2123 return normalizeRoundAndPackFloat32( aSign
^ zSign
, bExp
, aSig STATUS_VAR
);
2127 /*----------------------------------------------------------------------------
2128 | Returns the square root of the single-precision floating-point value `a'.
2129 | The operation is performed according to the IEC/IEEE Standard for Binary
2130 | Floating-Point Arithmetic.
2131 *----------------------------------------------------------------------------*/
2133 float32
float32_sqrt( float32 a STATUS_PARAM
)
2139 a
= float32_squash_input_denormal(a STATUS_VAR
);
2141 aSig
= extractFloat32Frac( a
);
2142 aExp
= extractFloat32Exp( a
);
2143 aSign
= extractFloat32Sign( a
);
2144 if ( aExp
== 0xFF ) {
2145 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2146 if ( ! aSign
) return a
;
2147 float_raise( float_flag_invalid STATUS_VAR
);
2148 return float32_default_nan
;
2151 if ( ( aExp
| aSig
) == 0 ) return a
;
2152 float_raise( float_flag_invalid STATUS_VAR
);
2153 return float32_default_nan
;
2156 if ( aSig
== 0 ) return float32_zero
;
2157 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2159 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
2160 aSig
= ( aSig
| 0x00800000 )<<8;
2161 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
2162 if ( ( zSig
& 0x7F ) <= 5 ) {
2168 term
= ( (bits64
) zSig
) * zSig
;
2169 rem
= ( ( (bits64
) aSig
)<<32 ) - term
;
2170 while ( (sbits64
) rem
< 0 ) {
2172 rem
+= ( ( (bits64
) zSig
)<<1 ) | 1;
2174 zSig
|= ( rem
!= 0 );
2176 shift32RightJamming( zSig
, 1, &zSig
);
2178 return roundAndPackFloat32( 0, zExp
, zSig STATUS_VAR
);
2182 /*----------------------------------------------------------------------------
2183 | Returns the binary exponential of the single-precision floating-point value
2184 | `a'. The operation is performed according to the IEC/IEEE Standard for
2185 | Binary Floating-Point Arithmetic.
2187 | Uses the following identities:
2189 | 1. -------------------------------------------------------------------------
2193 | 2. -------------------------------------------------------------------------
2196 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2198 *----------------------------------------------------------------------------*/
2200 static const float64 float32_exp2_coefficients
[15] =
2202 const_float64( 0x3ff0000000000000ll
), /* 1 */
2203 const_float64( 0x3fe0000000000000ll
), /* 2 */
2204 const_float64( 0x3fc5555555555555ll
), /* 3 */
2205 const_float64( 0x3fa5555555555555ll
), /* 4 */
2206 const_float64( 0x3f81111111111111ll
), /* 5 */
2207 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
2208 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
2209 const_float64( 0x3efa01a01a01a01all
), /* 8 */
2210 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
2211 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
2212 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
2213 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
2214 const_float64( 0x3de6124613a86d09ll
), /* 13 */
2215 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
2216 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
2219 float32
float32_exp2( float32 a STATUS_PARAM
)
2226 a
= float32_squash_input_denormal(a STATUS_VAR
);
2228 aSig
= extractFloat32Frac( a
);
2229 aExp
= extractFloat32Exp( a
);
2230 aSign
= extractFloat32Sign( a
);
2232 if ( aExp
== 0xFF) {
2233 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2234 return (aSign
) ? float32_zero
: a
;
2237 if (aSig
== 0) return float32_one
;
2240 float_raise( float_flag_inexact STATUS_VAR
);
2242 /* ******************************* */
2243 /* using float64 for approximation */
2244 /* ******************************* */
2245 x
= float32_to_float64(a STATUS_VAR
);
2246 x
= float64_mul(x
, float64_ln2 STATUS_VAR
);
2250 for (i
= 0 ; i
< 15 ; i
++) {
2253 f
= float64_mul(xn
, float32_exp2_coefficients
[i
] STATUS_VAR
);
2254 r
= float64_add(r
, f STATUS_VAR
);
2256 xn
= float64_mul(xn
, x STATUS_VAR
);
2259 return float64_to_float32(r
, status
);
2262 /*----------------------------------------------------------------------------
2263 | Returns the binary log of the single-precision floating-point value `a'.
2264 | The operation is performed according to the IEC/IEEE Standard for Binary
2265 | Floating-Point Arithmetic.
2266 *----------------------------------------------------------------------------*/
2267 float32
float32_log2( float32 a STATUS_PARAM
)
2271 bits32 aSig
, zSig
, i
;
2273 a
= float32_squash_input_denormal(a STATUS_VAR
);
2274 aSig
= extractFloat32Frac( a
);
2275 aExp
= extractFloat32Exp( a
);
2276 aSign
= extractFloat32Sign( a
);
2279 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
2280 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2283 float_raise( float_flag_invalid STATUS_VAR
);
2284 return float32_default_nan
;
2286 if ( aExp
== 0xFF ) {
2287 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2296 for (i
= 1 << 22; i
> 0; i
>>= 1) {
2297 aSig
= ( (bits64
)aSig
* aSig
) >> 23;
2298 if ( aSig
& 0x01000000 ) {
2307 return normalizeRoundAndPackFloat32( zSign
, 0x85, zSig STATUS_VAR
);
2310 /*----------------------------------------------------------------------------
2311 | Returns 1 if the single-precision floating-point value `a' is equal to
2312 | the corresponding value `b', and 0 otherwise. The comparison is performed
2313 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2314 *----------------------------------------------------------------------------*/
2316 int float32_eq( float32 a
, float32 b STATUS_PARAM
)
2318 a
= float32_squash_input_denormal(a STATUS_VAR
);
2319 b
= float32_squash_input_denormal(b STATUS_VAR
);
2321 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2322 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2324 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2325 float_raise( float_flag_invalid STATUS_VAR
);
2329 return ( float32_val(a
) == float32_val(b
) ) ||
2330 ( (bits32
) ( ( float32_val(a
) | float32_val(b
) )<<1 ) == 0 );
2334 /*----------------------------------------------------------------------------
2335 | Returns 1 if the single-precision floating-point value `a' is less than
2336 | or equal to the corresponding value `b', and 0 otherwise. The comparison
2337 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2339 *----------------------------------------------------------------------------*/
2341 int float32_le( float32 a
, float32 b STATUS_PARAM
)
2345 a
= float32_squash_input_denormal(a STATUS_VAR
);
2346 b
= float32_squash_input_denormal(b STATUS_VAR
);
2348 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2349 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2351 float_raise( float_flag_invalid STATUS_VAR
);
2354 aSign
= extractFloat32Sign( a
);
2355 bSign
= extractFloat32Sign( b
);
2356 av
= float32_val(a
);
2357 bv
= float32_val(b
);
2358 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( av
| bv
)<<1 ) == 0 );
2359 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
2363 /*----------------------------------------------------------------------------
2364 | Returns 1 if the single-precision floating-point value `a' is less than
2365 | the corresponding value `b', and 0 otherwise. The comparison is performed
2366 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2367 *----------------------------------------------------------------------------*/
2369 int float32_lt( float32 a
, float32 b STATUS_PARAM
)
2373 a
= float32_squash_input_denormal(a STATUS_VAR
);
2374 b
= float32_squash_input_denormal(b STATUS_VAR
);
2376 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2377 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2379 float_raise( float_flag_invalid STATUS_VAR
);
2382 aSign
= extractFloat32Sign( a
);
2383 bSign
= extractFloat32Sign( b
);
2384 av
= float32_val(a
);
2385 bv
= float32_val(b
);
2386 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( av
| bv
)<<1 ) != 0 );
2387 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
2391 /*----------------------------------------------------------------------------
2392 | Returns 1 if the single-precision floating-point value `a' is equal to
2393 | the corresponding value `b', and 0 otherwise. The invalid exception is
2394 | raised if either operand is a NaN. Otherwise, the comparison is performed
2395 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2396 *----------------------------------------------------------------------------*/
2398 int float32_eq_signaling( float32 a
, float32 b STATUS_PARAM
)
2401 a
= float32_squash_input_denormal(a STATUS_VAR
);
2402 b
= float32_squash_input_denormal(b STATUS_VAR
);
2404 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2405 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2407 float_raise( float_flag_invalid STATUS_VAR
);
2410 av
= float32_val(a
);
2411 bv
= float32_val(b
);
2412 return ( av
== bv
) || ( (bits32
) ( ( av
| bv
)<<1 ) == 0 );
2416 /*----------------------------------------------------------------------------
2417 | Returns 1 if the single-precision floating-point value `a' is less than or
2418 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2419 | cause an exception. Otherwise, the comparison is performed according to the
2420 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2421 *----------------------------------------------------------------------------*/
2423 int float32_le_quiet( float32 a
, float32 b STATUS_PARAM
)
2427 a
= float32_squash_input_denormal(a STATUS_VAR
);
2428 b
= float32_squash_input_denormal(b STATUS_VAR
);
2430 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2431 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2433 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2434 float_raise( float_flag_invalid STATUS_VAR
);
2438 aSign
= extractFloat32Sign( a
);
2439 bSign
= extractFloat32Sign( b
);
2440 av
= float32_val(a
);
2441 bv
= float32_val(b
);
2442 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( av
| bv
)<<1 ) == 0 );
2443 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
2447 /*----------------------------------------------------------------------------
2448 | Returns 1 if the single-precision floating-point value `a' is less than
2449 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2450 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2451 | Standard for Binary Floating-Point Arithmetic.
2452 *----------------------------------------------------------------------------*/
2454 int float32_lt_quiet( float32 a
, float32 b STATUS_PARAM
)
2458 a
= float32_squash_input_denormal(a STATUS_VAR
);
2459 b
= float32_squash_input_denormal(b STATUS_VAR
);
2461 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2462 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2464 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2465 float_raise( float_flag_invalid STATUS_VAR
);
2469 aSign
= extractFloat32Sign( a
);
2470 bSign
= extractFloat32Sign( b
);
2471 av
= float32_val(a
);
2472 bv
= float32_val(b
);
2473 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( av
| bv
)<<1 ) != 0 );
2474 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
2478 /*----------------------------------------------------------------------------
2479 | Returns the result of converting the double-precision floating-point value
2480 | `a' to the 32-bit two's complement integer format. The conversion is
2481 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2482 | Arithmetic---which means in particular that the conversion is rounded
2483 | according to the current rounding mode. If `a' is a NaN, the largest
2484 | positive integer is returned. Otherwise, if the conversion overflows, the
2485 | largest integer with the same sign as `a' is returned.
2486 *----------------------------------------------------------------------------*/
2488 int32
float64_to_int32( float64 a STATUS_PARAM
)
2491 int16 aExp
, shiftCount
;
2493 a
= float64_squash_input_denormal(a STATUS_VAR
);
2495 aSig
= extractFloat64Frac( a
);
2496 aExp
= extractFloat64Exp( a
);
2497 aSign
= extractFloat64Sign( a
);
2498 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2499 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2500 shiftCount
= 0x42C - aExp
;
2501 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
2502 return roundAndPackInt32( aSign
, aSig STATUS_VAR
);
2506 /*----------------------------------------------------------------------------
2507 | Returns the result of converting the double-precision floating-point value
2508 | `a' to the 32-bit two's complement integer format. The conversion is
2509 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2510 | Arithmetic, except that the conversion is always rounded toward zero.
2511 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2512 | the conversion overflows, the largest integer with the same sign as `a' is
2514 *----------------------------------------------------------------------------*/
2516 int32
float64_to_int32_round_to_zero( float64 a STATUS_PARAM
)
2519 int16 aExp
, shiftCount
;
2520 bits64 aSig
, savedASig
;
2522 a
= float64_squash_input_denormal(a STATUS_VAR
);
2524 aSig
= extractFloat64Frac( a
);
2525 aExp
= extractFloat64Exp( a
);
2526 aSign
= extractFloat64Sign( a
);
2527 if ( 0x41E < aExp
) {
2528 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2531 else if ( aExp
< 0x3FF ) {
2532 if ( aExp
|| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
2535 aSig
|= LIT64( 0x0010000000000000 );
2536 shiftCount
= 0x433 - aExp
;
2538 aSig
>>= shiftCount
;
2540 if ( aSign
) z
= - z
;
2541 if ( ( z
< 0 ) ^ aSign
) {
2543 float_raise( float_flag_invalid STATUS_VAR
);
2544 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
2546 if ( ( aSig
<<shiftCount
) != savedASig
) {
2547 STATUS(float_exception_flags
) |= float_flag_inexact
;
2553 /*----------------------------------------------------------------------------
2554 | Returns the result of converting the double-precision floating-point value
2555 | `a' to the 16-bit two's complement integer format. The conversion is
2556 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2557 | Arithmetic, except that the conversion is always rounded toward zero.
2558 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2559 | the conversion overflows, the largest integer with the same sign as `a' is
2561 *----------------------------------------------------------------------------*/
2563 int16
float64_to_int16_round_to_zero( float64 a STATUS_PARAM
)
2566 int16 aExp
, shiftCount
;
2567 bits64 aSig
, savedASig
;
2570 aSig
= extractFloat64Frac( a
);
2571 aExp
= extractFloat64Exp( a
);
2572 aSign
= extractFloat64Sign( a
);
2573 if ( 0x40E < aExp
) {
2574 if ( ( aExp
== 0x7FF ) && aSig
) {
2579 else if ( aExp
< 0x3FF ) {
2580 if ( aExp
|| aSig
) {
2581 STATUS(float_exception_flags
) |= float_flag_inexact
;
2585 aSig
|= LIT64( 0x0010000000000000 );
2586 shiftCount
= 0x433 - aExp
;
2588 aSig
>>= shiftCount
;
2593 if ( ( (int16_t)z
< 0 ) ^ aSign
) {
2595 float_raise( float_flag_invalid STATUS_VAR
);
2596 return aSign
? (sbits32
) 0xffff8000 : 0x7FFF;
2598 if ( ( aSig
<<shiftCount
) != savedASig
) {
2599 STATUS(float_exception_flags
) |= float_flag_inexact
;
2604 /*----------------------------------------------------------------------------
2605 | Returns the result of converting the double-precision floating-point value
2606 | `a' to the 64-bit two's complement integer format. The conversion is
2607 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2608 | Arithmetic---which means in particular that the conversion is rounded
2609 | according to the current rounding mode. If `a' is a NaN, the largest
2610 | positive integer is returned. Otherwise, if the conversion overflows, the
2611 | largest integer with the same sign as `a' is returned.
2612 *----------------------------------------------------------------------------*/
2614 int64
float64_to_int64( float64 a STATUS_PARAM
)
2617 int16 aExp
, shiftCount
;
2618 bits64 aSig
, aSigExtra
;
2619 a
= float64_squash_input_denormal(a STATUS_VAR
);
2621 aSig
= extractFloat64Frac( a
);
2622 aExp
= extractFloat64Exp( a
);
2623 aSign
= extractFloat64Sign( a
);
2624 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2625 shiftCount
= 0x433 - aExp
;
2626 if ( shiftCount
<= 0 ) {
2627 if ( 0x43E < aExp
) {
2628 float_raise( float_flag_invalid STATUS_VAR
);
2630 || ( ( aExp
== 0x7FF )
2631 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2633 return LIT64( 0x7FFFFFFFFFFFFFFF );
2635 return (sbits64
) LIT64( 0x8000000000000000 );
2638 aSig
<<= - shiftCount
;
2641 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
2643 return roundAndPackInt64( aSign
, aSig
, aSigExtra STATUS_VAR
);
2647 /*----------------------------------------------------------------------------
2648 | Returns the result of converting the double-precision floating-point value
2649 | `a' to the 64-bit two's complement integer format. The conversion is
2650 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2651 | Arithmetic, except that the conversion is always rounded toward zero.
2652 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2653 | the conversion overflows, the largest integer with the same sign as `a' is
2655 *----------------------------------------------------------------------------*/
2657 int64
float64_to_int64_round_to_zero( float64 a STATUS_PARAM
)
2660 int16 aExp
, shiftCount
;
2663 a
= float64_squash_input_denormal(a STATUS_VAR
);
2665 aSig
= extractFloat64Frac( a
);
2666 aExp
= extractFloat64Exp( a
);
2667 aSign
= extractFloat64Sign( a
);
2668 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2669 shiftCount
= aExp
- 0x433;
2670 if ( 0 <= shiftCount
) {
2671 if ( 0x43E <= aExp
) {
2672 if ( float64_val(a
) != LIT64( 0xC3E0000000000000 ) ) {
2673 float_raise( float_flag_invalid STATUS_VAR
);
2675 || ( ( aExp
== 0x7FF )
2676 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2678 return LIT64( 0x7FFFFFFFFFFFFFFF );
2681 return (sbits64
) LIT64( 0x8000000000000000 );
2683 z
= aSig
<<shiftCount
;
2686 if ( aExp
< 0x3FE ) {
2687 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
2690 z
= aSig
>>( - shiftCount
);
2691 if ( (bits64
) ( aSig
<<( shiftCount
& 63 ) ) ) {
2692 STATUS(float_exception_flags
) |= float_flag_inexact
;
2695 if ( aSign
) z
= - z
;
2700 /*----------------------------------------------------------------------------
2701 | Returns the result of converting the double-precision floating-point value
2702 | `a' to the single-precision floating-point format. The conversion is
2703 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2705 *----------------------------------------------------------------------------*/
2707 float32
float64_to_float32( float64 a STATUS_PARAM
)
2713 a
= float64_squash_input_denormal(a STATUS_VAR
);
2715 aSig
= extractFloat64Frac( a
);
2716 aExp
= extractFloat64Exp( a
);
2717 aSign
= extractFloat64Sign( a
);
2718 if ( aExp
== 0x7FF ) {
2719 if ( aSig
) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
2720 return packFloat32( aSign
, 0xFF, 0 );
2722 shift64RightJamming( aSig
, 22, &aSig
);
2724 if ( aExp
|| zSig
) {
2728 return roundAndPackFloat32( aSign
, aExp
, zSig STATUS_VAR
);
2733 /*----------------------------------------------------------------------------
2734 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2735 | half-precision floating-point value, returning the result. After being
2736 | shifted into the proper positions, the three fields are simply added
2737 | together to form the result. This means that any integer portion of `zSig'
2738 | will be added into the exponent. Since a properly normalized significand
2739 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2740 | than the desired result exponent whenever `zSig' is a complete, normalized
2742 *----------------------------------------------------------------------------*/
2743 static float16
packFloat16(flag zSign
, int16 zExp
, bits16 zSig
)
2745 return make_float16(
2746 (((bits32
)zSign
) << 15) + (((bits32
)zExp
) << 10) + zSig
);
2749 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2750 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
2752 float32
float16_to_float32(float16 a
, flag ieee STATUS_PARAM
)
2758 aSign
= extractFloat16Sign(a
);
2759 aExp
= extractFloat16Exp(a
);
2760 aSig
= extractFloat16Frac(a
);
2762 if (aExp
== 0x1f && ieee
) {
2764 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR
) STATUS_VAR
);
2766 return packFloat32(aSign
, 0xff, aSig
<< 13);
2772 return packFloat32(aSign
, 0, 0);
2775 shiftCount
= countLeadingZeros32( aSig
) - 21;
2776 aSig
= aSig
<< shiftCount
;
2779 return packFloat32( aSign
, aExp
+ 0x70, aSig
<< 13);
2782 float16
float32_to_float16(float32 a
, flag ieee STATUS_PARAM
)
2790 a
= float32_squash_input_denormal(a STATUS_VAR
);
2792 aSig
= extractFloat32Frac( a
);
2793 aExp
= extractFloat32Exp( a
);
2794 aSign
= extractFloat32Sign( a
);
2795 if ( aExp
== 0xFF ) {
2797 /* Input is a NaN */
2798 float16 r
= commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
2800 return packFloat16(aSign
, 0, 0);
2806 float_raise(float_flag_invalid STATUS_VAR
);
2807 return packFloat16(aSign
, 0x1f, 0x3ff);
2809 return packFloat16(aSign
, 0x1f, 0);
2811 if (aExp
== 0 && aSig
== 0) {
2812 return packFloat16(aSign
, 0, 0);
2814 /* Decimal point between bits 22 and 23. */
2826 float_raise( float_flag_underflow STATUS_VAR
);
2827 roundingMode
= STATUS(float_rounding_mode
);
2828 switch (roundingMode
) {
2829 case float_round_nearest_even
:
2830 increment
= (mask
+ 1) >> 1;
2831 if ((aSig
& mask
) == increment
) {
2832 increment
= aSig
& (increment
<< 1);
2835 case float_round_up
:
2836 increment
= aSign
? 0 : mask
;
2838 case float_round_down
:
2839 increment
= aSign
? mask
: 0;
2841 default: /* round_to_zero */
2846 if (aSig
>= 0x01000000) {
2850 } else if (aExp
< -14
2851 && STATUS(float_detect_tininess
) == float_tininess_before_rounding
) {
2852 float_raise( float_flag_underflow STATUS_VAR
);
2857 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
2858 return packFloat16(aSign
, 0x1f, 0);
2862 float_raise(float_flag_invalid
| float_flag_inexact STATUS_VAR
);
2863 return packFloat16(aSign
, 0x1f, 0x3ff);
2867 return packFloat16(aSign
, 0, 0);
2870 aSig
>>= -14 - aExp
;
2873 return packFloat16(aSign
, aExp
+ 14, aSig
>> 13);
2878 /*----------------------------------------------------------------------------
2879 | Returns the result of converting the double-precision floating-point value
2880 | `a' to the extended double-precision floating-point format. The conversion
2881 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2883 *----------------------------------------------------------------------------*/
2885 floatx80
float64_to_floatx80( float64 a STATUS_PARAM
)
2891 a
= float64_squash_input_denormal(a STATUS_VAR
);
2892 aSig
= extractFloat64Frac( a
);
2893 aExp
= extractFloat64Exp( a
);
2894 aSign
= extractFloat64Sign( a
);
2895 if ( aExp
== 0x7FF ) {
2896 if ( aSig
) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
2897 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2900 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
2901 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2905 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
2913 /*----------------------------------------------------------------------------
2914 | Returns the result of converting the double-precision floating-point value
2915 | `a' to the quadruple-precision floating-point format. The conversion is
2916 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2918 *----------------------------------------------------------------------------*/
2920 float128
float64_to_float128( float64 a STATUS_PARAM
)
2924 bits64 aSig
, zSig0
, zSig1
;
2926 a
= float64_squash_input_denormal(a STATUS_VAR
);
2927 aSig
= extractFloat64Frac( a
);
2928 aExp
= extractFloat64Exp( a
);
2929 aSign
= extractFloat64Sign( a
);
2930 if ( aExp
== 0x7FF ) {
2931 if ( aSig
) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
2932 return packFloat128( aSign
, 0x7FFF, 0, 0 );
2935 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
2936 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2939 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
2940 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
2946 /*----------------------------------------------------------------------------
2947 | Rounds the double-precision floating-point value `a' to an integer, and
2948 | returns the result as a double-precision floating-point value. The
2949 | operation is performed according to the IEC/IEEE Standard for Binary
2950 | Floating-Point Arithmetic.
2951 *----------------------------------------------------------------------------*/
2953 float64
float64_round_to_int( float64 a STATUS_PARAM
)
2957 bits64 lastBitMask
, roundBitsMask
;
2960 a
= float64_squash_input_denormal(a STATUS_VAR
);
2962 aExp
= extractFloat64Exp( a
);
2963 if ( 0x433 <= aExp
) {
2964 if ( ( aExp
== 0x7FF ) && extractFloat64Frac( a
) ) {
2965 return propagateFloat64NaN( a
, a STATUS_VAR
);
2969 if ( aExp
< 0x3FF ) {
2970 if ( (bits64
) ( float64_val(a
)<<1 ) == 0 ) return a
;
2971 STATUS(float_exception_flags
) |= float_flag_inexact
;
2972 aSign
= extractFloat64Sign( a
);
2973 switch ( STATUS(float_rounding_mode
) ) {
2974 case float_round_nearest_even
:
2975 if ( ( aExp
== 0x3FE ) && extractFloat64Frac( a
) ) {
2976 return packFloat64( aSign
, 0x3FF, 0 );
2979 case float_round_down
:
2980 return make_float64(aSign
? LIT64( 0xBFF0000000000000 ) : 0);
2981 case float_round_up
:
2982 return make_float64(
2983 aSign
? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2985 return packFloat64( aSign
, 0, 0 );
2988 lastBitMask
<<= 0x433 - aExp
;
2989 roundBitsMask
= lastBitMask
- 1;
2991 roundingMode
= STATUS(float_rounding_mode
);
2992 if ( roundingMode
== float_round_nearest_even
) {
2993 z
+= lastBitMask
>>1;
2994 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
2996 else if ( roundingMode
!= float_round_to_zero
) {
2997 if ( extractFloat64Sign( make_float64(z
) ) ^ ( roundingMode
== float_round_up
) ) {
3001 z
&= ~ roundBitsMask
;
3002 if ( z
!= float64_val(a
) )
3003 STATUS(float_exception_flags
) |= float_flag_inexact
;
3004 return make_float64(z
);
3008 float64
float64_trunc_to_int( float64 a STATUS_PARAM
)
3012 oldmode
= STATUS(float_rounding_mode
);
3013 STATUS(float_rounding_mode
) = float_round_to_zero
;
3014 res
= float64_round_to_int(a STATUS_VAR
);
3015 STATUS(float_rounding_mode
) = oldmode
;
3019 /*----------------------------------------------------------------------------
3020 | Returns the result of adding the absolute values of the double-precision
3021 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3022 | before being returned. `zSign' is ignored if the result is a NaN.
3023 | The addition is performed according to the IEC/IEEE Standard for Binary
3024 | Floating-Point Arithmetic.
3025 *----------------------------------------------------------------------------*/
3027 static float64
addFloat64Sigs( float64 a
, float64 b
, flag zSign STATUS_PARAM
)
3029 int16 aExp
, bExp
, zExp
;
3030 bits64 aSig
, bSig
, zSig
;
3033 aSig
= extractFloat64Frac( a
);
3034 aExp
= extractFloat64Exp( a
);
3035 bSig
= extractFloat64Frac( b
);
3036 bExp
= extractFloat64Exp( b
);
3037 expDiff
= aExp
- bExp
;
3040 if ( 0 < expDiff
) {
3041 if ( aExp
== 0x7FF ) {
3042 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3049 bSig
|= LIT64( 0x2000000000000000 );
3051 shift64RightJamming( bSig
, expDiff
, &bSig
);
3054 else if ( expDiff
< 0 ) {
3055 if ( bExp
== 0x7FF ) {
3056 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3057 return packFloat64( zSign
, 0x7FF, 0 );
3063 aSig
|= LIT64( 0x2000000000000000 );
3065 shift64RightJamming( aSig
, - expDiff
, &aSig
);
3069 if ( aExp
== 0x7FF ) {
3070 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3074 if ( STATUS(flush_to_zero
) ) return packFloat64( zSign
, 0, 0 );
3075 return packFloat64( zSign
, 0, ( aSig
+ bSig
)>>9 );
3077 zSig
= LIT64( 0x4000000000000000 ) + aSig
+ bSig
;
3081 aSig
|= LIT64( 0x2000000000000000 );
3082 zSig
= ( aSig
+ bSig
)<<1;
3084 if ( (sbits64
) zSig
< 0 ) {
3089 return roundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
3093 /*----------------------------------------------------------------------------
3094 | Returns the result of subtracting the absolute values of the double-
3095 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3096 | difference is negated before being returned. `zSign' is ignored if the
3097 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3098 | Standard for Binary Floating-Point Arithmetic.
3099 *----------------------------------------------------------------------------*/
3101 static float64
subFloat64Sigs( float64 a
, float64 b
, flag zSign STATUS_PARAM
)
3103 int16 aExp
, bExp
, zExp
;
3104 bits64 aSig
, bSig
, zSig
;
3107 aSig
= extractFloat64Frac( a
);
3108 aExp
= extractFloat64Exp( a
);
3109 bSig
= extractFloat64Frac( b
);
3110 bExp
= extractFloat64Exp( b
);
3111 expDiff
= aExp
- bExp
;
3114 if ( 0 < expDiff
) goto aExpBigger
;
3115 if ( expDiff
< 0 ) goto bExpBigger
;
3116 if ( aExp
== 0x7FF ) {
3117 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3118 float_raise( float_flag_invalid STATUS_VAR
);
3119 return float64_default_nan
;
3125 if ( bSig
< aSig
) goto aBigger
;
3126 if ( aSig
< bSig
) goto bBigger
;
3127 return packFloat64( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
3129 if ( bExp
== 0x7FF ) {
3130 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3131 return packFloat64( zSign
^ 1, 0x7FF, 0 );
3137 aSig
|= LIT64( 0x4000000000000000 );
3139 shift64RightJamming( aSig
, - expDiff
, &aSig
);
3140 bSig
|= LIT64( 0x4000000000000000 );
3145 goto normalizeRoundAndPack
;
3147 if ( aExp
== 0x7FF ) {
3148 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3155 bSig
|= LIT64( 0x4000000000000000 );
3157 shift64RightJamming( bSig
, expDiff
, &bSig
);
3158 aSig
|= LIT64( 0x4000000000000000 );
3162 normalizeRoundAndPack
:
3164 return normalizeRoundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
3168 /*----------------------------------------------------------------------------
3169 | Returns the result of adding the double-precision floating-point values `a'
3170 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3171 | Binary Floating-Point Arithmetic.
3172 *----------------------------------------------------------------------------*/
3174 float64
float64_add( float64 a
, float64 b STATUS_PARAM
)
3177 a
= float64_squash_input_denormal(a STATUS_VAR
);
3178 b
= float64_squash_input_denormal(b STATUS_VAR
);
3180 aSign
= extractFloat64Sign( a
);
3181 bSign
= extractFloat64Sign( b
);
3182 if ( aSign
== bSign
) {
3183 return addFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3186 return subFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3191 /*----------------------------------------------------------------------------
3192 | Returns the result of subtracting the double-precision floating-point values
3193 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3194 | for Binary Floating-Point Arithmetic.
3195 *----------------------------------------------------------------------------*/
3197 float64
float64_sub( float64 a
, float64 b STATUS_PARAM
)
3200 a
= float64_squash_input_denormal(a STATUS_VAR
);
3201 b
= float64_squash_input_denormal(b STATUS_VAR
);
3203 aSign
= extractFloat64Sign( a
);
3204 bSign
= extractFloat64Sign( b
);
3205 if ( aSign
== bSign
) {
3206 return subFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3209 return addFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3214 /*----------------------------------------------------------------------------
3215 | Returns the result of multiplying the double-precision floating-point values
3216 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3217 | for Binary Floating-Point Arithmetic.
3218 *----------------------------------------------------------------------------*/
3220 float64
float64_mul( float64 a
, float64 b STATUS_PARAM
)
3222 flag aSign
, bSign
, zSign
;
3223 int16 aExp
, bExp
, zExp
;
3224 bits64 aSig
, bSig
, zSig0
, zSig1
;
3226 a
= float64_squash_input_denormal(a STATUS_VAR
);
3227 b
= float64_squash_input_denormal(b STATUS_VAR
);
3229 aSig
= extractFloat64Frac( a
);
3230 aExp
= extractFloat64Exp( a
);
3231 aSign
= extractFloat64Sign( a
);
3232 bSig
= extractFloat64Frac( b
);
3233 bExp
= extractFloat64Exp( b
);
3234 bSign
= extractFloat64Sign( b
);
3235 zSign
= aSign
^ bSign
;
3236 if ( aExp
== 0x7FF ) {
3237 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
3238 return propagateFloat64NaN( a
, b STATUS_VAR
);
3240 if ( ( bExp
| bSig
) == 0 ) {
3241 float_raise( float_flag_invalid STATUS_VAR
);
3242 return float64_default_nan
;
3244 return packFloat64( zSign
, 0x7FF, 0 );
3246 if ( bExp
== 0x7FF ) {
3247 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3248 if ( ( aExp
| aSig
) == 0 ) {
3249 float_raise( float_flag_invalid STATUS_VAR
);
3250 return float64_default_nan
;
3252 return packFloat64( zSign
, 0x7FF, 0 );
3255 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
3256 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3259 if ( bSig
== 0 ) return packFloat64( zSign
, 0, 0 );
3260 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3262 zExp
= aExp
+ bExp
- 0x3FF;
3263 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
3264 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3265 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
3266 zSig0
|= ( zSig1
!= 0 );
3267 if ( 0 <= (sbits64
) ( zSig0
<<1 ) ) {
3271 return roundAndPackFloat64( zSign
, zExp
, zSig0 STATUS_VAR
);
3275 /*----------------------------------------------------------------------------
3276 | Returns the result of dividing the double-precision floating-point value `a'
3277 | by the corresponding value `b'. The operation is performed according to
3278 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3279 *----------------------------------------------------------------------------*/
3281 float64
float64_div( float64 a
, float64 b STATUS_PARAM
)
3283 flag aSign
, bSign
, zSign
;
3284 int16 aExp
, bExp
, zExp
;
3285 bits64 aSig
, bSig
, zSig
;
3287 bits64 term0
, term1
;
3288 a
= float64_squash_input_denormal(a STATUS_VAR
);
3289 b
= float64_squash_input_denormal(b STATUS_VAR
);
3291 aSig
= extractFloat64Frac( a
);
3292 aExp
= extractFloat64Exp( a
);
3293 aSign
= extractFloat64Sign( a
);
3294 bSig
= extractFloat64Frac( b
);
3295 bExp
= extractFloat64Exp( b
);
3296 bSign
= extractFloat64Sign( b
);
3297 zSign
= aSign
^ bSign
;
3298 if ( aExp
== 0x7FF ) {
3299 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3300 if ( bExp
== 0x7FF ) {
3301 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3302 float_raise( float_flag_invalid STATUS_VAR
);
3303 return float64_default_nan
;
3305 return packFloat64( zSign
, 0x7FF, 0 );
3307 if ( bExp
== 0x7FF ) {
3308 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3309 return packFloat64( zSign
, 0, 0 );
3313 if ( ( aExp
| aSig
) == 0 ) {
3314 float_raise( float_flag_invalid STATUS_VAR
);
3315 return float64_default_nan
;
3317 float_raise( float_flag_divbyzero STATUS_VAR
);
3318 return packFloat64( zSign
, 0x7FF, 0 );
3320 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3323 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
3324 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3326 zExp
= aExp
- bExp
+ 0x3FD;
3327 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
3328 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3329 if ( bSig
<= ( aSig
+ aSig
) ) {
3333 zSig
= estimateDiv128To64( aSig
, 0, bSig
);
3334 if ( ( zSig
& 0x1FF ) <= 2 ) {
3335 mul64To128( bSig
, zSig
, &term0
, &term1
);
3336 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
3337 while ( (sbits64
) rem0
< 0 ) {
3339 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
3341 zSig
|= ( rem1
!= 0 );
3343 return roundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
3347 /*----------------------------------------------------------------------------
3348 | Returns the remainder of the double-precision floating-point value `a'
3349 | with respect to the corresponding value `b'. The operation is performed
3350 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3351 *----------------------------------------------------------------------------*/
3353 float64
float64_rem( float64 a
, float64 b STATUS_PARAM
)
3356 int16 aExp
, bExp
, expDiff
;
3358 bits64 q
, alternateASig
;
3361 a
= float64_squash_input_denormal(a STATUS_VAR
);
3362 b
= float64_squash_input_denormal(b STATUS_VAR
);
3363 aSig
= extractFloat64Frac( a
);
3364 aExp
= extractFloat64Exp( a
);
3365 aSign
= extractFloat64Sign( a
);
3366 bSig
= extractFloat64Frac( b
);
3367 bExp
= extractFloat64Exp( b
);
3368 if ( aExp
== 0x7FF ) {
3369 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
3370 return propagateFloat64NaN( a
, b STATUS_VAR
);
3372 float_raise( float_flag_invalid STATUS_VAR
);
3373 return float64_default_nan
;
3375 if ( bExp
== 0x7FF ) {
3376 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3381 float_raise( float_flag_invalid STATUS_VAR
);
3382 return float64_default_nan
;
3384 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3387 if ( aSig
== 0 ) return a
;
3388 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3390 expDiff
= aExp
- bExp
;
3391 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
3392 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3393 if ( expDiff
< 0 ) {
3394 if ( expDiff
< -1 ) return a
;
3397 q
= ( bSig
<= aSig
);
3398 if ( q
) aSig
-= bSig
;
3400 while ( 0 < expDiff
) {
3401 q
= estimateDiv128To64( aSig
, 0, bSig
);
3402 q
= ( 2 < q
) ? q
- 2 : 0;
3403 aSig
= - ( ( bSig
>>2 ) * q
);
3407 if ( 0 < expDiff
) {
3408 q
= estimateDiv128To64( aSig
, 0, bSig
);
3409 q
= ( 2 < q
) ? q
- 2 : 0;
3412 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
3419 alternateASig
= aSig
;
3422 } while ( 0 <= (sbits64
) aSig
);
3423 sigMean
= aSig
+ alternateASig
;
3424 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
3425 aSig
= alternateASig
;
3427 zSign
= ( (sbits64
) aSig
< 0 );
3428 if ( zSign
) aSig
= - aSig
;
3429 return normalizeRoundAndPackFloat64( aSign
^ zSign
, bExp
, aSig STATUS_VAR
);
3433 /*----------------------------------------------------------------------------
3434 | Returns the square root of the double-precision floating-point value `a'.
3435 | The operation is performed according to the IEC/IEEE Standard for Binary
3436 | Floating-Point Arithmetic.
3437 *----------------------------------------------------------------------------*/
3439 float64
float64_sqrt( float64 a STATUS_PARAM
)
3443 bits64 aSig
, zSig
, doubleZSig
;
3444 bits64 rem0
, rem1
, term0
, term1
;
3445 a
= float64_squash_input_denormal(a STATUS_VAR
);
3447 aSig
= extractFloat64Frac( a
);
3448 aExp
= extractFloat64Exp( a
);
3449 aSign
= extractFloat64Sign( a
);
3450 if ( aExp
== 0x7FF ) {
3451 if ( aSig
) return propagateFloat64NaN( a
, a STATUS_VAR
);
3452 if ( ! aSign
) return a
;
3453 float_raise( float_flag_invalid STATUS_VAR
);
3454 return float64_default_nan
;
3457 if ( ( aExp
| aSig
) == 0 ) return a
;
3458 float_raise( float_flag_invalid STATUS_VAR
);
3459 return float64_default_nan
;
3462 if ( aSig
== 0 ) return float64_zero
;
3463 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3465 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
3466 aSig
|= LIT64( 0x0010000000000000 );
3467 zSig
= estimateSqrt32( aExp
, aSig
>>21 );
3468 aSig
<<= 9 - ( aExp
& 1 );
3469 zSig
= estimateDiv128To64( aSig
, 0, zSig
<<32 ) + ( zSig
<<30 );
3470 if ( ( zSig
& 0x1FF ) <= 5 ) {
3471 doubleZSig
= zSig
<<1;
3472 mul64To128( zSig
, zSig
, &term0
, &term1
);
3473 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
3474 while ( (sbits64
) rem0
< 0 ) {
3477 add128( rem0
, rem1
, zSig
>>63, doubleZSig
| 1, &rem0
, &rem1
);
3479 zSig
|= ( ( rem0
| rem1
) != 0 );
3481 return roundAndPackFloat64( 0, zExp
, zSig STATUS_VAR
);
3485 /*----------------------------------------------------------------------------
3486 | Returns the binary log of the double-precision floating-point value `a'.
3487 | The operation is performed according to the IEC/IEEE Standard for Binary
3488 | Floating-Point Arithmetic.
3489 *----------------------------------------------------------------------------*/
3490 float64
float64_log2( float64 a STATUS_PARAM
)
3494 bits64 aSig
, aSig0
, aSig1
, zSig
, i
;
3495 a
= float64_squash_input_denormal(a STATUS_VAR
);
3497 aSig
= extractFloat64Frac( a
);
3498 aExp
= extractFloat64Exp( a
);
3499 aSign
= extractFloat64Sign( a
);
3502 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
3503 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3506 float_raise( float_flag_invalid STATUS_VAR
);
3507 return float64_default_nan
;
3509 if ( aExp
== 0x7FF ) {
3510 if ( aSig
) return propagateFloat64NaN( a
, float64_zero STATUS_VAR
);
3515 aSig
|= LIT64( 0x0010000000000000 );
3517 zSig
= (bits64
)aExp
<< 52;
3518 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
3519 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
3520 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
3521 if ( aSig
& LIT64( 0x0020000000000000 ) ) {
3529 return normalizeRoundAndPackFloat64( zSign
, 0x408, zSig STATUS_VAR
);
3532 /*----------------------------------------------------------------------------
3533 | Returns 1 if the double-precision floating-point value `a' is equal to the
3534 | corresponding value `b', and 0 otherwise. The comparison is performed
3535 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3536 *----------------------------------------------------------------------------*/
3538 int float64_eq( float64 a
, float64 b STATUS_PARAM
)
3541 a
= float64_squash_input_denormal(a STATUS_VAR
);
3542 b
= float64_squash_input_denormal(b STATUS_VAR
);
3544 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3545 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3547 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3548 float_raise( float_flag_invalid STATUS_VAR
);
3552 av
= float64_val(a
);
3553 bv
= float64_val(b
);
3554 return ( av
== bv
) || ( (bits64
) ( ( av
| bv
)<<1 ) == 0 );
3558 /*----------------------------------------------------------------------------
3559 | Returns 1 if the double-precision floating-point value `a' is less than or
3560 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3561 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3563 *----------------------------------------------------------------------------*/
3565 int float64_le( float64 a
, float64 b STATUS_PARAM
)
3569 a
= float64_squash_input_denormal(a STATUS_VAR
);
3570 b
= float64_squash_input_denormal(b STATUS_VAR
);
3572 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3573 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3575 float_raise( float_flag_invalid STATUS_VAR
);
3578 aSign
= extractFloat64Sign( a
);
3579 bSign
= extractFloat64Sign( b
);
3580 av
= float64_val(a
);
3581 bv
= float64_val(b
);
3582 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( av
| bv
)<<1 ) == 0 );
3583 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3587 /*----------------------------------------------------------------------------
3588 | Returns 1 if the double-precision floating-point value `a' is less than
3589 | the corresponding value `b', and 0 otherwise. The comparison is performed
3590 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3591 *----------------------------------------------------------------------------*/
3593 int float64_lt( float64 a
, float64 b STATUS_PARAM
)
3598 a
= float64_squash_input_denormal(a STATUS_VAR
);
3599 b
= float64_squash_input_denormal(b STATUS_VAR
);
3600 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3601 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3603 float_raise( float_flag_invalid STATUS_VAR
);
3606 aSign
= extractFloat64Sign( a
);
3607 bSign
= extractFloat64Sign( b
);
3608 av
= float64_val(a
);
3609 bv
= float64_val(b
);
3610 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( av
| bv
)<<1 ) != 0 );
3611 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3615 /*----------------------------------------------------------------------------
3616 | Returns 1 if the double-precision floating-point value `a' is equal to the
3617 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3618 | if either operand is a NaN. Otherwise, the comparison is performed
3619 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3620 *----------------------------------------------------------------------------*/
3622 int float64_eq_signaling( float64 a
, float64 b STATUS_PARAM
)
3625 a
= float64_squash_input_denormal(a STATUS_VAR
);
3626 b
= float64_squash_input_denormal(b STATUS_VAR
);
3628 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3629 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3631 float_raise( float_flag_invalid STATUS_VAR
);
3634 av
= float64_val(a
);
3635 bv
= float64_val(b
);
3636 return ( av
== bv
) || ( (bits64
) ( ( av
| bv
)<<1 ) == 0 );
3640 /*----------------------------------------------------------------------------
3641 | Returns 1 if the double-precision floating-point value `a' is less than or
3642 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3643 | cause an exception. Otherwise, the comparison is performed according to the
3644 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3645 *----------------------------------------------------------------------------*/
3647 int float64_le_quiet( float64 a
, float64 b STATUS_PARAM
)
3651 a
= float64_squash_input_denormal(a STATUS_VAR
);
3652 b
= float64_squash_input_denormal(b STATUS_VAR
);
3654 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3655 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3657 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3658 float_raise( float_flag_invalid STATUS_VAR
);
3662 aSign
= extractFloat64Sign( a
);
3663 bSign
= extractFloat64Sign( b
);
3664 av
= float64_val(a
);
3665 bv
= float64_val(b
);
3666 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( av
| bv
)<<1 ) == 0 );
3667 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3671 /*----------------------------------------------------------------------------
3672 | Returns 1 if the double-precision floating-point value `a' is less than
3673 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3674 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3675 | Standard for Binary Floating-Point Arithmetic.
3676 *----------------------------------------------------------------------------*/
3678 int float64_lt_quiet( float64 a
, float64 b STATUS_PARAM
)
3682 a
= float64_squash_input_denormal(a STATUS_VAR
);
3683 b
= float64_squash_input_denormal(b STATUS_VAR
);
3685 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3686 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3688 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3689 float_raise( float_flag_invalid STATUS_VAR
);
3693 aSign
= extractFloat64Sign( a
);
3694 bSign
= extractFloat64Sign( b
);
3695 av
= float64_val(a
);
3696 bv
= float64_val(b
);
3697 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( av
| bv
)<<1 ) != 0 );
3698 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3704 /*----------------------------------------------------------------------------
3705 | Returns the result of converting the extended double-precision floating-
3706 | point value `a' to the 32-bit two's complement integer format. The
3707 | conversion is performed according to the IEC/IEEE Standard for Binary
3708 | Floating-Point Arithmetic---which means in particular that the conversion
3709 | is rounded according to the current rounding mode. If `a' is a NaN, the
3710 | largest positive integer is returned. Otherwise, if the conversion
3711 | overflows, the largest integer with the same sign as `a' is returned.
3712 *----------------------------------------------------------------------------*/
3714 int32
floatx80_to_int32( floatx80 a STATUS_PARAM
)
3717 int32 aExp
, shiftCount
;
3720 aSig
= extractFloatx80Frac( a
);
3721 aExp
= extractFloatx80Exp( a
);
3722 aSign
= extractFloatx80Sign( a
);
3723 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
3724 shiftCount
= 0x4037 - aExp
;
3725 if ( shiftCount
<= 0 ) shiftCount
= 1;
3726 shift64RightJamming( aSig
, shiftCount
, &aSig
);
3727 return roundAndPackInt32( aSign
, aSig STATUS_VAR
);
3731 /*----------------------------------------------------------------------------
3732 | Returns the result of converting the extended double-precision floating-
3733 | point value `a' to the 32-bit two's complement integer format. The
3734 | conversion is performed according to the IEC/IEEE Standard for Binary
3735 | Floating-Point Arithmetic, except that the conversion is always rounded
3736 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3737 | Otherwise, if the conversion overflows, the largest integer with the same
3738 | sign as `a' is returned.
3739 *----------------------------------------------------------------------------*/
3741 int32
floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM
)
3744 int32 aExp
, shiftCount
;
3745 bits64 aSig
, savedASig
;
3748 aSig
= extractFloatx80Frac( a
);
3749 aExp
= extractFloatx80Exp( a
);
3750 aSign
= extractFloatx80Sign( a
);
3751 if ( 0x401E < aExp
) {
3752 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
3755 else if ( aExp
< 0x3FFF ) {
3756 if ( aExp
|| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
3759 shiftCount
= 0x403E - aExp
;
3761 aSig
>>= shiftCount
;
3763 if ( aSign
) z
= - z
;
3764 if ( ( z
< 0 ) ^ aSign
) {
3766 float_raise( float_flag_invalid STATUS_VAR
);
3767 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
3769 if ( ( aSig
<<shiftCount
) != savedASig
) {
3770 STATUS(float_exception_flags
) |= float_flag_inexact
;
3776 /*----------------------------------------------------------------------------
3777 | Returns the result of converting the extended double-precision floating-
3778 | point value `a' to the 64-bit two's complement integer format. The
3779 | conversion is performed according to the IEC/IEEE Standard for Binary
3780 | Floating-Point Arithmetic---which means in particular that the conversion
3781 | is rounded according to the current rounding mode. If `a' is a NaN,
3782 | the largest positive integer is returned. Otherwise, if the conversion
3783 | overflows, the largest integer with the same sign as `a' is returned.
3784 *----------------------------------------------------------------------------*/
3786 int64
floatx80_to_int64( floatx80 a STATUS_PARAM
)
3789 int32 aExp
, shiftCount
;
3790 bits64 aSig
, aSigExtra
;
3792 aSig
= extractFloatx80Frac( a
);
3793 aExp
= extractFloatx80Exp( a
);
3794 aSign
= extractFloatx80Sign( a
);
3795 shiftCount
= 0x403E - aExp
;
3796 if ( shiftCount
<= 0 ) {
3798 float_raise( float_flag_invalid STATUS_VAR
);
3800 || ( ( aExp
== 0x7FFF )
3801 && ( aSig
!= LIT64( 0x8000000000000000 ) ) )
3803 return LIT64( 0x7FFFFFFFFFFFFFFF );
3805 return (sbits64
) LIT64( 0x8000000000000000 );
3810 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
3812 return roundAndPackInt64( aSign
, aSig
, aSigExtra STATUS_VAR
);
3816 /*----------------------------------------------------------------------------
3817 | Returns the result of converting the extended double-precision floating-
3818 | point value `a' to the 64-bit two's complement integer format. The
3819 | conversion is performed according to the IEC/IEEE Standard for Binary
3820 | Floating-Point Arithmetic, except that the conversion is always rounded
3821 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3822 | Otherwise, if the conversion overflows, the largest integer with the same
3823 | sign as `a' is returned.
3824 *----------------------------------------------------------------------------*/
3826 int64
floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM
)
3829 int32 aExp
, shiftCount
;
3833 aSig
= extractFloatx80Frac( a
);
3834 aExp
= extractFloatx80Exp( a
);
3835 aSign
= extractFloatx80Sign( a
);
3836 shiftCount
= aExp
- 0x403E;
3837 if ( 0 <= shiftCount
) {
3838 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
3839 if ( ( a
.high
!= 0xC03E ) || aSig
) {
3840 float_raise( float_flag_invalid STATUS_VAR
);
3841 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
3842 return LIT64( 0x7FFFFFFFFFFFFFFF );
3845 return (sbits64
) LIT64( 0x8000000000000000 );
3847 else if ( aExp
< 0x3FFF ) {
3848 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
3851 z
= aSig
>>( - shiftCount
);
3852 if ( (bits64
) ( aSig
<<( shiftCount
& 63 ) ) ) {
3853 STATUS(float_exception_flags
) |= float_flag_inexact
;
3855 if ( aSign
) z
= - z
;
3860 /*----------------------------------------------------------------------------
3861 | Returns the result of converting the extended double-precision floating-
3862 | point value `a' to the single-precision floating-point format. The
3863 | conversion is performed according to the IEC/IEEE Standard for Binary
3864 | Floating-Point Arithmetic.
3865 *----------------------------------------------------------------------------*/
3867 float32
floatx80_to_float32( floatx80 a STATUS_PARAM
)
3873 aSig
= extractFloatx80Frac( a
);
3874 aExp
= extractFloatx80Exp( a
);
3875 aSign
= extractFloatx80Sign( a
);
3876 if ( aExp
== 0x7FFF ) {
3877 if ( (bits64
) ( aSig
<<1 ) ) {
3878 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
3880 return packFloat32( aSign
, 0xFF, 0 );
3882 shift64RightJamming( aSig
, 33, &aSig
);
3883 if ( aExp
|| aSig
) aExp
-= 0x3F81;
3884 return roundAndPackFloat32( aSign
, aExp
, aSig STATUS_VAR
);
3888 /*----------------------------------------------------------------------------
3889 | Returns the result of converting the extended double-precision floating-
3890 | point value `a' to the double-precision floating-point format. The
3891 | conversion is performed according to the IEC/IEEE Standard for Binary
3892 | Floating-Point Arithmetic.
3893 *----------------------------------------------------------------------------*/
3895 float64
floatx80_to_float64( floatx80 a STATUS_PARAM
)
3901 aSig
= extractFloatx80Frac( a
);
3902 aExp
= extractFloatx80Exp( a
);
3903 aSign
= extractFloatx80Sign( a
);
3904 if ( aExp
== 0x7FFF ) {
3905 if ( (bits64
) ( aSig
<<1 ) ) {
3906 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
3908 return packFloat64( aSign
, 0x7FF, 0 );
3910 shift64RightJamming( aSig
, 1, &zSig
);
3911 if ( aExp
|| aSig
) aExp
-= 0x3C01;
3912 return roundAndPackFloat64( aSign
, aExp
, zSig STATUS_VAR
);
3918 /*----------------------------------------------------------------------------
3919 | Returns the result of converting the extended double-precision floating-
3920 | point value `a' to the quadruple-precision floating-point format. The
3921 | conversion is performed according to the IEC/IEEE Standard for Binary
3922 | Floating-Point Arithmetic.
3923 *----------------------------------------------------------------------------*/
3925 float128
floatx80_to_float128( floatx80 a STATUS_PARAM
)
3929 bits64 aSig
, zSig0
, zSig1
;
3931 aSig
= extractFloatx80Frac( a
);
3932 aExp
= extractFloatx80Exp( a
);
3933 aSign
= extractFloatx80Sign( a
);
3934 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) {
3935 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
3937 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
3938 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
3944 /*----------------------------------------------------------------------------
3945 | Rounds the extended double-precision floating-point value `a' to an integer,
3946 | and returns the result as an extended quadruple-precision floating-point
3947 | value. The operation is performed according to the IEC/IEEE Standard for
3948 | Binary Floating-Point Arithmetic.
3949 *----------------------------------------------------------------------------*/
3951 floatx80
floatx80_round_to_int( floatx80 a STATUS_PARAM
)
3955 bits64 lastBitMask
, roundBitsMask
;
3959 aExp
= extractFloatx80Exp( a
);
3960 if ( 0x403E <= aExp
) {
3961 if ( ( aExp
== 0x7FFF ) && (bits64
) ( extractFloatx80Frac( a
)<<1 ) ) {
3962 return propagateFloatx80NaN( a
, a STATUS_VAR
);
3966 if ( aExp
< 0x3FFF ) {
3968 && ( (bits64
) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
3971 STATUS(float_exception_flags
) |= float_flag_inexact
;
3972 aSign
= extractFloatx80Sign( a
);
3973 switch ( STATUS(float_rounding_mode
) ) {
3974 case float_round_nearest_even
:
3975 if ( ( aExp
== 0x3FFE ) && (bits64
) ( extractFloatx80Frac( a
)<<1 )
3978 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
3981 case float_round_down
:
3984 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3985 : packFloatx80( 0, 0, 0 );
3986 case float_round_up
:
3988 aSign
? packFloatx80( 1, 0, 0 )
3989 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3991 return packFloatx80( aSign
, 0, 0 );
3994 lastBitMask
<<= 0x403E - aExp
;
3995 roundBitsMask
= lastBitMask
- 1;
3997 roundingMode
= STATUS(float_rounding_mode
);
3998 if ( roundingMode
== float_round_nearest_even
) {
3999 z
.low
+= lastBitMask
>>1;
4000 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
4002 else if ( roundingMode
!= float_round_to_zero
) {
4003 if ( extractFloatx80Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
4004 z
.low
+= roundBitsMask
;
4007 z
.low
&= ~ roundBitsMask
;
4010 z
.low
= LIT64( 0x8000000000000000 );
4012 if ( z
.low
!= a
.low
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4017 /*----------------------------------------------------------------------------
4018 | Returns the result of adding the absolute values of the extended double-
4019 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4020 | negated before being returned. `zSign' is ignored if the result is a NaN.
4021 | The addition is performed according to the IEC/IEEE Standard for Binary
4022 | Floating-Point Arithmetic.
4023 *----------------------------------------------------------------------------*/
4025 static floatx80
addFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign STATUS_PARAM
)
4027 int32 aExp
, bExp
, zExp
;
4028 bits64 aSig
, bSig
, zSig0
, zSig1
;
4031 aSig
= extractFloatx80Frac( a
);
4032 aExp
= extractFloatx80Exp( a
);
4033 bSig
= extractFloatx80Frac( b
);
4034 bExp
= extractFloatx80Exp( b
);
4035 expDiff
= aExp
- bExp
;
4036 if ( 0 < expDiff
) {
4037 if ( aExp
== 0x7FFF ) {
4038 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4041 if ( bExp
== 0 ) --expDiff
;
4042 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
4045 else if ( expDiff
< 0 ) {
4046 if ( bExp
== 0x7FFF ) {
4047 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4048 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4050 if ( aExp
== 0 ) ++expDiff
;
4051 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
4055 if ( aExp
== 0x7FFF ) {
4056 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
4057 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4062 zSig0
= aSig
+ bSig
;
4064 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
4070 zSig0
= aSig
+ bSig
;
4071 if ( (sbits64
) zSig0
< 0 ) goto roundAndPack
;
4073 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
4074 zSig0
|= LIT64( 0x8000000000000000 );
4078 roundAndPackFloatx80(
4079 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4083 /*----------------------------------------------------------------------------
4084 | Returns the result of subtracting the absolute values of the extended
4085 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4086 | difference is negated before being returned. `zSign' is ignored if the
4087 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4088 | Standard for Binary Floating-Point Arithmetic.
4089 *----------------------------------------------------------------------------*/
4091 static floatx80
subFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign STATUS_PARAM
)
4093 int32 aExp
, bExp
, zExp
;
4094 bits64 aSig
, bSig
, zSig0
, zSig1
;
4098 aSig
= extractFloatx80Frac( a
);
4099 aExp
= extractFloatx80Exp( a
);
4100 bSig
= extractFloatx80Frac( b
);
4101 bExp
= extractFloatx80Exp( b
);
4102 expDiff
= aExp
- bExp
;
4103 if ( 0 < expDiff
) goto aExpBigger
;
4104 if ( expDiff
< 0 ) goto bExpBigger
;
4105 if ( aExp
== 0x7FFF ) {
4106 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
4107 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4109 float_raise( float_flag_invalid STATUS_VAR
);
4110 z
.low
= floatx80_default_nan_low
;
4111 z
.high
= floatx80_default_nan_high
;
4119 if ( bSig
< aSig
) goto aBigger
;
4120 if ( aSig
< bSig
) goto bBigger
;
4121 return packFloatx80( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
4123 if ( bExp
== 0x7FFF ) {
4124 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4125 return packFloatx80( zSign
^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4127 if ( aExp
== 0 ) ++expDiff
;
4128 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
4130 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
4133 goto normalizeRoundAndPack
;
4135 if ( aExp
== 0x7FFF ) {
4136 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4139 if ( bExp
== 0 ) --expDiff
;
4140 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
4142 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
4144 normalizeRoundAndPack
:
4146 normalizeRoundAndPackFloatx80(
4147 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4151 /*----------------------------------------------------------------------------
4152 | Returns the result of adding the extended double-precision floating-point
4153 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4154 | Standard for Binary Floating-Point Arithmetic.
4155 *----------------------------------------------------------------------------*/
4157 floatx80
floatx80_add( floatx80 a
, floatx80 b STATUS_PARAM
)
4161 aSign
= extractFloatx80Sign( a
);
4162 bSign
= extractFloatx80Sign( b
);
4163 if ( aSign
== bSign
) {
4164 return addFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
4167 return subFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
4172 /*----------------------------------------------------------------------------
4173 | Returns the result of subtracting the extended double-precision floating-
4174 | point values `a' and `b'. The operation is performed according to the
4175 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4176 *----------------------------------------------------------------------------*/
4178 floatx80
floatx80_sub( floatx80 a
, floatx80 b STATUS_PARAM
)
4182 aSign
= extractFloatx80Sign( a
);
4183 bSign
= extractFloatx80Sign( b
);
4184 if ( aSign
== bSign
) {
4185 return subFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
4188 return addFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
4193 /*----------------------------------------------------------------------------
4194 | Returns the result of multiplying the extended double-precision floating-
4195 | point values `a' and `b'. The operation is performed according to the
4196 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4197 *----------------------------------------------------------------------------*/
4199 floatx80
floatx80_mul( floatx80 a
, floatx80 b STATUS_PARAM
)
4201 flag aSign
, bSign
, zSign
;
4202 int32 aExp
, bExp
, zExp
;
4203 bits64 aSig
, bSig
, zSig0
, zSig1
;
4206 aSig
= extractFloatx80Frac( a
);
4207 aExp
= extractFloatx80Exp( a
);
4208 aSign
= extractFloatx80Sign( a
);
4209 bSig
= extractFloatx80Frac( b
);
4210 bExp
= extractFloatx80Exp( b
);
4211 bSign
= extractFloatx80Sign( b
);
4212 zSign
= aSign
^ bSign
;
4213 if ( aExp
== 0x7FFF ) {
4214 if ( (bits64
) ( aSig
<<1 )
4215 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
4216 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4218 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
4219 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4221 if ( bExp
== 0x7FFF ) {
4222 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4223 if ( ( aExp
| aSig
) == 0 ) {
4225 float_raise( float_flag_invalid STATUS_VAR
);
4226 z
.low
= floatx80_default_nan_low
;
4227 z
.high
= floatx80_default_nan_high
;
4230 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4233 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
4234 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
4237 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
4238 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
4240 zExp
= aExp
+ bExp
- 0x3FFE;
4241 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
4242 if ( 0 < (sbits64
) zSig0
) {
4243 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
4247 roundAndPackFloatx80(
4248 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4252 /*----------------------------------------------------------------------------
4253 | Returns the result of dividing the extended double-precision floating-point
4254 | value `a' by the corresponding value `b'. The operation is performed
4255 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4256 *----------------------------------------------------------------------------*/
4258 floatx80
floatx80_div( floatx80 a
, floatx80 b STATUS_PARAM
)
4260 flag aSign
, bSign
, zSign
;
4261 int32 aExp
, bExp
, zExp
;
4262 bits64 aSig
, bSig
, zSig0
, zSig1
;
4263 bits64 rem0
, rem1
, rem2
, term0
, term1
, term2
;
4266 aSig
= extractFloatx80Frac( a
);
4267 aExp
= extractFloatx80Exp( a
);
4268 aSign
= extractFloatx80Sign( a
);
4269 bSig
= extractFloatx80Frac( b
);
4270 bExp
= extractFloatx80Exp( b
);
4271 bSign
= extractFloatx80Sign( b
);
4272 zSign
= aSign
^ bSign
;
4273 if ( aExp
== 0x7FFF ) {
4274 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4275 if ( bExp
== 0x7FFF ) {
4276 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4279 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4281 if ( bExp
== 0x7FFF ) {
4282 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4283 return packFloatx80( zSign
, 0, 0 );
4287 if ( ( aExp
| aSig
) == 0 ) {
4289 float_raise( float_flag_invalid STATUS_VAR
);
4290 z
.low
= floatx80_default_nan_low
;
4291 z
.high
= floatx80_default_nan_high
;
4294 float_raise( float_flag_divbyzero STATUS_VAR
);
4295 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4297 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
4300 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
4301 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
4303 zExp
= aExp
- bExp
+ 0x3FFE;
4305 if ( bSig
<= aSig
) {
4306 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
4309 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
4310 mul64To128( bSig
, zSig0
, &term0
, &term1
);
4311 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
4312 while ( (sbits64
) rem0
< 0 ) {
4314 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
4316 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
4317 if ( (bits64
) ( zSig1
<<1 ) <= 8 ) {
4318 mul64To128( bSig
, zSig1
, &term1
, &term2
);
4319 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
4320 while ( (sbits64
) rem1
< 0 ) {
4322 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
4324 zSig1
|= ( ( rem1
| rem2
) != 0 );
4327 roundAndPackFloatx80(
4328 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4332 /*----------------------------------------------------------------------------
4333 | Returns the remainder of the extended double-precision floating-point value
4334 | `a' with respect to the corresponding value `b'. The operation is performed
4335 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4336 *----------------------------------------------------------------------------*/
4338 floatx80
floatx80_rem( floatx80 a
, floatx80 b STATUS_PARAM
)
4341 int32 aExp
, bExp
, expDiff
;
4342 bits64 aSig0
, aSig1
, bSig
;
4343 bits64 q
, term0
, term1
, alternateASig0
, alternateASig1
;
4346 aSig0
= extractFloatx80Frac( a
);
4347 aExp
= extractFloatx80Exp( a
);
4348 aSign
= extractFloatx80Sign( a
);
4349 bSig
= extractFloatx80Frac( b
);
4350 bExp
= extractFloatx80Exp( b
);
4351 if ( aExp
== 0x7FFF ) {
4352 if ( (bits64
) ( aSig0
<<1 )
4353 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
4354 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4358 if ( bExp
== 0x7FFF ) {
4359 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4365 float_raise( float_flag_invalid STATUS_VAR
);
4366 z
.low
= floatx80_default_nan_low
;
4367 z
.high
= floatx80_default_nan_high
;
4370 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
4373 if ( (bits64
) ( aSig0
<<1 ) == 0 ) return a
;
4374 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
4376 bSig
|= LIT64( 0x8000000000000000 );
4378 expDiff
= aExp
- bExp
;
4380 if ( expDiff
< 0 ) {
4381 if ( expDiff
< -1 ) return a
;
4382 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
4385 q
= ( bSig
<= aSig0
);
4386 if ( q
) aSig0
-= bSig
;
4388 while ( 0 < expDiff
) {
4389 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
4390 q
= ( 2 < q
) ? q
- 2 : 0;
4391 mul64To128( bSig
, q
, &term0
, &term1
);
4392 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
4393 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
4397 if ( 0 < expDiff
) {
4398 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
4399 q
= ( 2 < q
) ? q
- 2 : 0;
4401 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
4402 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
4403 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
4404 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
4406 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
4413 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
4414 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
4415 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
4418 aSig0
= alternateASig0
;
4419 aSig1
= alternateASig1
;
4423 normalizeRoundAndPackFloatx80(
4424 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1 STATUS_VAR
);
4428 /*----------------------------------------------------------------------------
4429 | Returns the square root of the extended double-precision floating-point
4430 | value `a'. The operation is performed according to the IEC/IEEE Standard
4431 | for Binary Floating-Point Arithmetic.
4432 *----------------------------------------------------------------------------*/
4434 floatx80
floatx80_sqrt( floatx80 a STATUS_PARAM
)
4438 bits64 aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
4439 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
4442 aSig0
= extractFloatx80Frac( a
);
4443 aExp
= extractFloatx80Exp( a
);
4444 aSign
= extractFloatx80Sign( a
);
4445 if ( aExp
== 0x7FFF ) {
4446 if ( (bits64
) ( aSig0
<<1 ) ) return propagateFloatx80NaN( a
, a STATUS_VAR
);
4447 if ( ! aSign
) return a
;
4451 if ( ( aExp
| aSig0
) == 0 ) return a
;
4453 float_raise( float_flag_invalid STATUS_VAR
);
4454 z
.low
= floatx80_default_nan_low
;
4455 z
.high
= floatx80_default_nan_high
;
4459 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
4460 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
4462 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
4463 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
4464 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
4465 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
4466 doubleZSig0
= zSig0
<<1;
4467 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
4468 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
4469 while ( (sbits64
) rem0
< 0 ) {
4472 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
4474 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
4475 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4476 if ( zSig1
== 0 ) zSig1
= 1;
4477 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
4478 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
4479 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
4480 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
4481 while ( (sbits64
) rem1
< 0 ) {
4483 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
4485 term2
|= doubleZSig0
;
4486 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
4488 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
4490 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
4491 zSig0
|= doubleZSig0
;
4493 roundAndPackFloatx80(
4494 STATUS(floatx80_rounding_precision
), 0, zExp
, zSig0
, zSig1 STATUS_VAR
);
4498 /*----------------------------------------------------------------------------
4499 | Returns 1 if the extended double-precision floating-point value `a' is
4500 | equal to the corresponding value `b', and 0 otherwise. The comparison is
4501 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4503 *----------------------------------------------------------------------------*/
4505 int floatx80_eq( floatx80 a
, floatx80 b STATUS_PARAM
)
4508 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4509 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4510 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4511 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4513 if ( floatx80_is_signaling_nan( a
)
4514 || floatx80_is_signaling_nan( b
) ) {
4515 float_raise( float_flag_invalid STATUS_VAR
);
4521 && ( ( a
.high
== b
.high
)
4523 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
4528 /*----------------------------------------------------------------------------
4529 | Returns 1 if the extended double-precision floating-point value `a' is
4530 | less than or equal to the corresponding value `b', and 0 otherwise. The
4531 | comparison is performed according to the IEC/IEEE Standard for Binary
4532 | Floating-Point Arithmetic.
4533 *----------------------------------------------------------------------------*/
4535 int floatx80_le( floatx80 a
, floatx80 b STATUS_PARAM
)
4539 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4540 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4541 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4542 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4544 float_raise( float_flag_invalid STATUS_VAR
);
4547 aSign
= extractFloatx80Sign( a
);
4548 bSign
= extractFloatx80Sign( b
);
4549 if ( aSign
!= bSign
) {
4552 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4556 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4557 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4561 /*----------------------------------------------------------------------------
4562 | Returns 1 if the extended double-precision floating-point value `a' is
4563 | less than the corresponding value `b', and 0 otherwise. The comparison
4564 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4566 *----------------------------------------------------------------------------*/
4568 int floatx80_lt( floatx80 a
, floatx80 b STATUS_PARAM
)
4572 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4573 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4574 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4575 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4577 float_raise( float_flag_invalid STATUS_VAR
);
4580 aSign
= extractFloatx80Sign( a
);
4581 bSign
= extractFloatx80Sign( b
);
4582 if ( aSign
!= bSign
) {
4585 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4589 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4590 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4594 /*----------------------------------------------------------------------------
4595 | Returns 1 if the extended double-precision floating-point value `a' is equal
4596 | to the corresponding value `b', and 0 otherwise. The invalid exception is
4597 | raised if either operand is a NaN. Otherwise, the comparison is performed
4598 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4599 *----------------------------------------------------------------------------*/
4601 int floatx80_eq_signaling( floatx80 a
, floatx80 b STATUS_PARAM
)
4604 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4605 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4606 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4607 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4609 float_raise( float_flag_invalid STATUS_VAR
);
4614 && ( ( a
.high
== b
.high
)
4616 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
4621 /*----------------------------------------------------------------------------
4622 | Returns 1 if the extended double-precision floating-point value `a' is less
4623 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4624 | do not cause an exception. Otherwise, the comparison is performed according
4625 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4626 *----------------------------------------------------------------------------*/
4628 int floatx80_le_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
4632 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4633 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4634 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4635 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4637 if ( floatx80_is_signaling_nan( a
)
4638 || floatx80_is_signaling_nan( b
) ) {
4639 float_raise( float_flag_invalid STATUS_VAR
);
4643 aSign
= extractFloatx80Sign( a
);
4644 bSign
= extractFloatx80Sign( b
);
4645 if ( aSign
!= bSign
) {
4648 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4652 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4653 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4657 /*----------------------------------------------------------------------------
4658 | Returns 1 if the extended double-precision floating-point value `a' is less
4659 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4660 | an exception. Otherwise, the comparison is performed according to the
4661 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4662 *----------------------------------------------------------------------------*/
4664 int floatx80_lt_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
4668 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4669 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4670 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4671 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4673 if ( floatx80_is_signaling_nan( a
)
4674 || floatx80_is_signaling_nan( b
) ) {
4675 float_raise( float_flag_invalid STATUS_VAR
);
4679 aSign
= extractFloatx80Sign( a
);
4680 bSign
= extractFloatx80Sign( b
);
4681 if ( aSign
!= bSign
) {
4684 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4688 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4689 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4697 /*----------------------------------------------------------------------------
4698 | Returns the result of converting the quadruple-precision floating-point
4699 | value `a' to the 32-bit two's complement integer format. The conversion
4700 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4701 | Arithmetic---which means in particular that the conversion is rounded
4702 | according to the current rounding mode. If `a' is a NaN, the largest
4703 | positive integer is returned. Otherwise, if the conversion overflows, the
4704 | largest integer with the same sign as `a' is returned.
4705 *----------------------------------------------------------------------------*/
4707 int32
float128_to_int32( float128 a STATUS_PARAM
)
4710 int32 aExp
, shiftCount
;
4711 bits64 aSig0
, aSig1
;
4713 aSig1
= extractFloat128Frac1( a
);
4714 aSig0
= extractFloat128Frac0( a
);
4715 aExp
= extractFloat128Exp( a
);
4716 aSign
= extractFloat128Sign( a
);
4717 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
4718 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4719 aSig0
|= ( aSig1
!= 0 );
4720 shiftCount
= 0x4028 - aExp
;
4721 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
4722 return roundAndPackInt32( aSign
, aSig0 STATUS_VAR
);
4726 /*----------------------------------------------------------------------------
4727 | Returns the result of converting the quadruple-precision floating-point
4728 | value `a' to the 32-bit two's complement integer format. The conversion
4729 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4730 | Arithmetic, except that the conversion is always rounded toward zero. If
4731 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4732 | conversion overflows, the largest integer with the same sign as `a' is
4734 *----------------------------------------------------------------------------*/
4736 int32
float128_to_int32_round_to_zero( float128 a STATUS_PARAM
)
4739 int32 aExp
, shiftCount
;
4740 bits64 aSig0
, aSig1
, savedASig
;
4743 aSig1
= extractFloat128Frac1( a
);
4744 aSig0
= extractFloat128Frac0( a
);
4745 aExp
= extractFloat128Exp( a
);
4746 aSign
= extractFloat128Sign( a
);
4747 aSig0
|= ( aSig1
!= 0 );
4748 if ( 0x401E < aExp
) {
4749 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
4752 else if ( aExp
< 0x3FFF ) {
4753 if ( aExp
|| aSig0
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4756 aSig0
|= LIT64( 0x0001000000000000 );
4757 shiftCount
= 0x402F - aExp
;
4759 aSig0
>>= shiftCount
;
4761 if ( aSign
) z
= - z
;
4762 if ( ( z
< 0 ) ^ aSign
) {
4764 float_raise( float_flag_invalid STATUS_VAR
);
4765 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
4767 if ( ( aSig0
<<shiftCount
) != savedASig
) {
4768 STATUS(float_exception_flags
) |= float_flag_inexact
;
4774 /*----------------------------------------------------------------------------
4775 | Returns the result of converting the quadruple-precision floating-point
4776 | value `a' to the 64-bit two's complement integer format. The conversion
4777 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4778 | Arithmetic---which means in particular that the conversion is rounded
4779 | according to the current rounding mode. If `a' is a NaN, the largest
4780 | positive integer is returned. Otherwise, if the conversion overflows, the
4781 | largest integer with the same sign as `a' is returned.
4782 *----------------------------------------------------------------------------*/
4784 int64
float128_to_int64( float128 a STATUS_PARAM
)
4787 int32 aExp
, shiftCount
;
4788 bits64 aSig0
, aSig1
;
4790 aSig1
= extractFloat128Frac1( a
);
4791 aSig0
= extractFloat128Frac0( a
);
4792 aExp
= extractFloat128Exp( a
);
4793 aSign
= extractFloat128Sign( a
);
4794 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4795 shiftCount
= 0x402F - aExp
;
4796 if ( shiftCount
<= 0 ) {
4797 if ( 0x403E < aExp
) {
4798 float_raise( float_flag_invalid STATUS_VAR
);
4800 || ( ( aExp
== 0x7FFF )
4801 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
4804 return LIT64( 0x7FFFFFFFFFFFFFFF );
4806 return (sbits64
) LIT64( 0x8000000000000000 );
4808 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
4811 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
4813 return roundAndPackInt64( aSign
, aSig0
, aSig1 STATUS_VAR
);
4817 /*----------------------------------------------------------------------------
4818 | Returns the result of converting the quadruple-precision floating-point
4819 | value `a' to the 64-bit two's complement integer format. The conversion
4820 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4821 | Arithmetic, except that the conversion is always rounded toward zero.
4822 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4823 | the conversion overflows, the largest integer with the same sign as `a' is
4825 *----------------------------------------------------------------------------*/
4827 int64
float128_to_int64_round_to_zero( float128 a STATUS_PARAM
)
4830 int32 aExp
, shiftCount
;
4831 bits64 aSig0
, aSig1
;
4834 aSig1
= extractFloat128Frac1( a
);
4835 aSig0
= extractFloat128Frac0( a
);
4836 aExp
= extractFloat128Exp( a
);
4837 aSign
= extractFloat128Sign( a
);
4838 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4839 shiftCount
= aExp
- 0x402F;
4840 if ( 0 < shiftCount
) {
4841 if ( 0x403E <= aExp
) {
4842 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
4843 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
4844 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
4845 if ( aSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4848 float_raise( float_flag_invalid STATUS_VAR
);
4849 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
4850 return LIT64( 0x7FFFFFFFFFFFFFFF );
4853 return (sbits64
) LIT64( 0x8000000000000000 );
4855 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
4856 if ( (bits64
) ( aSig1
<<shiftCount
) ) {
4857 STATUS(float_exception_flags
) |= float_flag_inexact
;
4861 if ( aExp
< 0x3FFF ) {
4862 if ( aExp
| aSig0
| aSig1
) {
4863 STATUS(float_exception_flags
) |= float_flag_inexact
;
4867 z
= aSig0
>>( - shiftCount
);
4869 || ( shiftCount
&& (bits64
) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
4870 STATUS(float_exception_flags
) |= float_flag_inexact
;
4873 if ( aSign
) z
= - z
;
4878 /*----------------------------------------------------------------------------
4879 | Returns the result of converting the quadruple-precision floating-point
4880 | value `a' to the single-precision floating-point format. The conversion
4881 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4883 *----------------------------------------------------------------------------*/
4885 float32
float128_to_float32( float128 a STATUS_PARAM
)
4889 bits64 aSig0
, aSig1
;
4892 aSig1
= extractFloat128Frac1( a
);
4893 aSig0
= extractFloat128Frac0( a
);
4894 aExp
= extractFloat128Exp( a
);
4895 aSign
= extractFloat128Sign( a
);
4896 if ( aExp
== 0x7FFF ) {
4897 if ( aSig0
| aSig1
) {
4898 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
4900 return packFloat32( aSign
, 0xFF, 0 );
4902 aSig0
|= ( aSig1
!= 0 );
4903 shift64RightJamming( aSig0
, 18, &aSig0
);
4905 if ( aExp
|| zSig
) {
4909 return roundAndPackFloat32( aSign
, aExp
, zSig STATUS_VAR
);
4913 /*----------------------------------------------------------------------------
4914 | Returns the result of converting the quadruple-precision floating-point
4915 | value `a' to the double-precision floating-point format. The conversion
4916 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4918 *----------------------------------------------------------------------------*/
4920 float64
float128_to_float64( float128 a STATUS_PARAM
)
4924 bits64 aSig0
, aSig1
;
4926 aSig1
= extractFloat128Frac1( a
);
4927 aSig0
= extractFloat128Frac0( a
);
4928 aExp
= extractFloat128Exp( a
);
4929 aSign
= extractFloat128Sign( a
);
4930 if ( aExp
== 0x7FFF ) {
4931 if ( aSig0
| aSig1
) {
4932 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
4934 return packFloat64( aSign
, 0x7FF, 0 );
4936 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
4937 aSig0
|= ( aSig1
!= 0 );
4938 if ( aExp
|| aSig0
) {
4939 aSig0
|= LIT64( 0x4000000000000000 );
4942 return roundAndPackFloat64( aSign
, aExp
, aSig0 STATUS_VAR
);
4948 /*----------------------------------------------------------------------------
4949 | Returns the result of converting the quadruple-precision floating-point
4950 | value `a' to the extended double-precision floating-point format. The
4951 | conversion is performed according to the IEC/IEEE Standard for Binary
4952 | Floating-Point Arithmetic.
4953 *----------------------------------------------------------------------------*/
4955 floatx80
float128_to_floatx80( float128 a STATUS_PARAM
)
4959 bits64 aSig0
, aSig1
;
4961 aSig1
= extractFloat128Frac1( a
);
4962 aSig0
= extractFloat128Frac0( a
);
4963 aExp
= extractFloat128Exp( a
);
4964 aSign
= extractFloat128Sign( a
);
4965 if ( aExp
== 0x7FFF ) {
4966 if ( aSig0
| aSig1
) {
4967 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
4969 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4972 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
4973 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4976 aSig0
|= LIT64( 0x0001000000000000 );
4978 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
4979 return roundAndPackFloatx80( 80, aSign
, aExp
, aSig0
, aSig1 STATUS_VAR
);
4985 /*----------------------------------------------------------------------------
4986 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4987 | returns the result as a quadruple-precision floating-point value. The
4988 | operation is performed according to the IEC/IEEE Standard for Binary
4989 | Floating-Point Arithmetic.
4990 *----------------------------------------------------------------------------*/
4992 float128
float128_round_to_int( float128 a STATUS_PARAM
)
4996 bits64 lastBitMask
, roundBitsMask
;
5000 aExp
= extractFloat128Exp( a
);
5001 if ( 0x402F <= aExp
) {
5002 if ( 0x406F <= aExp
) {
5003 if ( ( aExp
== 0x7FFF )
5004 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
5006 return propagateFloat128NaN( a
, a STATUS_VAR
);
5011 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
5012 roundBitsMask
= lastBitMask
- 1;
5014 roundingMode
= STATUS(float_rounding_mode
);
5015 if ( roundingMode
== float_round_nearest_even
) {
5016 if ( lastBitMask
) {
5017 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
5018 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
5021 if ( (sbits64
) z
.low
< 0 ) {
5023 if ( (bits64
) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
5027 else if ( roundingMode
!= float_round_to_zero
) {
5028 if ( extractFloat128Sign( z
)
5029 ^ ( roundingMode
== float_round_up
) ) {
5030 add128( z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
5033 z
.low
&= ~ roundBitsMask
;
5036 if ( aExp
< 0x3FFF ) {
5037 if ( ( ( (bits64
) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
5038 STATUS(float_exception_flags
) |= float_flag_inexact
;
5039 aSign
= extractFloat128Sign( a
);
5040 switch ( STATUS(float_rounding_mode
) ) {
5041 case float_round_nearest_even
:
5042 if ( ( aExp
== 0x3FFE )
5043 && ( extractFloat128Frac0( a
)
5044 | extractFloat128Frac1( a
) )
5046 return packFloat128( aSign
, 0x3FFF, 0, 0 );
5049 case float_round_down
:
5051 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
5052 : packFloat128( 0, 0, 0, 0 );
5053 case float_round_up
:
5055 aSign
? packFloat128( 1, 0, 0, 0 )
5056 : packFloat128( 0, 0x3FFF, 0, 0 );
5058 return packFloat128( aSign
, 0, 0, 0 );
5061 lastBitMask
<<= 0x402F - aExp
;
5062 roundBitsMask
= lastBitMask
- 1;
5065 roundingMode
= STATUS(float_rounding_mode
);
5066 if ( roundingMode
== float_round_nearest_even
) {
5067 z
.high
+= lastBitMask
>>1;
5068 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
5069 z
.high
&= ~ lastBitMask
;
5072 else if ( roundingMode
!= float_round_to_zero
) {
5073 if ( extractFloat128Sign( z
)
5074 ^ ( roundingMode
== float_round_up
) ) {
5075 z
.high
|= ( a
.low
!= 0 );
5076 z
.high
+= roundBitsMask
;
5079 z
.high
&= ~ roundBitsMask
;
5081 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
5082 STATUS(float_exception_flags
) |= float_flag_inexact
;
5088 /*----------------------------------------------------------------------------
5089 | Returns the result of adding the absolute values of the quadruple-precision
5090 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5091 | before being returned. `zSign' is ignored if the result is a NaN.
5092 | The addition is performed according to the IEC/IEEE Standard for Binary
5093 | Floating-Point Arithmetic.
5094 *----------------------------------------------------------------------------*/
5096 static float128
addFloat128Sigs( float128 a
, float128 b
, flag zSign STATUS_PARAM
)
5098 int32 aExp
, bExp
, zExp
;
5099 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
5102 aSig1
= extractFloat128Frac1( a
);
5103 aSig0
= extractFloat128Frac0( a
);
5104 aExp
= extractFloat128Exp( a
);
5105 bSig1
= extractFloat128Frac1( b
);
5106 bSig0
= extractFloat128Frac0( b
);
5107 bExp
= extractFloat128Exp( b
);
5108 expDiff
= aExp
- bExp
;
5109 if ( 0 < expDiff
) {
5110 if ( aExp
== 0x7FFF ) {
5111 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5118 bSig0
|= LIT64( 0x0001000000000000 );
5120 shift128ExtraRightJamming(
5121 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
5124 else if ( expDiff
< 0 ) {
5125 if ( bExp
== 0x7FFF ) {
5126 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5127 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5133 aSig0
|= LIT64( 0x0001000000000000 );
5135 shift128ExtraRightJamming(
5136 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
5140 if ( aExp
== 0x7FFF ) {
5141 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
5142 return propagateFloat128NaN( a
, b STATUS_VAR
);
5146 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
5148 if ( STATUS(flush_to_zero
) ) return packFloat128( zSign
, 0, 0, 0 );
5149 return packFloat128( zSign
, 0, zSig0
, zSig1
);
5152 zSig0
|= LIT64( 0x0002000000000000 );
5156 aSig0
|= LIT64( 0x0001000000000000 );
5157 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
5159 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
5162 shift128ExtraRightJamming(
5163 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
5165 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5169 /*----------------------------------------------------------------------------
5170 | Returns the result of subtracting the absolute values of the quadruple-
5171 | precision floating-point values `a' and `b'. If `zSign' is 1, the
5172 | difference is negated before being returned. `zSign' is ignored if the
5173 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5174 | Standard for Binary Floating-Point Arithmetic.
5175 *----------------------------------------------------------------------------*/
5177 static float128
subFloat128Sigs( float128 a
, float128 b
, flag zSign STATUS_PARAM
)
5179 int32 aExp
, bExp
, zExp
;
5180 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
5184 aSig1
= extractFloat128Frac1( a
);
5185 aSig0
= extractFloat128Frac0( a
);
5186 aExp
= extractFloat128Exp( a
);
5187 bSig1
= extractFloat128Frac1( b
);
5188 bSig0
= extractFloat128Frac0( b
);
5189 bExp
= extractFloat128Exp( b
);
5190 expDiff
= aExp
- bExp
;
5191 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
5192 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
5193 if ( 0 < expDiff
) goto aExpBigger
;
5194 if ( expDiff
< 0 ) goto bExpBigger
;
5195 if ( aExp
== 0x7FFF ) {
5196 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
5197 return propagateFloat128NaN( a
, b STATUS_VAR
);
5199 float_raise( float_flag_invalid STATUS_VAR
);
5200 z
.low
= float128_default_nan_low
;
5201 z
.high
= float128_default_nan_high
;
5208 if ( bSig0
< aSig0
) goto aBigger
;
5209 if ( aSig0
< bSig0
) goto bBigger
;
5210 if ( bSig1
< aSig1
) goto aBigger
;
5211 if ( aSig1
< bSig1
) goto bBigger
;
5212 return packFloat128( STATUS(float_rounding_mode
) == float_round_down
, 0, 0, 0 );
5214 if ( bExp
== 0x7FFF ) {
5215 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5216 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
5222 aSig0
|= LIT64( 0x4000000000000000 );
5224 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
5225 bSig0
|= LIT64( 0x4000000000000000 );
5227 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
5230 goto normalizeRoundAndPack
;
5232 if ( aExp
== 0x7FFF ) {
5233 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5240 bSig0
|= LIT64( 0x4000000000000000 );
5242 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
5243 aSig0
|= LIT64( 0x4000000000000000 );
5245 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
5247 normalizeRoundAndPack
:
5249 return normalizeRoundAndPackFloat128( zSign
, zExp
- 14, zSig0
, zSig1 STATUS_VAR
);
5253 /*----------------------------------------------------------------------------
5254 | Returns the result of adding the quadruple-precision floating-point values
5255 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5256 | for Binary Floating-Point Arithmetic.
5257 *----------------------------------------------------------------------------*/
5259 float128
float128_add( float128 a
, float128 b STATUS_PARAM
)
5263 aSign
= extractFloat128Sign( a
);
5264 bSign
= extractFloat128Sign( b
);
5265 if ( aSign
== bSign
) {
5266 return addFloat128Sigs( a
, b
, aSign STATUS_VAR
);
5269 return subFloat128Sigs( a
, b
, aSign STATUS_VAR
);
5274 /*----------------------------------------------------------------------------
5275 | Returns the result of subtracting the quadruple-precision floating-point
5276 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5277 | Standard for Binary Floating-Point Arithmetic.
5278 *----------------------------------------------------------------------------*/
5280 float128
float128_sub( float128 a
, float128 b STATUS_PARAM
)
5284 aSign
= extractFloat128Sign( a
);
5285 bSign
= extractFloat128Sign( b
);
5286 if ( aSign
== bSign
) {
5287 return subFloat128Sigs( a
, b
, aSign STATUS_VAR
);
5290 return addFloat128Sigs( a
, b
, aSign STATUS_VAR
);
5295 /*----------------------------------------------------------------------------
5296 | Returns the result of multiplying the quadruple-precision floating-point
5297 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5298 | Standard for Binary Floating-Point Arithmetic.
5299 *----------------------------------------------------------------------------*/
5301 float128
float128_mul( float128 a
, float128 b STATUS_PARAM
)
5303 flag aSign
, bSign
, zSign
;
5304 int32 aExp
, bExp
, zExp
;
5305 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
5308 aSig1
= extractFloat128Frac1( a
);
5309 aSig0
= extractFloat128Frac0( a
);
5310 aExp
= extractFloat128Exp( a
);
5311 aSign
= extractFloat128Sign( a
);
5312 bSig1
= extractFloat128Frac1( b
);
5313 bSig0
= extractFloat128Frac0( b
);
5314 bExp
= extractFloat128Exp( b
);
5315 bSign
= extractFloat128Sign( b
);
5316 zSign
= aSign
^ bSign
;
5317 if ( aExp
== 0x7FFF ) {
5318 if ( ( aSig0
| aSig1
)
5319 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
5320 return propagateFloat128NaN( a
, b STATUS_VAR
);
5322 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
5323 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5325 if ( bExp
== 0x7FFF ) {
5326 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5327 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
5329 float_raise( float_flag_invalid STATUS_VAR
);
5330 z
.low
= float128_default_nan_low
;
5331 z
.high
= float128_default_nan_high
;
5334 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5337 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
5338 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5341 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
5342 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5344 zExp
= aExp
+ bExp
- 0x4000;
5345 aSig0
|= LIT64( 0x0001000000000000 );
5346 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
5347 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
5348 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
5349 zSig2
|= ( zSig3
!= 0 );
5350 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
5351 shift128ExtraRightJamming(
5352 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
5355 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5359 /*----------------------------------------------------------------------------
5360 | Returns the result of dividing the quadruple-precision floating-point value
5361 | `a' by the corresponding value `b'. The operation is performed according to
5362 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5363 *----------------------------------------------------------------------------*/
5365 float128
float128_div( float128 a
, float128 b STATUS_PARAM
)
5367 flag aSign
, bSign
, zSign
;
5368 int32 aExp
, bExp
, zExp
;
5369 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
5370 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5373 aSig1
= extractFloat128Frac1( a
);
5374 aSig0
= extractFloat128Frac0( a
);
5375 aExp
= extractFloat128Exp( a
);
5376 aSign
= extractFloat128Sign( a
);
5377 bSig1
= extractFloat128Frac1( b
);
5378 bSig0
= extractFloat128Frac0( b
);
5379 bExp
= extractFloat128Exp( b
);
5380 bSign
= extractFloat128Sign( b
);
5381 zSign
= aSign
^ bSign
;
5382 if ( aExp
== 0x7FFF ) {
5383 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5384 if ( bExp
== 0x7FFF ) {
5385 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5388 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5390 if ( bExp
== 0x7FFF ) {
5391 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5392 return packFloat128( zSign
, 0, 0, 0 );
5395 if ( ( bSig0
| bSig1
) == 0 ) {
5396 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
5398 float_raise( float_flag_invalid STATUS_VAR
);
5399 z
.low
= float128_default_nan_low
;
5400 z
.high
= float128_default_nan_high
;
5403 float_raise( float_flag_divbyzero STATUS_VAR
);
5404 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5406 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5409 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
5410 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5412 zExp
= aExp
- bExp
+ 0x3FFD;
5414 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
5416 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
5417 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
5418 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
5421 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5422 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
5423 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
5424 while ( (sbits64
) rem0
< 0 ) {
5426 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
5428 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
5429 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
5430 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
5431 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
5432 while ( (sbits64
) rem1
< 0 ) {
5434 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
5436 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5438 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
5439 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5443 /*----------------------------------------------------------------------------
5444 | Returns the remainder of the quadruple-precision floating-point value `a'
5445 | with respect to the corresponding value `b'. The operation is performed
5446 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5447 *----------------------------------------------------------------------------*/
5449 float128
float128_rem( float128 a
, float128 b STATUS_PARAM
)
5452 int32 aExp
, bExp
, expDiff
;
5453 bits64 aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
5454 bits64 allZero
, alternateASig0
, alternateASig1
, sigMean1
;
5458 aSig1
= extractFloat128Frac1( a
);
5459 aSig0
= extractFloat128Frac0( a
);
5460 aExp
= extractFloat128Exp( a
);
5461 aSign
= extractFloat128Sign( a
);
5462 bSig1
= extractFloat128Frac1( b
);
5463 bSig0
= extractFloat128Frac0( b
);
5464 bExp
= extractFloat128Exp( b
);
5465 if ( aExp
== 0x7FFF ) {
5466 if ( ( aSig0
| aSig1
)
5467 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
5468 return propagateFloat128NaN( a
, b STATUS_VAR
);
5472 if ( bExp
== 0x7FFF ) {
5473 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5477 if ( ( bSig0
| bSig1
) == 0 ) {
5479 float_raise( float_flag_invalid STATUS_VAR
);
5480 z
.low
= float128_default_nan_low
;
5481 z
.high
= float128_default_nan_high
;
5484 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5487 if ( ( aSig0
| aSig1
) == 0 ) return a
;
5488 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5490 expDiff
= aExp
- bExp
;
5491 if ( expDiff
< -1 ) return a
;
5493 aSig0
| LIT64( 0x0001000000000000 ),
5495 15 - ( expDiff
< 0 ),
5500 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
5501 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
5502 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
5504 while ( 0 < expDiff
) {
5505 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5506 q
= ( 4 < q
) ? q
- 4 : 0;
5507 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
5508 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
5509 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
5510 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
5513 if ( -64 < expDiff
) {
5514 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5515 q
= ( 4 < q
) ? q
- 4 : 0;
5517 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
5519 if ( expDiff
< 0 ) {
5520 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
5523 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
5525 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
5526 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
5529 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
5530 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
5533 alternateASig0
= aSig0
;
5534 alternateASig1
= aSig1
;
5536 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
5537 } while ( 0 <= (sbits64
) aSig0
);
5539 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (bits64
*)&sigMean0
, &sigMean1
);
5540 if ( ( sigMean0
< 0 )
5541 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
5542 aSig0
= alternateASig0
;
5543 aSig1
= alternateASig1
;
5545 zSign
= ( (sbits64
) aSig0
< 0 );
5546 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
5548 normalizeRoundAndPackFloat128( aSign
^ zSign
, bExp
- 4, aSig0
, aSig1 STATUS_VAR
);
5552 /*----------------------------------------------------------------------------
5553 | Returns the square root of the quadruple-precision floating-point value `a'.
5554 | The operation is performed according to the IEC/IEEE Standard for Binary
5555 | Floating-Point Arithmetic.
5556 *----------------------------------------------------------------------------*/
5558 float128
float128_sqrt( float128 a STATUS_PARAM
)
5562 bits64 aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
5563 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5566 aSig1
= extractFloat128Frac1( a
);
5567 aSig0
= extractFloat128Frac0( a
);
5568 aExp
= extractFloat128Exp( a
);
5569 aSign
= extractFloat128Sign( a
);
5570 if ( aExp
== 0x7FFF ) {
5571 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, a STATUS_VAR
);
5572 if ( ! aSign
) return a
;
5576 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
5578 float_raise( float_flag_invalid STATUS_VAR
);
5579 z
.low
= float128_default_nan_low
;
5580 z
.high
= float128_default_nan_high
;
5584 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
5585 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5587 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
5588 aSig0
|= LIT64( 0x0001000000000000 );
5589 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
5590 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
5591 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
5592 doubleZSig0
= zSig0
<<1;
5593 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
5594 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
5595 while ( (sbits64
) rem0
< 0 ) {
5598 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
5600 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
5601 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
5602 if ( zSig1
== 0 ) zSig1
= 1;
5603 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
5604 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5605 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
5606 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5607 while ( (sbits64
) rem1
< 0 ) {
5609 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
5611 term2
|= doubleZSig0
;
5612 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5614 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5616 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
5617 return roundAndPackFloat128( 0, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5621 /*----------------------------------------------------------------------------
5622 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5623 | the corresponding value `b', and 0 otherwise. The comparison is performed
5624 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5625 *----------------------------------------------------------------------------*/
5627 int float128_eq( float128 a
, float128 b STATUS_PARAM
)
5630 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5631 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5632 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5633 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5635 if ( float128_is_signaling_nan( a
)
5636 || float128_is_signaling_nan( b
) ) {
5637 float_raise( float_flag_invalid STATUS_VAR
);
5643 && ( ( a
.high
== b
.high
)
5645 && ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5650 /*----------------------------------------------------------------------------
5651 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5652 | or equal to the corresponding value `b', and 0 otherwise. The comparison
5653 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5655 *----------------------------------------------------------------------------*/
5657 int float128_le( float128 a
, float128 b STATUS_PARAM
)
5661 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5662 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5663 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5664 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5666 float_raise( float_flag_invalid STATUS_VAR
);
5669 aSign
= extractFloat128Sign( a
);
5670 bSign
= extractFloat128Sign( b
);
5671 if ( aSign
!= bSign
) {
5674 || ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5678 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5679 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5683 /*----------------------------------------------------------------------------
5684 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5685 | the corresponding value `b', and 0 otherwise. The comparison is performed
5686 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5687 *----------------------------------------------------------------------------*/
5689 int float128_lt( float128 a
, float128 b STATUS_PARAM
)
5693 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5694 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5695 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5696 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5698 float_raise( float_flag_invalid STATUS_VAR
);
5701 aSign
= extractFloat128Sign( a
);
5702 bSign
= extractFloat128Sign( b
);
5703 if ( aSign
!= bSign
) {
5706 && ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5710 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5711 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5715 /*----------------------------------------------------------------------------
5716 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5717 | the corresponding value `b', and 0 otherwise. The invalid exception is
5718 | raised if either operand is a NaN. Otherwise, the comparison is performed
5719 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5720 *----------------------------------------------------------------------------*/
5722 int float128_eq_signaling( float128 a
, float128 b STATUS_PARAM
)
5725 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5726 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5727 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5728 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5730 float_raise( float_flag_invalid STATUS_VAR
);
5735 && ( ( a
.high
== b
.high
)
5737 && ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5742 /*----------------------------------------------------------------------------
5743 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5744 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5745 | cause an exception. Otherwise, the comparison is performed according to the
5746 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5747 *----------------------------------------------------------------------------*/
5749 int float128_le_quiet( float128 a
, float128 b STATUS_PARAM
)
5753 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5754 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5755 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5756 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5758 if ( float128_is_signaling_nan( a
)
5759 || float128_is_signaling_nan( b
) ) {
5760 float_raise( float_flag_invalid STATUS_VAR
);
5764 aSign
= extractFloat128Sign( a
);
5765 bSign
= extractFloat128Sign( b
);
5766 if ( aSign
!= bSign
) {
5769 || ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5773 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5774 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5778 /*----------------------------------------------------------------------------
5779 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5780 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5781 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5782 | Standard for Binary Floating-Point Arithmetic.
5783 *----------------------------------------------------------------------------*/
5785 int float128_lt_quiet( float128 a
, float128 b STATUS_PARAM
)
5789 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5790 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5791 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5792 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5794 if ( float128_is_signaling_nan( a
)
5795 || float128_is_signaling_nan( b
) ) {
5796 float_raise( float_flag_invalid STATUS_VAR
);
5800 aSign
= extractFloat128Sign( a
);
5801 bSign
= extractFloat128Sign( b
);
5802 if ( aSign
!= bSign
) {
5805 && ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5809 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5810 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5816 /* misc functions */
5817 float32
uint32_to_float32( unsigned int a STATUS_PARAM
)
5819 return int64_to_float32(a STATUS_VAR
);
5822 float64
uint32_to_float64( unsigned int a STATUS_PARAM
)
5824 return int64_to_float64(a STATUS_VAR
);
5827 unsigned int float32_to_uint32( float32 a STATUS_PARAM
)
5832 v
= float32_to_int64(a STATUS_VAR
);
5835 float_raise( float_flag_invalid STATUS_VAR
);
5836 } else if (v
> 0xffffffff) {
5838 float_raise( float_flag_invalid STATUS_VAR
);
5845 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM
)
5850 v
= float32_to_int64_round_to_zero(a STATUS_VAR
);
5853 float_raise( float_flag_invalid STATUS_VAR
);
5854 } else if (v
> 0xffffffff) {
5856 float_raise( float_flag_invalid STATUS_VAR
);
5863 unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM
)
5868 v
= float32_to_int64_round_to_zero(a STATUS_VAR
);
5871 float_raise( float_flag_invalid STATUS_VAR
);
5872 } else if (v
> 0xffff) {
5874 float_raise( float_flag_invalid STATUS_VAR
);
5881 unsigned int float64_to_uint32( float64 a STATUS_PARAM
)
5886 v
= float64_to_int64(a STATUS_VAR
);
5889 float_raise( float_flag_invalid STATUS_VAR
);
5890 } else if (v
> 0xffffffff) {
5892 float_raise( float_flag_invalid STATUS_VAR
);
5899 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM
)
5904 v
= float64_to_int64_round_to_zero(a STATUS_VAR
);
5907 float_raise( float_flag_invalid STATUS_VAR
);
5908 } else if (v
> 0xffffffff) {
5910 float_raise( float_flag_invalid STATUS_VAR
);
5917 unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM
)
5922 v
= float64_to_int64_round_to_zero(a STATUS_VAR
);
5925 float_raise( float_flag_invalid STATUS_VAR
);
5926 } else if (v
> 0xffff) {
5928 float_raise( float_flag_invalid STATUS_VAR
);
5935 /* FIXME: This looks broken. */
5936 uint64_t float64_to_uint64 (float64 a STATUS_PARAM
)
5940 v
= float64_val(int64_to_float64(INT64_MIN STATUS_VAR
));
5941 v
+= float64_val(a
);
5942 v
= float64_to_int64(make_float64(v
) STATUS_VAR
);
5944 return v
- INT64_MIN
;
5947 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM
)
5951 v
= float64_val(int64_to_float64(INT64_MIN STATUS_VAR
));
5952 v
+= float64_val(a
);
5953 v
= float64_to_int64_round_to_zero(make_float64(v
) STATUS_VAR
);
5955 return v
- INT64_MIN
;
5958 #define COMPARE(s, nan_exp) \
5959 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
5960 int is_quiet STATUS_PARAM ) \
5962 flag aSign, bSign; \
5964 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
5965 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
5967 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
5968 extractFloat ## s ## Frac( a ) ) || \
5969 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
5970 extractFloat ## s ## Frac( b ) )) { \
5972 float ## s ## _is_signaling_nan( a ) || \
5973 float ## s ## _is_signaling_nan( b ) ) { \
5974 float_raise( float_flag_invalid STATUS_VAR); \
5976 return float_relation_unordered; \
5978 aSign = extractFloat ## s ## Sign( a ); \
5979 bSign = extractFloat ## s ## Sign( b ); \
5980 av = float ## s ## _val(a); \
5981 bv = float ## s ## _val(b); \
5982 if ( aSign != bSign ) { \
5983 if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) { \
5985 return float_relation_equal; \
5987 return 1 - (2 * aSign); \
5991 return float_relation_equal; \
5993 return 1 - 2 * (aSign ^ ( av < bv )); \
5998 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
6000 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
6003 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
6005 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
6011 INLINE
int float128_compare_internal( float128 a
, float128 b
,
6012 int is_quiet STATUS_PARAM
)
6016 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
6017 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
6018 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
6019 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
6021 float128_is_signaling_nan( a
) ||
6022 float128_is_signaling_nan( b
) ) {
6023 float_raise( float_flag_invalid STATUS_VAR
);
6025 return float_relation_unordered
;
6027 aSign
= extractFloat128Sign( a
);
6028 bSign
= extractFloat128Sign( b
);
6029 if ( aSign
!= bSign
) {
6030 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
6032 return float_relation_equal
;
6034 return 1 - (2 * aSign
);
6037 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
6038 return float_relation_equal
;
6040 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
6045 int float128_compare( float128 a
, float128 b STATUS_PARAM
)
6047 return float128_compare_internal(a
, b
, 0 STATUS_VAR
);
6050 int float128_compare_quiet( float128 a
, float128 b STATUS_PARAM
)
6052 return float128_compare_internal(a
, b
, 1 STATUS_VAR
);
6055 /* Multiply A by 2 raised to the power N. */
6056 float32
float32_scalbn( float32 a
, int n STATUS_PARAM
)
6062 a
= float32_squash_input_denormal(a STATUS_VAR
);
6063 aSig
= extractFloat32Frac( a
);
6064 aExp
= extractFloat32Exp( a
);
6065 aSign
= extractFloat32Sign( a
);
6067 if ( aExp
== 0xFF ) {
6072 else if ( aSig
== 0 )
6077 return normalizeRoundAndPackFloat32( aSign
, aExp
, aSig STATUS_VAR
);
6080 float64
float64_scalbn( float64 a
, int n STATUS_PARAM
)
6086 a
= float64_squash_input_denormal(a STATUS_VAR
);
6087 aSig
= extractFloat64Frac( a
);
6088 aExp
= extractFloat64Exp( a
);
6089 aSign
= extractFloat64Sign( a
);
6091 if ( aExp
== 0x7FF ) {
6095 aSig
|= LIT64( 0x0010000000000000 );
6096 else if ( aSig
== 0 )
6101 return normalizeRoundAndPackFloat64( aSign
, aExp
, aSig STATUS_VAR
);
6105 floatx80
floatx80_scalbn( floatx80 a
, int n STATUS_PARAM
)
6111 aSig
= extractFloatx80Frac( a
);
6112 aExp
= extractFloatx80Exp( a
);
6113 aSign
= extractFloatx80Sign( a
);
6115 if ( aExp
== 0x7FF ) {
6118 if (aExp
== 0 && aSig
== 0)
6122 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision
),
6123 aSign
, aExp
, aSig
, 0 STATUS_VAR
);
6128 float128
float128_scalbn( float128 a
, int n STATUS_PARAM
)
6132 bits64 aSig0
, aSig1
;
6134 aSig1
= extractFloat128Frac1( a
);
6135 aSig0
= extractFloat128Frac0( a
);
6136 aExp
= extractFloat128Exp( a
);
6137 aSign
= extractFloat128Sign( a
);
6138 if ( aExp
== 0x7FFF ) {
6142 aSig0
|= LIT64( 0x0001000000000000 );
6143 else if ( aSig0
== 0 && aSig1
== 0 )
6147 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1