2 /*============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15 arithmetic/SoftFloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
26 Derivative works are acceptable, even for commercial purposes, so long as
27 (1) the source code for the derivative work includes prominent notice that
28 the work is derivative, and (2) the source code includes prominent notice with
29 these four paragraphs for those parts of this code that are retained.
31 =============================================================================*/
33 /* FIXME: Flush-To-Zero only effects results. Denormal inputs should also
34 be flushed to zero. */
35 #include "softfloat.h"
37 /*----------------------------------------------------------------------------
38 | Primitive arithmetic functions, including multi-word arithmetic, and
39 | division and square root approximations. (Can be specialized to target if
41 *----------------------------------------------------------------------------*/
42 #include "softfloat-macros.h"
44 /*----------------------------------------------------------------------------
45 | Functions and definitions to determine: (1) whether tininess for underflow
46 | is detected before or after rounding by default, (2) what (if anything)
47 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
48 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
49 | are propagated from function inputs to output. These details are target-
51 *----------------------------------------------------------------------------*/
52 #include "softfloat-specialize.h"
54 void set_float_rounding_mode(int val STATUS_PARAM
)
56 STATUS(float_rounding_mode
) = val
;
59 void set_float_exception_flags(int val STATUS_PARAM
)
61 STATUS(float_exception_flags
) = val
;
65 void set_floatx80_rounding_precision(int val STATUS_PARAM
)
67 STATUS(floatx80_rounding_precision
) = val
;
71 /*----------------------------------------------------------------------------
72 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
73 | and 7, and returns the properly rounded 32-bit integer corresponding to the
74 | input. If `zSign' is 1, the input is negated before being converted to an
75 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
76 | is simply rounded to an integer, with the inexact exception raised if the
77 | input cannot be represented exactly as an integer. However, if the fixed-
78 | point input is too large, the invalid exception is raised and the largest
79 | positive or negative integer is returned.
80 *----------------------------------------------------------------------------*/
82 static int32
roundAndPackInt32( flag zSign
, bits64 absZ STATUS_PARAM
)
85 flag roundNearestEven
;
86 int8 roundIncrement
, roundBits
;
89 roundingMode
= STATUS(float_rounding_mode
);
90 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
91 roundIncrement
= 0x40;
92 if ( ! roundNearestEven
) {
93 if ( roundingMode
== float_round_to_zero
) {
97 roundIncrement
= 0x7F;
99 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
102 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
106 roundBits
= absZ
& 0x7F;
107 absZ
= ( absZ
+ roundIncrement
)>>7;
108 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
110 if ( zSign
) z
= - z
;
111 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
112 float_raise( float_flag_invalid STATUS_VAR
);
113 return zSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
115 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
120 /*----------------------------------------------------------------------------
121 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
122 | `absZ1', with binary point between bits 63 and 64 (between the input words),
123 | and returns the properly rounded 64-bit integer corresponding to the input.
124 | If `zSign' is 1, the input is negated before being converted to an integer.
125 | Ordinarily, the fixed-point input is simply rounded to an integer, with
126 | the inexact exception raised if the input cannot be represented exactly as
127 | an integer. However, if the fixed-point input is too large, the invalid
128 | exception is raised and the largest positive or negative integer is
130 *----------------------------------------------------------------------------*/
132 static int64
roundAndPackInt64( flag zSign
, bits64 absZ0
, bits64 absZ1 STATUS_PARAM
)
135 flag roundNearestEven
, increment
;
138 roundingMode
= STATUS(float_rounding_mode
);
139 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
140 increment
= ( (sbits64
) absZ1
< 0 );
141 if ( ! roundNearestEven
) {
142 if ( roundingMode
== float_round_to_zero
) {
147 increment
= ( roundingMode
== float_round_down
) && absZ1
;
150 increment
= ( roundingMode
== float_round_up
) && absZ1
;
156 if ( absZ0
== 0 ) goto overflow
;
157 absZ0
&= ~ ( ( (bits64
) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
160 if ( zSign
) z
= - z
;
161 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
163 float_raise( float_flag_invalid STATUS_VAR
);
165 zSign
? (sbits64
) LIT64( 0x8000000000000000 )
166 : LIT64( 0x7FFFFFFFFFFFFFFF );
168 if ( absZ1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
173 /*----------------------------------------------------------------------------
174 | Returns the fraction bits of the single-precision floating-point value `a'.
175 *----------------------------------------------------------------------------*/
177 INLINE bits32
extractFloat32Frac( float32 a
)
180 return float32_val(a
) & 0x007FFFFF;
184 /*----------------------------------------------------------------------------
185 | Returns the exponent bits of the single-precision floating-point value `a'.
186 *----------------------------------------------------------------------------*/
188 INLINE int16
extractFloat32Exp( float32 a
)
191 return ( float32_val(a
)>>23 ) & 0xFF;
195 /*----------------------------------------------------------------------------
196 | Returns the sign bit of the single-precision floating-point value `a'.
197 *----------------------------------------------------------------------------*/
199 INLINE flag
extractFloat32Sign( float32 a
)
202 return float32_val(a
)>>31;
206 /*----------------------------------------------------------------------------
207 | Normalizes the subnormal single-precision floating-point value represented
208 | by the denormalized significand `aSig'. The normalized exponent and
209 | significand are stored at the locations pointed to by `zExpPtr' and
210 | `zSigPtr', respectively.
211 *----------------------------------------------------------------------------*/
214 normalizeFloat32Subnormal( bits32 aSig
, int16
*zExpPtr
, bits32
*zSigPtr
)
218 shiftCount
= countLeadingZeros32( aSig
) - 8;
219 *zSigPtr
= aSig
<<shiftCount
;
220 *zExpPtr
= 1 - shiftCount
;
224 /*----------------------------------------------------------------------------
225 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
226 | single-precision floating-point value, returning the result. After being
227 | shifted into the proper positions, the three fields are simply added
228 | together to form the result. This means that any integer portion of `zSig'
229 | will be added into the exponent. Since a properly normalized significand
230 | will have an integer portion equal to 1, the `zExp' input should be 1 less
231 | than the desired result exponent whenever `zSig' is a complete, normalized
233 *----------------------------------------------------------------------------*/
235 INLINE float32
packFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
239 ( ( (bits32
) zSign
)<<31 ) + ( ( (bits32
) zExp
)<<23 ) + zSig
);
243 /*----------------------------------------------------------------------------
244 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
245 | and significand `zSig', and returns the proper single-precision floating-
246 | point value corresponding to the abstract input. Ordinarily, the abstract
247 | value is simply rounded and packed into the single-precision format, with
248 | the inexact exception raised if the abstract input cannot be represented
249 | exactly. However, if the abstract value is too large, the overflow and
250 | inexact exceptions are raised and an infinity or maximal finite value is
251 | returned. If the abstract value is too small, the input value is rounded to
252 | a subnormal number, and the underflow and inexact exceptions are raised if
253 | the abstract input cannot be represented exactly as a subnormal single-
254 | precision floating-point number.
255 | The input significand `zSig' has its binary point between bits 30
256 | and 29, which is 7 bits to the left of the usual location. This shifted
257 | significand must be normalized or smaller. If `zSig' is not normalized,
258 | `zExp' must be 0; in that case, the result returned is a subnormal number,
259 | and it must not require rounding. In the usual case that `zSig' is
260 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
261 | The handling of underflow and overflow follows the IEC/IEEE Standard for
262 | Binary Floating-Point Arithmetic.
263 *----------------------------------------------------------------------------*/
265 static float32
roundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig STATUS_PARAM
)
268 flag roundNearestEven
;
269 int8 roundIncrement
, roundBits
;
272 roundingMode
= STATUS(float_rounding_mode
);
273 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
274 roundIncrement
= 0x40;
275 if ( ! roundNearestEven
) {
276 if ( roundingMode
== float_round_to_zero
) {
280 roundIncrement
= 0x7F;
282 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
285 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
289 roundBits
= zSig
& 0x7F;
290 if ( 0xFD <= (bits16
) zExp
) {
292 || ( ( zExp
== 0xFD )
293 && ( (sbits32
) ( zSig
+ roundIncrement
) < 0 ) )
295 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
296 return packFloat32( zSign
, 0xFF, - ( roundIncrement
== 0 ));
299 if ( STATUS(flush_to_zero
) ) return packFloat32( zSign
, 0, 0 );
301 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
303 || ( zSig
+ roundIncrement
< 0x80000000 );
304 shift32RightJamming( zSig
, - zExp
, &zSig
);
306 roundBits
= zSig
& 0x7F;
307 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
310 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
311 zSig
= ( zSig
+ roundIncrement
)>>7;
312 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
313 if ( zSig
== 0 ) zExp
= 0;
314 return packFloat32( zSign
, zExp
, zSig
);
318 /*----------------------------------------------------------------------------
319 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
320 | and significand `zSig', and returns the proper single-precision floating-
321 | point value corresponding to the abstract input. This routine is just like
322 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
323 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
324 | floating-point exponent.
325 *----------------------------------------------------------------------------*/
328 normalizeRoundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig STATUS_PARAM
)
332 shiftCount
= countLeadingZeros32( zSig
) - 1;
333 return roundAndPackFloat32( zSign
, zExp
- shiftCount
, zSig
<<shiftCount STATUS_VAR
);
337 /*----------------------------------------------------------------------------
338 | Returns the fraction bits of the double-precision floating-point value `a'.
339 *----------------------------------------------------------------------------*/
341 INLINE bits64
extractFloat64Frac( float64 a
)
344 return float64_val(a
) & LIT64( 0x000FFFFFFFFFFFFF );
348 /*----------------------------------------------------------------------------
349 | Returns the exponent bits of the double-precision floating-point value `a'.
350 *----------------------------------------------------------------------------*/
352 INLINE int16
extractFloat64Exp( float64 a
)
355 return ( float64_val(a
)>>52 ) & 0x7FF;
359 /*----------------------------------------------------------------------------
360 | Returns the sign bit of the double-precision floating-point value `a'.
361 *----------------------------------------------------------------------------*/
363 INLINE flag
extractFloat64Sign( float64 a
)
366 return float64_val(a
)>>63;
370 /*----------------------------------------------------------------------------
371 | Normalizes the subnormal double-precision floating-point value represented
372 | by the denormalized significand `aSig'. The normalized exponent and
373 | significand are stored at the locations pointed to by `zExpPtr' and
374 | `zSigPtr', respectively.
375 *----------------------------------------------------------------------------*/
378 normalizeFloat64Subnormal( bits64 aSig
, int16
*zExpPtr
, bits64
*zSigPtr
)
382 shiftCount
= countLeadingZeros64( aSig
) - 11;
383 *zSigPtr
= aSig
<<shiftCount
;
384 *zExpPtr
= 1 - shiftCount
;
388 /*----------------------------------------------------------------------------
389 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
390 | double-precision floating-point value, returning the result. After being
391 | shifted into the proper positions, the three fields are simply added
392 | together to form the result. This means that any integer portion of `zSig'
393 | will be added into the exponent. Since a properly normalized significand
394 | will have an integer portion equal to 1, the `zExp' input should be 1 less
395 | than the desired result exponent whenever `zSig' is a complete, normalized
397 *----------------------------------------------------------------------------*/
399 INLINE float64
packFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
403 ( ( (bits64
) zSign
)<<63 ) + ( ( (bits64
) zExp
)<<52 ) + zSig
);
407 /*----------------------------------------------------------------------------
408 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
409 | and significand `zSig', and returns the proper double-precision floating-
410 | point value corresponding to the abstract input. Ordinarily, the abstract
411 | value is simply rounded and packed into the double-precision format, with
412 | the inexact exception raised if the abstract input cannot be represented
413 | exactly. However, if the abstract value is too large, the overflow and
414 | inexact exceptions are raised and an infinity or maximal finite value is
415 | returned. If the abstract value is too small, the input value is rounded
416 | to a subnormal number, and the underflow and inexact exceptions are raised
417 | if the abstract input cannot be represented exactly as a subnormal double-
418 | precision floating-point number.
419 | The input significand `zSig' has its binary point between bits 62
420 | and 61, which is 10 bits to the left of the usual location. This shifted
421 | significand must be normalized or smaller. If `zSig' is not normalized,
422 | `zExp' must be 0; in that case, the result returned is a subnormal number,
423 | and it must not require rounding. In the usual case that `zSig' is
424 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
425 | The handling of underflow and overflow follows the IEC/IEEE Standard for
426 | Binary Floating-Point Arithmetic.
427 *----------------------------------------------------------------------------*/
429 static float64
roundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig STATUS_PARAM
)
432 flag roundNearestEven
;
433 int16 roundIncrement
, roundBits
;
436 roundingMode
= STATUS(float_rounding_mode
);
437 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
438 roundIncrement
= 0x200;
439 if ( ! roundNearestEven
) {
440 if ( roundingMode
== float_round_to_zero
) {
444 roundIncrement
= 0x3FF;
446 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
449 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
453 roundBits
= zSig
& 0x3FF;
454 if ( 0x7FD <= (bits16
) zExp
) {
455 if ( ( 0x7FD < zExp
)
456 || ( ( zExp
== 0x7FD )
457 && ( (sbits64
) ( zSig
+ roundIncrement
) < 0 ) )
459 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
460 return packFloat64( zSign
, 0x7FF, - ( roundIncrement
== 0 ));
463 if ( STATUS(flush_to_zero
) ) return packFloat64( zSign
, 0, 0 );
465 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
467 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
468 shift64RightJamming( zSig
, - zExp
, &zSig
);
470 roundBits
= zSig
& 0x3FF;
471 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
474 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
475 zSig
= ( zSig
+ roundIncrement
)>>10;
476 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
477 if ( zSig
== 0 ) zExp
= 0;
478 return packFloat64( zSign
, zExp
, zSig
);
482 /*----------------------------------------------------------------------------
483 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
484 | and significand `zSig', and returns the proper double-precision floating-
485 | point value corresponding to the abstract input. This routine is just like
486 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
487 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
488 | floating-point exponent.
489 *----------------------------------------------------------------------------*/
492 normalizeRoundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig STATUS_PARAM
)
496 shiftCount
= countLeadingZeros64( zSig
) - 1;
497 return roundAndPackFloat64( zSign
, zExp
- shiftCount
, zSig
<<shiftCount STATUS_VAR
);
503 /*----------------------------------------------------------------------------
504 | Returns the fraction bits of the extended double-precision floating-point
506 *----------------------------------------------------------------------------*/
508 INLINE bits64
extractFloatx80Frac( floatx80 a
)
515 /*----------------------------------------------------------------------------
516 | Returns the exponent bits of the extended double-precision floating-point
518 *----------------------------------------------------------------------------*/
520 INLINE int32
extractFloatx80Exp( floatx80 a
)
523 return a
.high
& 0x7FFF;
527 /*----------------------------------------------------------------------------
528 | Returns the sign bit of the extended double-precision floating-point value
530 *----------------------------------------------------------------------------*/
532 INLINE flag
extractFloatx80Sign( floatx80 a
)
539 /*----------------------------------------------------------------------------
540 | Normalizes the subnormal extended double-precision floating-point value
541 | represented by the denormalized significand `aSig'. The normalized exponent
542 | and significand are stored at the locations pointed to by `zExpPtr' and
543 | `zSigPtr', respectively.
544 *----------------------------------------------------------------------------*/
547 normalizeFloatx80Subnormal( bits64 aSig
, int32
*zExpPtr
, bits64
*zSigPtr
)
551 shiftCount
= countLeadingZeros64( aSig
);
552 *zSigPtr
= aSig
<<shiftCount
;
553 *zExpPtr
= 1 - shiftCount
;
557 /*----------------------------------------------------------------------------
558 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
559 | extended double-precision floating-point value, returning the result.
560 *----------------------------------------------------------------------------*/
562 INLINE floatx80
packFloatx80( flag zSign
, int32 zExp
, bits64 zSig
)
567 z
.high
= ( ( (bits16
) zSign
)<<15 ) + zExp
;
572 /*----------------------------------------------------------------------------
573 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
574 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
575 | and returns the proper extended double-precision floating-point value
576 | corresponding to the abstract input. Ordinarily, the abstract value is
577 | rounded and packed into the extended double-precision format, with the
578 | inexact exception raised if the abstract input cannot be represented
579 | exactly. However, if the abstract value is too large, the overflow and
580 | inexact exceptions are raised and an infinity or maximal finite value is
581 | returned. If the abstract value is too small, the input value is rounded to
582 | a subnormal number, and the underflow and inexact exceptions are raised if
583 | the abstract input cannot be represented exactly as a subnormal extended
584 | double-precision floating-point number.
585 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
586 | number of bits as single or double precision, respectively. Otherwise, the
587 | result is rounded to the full precision of the extended double-precision
589 | The input significand must be normalized or smaller. If the input
590 | significand is not normalized, `zExp' must be 0; in that case, the result
591 | returned is a subnormal number, and it must not require rounding. The
592 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
593 | Floating-Point Arithmetic.
594 *----------------------------------------------------------------------------*/
597 roundAndPackFloatx80(
598 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
602 flag roundNearestEven
, increment
, isTiny
;
603 int64 roundIncrement
, roundMask
, roundBits
;
605 roundingMode
= STATUS(float_rounding_mode
);
606 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
607 if ( roundingPrecision
== 80 ) goto precision80
;
608 if ( roundingPrecision
== 64 ) {
609 roundIncrement
= LIT64( 0x0000000000000400 );
610 roundMask
= LIT64( 0x00000000000007FF );
612 else if ( roundingPrecision
== 32 ) {
613 roundIncrement
= LIT64( 0x0000008000000000 );
614 roundMask
= LIT64( 0x000000FFFFFFFFFF );
619 zSig0
|= ( zSig1
!= 0 );
620 if ( ! roundNearestEven
) {
621 if ( roundingMode
== float_round_to_zero
) {
625 roundIncrement
= roundMask
;
627 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
630 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
634 roundBits
= zSig0
& roundMask
;
635 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
636 if ( ( 0x7FFE < zExp
)
637 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
642 if ( STATUS(flush_to_zero
) ) return packFloatx80( zSign
, 0, 0 );
644 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
646 || ( zSig0
<= zSig0
+ roundIncrement
);
647 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
649 roundBits
= zSig0
& roundMask
;
650 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
651 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
652 zSig0
+= roundIncrement
;
653 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
654 roundIncrement
= roundMask
+ 1;
655 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
656 roundMask
|= roundIncrement
;
658 zSig0
&= ~ roundMask
;
659 return packFloatx80( zSign
, zExp
, zSig0
);
662 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
663 zSig0
+= roundIncrement
;
664 if ( zSig0
< roundIncrement
) {
666 zSig0
= LIT64( 0x8000000000000000 );
668 roundIncrement
= roundMask
+ 1;
669 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
670 roundMask
|= roundIncrement
;
672 zSig0
&= ~ roundMask
;
673 if ( zSig0
== 0 ) zExp
= 0;
674 return packFloatx80( zSign
, zExp
, zSig0
);
676 increment
= ( (sbits64
) zSig1
< 0 );
677 if ( ! roundNearestEven
) {
678 if ( roundingMode
== float_round_to_zero
) {
683 increment
= ( roundingMode
== float_round_down
) && zSig1
;
686 increment
= ( roundingMode
== float_round_up
) && zSig1
;
690 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
691 if ( ( 0x7FFE < zExp
)
692 || ( ( zExp
== 0x7FFE )
693 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
699 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
700 if ( ( roundingMode
== float_round_to_zero
)
701 || ( zSign
&& ( roundingMode
== float_round_up
) )
702 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
704 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
706 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
710 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
713 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
714 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
716 if ( isTiny
&& zSig1
) float_raise( float_flag_underflow STATUS_VAR
);
717 if ( zSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
718 if ( roundNearestEven
) {
719 increment
= ( (sbits64
) zSig1
< 0 );
723 increment
= ( roundingMode
== float_round_down
) && zSig1
;
726 increment
= ( roundingMode
== float_round_up
) && zSig1
;
732 ~ ( ( (bits64
) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
733 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
735 return packFloatx80( zSign
, zExp
, zSig0
);
738 if ( zSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
743 zSig0
= LIT64( 0x8000000000000000 );
746 zSig0
&= ~ ( ( (bits64
) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
750 if ( zSig0
== 0 ) zExp
= 0;
752 return packFloatx80( zSign
, zExp
, zSig0
);
756 /*----------------------------------------------------------------------------
757 | Takes an abstract floating-point value having sign `zSign', exponent
758 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
759 | and returns the proper extended double-precision floating-point value
760 | corresponding to the abstract input. This routine is just like
761 | `roundAndPackFloatx80' except that the input significand does not have to be
763 *----------------------------------------------------------------------------*/
766 normalizeRoundAndPackFloatx80(
767 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
777 shiftCount
= countLeadingZeros64( zSig0
);
778 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
781 roundAndPackFloatx80( roundingPrecision
, zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
789 /*----------------------------------------------------------------------------
790 | Returns the least-significant 64 fraction bits of the quadruple-precision
791 | floating-point value `a'.
792 *----------------------------------------------------------------------------*/
794 INLINE bits64
extractFloat128Frac1( float128 a
)
801 /*----------------------------------------------------------------------------
802 | Returns the most-significant 48 fraction bits of the quadruple-precision
803 | floating-point value `a'.
804 *----------------------------------------------------------------------------*/
806 INLINE bits64
extractFloat128Frac0( float128 a
)
809 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
813 /*----------------------------------------------------------------------------
814 | Returns the exponent bits of the quadruple-precision floating-point value
816 *----------------------------------------------------------------------------*/
818 INLINE int32
extractFloat128Exp( float128 a
)
821 return ( a
.high
>>48 ) & 0x7FFF;
825 /*----------------------------------------------------------------------------
826 | Returns the sign bit of the quadruple-precision floating-point value `a'.
827 *----------------------------------------------------------------------------*/
829 INLINE flag
extractFloat128Sign( float128 a
)
836 /*----------------------------------------------------------------------------
837 | Normalizes the subnormal quadruple-precision floating-point value
838 | represented by the denormalized significand formed by the concatenation of
839 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
840 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
841 | significand are stored at the location pointed to by `zSig0Ptr', and the
842 | least significant 64 bits of the normalized significand are stored at the
843 | location pointed to by `zSig1Ptr'.
844 *----------------------------------------------------------------------------*/
847 normalizeFloat128Subnormal(
858 shiftCount
= countLeadingZeros64( aSig1
) - 15;
859 if ( shiftCount
< 0 ) {
860 *zSig0Ptr
= aSig1
>>( - shiftCount
);
861 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
864 *zSig0Ptr
= aSig1
<<shiftCount
;
867 *zExpPtr
= - shiftCount
- 63;
870 shiftCount
= countLeadingZeros64( aSig0
) - 15;
871 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
872 *zExpPtr
= 1 - shiftCount
;
877 /*----------------------------------------------------------------------------
878 | Packs the sign `zSign', the exponent `zExp', and the significand formed
879 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
880 | floating-point value, returning the result. After being shifted into the
881 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
882 | added together to form the most significant 32 bits of the result. This
883 | means that any integer portion of `zSig0' will be added into the exponent.
884 | Since a properly normalized significand will have an integer portion equal
885 | to 1, the `zExp' input should be 1 less than the desired result exponent
886 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
888 *----------------------------------------------------------------------------*/
891 packFloat128( flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
)
896 z
.high
= ( ( (bits64
) zSign
)<<63 ) + ( ( (bits64
) zExp
)<<48 ) + zSig0
;
901 /*----------------------------------------------------------------------------
902 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
903 | and extended significand formed by the concatenation of `zSig0', `zSig1',
904 | and `zSig2', and returns the proper quadruple-precision floating-point value
905 | corresponding to the abstract input. Ordinarily, the abstract value is
906 | simply rounded and packed into the quadruple-precision format, with the
907 | inexact exception raised if the abstract input cannot be represented
908 | exactly. However, if the abstract value is too large, the overflow and
909 | inexact exceptions are raised and an infinity or maximal finite value is
910 | returned. If the abstract value is too small, the input value is rounded to
911 | a subnormal number, and the underflow and inexact exceptions are raised if
912 | the abstract input cannot be represented exactly as a subnormal quadruple-
913 | precision floating-point number.
914 | The input significand must be normalized or smaller. If the input
915 | significand is not normalized, `zExp' must be 0; in that case, the result
916 | returned is a subnormal number, and it must not require rounding. In the
917 | usual case that the input significand is normalized, `zExp' must be 1 less
918 | than the ``true'' floating-point exponent. The handling of underflow and
919 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
920 *----------------------------------------------------------------------------*/
923 roundAndPackFloat128(
924 flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
, bits64 zSig2 STATUS_PARAM
)
927 flag roundNearestEven
, increment
, isTiny
;
929 roundingMode
= STATUS(float_rounding_mode
);
930 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
931 increment
= ( (sbits64
) zSig2
< 0 );
932 if ( ! roundNearestEven
) {
933 if ( roundingMode
== float_round_to_zero
) {
938 increment
= ( roundingMode
== float_round_down
) && zSig2
;
941 increment
= ( roundingMode
== float_round_up
) && zSig2
;
945 if ( 0x7FFD <= (bits32
) zExp
) {
946 if ( ( 0x7FFD < zExp
)
947 || ( ( zExp
== 0x7FFD )
949 LIT64( 0x0001FFFFFFFFFFFF ),
950 LIT64( 0xFFFFFFFFFFFFFFFF ),
957 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
958 if ( ( roundingMode
== float_round_to_zero
)
959 || ( zSign
&& ( roundingMode
== float_round_up
) )
960 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
966 LIT64( 0x0000FFFFFFFFFFFF ),
967 LIT64( 0xFFFFFFFFFFFFFFFF )
970 return packFloat128( zSign
, 0x7FFF, 0, 0 );
973 if ( STATUS(flush_to_zero
) ) return packFloat128( zSign
, 0, 0, 0 );
975 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
981 LIT64( 0x0001FFFFFFFFFFFF ),
982 LIT64( 0xFFFFFFFFFFFFFFFF )
984 shift128ExtraRightJamming(
985 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
987 if ( isTiny
&& zSig2
) float_raise( float_flag_underflow STATUS_VAR
);
988 if ( roundNearestEven
) {
989 increment
= ( (sbits64
) zSig2
< 0 );
993 increment
= ( roundingMode
== float_round_down
) && zSig2
;
996 increment
= ( roundingMode
== float_round_up
) && zSig2
;
1001 if ( zSig2
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1003 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
1004 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
1007 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
1009 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1013 /*----------------------------------------------------------------------------
1014 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1015 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1016 | returns the proper quadruple-precision floating-point value corresponding
1017 | to the abstract input. This routine is just like `roundAndPackFloat128'
1018 | except that the input significand has fewer bits and does not have to be
1019 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1021 *----------------------------------------------------------------------------*/
1024 normalizeRoundAndPackFloat128(
1025 flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1 STATUS_PARAM
)
1035 shiftCount
= countLeadingZeros64( zSig0
) - 15;
1036 if ( 0 <= shiftCount
) {
1038 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1041 shift128ExtraRightJamming(
1042 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
1045 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
1051 /*----------------------------------------------------------------------------
1052 | Returns the result of converting the 32-bit two's complement integer `a'
1053 | to the single-precision floating-point format. The conversion is performed
1054 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1055 *----------------------------------------------------------------------------*/
1057 float32
int32_to_float32( int32 a STATUS_PARAM
)
1061 if ( a
== 0 ) return float32_zero
;
1062 if ( a
== (sbits32
) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1064 return normalizeRoundAndPackFloat32( zSign
, 0x9C, zSign
? - a
: a STATUS_VAR
);
1068 /*----------------------------------------------------------------------------
1069 | Returns the result of converting the 32-bit two's complement integer `a'
1070 | to the double-precision floating-point format. The conversion is performed
1071 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1072 *----------------------------------------------------------------------------*/
1074 float64
int32_to_float64( int32 a STATUS_PARAM
)
1081 if ( a
== 0 ) return float64_zero
;
1083 absA
= zSign
? - a
: a
;
1084 shiftCount
= countLeadingZeros32( absA
) + 21;
1086 return packFloat64( zSign
, 0x432 - shiftCount
, zSig
<<shiftCount
);
1092 /*----------------------------------------------------------------------------
1093 | Returns the result of converting the 32-bit two's complement integer `a'
1094 | to the extended double-precision floating-point format. The conversion
1095 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1097 *----------------------------------------------------------------------------*/
1099 floatx80
int32_to_floatx80( int32 a STATUS_PARAM
)
1106 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1108 absA
= zSign
? - a
: a
;
1109 shiftCount
= countLeadingZeros32( absA
) + 32;
1111 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
1119 /*----------------------------------------------------------------------------
1120 | Returns the result of converting the 32-bit two's complement integer `a' to
1121 | the quadruple-precision floating-point format. The conversion is performed
1122 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1123 *----------------------------------------------------------------------------*/
1125 float128
int32_to_float128( int32 a STATUS_PARAM
)
1132 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1134 absA
= zSign
? - a
: a
;
1135 shiftCount
= countLeadingZeros32( absA
) + 17;
1137 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
1143 /*----------------------------------------------------------------------------
1144 | Returns the result of converting the 64-bit two's complement integer `a'
1145 | to the single-precision floating-point format. The conversion is performed
1146 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1147 *----------------------------------------------------------------------------*/
1149 float32
int64_to_float32( int64 a STATUS_PARAM
)
1155 if ( a
== 0 ) return float32_zero
;
1157 absA
= zSign
? - a
: a
;
1158 shiftCount
= countLeadingZeros64( absA
) - 40;
1159 if ( 0 <= shiftCount
) {
1160 return packFloat32( zSign
, 0x95 - shiftCount
, absA
<<shiftCount
);
1164 if ( shiftCount
< 0 ) {
1165 shift64RightJamming( absA
, - shiftCount
, &absA
);
1168 absA
<<= shiftCount
;
1170 return roundAndPackFloat32( zSign
, 0x9C - shiftCount
, absA STATUS_VAR
);
1175 float32
uint64_to_float32( uint64 a STATUS_PARAM
)
1179 if ( a
== 0 ) return float32_zero
;
1180 shiftCount
= countLeadingZeros64( a
) - 40;
1181 if ( 0 <= shiftCount
) {
1182 return packFloat32( 1 > 0, 0x95 - shiftCount
, a
<<shiftCount
);
1186 if ( shiftCount
< 0 ) {
1187 shift64RightJamming( a
, - shiftCount
, &a
);
1192 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount
, a STATUS_VAR
);
1196 /*----------------------------------------------------------------------------
1197 | Returns the result of converting the 64-bit two's complement integer `a'
1198 | to the double-precision floating-point format. The conversion is performed
1199 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1200 *----------------------------------------------------------------------------*/
1202 float64
int64_to_float64( int64 a STATUS_PARAM
)
1206 if ( a
== 0 ) return float64_zero
;
1207 if ( a
== (sbits64
) LIT64( 0x8000000000000000 ) ) {
1208 return packFloat64( 1, 0x43E, 0 );
1211 return normalizeRoundAndPackFloat64( zSign
, 0x43C, zSign
? - a
: a STATUS_VAR
);
1215 float64
uint64_to_float64( uint64 a STATUS_PARAM
)
1217 if ( a
== 0 ) return float64_zero
;
1218 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR
);
1224 /*----------------------------------------------------------------------------
1225 | Returns the result of converting the 64-bit two's complement integer `a'
1226 | to the extended double-precision floating-point format. The conversion
1227 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1229 *----------------------------------------------------------------------------*/
1231 floatx80
int64_to_floatx80( int64 a STATUS_PARAM
)
1237 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1239 absA
= zSign
? - a
: a
;
1240 shiftCount
= countLeadingZeros64( absA
);
1241 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
1249 /*----------------------------------------------------------------------------
1250 | Returns the result of converting the 64-bit two's complement integer `a' to
1251 | the quadruple-precision floating-point format. The conversion is performed
1252 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1253 *----------------------------------------------------------------------------*/
1255 float128
int64_to_float128( int64 a STATUS_PARAM
)
1261 bits64 zSig0
, zSig1
;
1263 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1265 absA
= zSign
? - a
: a
;
1266 shiftCount
= countLeadingZeros64( absA
) + 49;
1267 zExp
= 0x406E - shiftCount
;
1268 if ( 64 <= shiftCount
) {
1277 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1278 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1284 /*----------------------------------------------------------------------------
1285 | Returns the result of converting the single-precision floating-point value
1286 | `a' to the 32-bit two's complement integer format. The conversion is
1287 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1288 | Arithmetic---which means in particular that the conversion is rounded
1289 | according to the current rounding mode. If `a' is a NaN, the largest
1290 | positive integer is returned. Otherwise, if the conversion overflows, the
1291 | largest integer with the same sign as `a' is returned.
1292 *----------------------------------------------------------------------------*/
1294 int32
float32_to_int32( float32 a STATUS_PARAM
)
1297 int16 aExp
, shiftCount
;
1301 aSig
= extractFloat32Frac( a
);
1302 aExp
= extractFloat32Exp( a
);
1303 aSign
= extractFloat32Sign( a
);
1304 if ( ( aExp
== 0xFF ) && aSig
) aSign
= 0;
1305 if ( aExp
) aSig
|= 0x00800000;
1306 shiftCount
= 0xAF - aExp
;
1309 if ( 0 < shiftCount
) shift64RightJamming( aSig64
, shiftCount
, &aSig64
);
1310 return roundAndPackInt32( aSign
, aSig64 STATUS_VAR
);
1314 /*----------------------------------------------------------------------------
1315 | Returns the result of converting the single-precision floating-point value
1316 | `a' to the 32-bit two's complement integer format. The conversion is
1317 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1318 | Arithmetic, except that the conversion is always rounded toward zero.
1319 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1320 | the conversion overflows, the largest integer with the same sign as `a' is
1322 *----------------------------------------------------------------------------*/
1324 int32
float32_to_int32_round_to_zero( float32 a STATUS_PARAM
)
1327 int16 aExp
, shiftCount
;
1331 aSig
= extractFloat32Frac( a
);
1332 aExp
= extractFloat32Exp( a
);
1333 aSign
= extractFloat32Sign( a
);
1334 shiftCount
= aExp
- 0x9E;
1335 if ( 0 <= shiftCount
) {
1336 if ( float32_val(a
) != 0xCF000000 ) {
1337 float_raise( float_flag_invalid STATUS_VAR
);
1338 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
1340 return (sbits32
) 0x80000000;
1342 else if ( aExp
<= 0x7E ) {
1343 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1346 aSig
= ( aSig
| 0x00800000 )<<8;
1347 z
= aSig
>>( - shiftCount
);
1348 if ( (bits32
) ( aSig
<<( shiftCount
& 31 ) ) ) {
1349 STATUS(float_exception_flags
) |= float_flag_inexact
;
1351 if ( aSign
) z
= - z
;
1356 /*----------------------------------------------------------------------------
1357 | Returns the result of converting the single-precision floating-point value
1358 | `a' to the 64-bit two's complement integer format. The conversion is
1359 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1360 | Arithmetic---which means in particular that the conversion is rounded
1361 | according to the current rounding mode. If `a' is a NaN, the largest
1362 | positive integer is returned. Otherwise, if the conversion overflows, the
1363 | largest integer with the same sign as `a' is returned.
1364 *----------------------------------------------------------------------------*/
1366 int64
float32_to_int64( float32 a STATUS_PARAM
)
1369 int16 aExp
, shiftCount
;
1371 bits64 aSig64
, aSigExtra
;
1373 aSig
= extractFloat32Frac( a
);
1374 aExp
= extractFloat32Exp( a
);
1375 aSign
= extractFloat32Sign( a
);
1376 shiftCount
= 0xBE - aExp
;
1377 if ( shiftCount
< 0 ) {
1378 float_raise( float_flag_invalid STATUS_VAR
);
1379 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1380 return LIT64( 0x7FFFFFFFFFFFFFFF );
1382 return (sbits64
) LIT64( 0x8000000000000000 );
1384 if ( aExp
) aSig
|= 0x00800000;
1387 shift64ExtraRightJamming( aSig64
, 0, shiftCount
, &aSig64
, &aSigExtra
);
1388 return roundAndPackInt64( aSign
, aSig64
, aSigExtra STATUS_VAR
);
1392 /*----------------------------------------------------------------------------
1393 | Returns the result of converting the single-precision floating-point value
1394 | `a' to the 64-bit two's complement integer format. The conversion is
1395 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1396 | Arithmetic, except that the conversion is always rounded toward zero. If
1397 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1398 | conversion overflows, the largest integer with the same sign as `a' is
1400 *----------------------------------------------------------------------------*/
1402 int64
float32_to_int64_round_to_zero( float32 a STATUS_PARAM
)
1405 int16 aExp
, shiftCount
;
1410 aSig
= extractFloat32Frac( a
);
1411 aExp
= extractFloat32Exp( a
);
1412 aSign
= extractFloat32Sign( a
);
1413 shiftCount
= aExp
- 0xBE;
1414 if ( 0 <= shiftCount
) {
1415 if ( float32_val(a
) != 0xDF000000 ) {
1416 float_raise( float_flag_invalid STATUS_VAR
);
1417 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1418 return LIT64( 0x7FFFFFFFFFFFFFFF );
1421 return (sbits64
) LIT64( 0x8000000000000000 );
1423 else if ( aExp
<= 0x7E ) {
1424 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1427 aSig64
= aSig
| 0x00800000;
1429 z
= aSig64
>>( - shiftCount
);
1430 if ( (bits64
) ( aSig64
<<( shiftCount
& 63 ) ) ) {
1431 STATUS(float_exception_flags
) |= float_flag_inexact
;
1433 if ( aSign
) z
= - z
;
1438 /*----------------------------------------------------------------------------
1439 | Returns the result of converting the single-precision floating-point value
1440 | `a' to the double-precision floating-point format. The conversion is
1441 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1443 *----------------------------------------------------------------------------*/
1445 float64
float32_to_float64( float32 a STATUS_PARAM
)
1451 aSig
= extractFloat32Frac( a
);
1452 aExp
= extractFloat32Exp( a
);
1453 aSign
= extractFloat32Sign( a
);
1454 if ( aExp
== 0xFF ) {
1455 if ( aSig
) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR
));
1456 return packFloat64( aSign
, 0x7FF, 0 );
1459 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
1460 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1463 return packFloat64( aSign
, aExp
+ 0x380, ( (bits64
) aSig
)<<29 );
1469 /*----------------------------------------------------------------------------
1470 | Returns the result of converting the single-precision floating-point value
1471 | `a' to the extended double-precision floating-point format. The conversion
1472 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1474 *----------------------------------------------------------------------------*/
1476 floatx80
float32_to_floatx80( float32 a STATUS_PARAM
)
1482 aSig
= extractFloat32Frac( a
);
1483 aExp
= extractFloat32Exp( a
);
1484 aSign
= extractFloat32Sign( a
);
1485 if ( aExp
== 0xFF ) {
1486 if ( aSig
) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR
) );
1487 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1490 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
1491 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1494 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<40 );
1502 /*----------------------------------------------------------------------------
1503 | Returns the result of converting the single-precision floating-point value
1504 | `a' to the double-precision floating-point format. The conversion is
1505 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1507 *----------------------------------------------------------------------------*/
1509 float128
float32_to_float128( float32 a STATUS_PARAM
)
1515 aSig
= extractFloat32Frac( a
);
1516 aExp
= extractFloat32Exp( a
);
1517 aSign
= extractFloat32Sign( a
);
1518 if ( aExp
== 0xFF ) {
1519 if ( aSig
) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR
) );
1520 return packFloat128( aSign
, 0x7FFF, 0, 0 );
1523 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
1524 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1527 return packFloat128( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<25, 0 );
1533 /*----------------------------------------------------------------------------
1534 | Rounds the single-precision floating-point value `a' to an integer, and
1535 | returns the result as a single-precision floating-point value. The
1536 | operation is performed according to the IEC/IEEE Standard for Binary
1537 | Floating-Point Arithmetic.
1538 *----------------------------------------------------------------------------*/
1540 float32
float32_round_to_int( float32 a STATUS_PARAM
)
1544 bits32 lastBitMask
, roundBitsMask
;
1548 aExp
= extractFloat32Exp( a
);
1549 if ( 0x96 <= aExp
) {
1550 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
1551 return propagateFloat32NaN( a
, a STATUS_VAR
);
1555 if ( aExp
<= 0x7E ) {
1556 if ( (bits32
) ( float32_val(a
)<<1 ) == 0 ) return a
;
1557 STATUS(float_exception_flags
) |= float_flag_inexact
;
1558 aSign
= extractFloat32Sign( a
);
1559 switch ( STATUS(float_rounding_mode
) ) {
1560 case float_round_nearest_even
:
1561 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
1562 return packFloat32( aSign
, 0x7F, 0 );
1565 case float_round_down
:
1566 return make_float32(aSign
? 0xBF800000 : 0);
1567 case float_round_up
:
1568 return make_float32(aSign
? 0x80000000 : 0x3F800000);
1570 return packFloat32( aSign
, 0, 0 );
1573 lastBitMask
<<= 0x96 - aExp
;
1574 roundBitsMask
= lastBitMask
- 1;
1576 roundingMode
= STATUS(float_rounding_mode
);
1577 if ( roundingMode
== float_round_nearest_even
) {
1578 z
+= lastBitMask
>>1;
1579 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
1581 else if ( roundingMode
!= float_round_to_zero
) {
1582 if ( extractFloat32Sign( make_float32(z
) ) ^ ( roundingMode
== float_round_up
) ) {
1586 z
&= ~ roundBitsMask
;
1587 if ( z
!= float32_val(a
) ) STATUS(float_exception_flags
) |= float_flag_inexact
;
1588 return make_float32(z
);
1592 /*----------------------------------------------------------------------------
1593 | Returns the result of adding the absolute values of the single-precision
1594 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1595 | before being returned. `zSign' is ignored if the result is a NaN.
1596 | The addition is performed according to the IEC/IEEE Standard for Binary
1597 | Floating-Point Arithmetic.
1598 *----------------------------------------------------------------------------*/
1600 static float32
addFloat32Sigs( float32 a
, float32 b
, flag zSign STATUS_PARAM
)
1602 int16 aExp
, bExp
, zExp
;
1603 bits32 aSig
, bSig
, zSig
;
1606 aSig
= extractFloat32Frac( a
);
1607 aExp
= extractFloat32Exp( a
);
1608 bSig
= extractFloat32Frac( b
);
1609 bExp
= extractFloat32Exp( b
);
1610 expDiff
= aExp
- bExp
;
1613 if ( 0 < expDiff
) {
1614 if ( aExp
== 0xFF ) {
1615 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1624 shift32RightJamming( bSig
, expDiff
, &bSig
);
1627 else if ( expDiff
< 0 ) {
1628 if ( bExp
== 0xFF ) {
1629 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1630 return packFloat32( zSign
, 0xFF, 0 );
1638 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1642 if ( aExp
== 0xFF ) {
1643 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1647 if ( STATUS(flush_to_zero
) ) return packFloat32( zSign
, 0, 0 );
1648 return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
1650 zSig
= 0x40000000 + aSig
+ bSig
;
1655 zSig
= ( aSig
+ bSig
)<<1;
1657 if ( (sbits32
) zSig
< 0 ) {
1662 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1666 /*----------------------------------------------------------------------------
1667 | Returns the result of subtracting the absolute values of the single-
1668 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1669 | difference is negated before being returned. `zSign' is ignored if the
1670 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1671 | Standard for Binary Floating-Point Arithmetic.
1672 *----------------------------------------------------------------------------*/
1674 static float32
subFloat32Sigs( float32 a
, float32 b
, flag zSign STATUS_PARAM
)
1676 int16 aExp
, bExp
, zExp
;
1677 bits32 aSig
, bSig
, zSig
;
1680 aSig
= extractFloat32Frac( a
);
1681 aExp
= extractFloat32Exp( a
);
1682 bSig
= extractFloat32Frac( b
);
1683 bExp
= extractFloat32Exp( b
);
1684 expDiff
= aExp
- bExp
;
1687 if ( 0 < expDiff
) goto aExpBigger
;
1688 if ( expDiff
< 0 ) goto bExpBigger
;
1689 if ( aExp
== 0xFF ) {
1690 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1691 float_raise( float_flag_invalid STATUS_VAR
);
1692 return float32_default_nan
;
1698 if ( bSig
< aSig
) goto aBigger
;
1699 if ( aSig
< bSig
) goto bBigger
;
1700 return packFloat32( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
1702 if ( bExp
== 0xFF ) {
1703 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1704 return packFloat32( zSign
^ 1, 0xFF, 0 );
1712 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1718 goto normalizeRoundAndPack
;
1720 if ( aExp
== 0xFF ) {
1721 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1730 shift32RightJamming( bSig
, expDiff
, &bSig
);
1735 normalizeRoundAndPack
:
1737 return normalizeRoundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1741 /*----------------------------------------------------------------------------
1742 | Returns the result of adding the single-precision floating-point values `a'
1743 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1744 | Binary Floating-Point Arithmetic.
1745 *----------------------------------------------------------------------------*/
1747 float32
float32_add( float32 a
, float32 b STATUS_PARAM
)
1751 aSign
= extractFloat32Sign( a
);
1752 bSign
= extractFloat32Sign( b
);
1753 if ( aSign
== bSign
) {
1754 return addFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1757 return subFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1762 /*----------------------------------------------------------------------------
1763 | Returns the result of subtracting the single-precision floating-point values
1764 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1765 | for Binary Floating-Point Arithmetic.
1766 *----------------------------------------------------------------------------*/
1768 float32
float32_sub( float32 a
, float32 b STATUS_PARAM
)
1772 aSign
= extractFloat32Sign( a
);
1773 bSign
= extractFloat32Sign( b
);
1774 if ( aSign
== bSign
) {
1775 return subFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1778 return addFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1783 /*----------------------------------------------------------------------------
1784 | Returns the result of multiplying the single-precision floating-point values
1785 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1786 | for Binary Floating-Point Arithmetic.
1787 *----------------------------------------------------------------------------*/
1789 float32
float32_mul( float32 a
, float32 b STATUS_PARAM
)
1791 flag aSign
, bSign
, zSign
;
1792 int16 aExp
, bExp
, zExp
;
1797 aSig
= extractFloat32Frac( a
);
1798 aExp
= extractFloat32Exp( a
);
1799 aSign
= extractFloat32Sign( a
);
1800 bSig
= extractFloat32Frac( b
);
1801 bExp
= extractFloat32Exp( b
);
1802 bSign
= extractFloat32Sign( b
);
1803 zSign
= aSign
^ bSign
;
1804 if ( aExp
== 0xFF ) {
1805 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1806 return propagateFloat32NaN( a
, b STATUS_VAR
);
1808 if ( ( bExp
| bSig
) == 0 ) {
1809 float_raise( float_flag_invalid STATUS_VAR
);
1810 return float32_default_nan
;
1812 return packFloat32( zSign
, 0xFF, 0 );
1814 if ( bExp
== 0xFF ) {
1815 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1816 if ( ( aExp
| aSig
) == 0 ) {
1817 float_raise( float_flag_invalid STATUS_VAR
);
1818 return float32_default_nan
;
1820 return packFloat32( zSign
, 0xFF, 0 );
1823 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1824 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1827 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1828 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1830 zExp
= aExp
+ bExp
- 0x7F;
1831 aSig
= ( aSig
| 0x00800000 )<<7;
1832 bSig
= ( bSig
| 0x00800000 )<<8;
1833 shift64RightJamming( ( (bits64
) aSig
) * bSig
, 32, &zSig64
);
1835 if ( 0 <= (sbits32
) ( zSig
<<1 ) ) {
1839 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1843 /*----------------------------------------------------------------------------
1844 | Returns the result of dividing the single-precision floating-point value `a'
1845 | by the corresponding value `b'. The operation is performed according to the
1846 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1847 *----------------------------------------------------------------------------*/
1849 float32
float32_div( float32 a
, float32 b STATUS_PARAM
)
1851 flag aSign
, bSign
, zSign
;
1852 int16 aExp
, bExp
, zExp
;
1853 bits32 aSig
, bSig
, zSig
;
1855 aSig
= extractFloat32Frac( a
);
1856 aExp
= extractFloat32Exp( a
);
1857 aSign
= extractFloat32Sign( a
);
1858 bSig
= extractFloat32Frac( b
);
1859 bExp
= extractFloat32Exp( b
);
1860 bSign
= extractFloat32Sign( b
);
1861 zSign
= aSign
^ bSign
;
1862 if ( aExp
== 0xFF ) {
1863 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1864 if ( bExp
== 0xFF ) {
1865 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1866 float_raise( float_flag_invalid STATUS_VAR
);
1867 return float32_default_nan
;
1869 return packFloat32( zSign
, 0xFF, 0 );
1871 if ( bExp
== 0xFF ) {
1872 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1873 return packFloat32( zSign
, 0, 0 );
1877 if ( ( aExp
| aSig
) == 0 ) {
1878 float_raise( float_flag_invalid STATUS_VAR
);
1879 return float32_default_nan
;
1881 float_raise( float_flag_divbyzero STATUS_VAR
);
1882 return packFloat32( zSign
, 0xFF, 0 );
1884 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1887 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1888 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1890 zExp
= aExp
- bExp
+ 0x7D;
1891 aSig
= ( aSig
| 0x00800000 )<<7;
1892 bSig
= ( bSig
| 0x00800000 )<<8;
1893 if ( bSig
<= ( aSig
+ aSig
) ) {
1897 zSig
= ( ( (bits64
) aSig
)<<32 ) / bSig
;
1898 if ( ( zSig
& 0x3F ) == 0 ) {
1899 zSig
|= ( (bits64
) bSig
* zSig
!= ( (bits64
) aSig
)<<32 );
1901 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1905 /*----------------------------------------------------------------------------
1906 | Returns the remainder of the single-precision floating-point value `a'
1907 | with respect to the corresponding value `b'. The operation is performed
1908 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1909 *----------------------------------------------------------------------------*/
1911 float32
float32_rem( float32 a
, float32 b STATUS_PARAM
)
1914 int16 aExp
, bExp
, expDiff
;
1917 bits64 aSig64
, bSig64
, q64
;
1918 bits32 alternateASig
;
1921 aSig
= extractFloat32Frac( a
);
1922 aExp
= extractFloat32Exp( a
);
1923 aSign
= extractFloat32Sign( a
);
1924 bSig
= extractFloat32Frac( b
);
1925 bExp
= extractFloat32Exp( b
);
1926 if ( aExp
== 0xFF ) {
1927 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1928 return propagateFloat32NaN( a
, b STATUS_VAR
);
1930 float_raise( float_flag_invalid STATUS_VAR
);
1931 return float32_default_nan
;
1933 if ( bExp
== 0xFF ) {
1934 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1939 float_raise( float_flag_invalid STATUS_VAR
);
1940 return float32_default_nan
;
1942 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1945 if ( aSig
== 0 ) return a
;
1946 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1948 expDiff
= aExp
- bExp
;
1951 if ( expDiff
< 32 ) {
1954 if ( expDiff
< 0 ) {
1955 if ( expDiff
< -1 ) return a
;
1958 q
= ( bSig
<= aSig
);
1959 if ( q
) aSig
-= bSig
;
1960 if ( 0 < expDiff
) {
1961 q
= ( ( (bits64
) aSig
)<<32 ) / bSig
;
1964 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
1972 if ( bSig
<= aSig
) aSig
-= bSig
;
1973 aSig64
= ( (bits64
) aSig
)<<40;
1974 bSig64
= ( (bits64
) bSig
)<<40;
1976 while ( 0 < expDiff
) {
1977 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
1978 q64
= ( 2 < q64
) ? q64
- 2 : 0;
1979 aSig64
= - ( ( bSig
* q64
)<<38 );
1983 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
1984 q64
= ( 2 < q64
) ? q64
- 2 : 0;
1985 q
= q64
>>( 64 - expDiff
);
1987 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
1990 alternateASig
= aSig
;
1993 } while ( 0 <= (sbits32
) aSig
);
1994 sigMean
= aSig
+ alternateASig
;
1995 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
1996 aSig
= alternateASig
;
1998 zSign
= ( (sbits32
) aSig
< 0 );
1999 if ( zSign
) aSig
= - aSig
;
2000 return normalizeRoundAndPackFloat32( aSign
^ zSign
, bExp
, aSig STATUS_VAR
);
2004 /*----------------------------------------------------------------------------
2005 | Returns the square root of the single-precision floating-point value `a'.
2006 | The operation is performed according to the IEC/IEEE Standard for Binary
2007 | Floating-Point Arithmetic.
2008 *----------------------------------------------------------------------------*/
2010 float32
float32_sqrt( float32 a STATUS_PARAM
)
2017 aSig
= extractFloat32Frac( a
);
2018 aExp
= extractFloat32Exp( a
);
2019 aSign
= extractFloat32Sign( a
);
2020 if ( aExp
== 0xFF ) {
2021 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2022 if ( ! aSign
) return a
;
2023 float_raise( float_flag_invalid STATUS_VAR
);
2024 return float32_default_nan
;
2027 if ( ( aExp
| aSig
) == 0 ) return a
;
2028 float_raise( float_flag_invalid STATUS_VAR
);
2029 return float32_default_nan
;
2032 if ( aSig
== 0 ) return float32_zero
;
2033 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2035 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
2036 aSig
= ( aSig
| 0x00800000 )<<8;
2037 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
2038 if ( ( zSig
& 0x7F ) <= 5 ) {
2044 term
= ( (bits64
) zSig
) * zSig
;
2045 rem
= ( ( (bits64
) aSig
)<<32 ) - term
;
2046 while ( (sbits64
) rem
< 0 ) {
2048 rem
+= ( ( (bits64
) zSig
)<<1 ) | 1;
2050 zSig
|= ( rem
!= 0 );
2052 shift32RightJamming( zSig
, 1, &zSig
);
2054 return roundAndPackFloat32( 0, zExp
, zSig STATUS_VAR
);
2058 /*----------------------------------------------------------------------------
2059 | Returns the binary log of the single-precision floating-point value `a'.
2060 | The operation is performed according to the IEC/IEEE Standard for Binary
2061 | Floating-Point Arithmetic.
2062 *----------------------------------------------------------------------------*/
2063 float32
float32_log2( float32 a STATUS_PARAM
)
2067 bits32 aSig
, zSig
, i
;
2069 aSig
= extractFloat32Frac( a
);
2070 aExp
= extractFloat32Exp( a
);
2071 aSign
= extractFloat32Sign( a
);
2074 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
2075 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2078 float_raise( float_flag_invalid STATUS_VAR
);
2079 return float32_default_nan
;
2081 if ( aExp
== 0xFF ) {
2082 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2091 for (i
= 1 << 22; i
> 0; i
>>= 1) {
2092 aSig
= ( (bits64
)aSig
* aSig
) >> 23;
2093 if ( aSig
& 0x01000000 ) {
2102 return normalizeRoundAndPackFloat32( zSign
, 0x85, zSig STATUS_VAR
);
2105 /*----------------------------------------------------------------------------
2106 | Returns 1 if the single-precision floating-point value `a' is equal to
2107 | the corresponding value `b', and 0 otherwise. The comparison is performed
2108 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2109 *----------------------------------------------------------------------------*/
2111 int float32_eq( float32 a
, float32 b STATUS_PARAM
)
2114 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2115 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2117 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2118 float_raise( float_flag_invalid STATUS_VAR
);
2122 return ( float32_val(a
) == float32_val(b
) ) ||
2123 ( (bits32
) ( ( float32_val(a
) | float32_val(b
) )<<1 ) == 0 );
2127 /*----------------------------------------------------------------------------
2128 | Returns 1 if the single-precision floating-point value `a' is less than
2129 | or equal to the corresponding value `b', and 0 otherwise. The comparison
2130 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2132 *----------------------------------------------------------------------------*/
2134 int float32_le( float32 a
, float32 b STATUS_PARAM
)
2139 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2140 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2142 float_raise( float_flag_invalid STATUS_VAR
);
2145 aSign
= extractFloat32Sign( a
);
2146 bSign
= extractFloat32Sign( b
);
2147 av
= float32_val(a
);
2148 bv
= float32_val(b
);
2149 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( av
| bv
)<<1 ) == 0 );
2150 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
2154 /*----------------------------------------------------------------------------
2155 | Returns 1 if the single-precision floating-point value `a' is less than
2156 | the corresponding value `b', and 0 otherwise. The comparison is performed
2157 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2158 *----------------------------------------------------------------------------*/
2160 int float32_lt( float32 a
, float32 b STATUS_PARAM
)
2165 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2166 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2168 float_raise( float_flag_invalid STATUS_VAR
);
2171 aSign
= extractFloat32Sign( a
);
2172 bSign
= extractFloat32Sign( b
);
2173 av
= float32_val(a
);
2174 bv
= float32_val(b
);
2175 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( av
| bv
)<<1 ) != 0 );
2176 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
2180 /*----------------------------------------------------------------------------
2181 | Returns 1 if the single-precision floating-point value `a' is equal to
2182 | the corresponding value `b', and 0 otherwise. The invalid exception is
2183 | raised if either operand is a NaN. Otherwise, the comparison is performed
2184 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2185 *----------------------------------------------------------------------------*/
2187 int float32_eq_signaling( float32 a
, float32 b STATUS_PARAM
)
2191 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2192 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2194 float_raise( float_flag_invalid STATUS_VAR
);
2197 av
= float32_val(a
);
2198 bv
= float32_val(b
);
2199 return ( av
== bv
) || ( (bits32
) ( ( av
| bv
)<<1 ) == 0 );
2203 /*----------------------------------------------------------------------------
2204 | Returns 1 if the single-precision floating-point value `a' is less than or
2205 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2206 | cause an exception. Otherwise, the comparison is performed according to the
2207 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2208 *----------------------------------------------------------------------------*/
2210 int float32_le_quiet( float32 a
, float32 b STATUS_PARAM
)
2215 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2216 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2218 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2219 float_raise( float_flag_invalid STATUS_VAR
);
2223 aSign
= extractFloat32Sign( a
);
2224 bSign
= extractFloat32Sign( b
);
2225 av
= float32_val(a
);
2226 bv
= float32_val(b
);
2227 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( av
| bv
)<<1 ) == 0 );
2228 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
2232 /*----------------------------------------------------------------------------
2233 | Returns 1 if the single-precision floating-point value `a' is less than
2234 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2235 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2236 | Standard for Binary Floating-Point Arithmetic.
2237 *----------------------------------------------------------------------------*/
2239 int float32_lt_quiet( float32 a
, float32 b STATUS_PARAM
)
2244 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2245 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2247 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2248 float_raise( float_flag_invalid STATUS_VAR
);
2252 aSign
= extractFloat32Sign( a
);
2253 bSign
= extractFloat32Sign( b
);
2254 av
= float32_val(a
);
2255 bv
= float32_val(b
);
2256 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( av
| bv
)<<1 ) != 0 );
2257 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
2261 /*----------------------------------------------------------------------------
2262 | Returns the result of converting the double-precision floating-point value
2263 | `a' to the 32-bit two's complement integer format. The conversion is
2264 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2265 | Arithmetic---which means in particular that the conversion is rounded
2266 | according to the current rounding mode. If `a' is a NaN, the largest
2267 | positive integer is returned. Otherwise, if the conversion overflows, the
2268 | largest integer with the same sign as `a' is returned.
2269 *----------------------------------------------------------------------------*/
2271 int32
float64_to_int32( float64 a STATUS_PARAM
)
2274 int16 aExp
, shiftCount
;
2277 aSig
= extractFloat64Frac( a
);
2278 aExp
= extractFloat64Exp( a
);
2279 aSign
= extractFloat64Sign( a
);
2280 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2281 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2282 shiftCount
= 0x42C - aExp
;
2283 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
2284 return roundAndPackInt32( aSign
, aSig STATUS_VAR
);
2288 /*----------------------------------------------------------------------------
2289 | Returns the result of converting the double-precision floating-point value
2290 | `a' to the 32-bit two's complement integer format. The conversion is
2291 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2292 | Arithmetic, except that the conversion is always rounded toward zero.
2293 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2294 | the conversion overflows, the largest integer with the same sign as `a' is
2296 *----------------------------------------------------------------------------*/
2298 int32
float64_to_int32_round_to_zero( float64 a STATUS_PARAM
)
2301 int16 aExp
, shiftCount
;
2302 bits64 aSig
, savedASig
;
2305 aSig
= extractFloat64Frac( a
);
2306 aExp
= extractFloat64Exp( a
);
2307 aSign
= extractFloat64Sign( a
);
2308 if ( 0x41E < aExp
) {
2309 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2312 else if ( aExp
< 0x3FF ) {
2313 if ( aExp
|| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
2316 aSig
|= LIT64( 0x0010000000000000 );
2317 shiftCount
= 0x433 - aExp
;
2319 aSig
>>= shiftCount
;
2321 if ( aSign
) z
= - z
;
2322 if ( ( z
< 0 ) ^ aSign
) {
2324 float_raise( float_flag_invalid STATUS_VAR
);
2325 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
2327 if ( ( aSig
<<shiftCount
) != savedASig
) {
2328 STATUS(float_exception_flags
) |= float_flag_inexact
;
2334 /*----------------------------------------------------------------------------
2335 | Returns the result of converting the double-precision floating-point value
2336 | `a' to the 64-bit two's complement integer format. The conversion is
2337 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2338 | Arithmetic---which means in particular that the conversion is rounded
2339 | according to the current rounding mode. If `a' is a NaN, the largest
2340 | positive integer is returned. Otherwise, if the conversion overflows, the
2341 | largest integer with the same sign as `a' is returned.
2342 *----------------------------------------------------------------------------*/
2344 int64
float64_to_int64( float64 a STATUS_PARAM
)
2347 int16 aExp
, shiftCount
;
2348 bits64 aSig
, aSigExtra
;
2350 aSig
= extractFloat64Frac( a
);
2351 aExp
= extractFloat64Exp( a
);
2352 aSign
= extractFloat64Sign( a
);
2353 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2354 shiftCount
= 0x433 - aExp
;
2355 if ( shiftCount
<= 0 ) {
2356 if ( 0x43E < aExp
) {
2357 float_raise( float_flag_invalid STATUS_VAR
);
2359 || ( ( aExp
== 0x7FF )
2360 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2362 return LIT64( 0x7FFFFFFFFFFFFFFF );
2364 return (sbits64
) LIT64( 0x8000000000000000 );
2367 aSig
<<= - shiftCount
;
2370 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
2372 return roundAndPackInt64( aSign
, aSig
, aSigExtra STATUS_VAR
);
2376 /*----------------------------------------------------------------------------
2377 | Returns the result of converting the double-precision floating-point value
2378 | `a' to the 64-bit two's complement integer format. The conversion is
2379 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2380 | Arithmetic, except that the conversion is always rounded toward zero.
2381 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2382 | the conversion overflows, the largest integer with the same sign as `a' is
2384 *----------------------------------------------------------------------------*/
2386 int64
float64_to_int64_round_to_zero( float64 a STATUS_PARAM
)
2389 int16 aExp
, shiftCount
;
2393 aSig
= extractFloat64Frac( a
);
2394 aExp
= extractFloat64Exp( a
);
2395 aSign
= extractFloat64Sign( a
);
2396 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2397 shiftCount
= aExp
- 0x433;
2398 if ( 0 <= shiftCount
) {
2399 if ( 0x43E <= aExp
) {
2400 if ( float64_val(a
) != LIT64( 0xC3E0000000000000 ) ) {
2401 float_raise( float_flag_invalid STATUS_VAR
);
2403 || ( ( aExp
== 0x7FF )
2404 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2406 return LIT64( 0x7FFFFFFFFFFFFFFF );
2409 return (sbits64
) LIT64( 0x8000000000000000 );
2411 z
= aSig
<<shiftCount
;
2414 if ( aExp
< 0x3FE ) {
2415 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
2418 z
= aSig
>>( - shiftCount
);
2419 if ( (bits64
) ( aSig
<<( shiftCount
& 63 ) ) ) {
2420 STATUS(float_exception_flags
) |= float_flag_inexact
;
2423 if ( aSign
) z
= - z
;
2428 /*----------------------------------------------------------------------------
2429 | Returns the result of converting the double-precision floating-point value
2430 | `a' to the single-precision floating-point format. The conversion is
2431 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2433 *----------------------------------------------------------------------------*/
2435 float32
float64_to_float32( float64 a STATUS_PARAM
)
2442 aSig
= extractFloat64Frac( a
);
2443 aExp
= extractFloat64Exp( a
);
2444 aSign
= extractFloat64Sign( a
);
2445 if ( aExp
== 0x7FF ) {
2446 if ( aSig
) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR
) );
2447 return packFloat32( aSign
, 0xFF, 0 );
2449 shift64RightJamming( aSig
, 22, &aSig
);
2451 if ( aExp
|| zSig
) {
2455 return roundAndPackFloat32( aSign
, aExp
, zSig STATUS_VAR
);
2460 /*----------------------------------------------------------------------------
2461 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2462 | half-precision floating-point value, returning the result. After being
2463 | shifted into the proper positions, the three fields are simply added
2464 | together to form the result. This means that any integer portion of `zSig'
2465 | will be added into the exponent. Since a properly normalized significand
2466 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2467 | than the desired result exponent whenever `zSig' is a complete, normalized
2469 *----------------------------------------------------------------------------*/
2470 static bits16
packFloat16(flag zSign
, int16 zExp
, bits16 zSig
)
2472 return (((bits32
)zSign
) << 15) + (((bits32
)zExp
) << 10) + zSig
;
2475 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2476 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
2478 float32
float16_to_float32( bits16 a
, flag ieee STATUS_PARAM
)
2485 aExp
= (a
>> 10) & 0x1f;
2488 if (aExp
== 0x1f && ieee
) {
2490 /* Make sure correct exceptions are raised. */
2491 float32ToCommonNaN(a STATUS_VAR
);
2494 return packFloat32(aSign
, 0xff, aSig
<< 13);
2500 return packFloat32(aSign
, 0, 0);
2503 shiftCount
= countLeadingZeros32( aSig
) - 21;
2504 aSig
= aSig
<< shiftCount
;
2507 return packFloat32( aSign
, aExp
+ 0x70, aSig
<< 13);
2510 bits16
float32_to_float16( float32 a
, flag ieee STATUS_PARAM
)
2519 aSig
= extractFloat32Frac( a
);
2520 aExp
= extractFloat32Exp( a
);
2521 aSign
= extractFloat32Sign( a
);
2522 if ( aExp
== 0xFF ) {
2524 /* Make sure correct exceptions are raised. */
2525 float32ToCommonNaN(a STATUS_VAR
);
2528 return packFloat16(aSign
, 0x1f, aSig
>> 13);
2530 if (aExp
== 0 && aSign
== 0) {
2531 return packFloat16(aSign
, 0, 0);
2533 /* Decimal point between bits 22 and 23. */
2547 float_raise( float_flag_underflow STATUS_VAR
);
2548 roundingMode
= STATUS(float_rounding_mode
);
2549 switch (roundingMode
) {
2550 case float_round_nearest_even
:
2551 increment
= (mask
+ 1) >> 1;
2552 if ((aSig
& mask
) == increment
) {
2553 increment
= aSig
& (increment
<< 1);
2556 case float_round_up
:
2557 increment
= aSign
? 0 : mask
;
2559 case float_round_down
:
2560 increment
= aSign
? mask
: 0;
2562 default: /* round_to_zero */
2567 if (aSig
>= 0x01000000) {
2571 } else if (aExp
< -14
2572 && STATUS(float_detect_tininess
) == float_tininess_before_rounding
) {
2573 float_raise( float_flag_underflow STATUS_VAR
);
2578 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
2579 return packFloat16(aSign
, 0x1f, 0);
2583 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
2584 return packFloat16(aSign
, 0x1f, 0x3ff);
2588 return packFloat16(aSign
, 0, 0);
2591 aSig
>>= -14 - aExp
;
2594 return packFloat16(aSign
, aExp
+ 14, aSig
>> 13);
2599 /*----------------------------------------------------------------------------
2600 | Returns the result of converting the double-precision floating-point value
2601 | `a' to the extended double-precision floating-point format. The conversion
2602 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2604 *----------------------------------------------------------------------------*/
2606 floatx80
float64_to_floatx80( float64 a STATUS_PARAM
)
2612 aSig
= extractFloat64Frac( a
);
2613 aExp
= extractFloat64Exp( a
);
2614 aSign
= extractFloat64Sign( a
);
2615 if ( aExp
== 0x7FF ) {
2616 if ( aSig
) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR
) );
2617 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2620 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
2621 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2625 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
2633 /*----------------------------------------------------------------------------
2634 | Returns the result of converting the double-precision floating-point value
2635 | `a' to the quadruple-precision floating-point format. The conversion is
2636 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2638 *----------------------------------------------------------------------------*/
2640 float128
float64_to_float128( float64 a STATUS_PARAM
)
2644 bits64 aSig
, zSig0
, zSig1
;
2646 aSig
= extractFloat64Frac( a
);
2647 aExp
= extractFloat64Exp( a
);
2648 aSign
= extractFloat64Sign( a
);
2649 if ( aExp
== 0x7FF ) {
2650 if ( aSig
) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR
) );
2651 return packFloat128( aSign
, 0x7FFF, 0, 0 );
2654 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
2655 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2658 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
2659 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
2665 /*----------------------------------------------------------------------------
2666 | Rounds the double-precision floating-point value `a' to an integer, and
2667 | returns the result as a double-precision floating-point value. The
2668 | operation is performed according to the IEC/IEEE Standard for Binary
2669 | Floating-Point Arithmetic.
2670 *----------------------------------------------------------------------------*/
2672 float64
float64_round_to_int( float64 a STATUS_PARAM
)
2676 bits64 lastBitMask
, roundBitsMask
;
2680 aExp
= extractFloat64Exp( a
);
2681 if ( 0x433 <= aExp
) {
2682 if ( ( aExp
== 0x7FF ) && extractFloat64Frac( a
) ) {
2683 return propagateFloat64NaN( a
, a STATUS_VAR
);
2687 if ( aExp
< 0x3FF ) {
2688 if ( (bits64
) ( float64_val(a
)<<1 ) == 0 ) return a
;
2689 STATUS(float_exception_flags
) |= float_flag_inexact
;
2690 aSign
= extractFloat64Sign( a
);
2691 switch ( STATUS(float_rounding_mode
) ) {
2692 case float_round_nearest_even
:
2693 if ( ( aExp
== 0x3FE ) && extractFloat64Frac( a
) ) {
2694 return packFloat64( aSign
, 0x3FF, 0 );
2697 case float_round_down
:
2698 return make_float64(aSign
? LIT64( 0xBFF0000000000000 ) : 0);
2699 case float_round_up
:
2700 return make_float64(
2701 aSign
? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2703 return packFloat64( aSign
, 0, 0 );
2706 lastBitMask
<<= 0x433 - aExp
;
2707 roundBitsMask
= lastBitMask
- 1;
2709 roundingMode
= STATUS(float_rounding_mode
);
2710 if ( roundingMode
== float_round_nearest_even
) {
2711 z
+= lastBitMask
>>1;
2712 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
2714 else if ( roundingMode
!= float_round_to_zero
) {
2715 if ( extractFloat64Sign( make_float64(z
) ) ^ ( roundingMode
== float_round_up
) ) {
2719 z
&= ~ roundBitsMask
;
2720 if ( z
!= float64_val(a
) )
2721 STATUS(float_exception_flags
) |= float_flag_inexact
;
2722 return make_float64(z
);
2726 float64
float64_trunc_to_int( float64 a STATUS_PARAM
)
2730 oldmode
= STATUS(float_rounding_mode
);
2731 STATUS(float_rounding_mode
) = float_round_to_zero
;
2732 res
= float64_round_to_int(a STATUS_VAR
);
2733 STATUS(float_rounding_mode
) = oldmode
;
2737 /*----------------------------------------------------------------------------
2738 | Returns the result of adding the absolute values of the double-precision
2739 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2740 | before being returned. `zSign' is ignored if the result is a NaN.
2741 | The addition is performed according to the IEC/IEEE Standard for Binary
2742 | Floating-Point Arithmetic.
2743 *----------------------------------------------------------------------------*/
2745 static float64
addFloat64Sigs( float64 a
, float64 b
, flag zSign STATUS_PARAM
)
2747 int16 aExp
, bExp
, zExp
;
2748 bits64 aSig
, bSig
, zSig
;
2751 aSig
= extractFloat64Frac( a
);
2752 aExp
= extractFloat64Exp( a
);
2753 bSig
= extractFloat64Frac( b
);
2754 bExp
= extractFloat64Exp( b
);
2755 expDiff
= aExp
- bExp
;
2758 if ( 0 < expDiff
) {
2759 if ( aExp
== 0x7FF ) {
2760 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2767 bSig
|= LIT64( 0x2000000000000000 );
2769 shift64RightJamming( bSig
, expDiff
, &bSig
);
2772 else if ( expDiff
< 0 ) {
2773 if ( bExp
== 0x7FF ) {
2774 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2775 return packFloat64( zSign
, 0x7FF, 0 );
2781 aSig
|= LIT64( 0x2000000000000000 );
2783 shift64RightJamming( aSig
, - expDiff
, &aSig
);
2787 if ( aExp
== 0x7FF ) {
2788 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2792 if ( STATUS(flush_to_zero
) ) return packFloat64( zSign
, 0, 0 );
2793 return packFloat64( zSign
, 0, ( aSig
+ bSig
)>>9 );
2795 zSig
= LIT64( 0x4000000000000000 ) + aSig
+ bSig
;
2799 aSig
|= LIT64( 0x2000000000000000 );
2800 zSig
= ( aSig
+ bSig
)<<1;
2802 if ( (sbits64
) zSig
< 0 ) {
2807 return roundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
2811 /*----------------------------------------------------------------------------
2812 | Returns the result of subtracting the absolute values of the double-
2813 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2814 | difference is negated before being returned. `zSign' is ignored if the
2815 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2816 | Standard for Binary Floating-Point Arithmetic.
2817 *----------------------------------------------------------------------------*/
2819 static float64
subFloat64Sigs( float64 a
, float64 b
, flag zSign STATUS_PARAM
)
2821 int16 aExp
, bExp
, zExp
;
2822 bits64 aSig
, bSig
, zSig
;
2825 aSig
= extractFloat64Frac( a
);
2826 aExp
= extractFloat64Exp( a
);
2827 bSig
= extractFloat64Frac( b
);
2828 bExp
= extractFloat64Exp( b
);
2829 expDiff
= aExp
- bExp
;
2832 if ( 0 < expDiff
) goto aExpBigger
;
2833 if ( expDiff
< 0 ) goto bExpBigger
;
2834 if ( aExp
== 0x7FF ) {
2835 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2836 float_raise( float_flag_invalid STATUS_VAR
);
2837 return float64_default_nan
;
2843 if ( bSig
< aSig
) goto aBigger
;
2844 if ( aSig
< bSig
) goto bBigger
;
2845 return packFloat64( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
2847 if ( bExp
== 0x7FF ) {
2848 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2849 return packFloat64( zSign
^ 1, 0x7FF, 0 );
2855 aSig
|= LIT64( 0x4000000000000000 );
2857 shift64RightJamming( aSig
, - expDiff
, &aSig
);
2858 bSig
|= LIT64( 0x4000000000000000 );
2863 goto normalizeRoundAndPack
;
2865 if ( aExp
== 0x7FF ) {
2866 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2873 bSig
|= LIT64( 0x4000000000000000 );
2875 shift64RightJamming( bSig
, expDiff
, &bSig
);
2876 aSig
|= LIT64( 0x4000000000000000 );
2880 normalizeRoundAndPack
:
2882 return normalizeRoundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
2886 /*----------------------------------------------------------------------------
2887 | Returns the result of adding the double-precision floating-point values `a'
2888 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2889 | Binary Floating-Point Arithmetic.
2890 *----------------------------------------------------------------------------*/
2892 float64
float64_add( float64 a
, float64 b STATUS_PARAM
)
2896 aSign
= extractFloat64Sign( a
);
2897 bSign
= extractFloat64Sign( b
);
2898 if ( aSign
== bSign
) {
2899 return addFloat64Sigs( a
, b
, aSign STATUS_VAR
);
2902 return subFloat64Sigs( a
, b
, aSign STATUS_VAR
);
2907 /*----------------------------------------------------------------------------
2908 | Returns the result of subtracting the double-precision floating-point values
2909 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2910 | for Binary Floating-Point Arithmetic.
2911 *----------------------------------------------------------------------------*/
2913 float64
float64_sub( float64 a
, float64 b STATUS_PARAM
)
2917 aSign
= extractFloat64Sign( a
);
2918 bSign
= extractFloat64Sign( b
);
2919 if ( aSign
== bSign
) {
2920 return subFloat64Sigs( a
, b
, aSign STATUS_VAR
);
2923 return addFloat64Sigs( a
, b
, aSign STATUS_VAR
);
2928 /*----------------------------------------------------------------------------
2929 | Returns the result of multiplying the double-precision floating-point values
2930 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2931 | for Binary Floating-Point Arithmetic.
2932 *----------------------------------------------------------------------------*/
2934 float64
float64_mul( float64 a
, float64 b STATUS_PARAM
)
2936 flag aSign
, bSign
, zSign
;
2937 int16 aExp
, bExp
, zExp
;
2938 bits64 aSig
, bSig
, zSig0
, zSig1
;
2940 aSig
= extractFloat64Frac( a
);
2941 aExp
= extractFloat64Exp( a
);
2942 aSign
= extractFloat64Sign( a
);
2943 bSig
= extractFloat64Frac( b
);
2944 bExp
= extractFloat64Exp( b
);
2945 bSign
= extractFloat64Sign( b
);
2946 zSign
= aSign
^ bSign
;
2947 if ( aExp
== 0x7FF ) {
2948 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
2949 return propagateFloat64NaN( a
, b STATUS_VAR
);
2951 if ( ( bExp
| bSig
) == 0 ) {
2952 float_raise( float_flag_invalid STATUS_VAR
);
2953 return float64_default_nan
;
2955 return packFloat64( zSign
, 0x7FF, 0 );
2957 if ( bExp
== 0x7FF ) {
2958 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2959 if ( ( aExp
| aSig
) == 0 ) {
2960 float_raise( float_flag_invalid STATUS_VAR
);
2961 return float64_default_nan
;
2963 return packFloat64( zSign
, 0x7FF, 0 );
2966 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2967 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2970 if ( bSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2971 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2973 zExp
= aExp
+ bExp
- 0x3FF;
2974 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
2975 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2976 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
2977 zSig0
|= ( zSig1
!= 0 );
2978 if ( 0 <= (sbits64
) ( zSig0
<<1 ) ) {
2982 return roundAndPackFloat64( zSign
, zExp
, zSig0 STATUS_VAR
);
2986 /*----------------------------------------------------------------------------
2987 | Returns the result of dividing the double-precision floating-point value `a'
2988 | by the corresponding value `b'. The operation is performed according to
2989 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2990 *----------------------------------------------------------------------------*/
2992 float64
float64_div( float64 a
, float64 b STATUS_PARAM
)
2994 flag aSign
, bSign
, zSign
;
2995 int16 aExp
, bExp
, zExp
;
2996 bits64 aSig
, bSig
, zSig
;
2998 bits64 term0
, term1
;
3000 aSig
= extractFloat64Frac( a
);
3001 aExp
= extractFloat64Exp( a
);
3002 aSign
= extractFloat64Sign( a
);
3003 bSig
= extractFloat64Frac( b
);
3004 bExp
= extractFloat64Exp( b
);
3005 bSign
= extractFloat64Sign( b
);
3006 zSign
= aSign
^ bSign
;
3007 if ( aExp
== 0x7FF ) {
3008 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3009 if ( bExp
== 0x7FF ) {
3010 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3011 float_raise( float_flag_invalid STATUS_VAR
);
3012 return float64_default_nan
;
3014 return packFloat64( zSign
, 0x7FF, 0 );
3016 if ( bExp
== 0x7FF ) {
3017 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3018 return packFloat64( zSign
, 0, 0 );
3022 if ( ( aExp
| aSig
) == 0 ) {
3023 float_raise( float_flag_invalid STATUS_VAR
);
3024 return float64_default_nan
;
3026 float_raise( float_flag_divbyzero STATUS_VAR
);
3027 return packFloat64( zSign
, 0x7FF, 0 );
3029 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3032 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
3033 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3035 zExp
= aExp
- bExp
+ 0x3FD;
3036 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
3037 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3038 if ( bSig
<= ( aSig
+ aSig
) ) {
3042 zSig
= estimateDiv128To64( aSig
, 0, bSig
);
3043 if ( ( zSig
& 0x1FF ) <= 2 ) {
3044 mul64To128( bSig
, zSig
, &term0
, &term1
);
3045 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
3046 while ( (sbits64
) rem0
< 0 ) {
3048 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
3050 zSig
|= ( rem1
!= 0 );
3052 return roundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
3056 /*----------------------------------------------------------------------------
3057 | Returns the remainder of the double-precision floating-point value `a'
3058 | with respect to the corresponding value `b'. The operation is performed
3059 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3060 *----------------------------------------------------------------------------*/
3062 float64
float64_rem( float64 a
, float64 b STATUS_PARAM
)
3065 int16 aExp
, bExp
, expDiff
;
3067 bits64 q
, alternateASig
;
3070 aSig
= extractFloat64Frac( a
);
3071 aExp
= extractFloat64Exp( a
);
3072 aSign
= extractFloat64Sign( a
);
3073 bSig
= extractFloat64Frac( b
);
3074 bExp
= extractFloat64Exp( b
);
3075 if ( aExp
== 0x7FF ) {
3076 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
3077 return propagateFloat64NaN( a
, b STATUS_VAR
);
3079 float_raise( float_flag_invalid STATUS_VAR
);
3080 return float64_default_nan
;
3082 if ( bExp
== 0x7FF ) {
3083 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3088 float_raise( float_flag_invalid STATUS_VAR
);
3089 return float64_default_nan
;
3091 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3094 if ( aSig
== 0 ) return a
;
3095 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3097 expDiff
= aExp
- bExp
;
3098 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
3099 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3100 if ( expDiff
< 0 ) {
3101 if ( expDiff
< -1 ) return a
;
3104 q
= ( bSig
<= aSig
);
3105 if ( q
) aSig
-= bSig
;
3107 while ( 0 < expDiff
) {
3108 q
= estimateDiv128To64( aSig
, 0, bSig
);
3109 q
= ( 2 < q
) ? q
- 2 : 0;
3110 aSig
= - ( ( bSig
>>2 ) * q
);
3114 if ( 0 < expDiff
) {
3115 q
= estimateDiv128To64( aSig
, 0, bSig
);
3116 q
= ( 2 < q
) ? q
- 2 : 0;
3119 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
3126 alternateASig
= aSig
;
3129 } while ( 0 <= (sbits64
) aSig
);
3130 sigMean
= aSig
+ alternateASig
;
3131 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
3132 aSig
= alternateASig
;
3134 zSign
= ( (sbits64
) aSig
< 0 );
3135 if ( zSign
) aSig
= - aSig
;
3136 return normalizeRoundAndPackFloat64( aSign
^ zSign
, bExp
, aSig STATUS_VAR
);
3140 /*----------------------------------------------------------------------------
3141 | Returns the square root of the double-precision floating-point value `a'.
3142 | The operation is performed according to the IEC/IEEE Standard for Binary
3143 | Floating-Point Arithmetic.
3144 *----------------------------------------------------------------------------*/
3146 float64
float64_sqrt( float64 a STATUS_PARAM
)
3150 bits64 aSig
, zSig
, doubleZSig
;
3151 bits64 rem0
, rem1
, term0
, term1
;
3153 aSig
= extractFloat64Frac( a
);
3154 aExp
= extractFloat64Exp( a
);
3155 aSign
= extractFloat64Sign( a
);
3156 if ( aExp
== 0x7FF ) {
3157 if ( aSig
) return propagateFloat64NaN( a
, a STATUS_VAR
);
3158 if ( ! aSign
) return a
;
3159 float_raise( float_flag_invalid STATUS_VAR
);
3160 return float64_default_nan
;
3163 if ( ( aExp
| aSig
) == 0 ) return a
;
3164 float_raise( float_flag_invalid STATUS_VAR
);
3165 return float64_default_nan
;
3168 if ( aSig
== 0 ) return float64_zero
;
3169 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3171 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
3172 aSig
|= LIT64( 0x0010000000000000 );
3173 zSig
= estimateSqrt32( aExp
, aSig
>>21 );
3174 aSig
<<= 9 - ( aExp
& 1 );
3175 zSig
= estimateDiv128To64( aSig
, 0, zSig
<<32 ) + ( zSig
<<30 );
3176 if ( ( zSig
& 0x1FF ) <= 5 ) {
3177 doubleZSig
= zSig
<<1;
3178 mul64To128( zSig
, zSig
, &term0
, &term1
);
3179 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
3180 while ( (sbits64
) rem0
< 0 ) {
3183 add128( rem0
, rem1
, zSig
>>63, doubleZSig
| 1, &rem0
, &rem1
);
3185 zSig
|= ( ( rem0
| rem1
) != 0 );
3187 return roundAndPackFloat64( 0, zExp
, zSig STATUS_VAR
);
3191 /*----------------------------------------------------------------------------
3192 | Returns the binary log of the double-precision floating-point value `a'.
3193 | The operation is performed according to the IEC/IEEE Standard for Binary
3194 | Floating-Point Arithmetic.
3195 *----------------------------------------------------------------------------*/
3196 float64
float64_log2( float64 a STATUS_PARAM
)
3200 bits64 aSig
, aSig0
, aSig1
, zSig
, i
;
3202 aSig
= extractFloat64Frac( a
);
3203 aExp
= extractFloat64Exp( a
);
3204 aSign
= extractFloat64Sign( a
);
3207 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
3208 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3211 float_raise( float_flag_invalid STATUS_VAR
);
3212 return float64_default_nan
;
3214 if ( aExp
== 0x7FF ) {
3215 if ( aSig
) return propagateFloat64NaN( a
, float64_zero STATUS_VAR
);
3220 aSig
|= LIT64( 0x0010000000000000 );
3222 zSig
= (bits64
)aExp
<< 52;
3223 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
3224 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
3225 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
3226 if ( aSig
& LIT64( 0x0020000000000000 ) ) {
3234 return normalizeRoundAndPackFloat64( zSign
, 0x408, zSig STATUS_VAR
);
3237 /*----------------------------------------------------------------------------
3238 | Returns 1 if the double-precision floating-point value `a' is equal to the
3239 | corresponding value `b', and 0 otherwise. The comparison is performed
3240 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3241 *----------------------------------------------------------------------------*/
3243 int float64_eq( float64 a
, float64 b STATUS_PARAM
)
3247 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3248 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3250 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3251 float_raise( float_flag_invalid STATUS_VAR
);
3255 av
= float64_val(a
);
3256 bv
= float64_val(b
);
3257 return ( av
== bv
) || ( (bits64
) ( ( av
| bv
)<<1 ) == 0 );
3261 /*----------------------------------------------------------------------------
3262 | Returns 1 if the double-precision floating-point value `a' is less than or
3263 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3264 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3266 *----------------------------------------------------------------------------*/
3268 int float64_le( float64 a
, float64 b STATUS_PARAM
)
3273 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3274 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3276 float_raise( float_flag_invalid STATUS_VAR
);
3279 aSign
= extractFloat64Sign( a
);
3280 bSign
= extractFloat64Sign( b
);
3281 av
= float64_val(a
);
3282 bv
= float64_val(b
);
3283 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( av
| bv
)<<1 ) == 0 );
3284 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3288 /*----------------------------------------------------------------------------
3289 | Returns 1 if the double-precision floating-point value `a' is less than
3290 | the corresponding value `b', and 0 otherwise. The comparison is performed
3291 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3292 *----------------------------------------------------------------------------*/
3294 int float64_lt( float64 a
, float64 b STATUS_PARAM
)
3299 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3300 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3302 float_raise( float_flag_invalid STATUS_VAR
);
3305 aSign
= extractFloat64Sign( a
);
3306 bSign
= extractFloat64Sign( b
);
3307 av
= float64_val(a
);
3308 bv
= float64_val(b
);
3309 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( av
| bv
)<<1 ) != 0 );
3310 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3314 /*----------------------------------------------------------------------------
3315 | Returns 1 if the double-precision floating-point value `a' is equal to the
3316 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3317 | if either operand is a NaN. Otherwise, the comparison is performed
3318 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3319 *----------------------------------------------------------------------------*/
3321 int float64_eq_signaling( float64 a
, float64 b STATUS_PARAM
)
3325 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3326 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3328 float_raise( float_flag_invalid STATUS_VAR
);
3331 av
= float64_val(a
);
3332 bv
= float64_val(b
);
3333 return ( av
== bv
) || ( (bits64
) ( ( av
| bv
)<<1 ) == 0 );
3337 /*----------------------------------------------------------------------------
3338 | Returns 1 if the double-precision floating-point value `a' is less than or
3339 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3340 | cause an exception. Otherwise, the comparison is performed according to the
3341 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3342 *----------------------------------------------------------------------------*/
3344 int float64_le_quiet( float64 a
, float64 b STATUS_PARAM
)
3349 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3350 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3352 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3353 float_raise( float_flag_invalid STATUS_VAR
);
3357 aSign
= extractFloat64Sign( a
);
3358 bSign
= extractFloat64Sign( b
);
3359 av
= float64_val(a
);
3360 bv
= float64_val(b
);
3361 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( av
| bv
)<<1 ) == 0 );
3362 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3366 /*----------------------------------------------------------------------------
3367 | Returns 1 if the double-precision floating-point value `a' is less than
3368 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3369 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3370 | Standard for Binary Floating-Point Arithmetic.
3371 *----------------------------------------------------------------------------*/
3373 int float64_lt_quiet( float64 a
, float64 b STATUS_PARAM
)
3378 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3379 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3381 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3382 float_raise( float_flag_invalid STATUS_VAR
);
3386 aSign
= extractFloat64Sign( a
);
3387 bSign
= extractFloat64Sign( b
);
3388 av
= float64_val(a
);
3389 bv
= float64_val(b
);
3390 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( av
| bv
)<<1 ) != 0 );
3391 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3397 /*----------------------------------------------------------------------------
3398 | Returns the result of converting the extended double-precision floating-
3399 | point value `a' to the 32-bit two's complement integer format. The
3400 | conversion is performed according to the IEC/IEEE Standard for Binary
3401 | Floating-Point Arithmetic---which means in particular that the conversion
3402 | is rounded according to the current rounding mode. If `a' is a NaN, the
3403 | largest positive integer is returned. Otherwise, if the conversion
3404 | overflows, the largest integer with the same sign as `a' is returned.
3405 *----------------------------------------------------------------------------*/
3407 int32
floatx80_to_int32( floatx80 a STATUS_PARAM
)
3410 int32 aExp
, shiftCount
;
3413 aSig
= extractFloatx80Frac( a
);
3414 aExp
= extractFloatx80Exp( a
);
3415 aSign
= extractFloatx80Sign( a
);
3416 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
3417 shiftCount
= 0x4037 - aExp
;
3418 if ( shiftCount
<= 0 ) shiftCount
= 1;
3419 shift64RightJamming( aSig
, shiftCount
, &aSig
);
3420 return roundAndPackInt32( aSign
, aSig STATUS_VAR
);
3424 /*----------------------------------------------------------------------------
3425 | Returns the result of converting the extended double-precision floating-
3426 | point value `a' to the 32-bit two's complement integer format. The
3427 | conversion is performed according to the IEC/IEEE Standard for Binary
3428 | Floating-Point Arithmetic, except that the conversion is always rounded
3429 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3430 | Otherwise, if the conversion overflows, the largest integer with the same
3431 | sign as `a' is returned.
3432 *----------------------------------------------------------------------------*/
3434 int32
floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM
)
3437 int32 aExp
, shiftCount
;
3438 bits64 aSig
, savedASig
;
3441 aSig
= extractFloatx80Frac( a
);
3442 aExp
= extractFloatx80Exp( a
);
3443 aSign
= extractFloatx80Sign( a
);
3444 if ( 0x401E < aExp
) {
3445 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
3448 else if ( aExp
< 0x3FFF ) {
3449 if ( aExp
|| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
3452 shiftCount
= 0x403E - aExp
;
3454 aSig
>>= shiftCount
;
3456 if ( aSign
) z
= - z
;
3457 if ( ( z
< 0 ) ^ aSign
) {
3459 float_raise( float_flag_invalid STATUS_VAR
);
3460 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
3462 if ( ( aSig
<<shiftCount
) != savedASig
) {
3463 STATUS(float_exception_flags
) |= float_flag_inexact
;
3469 /*----------------------------------------------------------------------------
3470 | Returns the result of converting the extended double-precision floating-
3471 | point value `a' to the 64-bit two's complement integer format. The
3472 | conversion is performed according to the IEC/IEEE Standard for Binary
3473 | Floating-Point Arithmetic---which means in particular that the conversion
3474 | is rounded according to the current rounding mode. If `a' is a NaN,
3475 | the largest positive integer is returned. Otherwise, if the conversion
3476 | overflows, the largest integer with the same sign as `a' is returned.
3477 *----------------------------------------------------------------------------*/
3479 int64
floatx80_to_int64( floatx80 a STATUS_PARAM
)
3482 int32 aExp
, shiftCount
;
3483 bits64 aSig
, aSigExtra
;
3485 aSig
= extractFloatx80Frac( a
);
3486 aExp
= extractFloatx80Exp( a
);
3487 aSign
= extractFloatx80Sign( a
);
3488 shiftCount
= 0x403E - aExp
;
3489 if ( shiftCount
<= 0 ) {
3491 float_raise( float_flag_invalid STATUS_VAR
);
3493 || ( ( aExp
== 0x7FFF )
3494 && ( aSig
!= LIT64( 0x8000000000000000 ) ) )
3496 return LIT64( 0x7FFFFFFFFFFFFFFF );
3498 return (sbits64
) LIT64( 0x8000000000000000 );
3503 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
3505 return roundAndPackInt64( aSign
, aSig
, aSigExtra STATUS_VAR
);
3509 /*----------------------------------------------------------------------------
3510 | Returns the result of converting the extended double-precision floating-
3511 | point value `a' to the 64-bit two's complement integer format. The
3512 | conversion is performed according to the IEC/IEEE Standard for Binary
3513 | Floating-Point Arithmetic, except that the conversion is always rounded
3514 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3515 | Otherwise, if the conversion overflows, the largest integer with the same
3516 | sign as `a' is returned.
3517 *----------------------------------------------------------------------------*/
3519 int64
floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM
)
3522 int32 aExp
, shiftCount
;
3526 aSig
= extractFloatx80Frac( a
);
3527 aExp
= extractFloatx80Exp( a
);
3528 aSign
= extractFloatx80Sign( a
);
3529 shiftCount
= aExp
- 0x403E;
3530 if ( 0 <= shiftCount
) {
3531 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
3532 if ( ( a
.high
!= 0xC03E ) || aSig
) {
3533 float_raise( float_flag_invalid STATUS_VAR
);
3534 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
3535 return LIT64( 0x7FFFFFFFFFFFFFFF );
3538 return (sbits64
) LIT64( 0x8000000000000000 );
3540 else if ( aExp
< 0x3FFF ) {
3541 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
3544 z
= aSig
>>( - shiftCount
);
3545 if ( (bits64
) ( aSig
<<( shiftCount
& 63 ) ) ) {
3546 STATUS(float_exception_flags
) |= float_flag_inexact
;
3548 if ( aSign
) z
= - z
;
3553 /*----------------------------------------------------------------------------
3554 | Returns the result of converting the extended double-precision floating-
3555 | point value `a' to the single-precision floating-point format. The
3556 | conversion is performed according to the IEC/IEEE Standard for Binary
3557 | Floating-Point Arithmetic.
3558 *----------------------------------------------------------------------------*/
3560 float32
floatx80_to_float32( floatx80 a STATUS_PARAM
)
3566 aSig
= extractFloatx80Frac( a
);
3567 aExp
= extractFloatx80Exp( a
);
3568 aSign
= extractFloatx80Sign( a
);
3569 if ( aExp
== 0x7FFF ) {
3570 if ( (bits64
) ( aSig
<<1 ) ) {
3571 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR
) );
3573 return packFloat32( aSign
, 0xFF, 0 );
3575 shift64RightJamming( aSig
, 33, &aSig
);
3576 if ( aExp
|| aSig
) aExp
-= 0x3F81;
3577 return roundAndPackFloat32( aSign
, aExp
, aSig STATUS_VAR
);
3581 /*----------------------------------------------------------------------------
3582 | Returns the result of converting the extended double-precision floating-
3583 | point value `a' to the double-precision floating-point format. The
3584 | conversion is performed according to the IEC/IEEE Standard for Binary
3585 | Floating-Point Arithmetic.
3586 *----------------------------------------------------------------------------*/
3588 float64
floatx80_to_float64( floatx80 a STATUS_PARAM
)
3594 aSig
= extractFloatx80Frac( a
);
3595 aExp
= extractFloatx80Exp( a
);
3596 aSign
= extractFloatx80Sign( a
);
3597 if ( aExp
== 0x7FFF ) {
3598 if ( (bits64
) ( aSig
<<1 ) ) {
3599 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR
) );
3601 return packFloat64( aSign
, 0x7FF, 0 );
3603 shift64RightJamming( aSig
, 1, &zSig
);
3604 if ( aExp
|| aSig
) aExp
-= 0x3C01;
3605 return roundAndPackFloat64( aSign
, aExp
, zSig STATUS_VAR
);
3611 /*----------------------------------------------------------------------------
3612 | Returns the result of converting the extended double-precision floating-
3613 | point value `a' to the quadruple-precision floating-point format. The
3614 | conversion is performed according to the IEC/IEEE Standard for Binary
3615 | Floating-Point Arithmetic.
3616 *----------------------------------------------------------------------------*/
3618 float128
floatx80_to_float128( floatx80 a STATUS_PARAM
)
3622 bits64 aSig
, zSig0
, zSig1
;
3624 aSig
= extractFloatx80Frac( a
);
3625 aExp
= extractFloatx80Exp( a
);
3626 aSign
= extractFloatx80Sign( a
);
3627 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) {
3628 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR
) );
3630 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
3631 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
3637 /*----------------------------------------------------------------------------
3638 | Rounds the extended double-precision floating-point value `a' to an integer,
3639 | and returns the result as an extended quadruple-precision floating-point
3640 | value. The operation is performed according to the IEC/IEEE Standard for
3641 | Binary Floating-Point Arithmetic.
3642 *----------------------------------------------------------------------------*/
3644 floatx80
floatx80_round_to_int( floatx80 a STATUS_PARAM
)
3648 bits64 lastBitMask
, roundBitsMask
;
3652 aExp
= extractFloatx80Exp( a
);
3653 if ( 0x403E <= aExp
) {
3654 if ( ( aExp
== 0x7FFF ) && (bits64
) ( extractFloatx80Frac( a
)<<1 ) ) {
3655 return propagateFloatx80NaN( a
, a STATUS_VAR
);
3659 if ( aExp
< 0x3FFF ) {
3661 && ( (bits64
) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
3664 STATUS(float_exception_flags
) |= float_flag_inexact
;
3665 aSign
= extractFloatx80Sign( a
);
3666 switch ( STATUS(float_rounding_mode
) ) {
3667 case float_round_nearest_even
:
3668 if ( ( aExp
== 0x3FFE ) && (bits64
) ( extractFloatx80Frac( a
)<<1 )
3671 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
3674 case float_round_down
:
3677 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3678 : packFloatx80( 0, 0, 0 );
3679 case float_round_up
:
3681 aSign
? packFloatx80( 1, 0, 0 )
3682 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3684 return packFloatx80( aSign
, 0, 0 );
3687 lastBitMask
<<= 0x403E - aExp
;
3688 roundBitsMask
= lastBitMask
- 1;
3690 roundingMode
= STATUS(float_rounding_mode
);
3691 if ( roundingMode
== float_round_nearest_even
) {
3692 z
.low
+= lastBitMask
>>1;
3693 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
3695 else if ( roundingMode
!= float_round_to_zero
) {
3696 if ( extractFloatx80Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
3697 z
.low
+= roundBitsMask
;
3700 z
.low
&= ~ roundBitsMask
;
3703 z
.low
= LIT64( 0x8000000000000000 );
3705 if ( z
.low
!= a
.low
) STATUS(float_exception_flags
) |= float_flag_inexact
;
3710 /*----------------------------------------------------------------------------
3711 | Returns the result of adding the absolute values of the extended double-
3712 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3713 | negated before being returned. `zSign' is ignored if the result is a NaN.
3714 | The addition is performed according to the IEC/IEEE Standard for Binary
3715 | Floating-Point Arithmetic.
3716 *----------------------------------------------------------------------------*/
3718 static floatx80
addFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign STATUS_PARAM
)
3720 int32 aExp
, bExp
, zExp
;
3721 bits64 aSig
, bSig
, zSig0
, zSig1
;
3724 aSig
= extractFloatx80Frac( a
);
3725 aExp
= extractFloatx80Exp( a
);
3726 bSig
= extractFloatx80Frac( b
);
3727 bExp
= extractFloatx80Exp( b
);
3728 expDiff
= aExp
- bExp
;
3729 if ( 0 < expDiff
) {
3730 if ( aExp
== 0x7FFF ) {
3731 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3734 if ( bExp
== 0 ) --expDiff
;
3735 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
3738 else if ( expDiff
< 0 ) {
3739 if ( bExp
== 0x7FFF ) {
3740 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3741 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3743 if ( aExp
== 0 ) ++expDiff
;
3744 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
3748 if ( aExp
== 0x7FFF ) {
3749 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
3750 return propagateFloatx80NaN( a
, b STATUS_VAR
);
3755 zSig0
= aSig
+ bSig
;
3757 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
3763 zSig0
= aSig
+ bSig
;
3764 if ( (sbits64
) zSig0
< 0 ) goto roundAndPack
;
3766 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
3767 zSig0
|= LIT64( 0x8000000000000000 );
3771 roundAndPackFloatx80(
3772 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
3776 /*----------------------------------------------------------------------------
3777 | Returns the result of subtracting the absolute values of the extended
3778 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3779 | difference is negated before being returned. `zSign' is ignored if the
3780 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3781 | Standard for Binary Floating-Point Arithmetic.
3782 *----------------------------------------------------------------------------*/
3784 static floatx80
subFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign STATUS_PARAM
)
3786 int32 aExp
, bExp
, zExp
;
3787 bits64 aSig
, bSig
, zSig0
, zSig1
;
3791 aSig
= extractFloatx80Frac( a
);
3792 aExp
= extractFloatx80Exp( a
);
3793 bSig
= extractFloatx80Frac( b
);
3794 bExp
= extractFloatx80Exp( b
);
3795 expDiff
= aExp
- bExp
;
3796 if ( 0 < expDiff
) goto aExpBigger
;
3797 if ( expDiff
< 0 ) goto bExpBigger
;
3798 if ( aExp
== 0x7FFF ) {
3799 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
3800 return propagateFloatx80NaN( a
, b STATUS_VAR
);
3802 float_raise( float_flag_invalid STATUS_VAR
);
3803 z
.low
= floatx80_default_nan_low
;
3804 z
.high
= floatx80_default_nan_high
;
3812 if ( bSig
< aSig
) goto aBigger
;
3813 if ( aSig
< bSig
) goto bBigger
;
3814 return packFloatx80( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
3816 if ( bExp
== 0x7FFF ) {
3817 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3818 return packFloatx80( zSign
^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3820 if ( aExp
== 0 ) ++expDiff
;
3821 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
3823 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
3826 goto normalizeRoundAndPack
;
3828 if ( aExp
== 0x7FFF ) {
3829 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3832 if ( bExp
== 0 ) --expDiff
;
3833 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
3835 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
3837 normalizeRoundAndPack
:
3839 normalizeRoundAndPackFloatx80(
3840 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
3844 /*----------------------------------------------------------------------------
3845 | Returns the result of adding the extended double-precision floating-point
3846 | values `a' and `b'. The operation is performed according to the IEC/IEEE
3847 | Standard for Binary Floating-Point Arithmetic.
3848 *----------------------------------------------------------------------------*/
3850 floatx80
floatx80_add( floatx80 a
, floatx80 b STATUS_PARAM
)
3854 aSign
= extractFloatx80Sign( a
);
3855 bSign
= extractFloatx80Sign( b
);
3856 if ( aSign
== bSign
) {
3857 return addFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
3860 return subFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
3865 /*----------------------------------------------------------------------------
3866 | Returns the result of subtracting the extended double-precision floating-
3867 | point values `a' and `b'. The operation is performed according to the
3868 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3869 *----------------------------------------------------------------------------*/
3871 floatx80
floatx80_sub( floatx80 a
, floatx80 b STATUS_PARAM
)
3875 aSign
= extractFloatx80Sign( a
);
3876 bSign
= extractFloatx80Sign( b
);
3877 if ( aSign
== bSign
) {
3878 return subFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
3881 return addFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
3886 /*----------------------------------------------------------------------------
3887 | Returns the result of multiplying the extended double-precision floating-
3888 | point values `a' and `b'. The operation is performed according to the
3889 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3890 *----------------------------------------------------------------------------*/
3892 floatx80
floatx80_mul( floatx80 a
, floatx80 b STATUS_PARAM
)
3894 flag aSign
, bSign
, zSign
;
3895 int32 aExp
, bExp
, zExp
;
3896 bits64 aSig
, bSig
, zSig0
, zSig1
;
3899 aSig
= extractFloatx80Frac( a
);
3900 aExp
= extractFloatx80Exp( a
);
3901 aSign
= extractFloatx80Sign( a
);
3902 bSig
= extractFloatx80Frac( b
);
3903 bExp
= extractFloatx80Exp( b
);
3904 bSign
= extractFloatx80Sign( b
);
3905 zSign
= aSign
^ bSign
;
3906 if ( aExp
== 0x7FFF ) {
3907 if ( (bits64
) ( aSig
<<1 )
3908 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
3909 return propagateFloatx80NaN( a
, b STATUS_VAR
);
3911 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
3912 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3914 if ( bExp
== 0x7FFF ) {
3915 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3916 if ( ( aExp
| aSig
) == 0 ) {
3918 float_raise( float_flag_invalid STATUS_VAR
);
3919 z
.low
= floatx80_default_nan_low
;
3920 z
.high
= floatx80_default_nan_high
;
3923 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3926 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3927 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
3930 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3931 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3933 zExp
= aExp
+ bExp
- 0x3FFE;
3934 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
3935 if ( 0 < (sbits64
) zSig0
) {
3936 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
3940 roundAndPackFloatx80(
3941 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
3945 /*----------------------------------------------------------------------------
3946 | Returns the result of dividing the extended double-precision floating-point
3947 | value `a' by the corresponding value `b'. The operation is performed
3948 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3949 *----------------------------------------------------------------------------*/
3951 floatx80
floatx80_div( floatx80 a
, floatx80 b STATUS_PARAM
)
3953 flag aSign
, bSign
, zSign
;
3954 int32 aExp
, bExp
, zExp
;
3955 bits64 aSig
, bSig
, zSig0
, zSig1
;
3956 bits64 rem0
, rem1
, rem2
, term0
, term1
, term2
;
3959 aSig
= extractFloatx80Frac( a
);
3960 aExp
= extractFloatx80Exp( a
);
3961 aSign
= extractFloatx80Sign( a
);
3962 bSig
= extractFloatx80Frac( b
);
3963 bExp
= extractFloatx80Exp( b
);
3964 bSign
= extractFloatx80Sign( b
);
3965 zSign
= aSign
^ bSign
;
3966 if ( aExp
== 0x7FFF ) {
3967 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3968 if ( bExp
== 0x7FFF ) {
3969 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3972 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3974 if ( bExp
== 0x7FFF ) {
3975 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3976 return packFloatx80( zSign
, 0, 0 );
3980 if ( ( aExp
| aSig
) == 0 ) {
3982 float_raise( float_flag_invalid STATUS_VAR
);
3983 z
.low
= floatx80_default_nan_low
;
3984 z
.high
= floatx80_default_nan_high
;
3987 float_raise( float_flag_divbyzero STATUS_VAR
);
3988 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3990 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3993 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3994 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
3996 zExp
= aExp
- bExp
+ 0x3FFE;
3998 if ( bSig
<= aSig
) {
3999 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
4002 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
4003 mul64To128( bSig
, zSig0
, &term0
, &term1
);
4004 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
4005 while ( (sbits64
) rem0
< 0 ) {
4007 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
4009 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
4010 if ( (bits64
) ( zSig1
<<1 ) <= 8 ) {
4011 mul64To128( bSig
, zSig1
, &term1
, &term2
);
4012 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
4013 while ( (sbits64
) rem1
< 0 ) {
4015 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
4017 zSig1
|= ( ( rem1
| rem2
) != 0 );
4020 roundAndPackFloatx80(
4021 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4025 /*----------------------------------------------------------------------------
4026 | Returns the remainder of the extended double-precision floating-point value
4027 | `a' with respect to the corresponding value `b'. The operation is performed
4028 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4029 *----------------------------------------------------------------------------*/
4031 floatx80
floatx80_rem( floatx80 a
, floatx80 b STATUS_PARAM
)
4034 int32 aExp
, bExp
, expDiff
;
4035 bits64 aSig0
, aSig1
, bSig
;
4036 bits64 q
, term0
, term1
, alternateASig0
, alternateASig1
;
4039 aSig0
= extractFloatx80Frac( a
);
4040 aExp
= extractFloatx80Exp( a
);
4041 aSign
= extractFloatx80Sign( a
);
4042 bSig
= extractFloatx80Frac( b
);
4043 bExp
= extractFloatx80Exp( b
);
4044 if ( aExp
== 0x7FFF ) {
4045 if ( (bits64
) ( aSig0
<<1 )
4046 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
4047 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4051 if ( bExp
== 0x7FFF ) {
4052 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4058 float_raise( float_flag_invalid STATUS_VAR
);
4059 z
.low
= floatx80_default_nan_low
;
4060 z
.high
= floatx80_default_nan_high
;
4063 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
4066 if ( (bits64
) ( aSig0
<<1 ) == 0 ) return a
;
4067 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
4069 bSig
|= LIT64( 0x8000000000000000 );
4071 expDiff
= aExp
- bExp
;
4073 if ( expDiff
< 0 ) {
4074 if ( expDiff
< -1 ) return a
;
4075 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
4078 q
= ( bSig
<= aSig0
);
4079 if ( q
) aSig0
-= bSig
;
4081 while ( 0 < expDiff
) {
4082 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
4083 q
= ( 2 < q
) ? q
- 2 : 0;
4084 mul64To128( bSig
, q
, &term0
, &term1
);
4085 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
4086 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
4090 if ( 0 < expDiff
) {
4091 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
4092 q
= ( 2 < q
) ? q
- 2 : 0;
4094 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
4095 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
4096 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
4097 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
4099 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
4106 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
4107 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
4108 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
4111 aSig0
= alternateASig0
;
4112 aSig1
= alternateASig1
;
4116 normalizeRoundAndPackFloatx80(
4117 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1 STATUS_VAR
);
4121 /*----------------------------------------------------------------------------
4122 | Returns the square root of the extended double-precision floating-point
4123 | value `a'. The operation is performed according to the IEC/IEEE Standard
4124 | for Binary Floating-Point Arithmetic.
4125 *----------------------------------------------------------------------------*/
4127 floatx80
floatx80_sqrt( floatx80 a STATUS_PARAM
)
4131 bits64 aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
4132 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
4135 aSig0
= extractFloatx80Frac( a
);
4136 aExp
= extractFloatx80Exp( a
);
4137 aSign
= extractFloatx80Sign( a
);
4138 if ( aExp
== 0x7FFF ) {
4139 if ( (bits64
) ( aSig0
<<1 ) ) return propagateFloatx80NaN( a
, a STATUS_VAR
);
4140 if ( ! aSign
) return a
;
4144 if ( ( aExp
| aSig0
) == 0 ) return a
;
4146 float_raise( float_flag_invalid STATUS_VAR
);
4147 z
.low
= floatx80_default_nan_low
;
4148 z
.high
= floatx80_default_nan_high
;
4152 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
4153 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
4155 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
4156 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
4157 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
4158 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
4159 doubleZSig0
= zSig0
<<1;
4160 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
4161 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
4162 while ( (sbits64
) rem0
< 0 ) {
4165 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
4167 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
4168 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4169 if ( zSig1
== 0 ) zSig1
= 1;
4170 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
4171 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
4172 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
4173 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
4174 while ( (sbits64
) rem1
< 0 ) {
4176 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
4178 term2
|= doubleZSig0
;
4179 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
4181 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
4183 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
4184 zSig0
|= doubleZSig0
;
4186 roundAndPackFloatx80(
4187 STATUS(floatx80_rounding_precision
), 0, zExp
, zSig0
, zSig1 STATUS_VAR
);
4191 /*----------------------------------------------------------------------------
4192 | Returns 1 if the extended double-precision floating-point value `a' is
4193 | equal to the corresponding value `b', and 0 otherwise. The comparison is
4194 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4196 *----------------------------------------------------------------------------*/
4198 int floatx80_eq( floatx80 a
, floatx80 b STATUS_PARAM
)
4201 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4202 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4203 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4204 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4206 if ( floatx80_is_signaling_nan( a
)
4207 || floatx80_is_signaling_nan( b
) ) {
4208 float_raise( float_flag_invalid STATUS_VAR
);
4214 && ( ( a
.high
== b
.high
)
4216 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
4221 /*----------------------------------------------------------------------------
4222 | Returns 1 if the extended double-precision floating-point value `a' is
4223 | less than or equal to the corresponding value `b', and 0 otherwise. The
4224 | comparison is performed according to the IEC/IEEE Standard for Binary
4225 | Floating-Point Arithmetic.
4226 *----------------------------------------------------------------------------*/
4228 int floatx80_le( floatx80 a
, floatx80 b STATUS_PARAM
)
4232 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4233 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4234 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4235 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4237 float_raise( float_flag_invalid STATUS_VAR
);
4240 aSign
= extractFloatx80Sign( a
);
4241 bSign
= extractFloatx80Sign( b
);
4242 if ( aSign
!= bSign
) {
4245 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4249 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4250 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4254 /*----------------------------------------------------------------------------
4255 | Returns 1 if the extended double-precision floating-point value `a' is
4256 | less than the corresponding value `b', and 0 otherwise. The comparison
4257 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4259 *----------------------------------------------------------------------------*/
4261 int floatx80_lt( floatx80 a
, floatx80 b STATUS_PARAM
)
4265 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4266 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4267 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4268 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4270 float_raise( float_flag_invalid STATUS_VAR
);
4273 aSign
= extractFloatx80Sign( a
);
4274 bSign
= extractFloatx80Sign( b
);
4275 if ( aSign
!= bSign
) {
4278 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4282 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4283 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4287 /*----------------------------------------------------------------------------
4288 | Returns 1 if the extended double-precision floating-point value `a' is equal
4289 | to the corresponding value `b', and 0 otherwise. The invalid exception is
4290 | raised if either operand is a NaN. Otherwise, the comparison is performed
4291 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4292 *----------------------------------------------------------------------------*/
4294 int floatx80_eq_signaling( floatx80 a
, floatx80 b STATUS_PARAM
)
4297 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4298 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4299 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4300 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4302 float_raise( float_flag_invalid STATUS_VAR
);
4307 && ( ( a
.high
== b
.high
)
4309 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
4314 /*----------------------------------------------------------------------------
4315 | Returns 1 if the extended double-precision floating-point value `a' is less
4316 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4317 | do not cause an exception. Otherwise, the comparison is performed according
4318 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4319 *----------------------------------------------------------------------------*/
4321 int floatx80_le_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
4325 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4326 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4327 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4328 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4330 if ( floatx80_is_signaling_nan( a
)
4331 || floatx80_is_signaling_nan( b
) ) {
4332 float_raise( float_flag_invalid STATUS_VAR
);
4336 aSign
= extractFloatx80Sign( a
);
4337 bSign
= extractFloatx80Sign( b
);
4338 if ( aSign
!= bSign
) {
4341 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4345 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4346 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4350 /*----------------------------------------------------------------------------
4351 | Returns 1 if the extended double-precision floating-point value `a' is less
4352 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4353 | an exception. Otherwise, the comparison is performed according to the
4354 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4355 *----------------------------------------------------------------------------*/
4357 int floatx80_lt_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
4361 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4362 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4363 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4364 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4366 if ( floatx80_is_signaling_nan( a
)
4367 || floatx80_is_signaling_nan( b
) ) {
4368 float_raise( float_flag_invalid STATUS_VAR
);
4372 aSign
= extractFloatx80Sign( a
);
4373 bSign
= extractFloatx80Sign( b
);
4374 if ( aSign
!= bSign
) {
4377 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4381 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4382 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4390 /*----------------------------------------------------------------------------
4391 | Returns the result of converting the quadruple-precision floating-point
4392 | value `a' to the 32-bit two's complement integer format. The conversion
4393 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4394 | Arithmetic---which means in particular that the conversion is rounded
4395 | according to the current rounding mode. If `a' is a NaN, the largest
4396 | positive integer is returned. Otherwise, if the conversion overflows, the
4397 | largest integer with the same sign as `a' is returned.
4398 *----------------------------------------------------------------------------*/
4400 int32
float128_to_int32( float128 a STATUS_PARAM
)
4403 int32 aExp
, shiftCount
;
4404 bits64 aSig0
, aSig1
;
4406 aSig1
= extractFloat128Frac1( a
);
4407 aSig0
= extractFloat128Frac0( a
);
4408 aExp
= extractFloat128Exp( a
);
4409 aSign
= extractFloat128Sign( a
);
4410 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
4411 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4412 aSig0
|= ( aSig1
!= 0 );
4413 shiftCount
= 0x4028 - aExp
;
4414 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
4415 return roundAndPackInt32( aSign
, aSig0 STATUS_VAR
);
4419 /*----------------------------------------------------------------------------
4420 | Returns the result of converting the quadruple-precision floating-point
4421 | value `a' to the 32-bit two's complement integer format. The conversion
4422 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4423 | Arithmetic, except that the conversion is always rounded toward zero. If
4424 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4425 | conversion overflows, the largest integer with the same sign as `a' is
4427 *----------------------------------------------------------------------------*/
4429 int32
float128_to_int32_round_to_zero( float128 a STATUS_PARAM
)
4432 int32 aExp
, shiftCount
;
4433 bits64 aSig0
, aSig1
, savedASig
;
4436 aSig1
= extractFloat128Frac1( a
);
4437 aSig0
= extractFloat128Frac0( a
);
4438 aExp
= extractFloat128Exp( a
);
4439 aSign
= extractFloat128Sign( a
);
4440 aSig0
|= ( aSig1
!= 0 );
4441 if ( 0x401E < aExp
) {
4442 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
4445 else if ( aExp
< 0x3FFF ) {
4446 if ( aExp
|| aSig0
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4449 aSig0
|= LIT64( 0x0001000000000000 );
4450 shiftCount
= 0x402F - aExp
;
4452 aSig0
>>= shiftCount
;
4454 if ( aSign
) z
= - z
;
4455 if ( ( z
< 0 ) ^ aSign
) {
4457 float_raise( float_flag_invalid STATUS_VAR
);
4458 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
4460 if ( ( aSig0
<<shiftCount
) != savedASig
) {
4461 STATUS(float_exception_flags
) |= float_flag_inexact
;
4467 /*----------------------------------------------------------------------------
4468 | Returns the result of converting the quadruple-precision floating-point
4469 | value `a' to the 64-bit two's complement integer format. The conversion
4470 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4471 | Arithmetic---which means in particular that the conversion is rounded
4472 | according to the current rounding mode. If `a' is a NaN, the largest
4473 | positive integer is returned. Otherwise, if the conversion overflows, the
4474 | largest integer with the same sign as `a' is returned.
4475 *----------------------------------------------------------------------------*/
4477 int64
float128_to_int64( float128 a STATUS_PARAM
)
4480 int32 aExp
, shiftCount
;
4481 bits64 aSig0
, aSig1
;
4483 aSig1
= extractFloat128Frac1( a
);
4484 aSig0
= extractFloat128Frac0( a
);
4485 aExp
= extractFloat128Exp( a
);
4486 aSign
= extractFloat128Sign( a
);
4487 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4488 shiftCount
= 0x402F - aExp
;
4489 if ( shiftCount
<= 0 ) {
4490 if ( 0x403E < aExp
) {
4491 float_raise( float_flag_invalid STATUS_VAR
);
4493 || ( ( aExp
== 0x7FFF )
4494 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
4497 return LIT64( 0x7FFFFFFFFFFFFFFF );
4499 return (sbits64
) LIT64( 0x8000000000000000 );
4501 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
4504 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
4506 return roundAndPackInt64( aSign
, aSig0
, aSig1 STATUS_VAR
);
4510 /*----------------------------------------------------------------------------
4511 | Returns the result of converting the quadruple-precision floating-point
4512 | value `a' to the 64-bit two's complement integer format. The conversion
4513 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4514 | Arithmetic, except that the conversion is always rounded toward zero.
4515 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4516 | the conversion overflows, the largest integer with the same sign as `a' is
4518 *----------------------------------------------------------------------------*/
4520 int64
float128_to_int64_round_to_zero( float128 a STATUS_PARAM
)
4523 int32 aExp
, shiftCount
;
4524 bits64 aSig0
, aSig1
;
4527 aSig1
= extractFloat128Frac1( a
);
4528 aSig0
= extractFloat128Frac0( a
);
4529 aExp
= extractFloat128Exp( a
);
4530 aSign
= extractFloat128Sign( a
);
4531 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4532 shiftCount
= aExp
- 0x402F;
4533 if ( 0 < shiftCount
) {
4534 if ( 0x403E <= aExp
) {
4535 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
4536 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
4537 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
4538 if ( aSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4541 float_raise( float_flag_invalid STATUS_VAR
);
4542 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
4543 return LIT64( 0x7FFFFFFFFFFFFFFF );
4546 return (sbits64
) LIT64( 0x8000000000000000 );
4548 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
4549 if ( (bits64
) ( aSig1
<<shiftCount
) ) {
4550 STATUS(float_exception_flags
) |= float_flag_inexact
;
4554 if ( aExp
< 0x3FFF ) {
4555 if ( aExp
| aSig0
| aSig1
) {
4556 STATUS(float_exception_flags
) |= float_flag_inexact
;
4560 z
= aSig0
>>( - shiftCount
);
4562 || ( shiftCount
&& (bits64
) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
4563 STATUS(float_exception_flags
) |= float_flag_inexact
;
4566 if ( aSign
) z
= - z
;
4571 /*----------------------------------------------------------------------------
4572 | Returns the result of converting the quadruple-precision floating-point
4573 | value `a' to the single-precision floating-point format. The conversion
4574 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4576 *----------------------------------------------------------------------------*/
4578 float32
float128_to_float32( float128 a STATUS_PARAM
)
4582 bits64 aSig0
, aSig1
;
4585 aSig1
= extractFloat128Frac1( a
);
4586 aSig0
= extractFloat128Frac0( a
);
4587 aExp
= extractFloat128Exp( a
);
4588 aSign
= extractFloat128Sign( a
);
4589 if ( aExp
== 0x7FFF ) {
4590 if ( aSig0
| aSig1
) {
4591 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR
) );
4593 return packFloat32( aSign
, 0xFF, 0 );
4595 aSig0
|= ( aSig1
!= 0 );
4596 shift64RightJamming( aSig0
, 18, &aSig0
);
4598 if ( aExp
|| zSig
) {
4602 return roundAndPackFloat32( aSign
, aExp
, zSig STATUS_VAR
);
4606 /*----------------------------------------------------------------------------
4607 | Returns the result of converting the quadruple-precision floating-point
4608 | value `a' to the double-precision floating-point format. The conversion
4609 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4611 *----------------------------------------------------------------------------*/
4613 float64
float128_to_float64( float128 a STATUS_PARAM
)
4617 bits64 aSig0
, aSig1
;
4619 aSig1
= extractFloat128Frac1( a
);
4620 aSig0
= extractFloat128Frac0( a
);
4621 aExp
= extractFloat128Exp( a
);
4622 aSign
= extractFloat128Sign( a
);
4623 if ( aExp
== 0x7FFF ) {
4624 if ( aSig0
| aSig1
) {
4625 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR
) );
4627 return packFloat64( aSign
, 0x7FF, 0 );
4629 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
4630 aSig0
|= ( aSig1
!= 0 );
4631 if ( aExp
|| aSig0
) {
4632 aSig0
|= LIT64( 0x4000000000000000 );
4635 return roundAndPackFloat64( aSign
, aExp
, aSig0 STATUS_VAR
);
4641 /*----------------------------------------------------------------------------
4642 | Returns the result of converting the quadruple-precision floating-point
4643 | value `a' to the extended double-precision floating-point format. The
4644 | conversion is performed according to the IEC/IEEE Standard for Binary
4645 | Floating-Point Arithmetic.
4646 *----------------------------------------------------------------------------*/
4648 floatx80
float128_to_floatx80( float128 a STATUS_PARAM
)
4652 bits64 aSig0
, aSig1
;
4654 aSig1
= extractFloat128Frac1( a
);
4655 aSig0
= extractFloat128Frac0( a
);
4656 aExp
= extractFloat128Exp( a
);
4657 aSign
= extractFloat128Sign( a
);
4658 if ( aExp
== 0x7FFF ) {
4659 if ( aSig0
| aSig1
) {
4660 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR
) );
4662 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4665 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
4666 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4669 aSig0
|= LIT64( 0x0001000000000000 );
4671 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
4672 return roundAndPackFloatx80( 80, aSign
, aExp
, aSig0
, aSig1 STATUS_VAR
);
4678 /*----------------------------------------------------------------------------
4679 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4680 | returns the result as a quadruple-precision floating-point value. The
4681 | operation is performed according to the IEC/IEEE Standard for Binary
4682 | Floating-Point Arithmetic.
4683 *----------------------------------------------------------------------------*/
4685 float128
float128_round_to_int( float128 a STATUS_PARAM
)
4689 bits64 lastBitMask
, roundBitsMask
;
4693 aExp
= extractFloat128Exp( a
);
4694 if ( 0x402F <= aExp
) {
4695 if ( 0x406F <= aExp
) {
4696 if ( ( aExp
== 0x7FFF )
4697 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
4699 return propagateFloat128NaN( a
, a STATUS_VAR
);
4704 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
4705 roundBitsMask
= lastBitMask
- 1;
4707 roundingMode
= STATUS(float_rounding_mode
);
4708 if ( roundingMode
== float_round_nearest_even
) {
4709 if ( lastBitMask
) {
4710 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
4711 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
4714 if ( (sbits64
) z
.low
< 0 ) {
4716 if ( (bits64
) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
4720 else if ( roundingMode
!= float_round_to_zero
) {
4721 if ( extractFloat128Sign( z
)
4722 ^ ( roundingMode
== float_round_up
) ) {
4723 add128( z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
4726 z
.low
&= ~ roundBitsMask
;
4729 if ( aExp
< 0x3FFF ) {
4730 if ( ( ( (bits64
) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
4731 STATUS(float_exception_flags
) |= float_flag_inexact
;
4732 aSign
= extractFloat128Sign( a
);
4733 switch ( STATUS(float_rounding_mode
) ) {
4734 case float_round_nearest_even
:
4735 if ( ( aExp
== 0x3FFE )
4736 && ( extractFloat128Frac0( a
)
4737 | extractFloat128Frac1( a
) )
4739 return packFloat128( aSign
, 0x3FFF, 0, 0 );
4742 case float_round_down
:
4744 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
4745 : packFloat128( 0, 0, 0, 0 );
4746 case float_round_up
:
4748 aSign
? packFloat128( 1, 0, 0, 0 )
4749 : packFloat128( 0, 0x3FFF, 0, 0 );
4751 return packFloat128( aSign
, 0, 0, 0 );
4754 lastBitMask
<<= 0x402F - aExp
;
4755 roundBitsMask
= lastBitMask
- 1;
4758 roundingMode
= STATUS(float_rounding_mode
);
4759 if ( roundingMode
== float_round_nearest_even
) {
4760 z
.high
+= lastBitMask
>>1;
4761 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
4762 z
.high
&= ~ lastBitMask
;
4765 else if ( roundingMode
!= float_round_to_zero
) {
4766 if ( extractFloat128Sign( z
)
4767 ^ ( roundingMode
== float_round_up
) ) {
4768 z
.high
|= ( a
.low
!= 0 );
4769 z
.high
+= roundBitsMask
;
4772 z
.high
&= ~ roundBitsMask
;
4774 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
4775 STATUS(float_exception_flags
) |= float_flag_inexact
;
4781 /*----------------------------------------------------------------------------
4782 | Returns the result of adding the absolute values of the quadruple-precision
4783 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4784 | before being returned. `zSign' is ignored if the result is a NaN.
4785 | The addition is performed according to the IEC/IEEE Standard for Binary
4786 | Floating-Point Arithmetic.
4787 *----------------------------------------------------------------------------*/
4789 static float128
addFloat128Sigs( float128 a
, float128 b
, flag zSign STATUS_PARAM
)
4791 int32 aExp
, bExp
, zExp
;
4792 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
4795 aSig1
= extractFloat128Frac1( a
);
4796 aSig0
= extractFloat128Frac0( a
);
4797 aExp
= extractFloat128Exp( a
);
4798 bSig1
= extractFloat128Frac1( b
);
4799 bSig0
= extractFloat128Frac0( b
);
4800 bExp
= extractFloat128Exp( b
);
4801 expDiff
= aExp
- bExp
;
4802 if ( 0 < expDiff
) {
4803 if ( aExp
== 0x7FFF ) {
4804 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
4811 bSig0
|= LIT64( 0x0001000000000000 );
4813 shift128ExtraRightJamming(
4814 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
4817 else if ( expDiff
< 0 ) {
4818 if ( bExp
== 0x7FFF ) {
4819 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
4820 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4826 aSig0
|= LIT64( 0x0001000000000000 );
4828 shift128ExtraRightJamming(
4829 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
4833 if ( aExp
== 0x7FFF ) {
4834 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
4835 return propagateFloat128NaN( a
, b STATUS_VAR
);
4839 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4841 if ( STATUS(flush_to_zero
) ) return packFloat128( zSign
, 0, 0, 0 );
4842 return packFloat128( zSign
, 0, zSig0
, zSig1
);
4845 zSig0
|= LIT64( 0x0002000000000000 );
4849 aSig0
|= LIT64( 0x0001000000000000 );
4850 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4852 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
4855 shift128ExtraRightJamming(
4856 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
4858 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
4862 /*----------------------------------------------------------------------------
4863 | Returns the result of subtracting the absolute values of the quadruple-
4864 | precision floating-point values `a' and `b'. If `zSign' is 1, the
4865 | difference is negated before being returned. `zSign' is ignored if the
4866 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4867 | Standard for Binary Floating-Point Arithmetic.
4868 *----------------------------------------------------------------------------*/
4870 static float128
subFloat128Sigs( float128 a
, float128 b
, flag zSign STATUS_PARAM
)
4872 int32 aExp
, bExp
, zExp
;
4873 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
4877 aSig1
= extractFloat128Frac1( a
);
4878 aSig0
= extractFloat128Frac0( a
);
4879 aExp
= extractFloat128Exp( a
);
4880 bSig1
= extractFloat128Frac1( b
);
4881 bSig0
= extractFloat128Frac0( b
);
4882 bExp
= extractFloat128Exp( b
);
4883 expDiff
= aExp
- bExp
;
4884 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
4885 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
4886 if ( 0 < expDiff
) goto aExpBigger
;
4887 if ( expDiff
< 0 ) goto bExpBigger
;
4888 if ( aExp
== 0x7FFF ) {
4889 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
4890 return propagateFloat128NaN( a
, b STATUS_VAR
);
4892 float_raise( float_flag_invalid STATUS_VAR
);
4893 z
.low
= float128_default_nan_low
;
4894 z
.high
= float128_default_nan_high
;
4901 if ( bSig0
< aSig0
) goto aBigger
;
4902 if ( aSig0
< bSig0
) goto bBigger
;
4903 if ( bSig1
< aSig1
) goto aBigger
;
4904 if ( aSig1
< bSig1
) goto bBigger
;
4905 return packFloat128( STATUS(float_rounding_mode
) == float_round_down
, 0, 0, 0 );
4907 if ( bExp
== 0x7FFF ) {
4908 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
4909 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
4915 aSig0
|= LIT64( 0x4000000000000000 );
4917 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
4918 bSig0
|= LIT64( 0x4000000000000000 );
4920 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
4923 goto normalizeRoundAndPack
;
4925 if ( aExp
== 0x7FFF ) {
4926 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
4933 bSig0
|= LIT64( 0x4000000000000000 );
4935 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
4936 aSig0
|= LIT64( 0x4000000000000000 );
4938 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4940 normalizeRoundAndPack
:
4942 return normalizeRoundAndPackFloat128( zSign
, zExp
- 14, zSig0
, zSig1 STATUS_VAR
);
4946 /*----------------------------------------------------------------------------
4947 | Returns the result of adding the quadruple-precision floating-point values
4948 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4949 | for Binary Floating-Point Arithmetic.
4950 *----------------------------------------------------------------------------*/
4952 float128
float128_add( float128 a
, float128 b STATUS_PARAM
)
4956 aSign
= extractFloat128Sign( a
);
4957 bSign
= extractFloat128Sign( b
);
4958 if ( aSign
== bSign
) {
4959 return addFloat128Sigs( a
, b
, aSign STATUS_VAR
);
4962 return subFloat128Sigs( a
, b
, aSign STATUS_VAR
);
4967 /*----------------------------------------------------------------------------
4968 | Returns the result of subtracting the quadruple-precision floating-point
4969 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4970 | Standard for Binary Floating-Point Arithmetic.
4971 *----------------------------------------------------------------------------*/
4973 float128
float128_sub( float128 a
, float128 b STATUS_PARAM
)
4977 aSign
= extractFloat128Sign( a
);
4978 bSign
= extractFloat128Sign( b
);
4979 if ( aSign
== bSign
) {
4980 return subFloat128Sigs( a
, b
, aSign STATUS_VAR
);
4983 return addFloat128Sigs( a
, b
, aSign STATUS_VAR
);
4988 /*----------------------------------------------------------------------------
4989 | Returns the result of multiplying the quadruple-precision floating-point
4990 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4991 | Standard for Binary Floating-Point Arithmetic.
4992 *----------------------------------------------------------------------------*/
4994 float128
float128_mul( float128 a
, float128 b STATUS_PARAM
)
4996 flag aSign
, bSign
, zSign
;
4997 int32 aExp
, bExp
, zExp
;
4998 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
5001 aSig1
= extractFloat128Frac1( a
);
5002 aSig0
= extractFloat128Frac0( a
);
5003 aExp
= extractFloat128Exp( a
);
5004 aSign
= extractFloat128Sign( a
);
5005 bSig1
= extractFloat128Frac1( b
);
5006 bSig0
= extractFloat128Frac0( b
);
5007 bExp
= extractFloat128Exp( b
);
5008 bSign
= extractFloat128Sign( b
);
5009 zSign
= aSign
^ bSign
;
5010 if ( aExp
== 0x7FFF ) {
5011 if ( ( aSig0
| aSig1
)
5012 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
5013 return propagateFloat128NaN( a
, b STATUS_VAR
);
5015 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
5016 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5018 if ( bExp
== 0x7FFF ) {
5019 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5020 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
5022 float_raise( float_flag_invalid STATUS_VAR
);
5023 z
.low
= float128_default_nan_low
;
5024 z
.high
= float128_default_nan_high
;
5027 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5030 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
5031 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5034 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
5035 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5037 zExp
= aExp
+ bExp
- 0x4000;
5038 aSig0
|= LIT64( 0x0001000000000000 );
5039 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
5040 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
5041 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
5042 zSig2
|= ( zSig3
!= 0 );
5043 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
5044 shift128ExtraRightJamming(
5045 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
5048 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5052 /*----------------------------------------------------------------------------
5053 | Returns the result of dividing the quadruple-precision floating-point value
5054 | `a' by the corresponding value `b'. The operation is performed according to
5055 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5056 *----------------------------------------------------------------------------*/
5058 float128
float128_div( float128 a
, float128 b STATUS_PARAM
)
5060 flag aSign
, bSign
, zSign
;
5061 int32 aExp
, bExp
, zExp
;
5062 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
5063 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5066 aSig1
= extractFloat128Frac1( a
);
5067 aSig0
= extractFloat128Frac0( a
);
5068 aExp
= extractFloat128Exp( a
);
5069 aSign
= extractFloat128Sign( a
);
5070 bSig1
= extractFloat128Frac1( b
);
5071 bSig0
= extractFloat128Frac0( b
);
5072 bExp
= extractFloat128Exp( b
);
5073 bSign
= extractFloat128Sign( b
);
5074 zSign
= aSign
^ bSign
;
5075 if ( aExp
== 0x7FFF ) {
5076 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5077 if ( bExp
== 0x7FFF ) {
5078 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5081 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5083 if ( bExp
== 0x7FFF ) {
5084 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5085 return packFloat128( zSign
, 0, 0, 0 );
5088 if ( ( bSig0
| bSig1
) == 0 ) {
5089 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
5091 float_raise( float_flag_invalid STATUS_VAR
);
5092 z
.low
= float128_default_nan_low
;
5093 z
.high
= float128_default_nan_high
;
5096 float_raise( float_flag_divbyzero STATUS_VAR
);
5097 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5099 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5102 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
5103 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5105 zExp
= aExp
- bExp
+ 0x3FFD;
5107 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
5109 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
5110 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
5111 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
5114 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5115 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
5116 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
5117 while ( (sbits64
) rem0
< 0 ) {
5119 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
5121 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
5122 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
5123 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
5124 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
5125 while ( (sbits64
) rem1
< 0 ) {
5127 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
5129 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5131 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
5132 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5136 /*----------------------------------------------------------------------------
5137 | Returns the remainder of the quadruple-precision floating-point value `a'
5138 | with respect to the corresponding value `b'. The operation is performed
5139 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5140 *----------------------------------------------------------------------------*/
5142 float128
float128_rem( float128 a
, float128 b STATUS_PARAM
)
5145 int32 aExp
, bExp
, expDiff
;
5146 bits64 aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
5147 bits64 allZero
, alternateASig0
, alternateASig1
, sigMean1
;
5151 aSig1
= extractFloat128Frac1( a
);
5152 aSig0
= extractFloat128Frac0( a
);
5153 aExp
= extractFloat128Exp( a
);
5154 aSign
= extractFloat128Sign( a
);
5155 bSig1
= extractFloat128Frac1( b
);
5156 bSig0
= extractFloat128Frac0( b
);
5157 bExp
= extractFloat128Exp( b
);
5158 if ( aExp
== 0x7FFF ) {
5159 if ( ( aSig0
| aSig1
)
5160 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
5161 return propagateFloat128NaN( a
, b STATUS_VAR
);
5165 if ( bExp
== 0x7FFF ) {
5166 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5170 if ( ( bSig0
| bSig1
) == 0 ) {
5172 float_raise( float_flag_invalid STATUS_VAR
);
5173 z
.low
= float128_default_nan_low
;
5174 z
.high
= float128_default_nan_high
;
5177 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5180 if ( ( aSig0
| aSig1
) == 0 ) return a
;
5181 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5183 expDiff
= aExp
- bExp
;
5184 if ( expDiff
< -1 ) return a
;
5186 aSig0
| LIT64( 0x0001000000000000 ),
5188 15 - ( expDiff
< 0 ),
5193 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
5194 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
5195 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
5197 while ( 0 < expDiff
) {
5198 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5199 q
= ( 4 < q
) ? q
- 4 : 0;
5200 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
5201 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
5202 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
5203 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
5206 if ( -64 < expDiff
) {
5207 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5208 q
= ( 4 < q
) ? q
- 4 : 0;
5210 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
5212 if ( expDiff
< 0 ) {
5213 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
5216 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
5218 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
5219 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
5222 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
5223 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
5226 alternateASig0
= aSig0
;
5227 alternateASig1
= aSig1
;
5229 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
5230 } while ( 0 <= (sbits64
) aSig0
);
5232 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (bits64
*)&sigMean0
, &sigMean1
);
5233 if ( ( sigMean0
< 0 )
5234 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
5235 aSig0
= alternateASig0
;
5236 aSig1
= alternateASig1
;
5238 zSign
= ( (sbits64
) aSig0
< 0 );
5239 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
5241 normalizeRoundAndPackFloat128( aSign
^ zSign
, bExp
- 4, aSig0
, aSig1 STATUS_VAR
);
5245 /*----------------------------------------------------------------------------
5246 | Returns the square root of the quadruple-precision floating-point value `a'.
5247 | The operation is performed according to the IEC/IEEE Standard for Binary
5248 | Floating-Point Arithmetic.
5249 *----------------------------------------------------------------------------*/
5251 float128
float128_sqrt( float128 a STATUS_PARAM
)
5255 bits64 aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
5256 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5259 aSig1
= extractFloat128Frac1( a
);
5260 aSig0
= extractFloat128Frac0( a
);
5261 aExp
= extractFloat128Exp( a
);
5262 aSign
= extractFloat128Sign( a
);
5263 if ( aExp
== 0x7FFF ) {
5264 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, a STATUS_VAR
);
5265 if ( ! aSign
) return a
;
5269 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
5271 float_raise( float_flag_invalid STATUS_VAR
);
5272 z
.low
= float128_default_nan_low
;
5273 z
.high
= float128_default_nan_high
;
5277 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
5278 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5280 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
5281 aSig0
|= LIT64( 0x0001000000000000 );
5282 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
5283 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
5284 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
5285 doubleZSig0
= zSig0
<<1;
5286 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
5287 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
5288 while ( (sbits64
) rem0
< 0 ) {
5291 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
5293 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
5294 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
5295 if ( zSig1
== 0 ) zSig1
= 1;
5296 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
5297 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5298 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
5299 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5300 while ( (sbits64
) rem1
< 0 ) {
5302 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
5304 term2
|= doubleZSig0
;
5305 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5307 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5309 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
5310 return roundAndPackFloat128( 0, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5314 /*----------------------------------------------------------------------------
5315 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5316 | the corresponding value `b', and 0 otherwise. The comparison is performed
5317 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5318 *----------------------------------------------------------------------------*/
5320 int float128_eq( float128 a
, float128 b STATUS_PARAM
)
5323 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5324 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5325 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5326 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5328 if ( float128_is_signaling_nan( a
)
5329 || float128_is_signaling_nan( b
) ) {
5330 float_raise( float_flag_invalid STATUS_VAR
);
5336 && ( ( a
.high
== b
.high
)
5338 && ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5343 /*----------------------------------------------------------------------------
5344 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5345 | or equal to the corresponding value `b', and 0 otherwise. The comparison
5346 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5348 *----------------------------------------------------------------------------*/
5350 int float128_le( float128 a
, float128 b STATUS_PARAM
)
5354 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5355 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5356 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5357 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5359 float_raise( float_flag_invalid STATUS_VAR
);
5362 aSign
= extractFloat128Sign( a
);
5363 bSign
= extractFloat128Sign( b
);
5364 if ( aSign
!= bSign
) {
5367 || ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5371 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5372 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5376 /*----------------------------------------------------------------------------
5377 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5378 | the corresponding value `b', and 0 otherwise. The comparison is performed
5379 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5380 *----------------------------------------------------------------------------*/
5382 int float128_lt( float128 a
, float128 b STATUS_PARAM
)
5386 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5387 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5388 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5389 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5391 float_raise( float_flag_invalid STATUS_VAR
);
5394 aSign
= extractFloat128Sign( a
);
5395 bSign
= extractFloat128Sign( b
);
5396 if ( aSign
!= bSign
) {
5399 && ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5403 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5404 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5408 /*----------------------------------------------------------------------------
5409 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5410 | the corresponding value `b', and 0 otherwise. The invalid exception is
5411 | raised if either operand is a NaN. Otherwise, the comparison is performed
5412 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5413 *----------------------------------------------------------------------------*/
5415 int float128_eq_signaling( float128 a
, float128 b STATUS_PARAM
)
5418 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5419 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5420 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5421 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5423 float_raise( float_flag_invalid STATUS_VAR
);
5428 && ( ( a
.high
== b
.high
)
5430 && ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5435 /*----------------------------------------------------------------------------
5436 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5437 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5438 | cause an exception. Otherwise, the comparison is performed according to the
5439 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5440 *----------------------------------------------------------------------------*/
5442 int float128_le_quiet( float128 a
, float128 b STATUS_PARAM
)
5446 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5447 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5448 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5449 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5451 if ( float128_is_signaling_nan( a
)
5452 || float128_is_signaling_nan( b
) ) {
5453 float_raise( float_flag_invalid STATUS_VAR
);
5457 aSign
= extractFloat128Sign( a
);
5458 bSign
= extractFloat128Sign( b
);
5459 if ( aSign
!= bSign
) {
5462 || ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5466 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5467 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5471 /*----------------------------------------------------------------------------
5472 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5473 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5474 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5475 | Standard for Binary Floating-Point Arithmetic.
5476 *----------------------------------------------------------------------------*/
5478 int float128_lt_quiet( float128 a
, float128 b STATUS_PARAM
)
5482 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5483 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5484 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5485 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5487 if ( float128_is_signaling_nan( a
)
5488 || float128_is_signaling_nan( b
) ) {
5489 float_raise( float_flag_invalid STATUS_VAR
);
5493 aSign
= extractFloat128Sign( a
);
5494 bSign
= extractFloat128Sign( b
);
5495 if ( aSign
!= bSign
) {
5498 && ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5502 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5503 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5509 /* misc functions */
5510 float32
uint32_to_float32( unsigned int a STATUS_PARAM
)
5512 return int64_to_float32(a STATUS_VAR
);
5515 float64
uint32_to_float64( unsigned int a STATUS_PARAM
)
5517 return int64_to_float64(a STATUS_VAR
);
5520 unsigned int float32_to_uint32( float32 a STATUS_PARAM
)
5525 v
= float32_to_int64(a STATUS_VAR
);
5528 float_raise( float_flag_invalid STATUS_VAR
);
5529 } else if (v
> 0xffffffff) {
5531 float_raise( float_flag_invalid STATUS_VAR
);
5538 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM
)
5543 v
= float32_to_int64_round_to_zero(a STATUS_VAR
);
5546 float_raise( float_flag_invalid STATUS_VAR
);
5547 } else if (v
> 0xffffffff) {
5549 float_raise( float_flag_invalid STATUS_VAR
);
5556 unsigned int float64_to_uint32( float64 a STATUS_PARAM
)
5561 v
= float64_to_int64(a STATUS_VAR
);
5564 float_raise( float_flag_invalid STATUS_VAR
);
5565 } else if (v
> 0xffffffff) {
5567 float_raise( float_flag_invalid STATUS_VAR
);
5574 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM
)
5579 v
= float64_to_int64_round_to_zero(a STATUS_VAR
);
5582 float_raise( float_flag_invalid STATUS_VAR
);
5583 } else if (v
> 0xffffffff) {
5585 float_raise( float_flag_invalid STATUS_VAR
);
5592 /* FIXME: This looks broken. */
5593 uint64_t float64_to_uint64 (float64 a STATUS_PARAM
)
5597 v
= float64_val(int64_to_float64(INT64_MIN STATUS_VAR
));
5598 v
+= float64_val(a
);
5599 v
= float64_to_int64(make_float64(v
) STATUS_VAR
);
5601 return v
- INT64_MIN
;
5604 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM
)
5608 v
= float64_val(int64_to_float64(INT64_MIN STATUS_VAR
));
5609 v
+= float64_val(a
);
5610 v
= float64_to_int64_round_to_zero(make_float64(v
) STATUS_VAR
);
5612 return v
- INT64_MIN
;
5615 #define COMPARE(s, nan_exp) \
5616 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
5617 int is_quiet STATUS_PARAM ) \
5619 flag aSign, bSign; \
5622 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
5623 extractFloat ## s ## Frac( a ) ) || \
5624 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
5625 extractFloat ## s ## Frac( b ) )) { \
5627 float ## s ## _is_signaling_nan( a ) || \
5628 float ## s ## _is_signaling_nan( b ) ) { \
5629 float_raise( float_flag_invalid STATUS_VAR); \
5631 return float_relation_unordered; \
5633 aSign = extractFloat ## s ## Sign( a ); \
5634 bSign = extractFloat ## s ## Sign( b ); \
5635 av = float ## s ## _val(a); \
5636 bv = float ## s ## _val(b); \
5637 if ( aSign != bSign ) { \
5638 if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) { \
5640 return float_relation_equal; \
5642 return 1 - (2 * aSign); \
5646 return float_relation_equal; \
5648 return 1 - 2 * (aSign ^ ( av < bv )); \
5653 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
5655 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
5658 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
5660 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
5666 INLINE
int float128_compare_internal( float128 a
, float128 b
,
5667 int is_quiet STATUS_PARAM
)
5671 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
5672 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
5673 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
5674 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
5676 float128_is_signaling_nan( a
) ||
5677 float128_is_signaling_nan( b
) ) {
5678 float_raise( float_flag_invalid STATUS_VAR
);
5680 return float_relation_unordered
;
5682 aSign
= extractFloat128Sign( a
);
5683 bSign
= extractFloat128Sign( b
);
5684 if ( aSign
!= bSign
) {
5685 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
5687 return float_relation_equal
;
5689 return 1 - (2 * aSign
);
5692 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
5693 return float_relation_equal
;
5695 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
5700 int float128_compare( float128 a
, float128 b STATUS_PARAM
)
5702 return float128_compare_internal(a
, b
, 0 STATUS_VAR
);
5705 int float128_compare_quiet( float128 a
, float128 b STATUS_PARAM
)
5707 return float128_compare_internal(a
, b
, 1 STATUS_VAR
);
5710 /* Multiply A by 2 raised to the power N. */
5711 float32
float32_scalbn( float32 a
, int n STATUS_PARAM
)
5717 aSig
= extractFloat32Frac( a
);
5718 aExp
= extractFloat32Exp( a
);
5719 aSign
= extractFloat32Sign( a
);
5721 if ( aExp
== 0xFF ) {
5726 else if ( aSig
== 0 )
5731 return normalizeRoundAndPackFloat32( aSign
, aExp
, aSig STATUS_VAR
);
5734 float64
float64_scalbn( float64 a
, int n STATUS_PARAM
)
5740 aSig
= extractFloat64Frac( a
);
5741 aExp
= extractFloat64Exp( a
);
5742 aSign
= extractFloat64Sign( a
);
5744 if ( aExp
== 0x7FF ) {
5748 aSig
|= LIT64( 0x0010000000000000 );
5749 else if ( aSig
== 0 )
5754 return normalizeRoundAndPackFloat64( aSign
, aExp
, aSig STATUS_VAR
);
5758 floatx80
floatx80_scalbn( floatx80 a
, int n STATUS_PARAM
)
5764 aSig
= extractFloatx80Frac( a
);
5765 aExp
= extractFloatx80Exp( a
);
5766 aSign
= extractFloatx80Sign( a
);
5768 if ( aExp
== 0x7FF ) {
5771 if (aExp
== 0 && aSig
== 0)
5775 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision
),
5776 aSign
, aExp
, aSig
, 0 STATUS_VAR
);
5781 float128
float128_scalbn( float128 a
, int n STATUS_PARAM
)
5785 bits64 aSig0
, aSig1
;
5787 aSig1
= extractFloat128Frac1( a
);
5788 aSig0
= extractFloat128Frac0( a
);
5789 aExp
= extractFloat128Exp( a
);
5790 aSign
= extractFloat128Sign( a
);
5791 if ( aExp
== 0x7FFF ) {
5795 aSig0
|= LIT64( 0x0001000000000000 );
5796 else if ( aSig0
== 0 && aSig1
== 0 )
5800 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1