Add lost semicolons
[qemu/mini2440.git] / fpu / softfloat.c
blob3988bd10990df8fcd1af268c8555500f9d7e6718
2 /*============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5 Package, Release 2b.
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15 arithmetic/SoftFloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
26 Derivative works are acceptable, even for commercial purposes, so long as
27 (1) the source code for the derivative work includes prominent notice that
28 the work is derivative, and (2) the source code includes prominent notice with
29 these four paragraphs for those parts of this code that are retained.
31 =============================================================================*/
33 #include "softfloat.h"
35 /*----------------------------------------------------------------------------
36 | Primitive arithmetic functions, including multi-word arithmetic, and
37 | division and square root approximations. (Can be specialized to target if
38 | desired.)
39 *----------------------------------------------------------------------------*/
40 #include "softfloat-macros.h"
42 /*----------------------------------------------------------------------------
43 | Functions and definitions to determine: (1) whether tininess for underflow
44 | is detected before or after rounding by default, (2) what (if anything)
45 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
46 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
47 | are propagated from function inputs to output. These details are target-
48 | specific.
49 *----------------------------------------------------------------------------*/
50 #include "softfloat-specialize.h"
52 void set_float_rounding_mode(int val STATUS_PARAM)
54 STATUS(float_rounding_mode) = val;
57 void set_float_exception_flags(int val STATUS_PARAM)
59 STATUS(float_exception_flags) = val;
62 #ifdef FLOATX80
63 void set_floatx80_rounding_precision(int val STATUS_PARAM)
65 STATUS(floatx80_rounding_precision) = val;
67 #endif
69 /*----------------------------------------------------------------------------
70 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
71 | and 7, and returns the properly rounded 32-bit integer corresponding to the
72 | input. If `zSign' is 1, the input is negated before being converted to an
73 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
74 | is simply rounded to an integer, with the inexact exception raised if the
75 | input cannot be represented exactly as an integer. However, if the fixed-
76 | point input is too large, the invalid exception is raised and the largest
77 | positive or negative integer is returned.
78 *----------------------------------------------------------------------------*/
80 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
82 int8 roundingMode;
83 flag roundNearestEven;
84 int8 roundIncrement, roundBits;
85 int32 z;
87 roundingMode = STATUS(float_rounding_mode);
88 roundNearestEven = ( roundingMode == float_round_nearest_even );
89 roundIncrement = 0x40;
90 if ( ! roundNearestEven ) {
91 if ( roundingMode == float_round_to_zero ) {
92 roundIncrement = 0;
94 else {
95 roundIncrement = 0x7F;
96 if ( zSign ) {
97 if ( roundingMode == float_round_up ) roundIncrement = 0;
99 else {
100 if ( roundingMode == float_round_down ) roundIncrement = 0;
104 roundBits = absZ & 0x7F;
105 absZ = ( absZ + roundIncrement )>>7;
106 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
107 z = absZ;
108 if ( zSign ) z = - z;
109 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
110 float_raise( float_flag_invalid STATUS_VAR);
111 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
113 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
114 return z;
118 /*----------------------------------------------------------------------------
119 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
120 | `absZ1', with binary point between bits 63 and 64 (between the input words),
121 | and returns the properly rounded 64-bit integer corresponding to the input.
122 | If `zSign' is 1, the input is negated before being converted to an integer.
123 | Ordinarily, the fixed-point input is simply rounded to an integer, with
124 | the inexact exception raised if the input cannot be represented exactly as
125 | an integer. However, if the fixed-point input is too large, the invalid
126 | exception is raised and the largest positive or negative integer is
127 | returned.
128 *----------------------------------------------------------------------------*/
130 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
132 int8 roundingMode;
133 flag roundNearestEven, increment;
134 int64 z;
136 roundingMode = STATUS(float_rounding_mode);
137 roundNearestEven = ( roundingMode == float_round_nearest_even );
138 increment = ( (sbits64) absZ1 < 0 );
139 if ( ! roundNearestEven ) {
140 if ( roundingMode == float_round_to_zero ) {
141 increment = 0;
143 else {
144 if ( zSign ) {
145 increment = ( roundingMode == float_round_down ) && absZ1;
147 else {
148 increment = ( roundingMode == float_round_up ) && absZ1;
152 if ( increment ) {
153 ++absZ0;
154 if ( absZ0 == 0 ) goto overflow;
155 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
157 z = absZ0;
158 if ( zSign ) z = - z;
159 if ( z && ( ( z < 0 ) ^ zSign ) ) {
160 overflow:
161 float_raise( float_flag_invalid STATUS_VAR);
162 return
163 zSign ? (sbits64) LIT64( 0x8000000000000000 )
164 : LIT64( 0x7FFFFFFFFFFFFFFF );
166 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
167 return z;
171 /*----------------------------------------------------------------------------
172 | Returns the fraction bits of the single-precision floating-point value `a'.
173 *----------------------------------------------------------------------------*/
175 INLINE bits32 extractFloat32Frac( float32 a )
178 return float32_val(a) & 0x007FFFFF;
182 /*----------------------------------------------------------------------------
183 | Returns the exponent bits of the single-precision floating-point value `a'.
184 *----------------------------------------------------------------------------*/
186 INLINE int16 extractFloat32Exp( float32 a )
189 return ( float32_val(a)>>23 ) & 0xFF;
193 /*----------------------------------------------------------------------------
194 | Returns the sign bit of the single-precision floating-point value `a'.
195 *----------------------------------------------------------------------------*/
197 INLINE flag extractFloat32Sign( float32 a )
200 return float32_val(a)>>31;
204 /*----------------------------------------------------------------------------
205 | Normalizes the subnormal single-precision floating-point value represented
206 | by the denormalized significand `aSig'. The normalized exponent and
207 | significand are stored at the locations pointed to by `zExpPtr' and
208 | `zSigPtr', respectively.
209 *----------------------------------------------------------------------------*/
211 static void
212 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
214 int8 shiftCount;
216 shiftCount = countLeadingZeros32( aSig ) - 8;
217 *zSigPtr = aSig<<shiftCount;
218 *zExpPtr = 1 - shiftCount;
222 /*----------------------------------------------------------------------------
223 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
224 | single-precision floating-point value, returning the result. After being
225 | shifted into the proper positions, the three fields are simply added
226 | together to form the result. This means that any integer portion of `zSig'
227 | will be added into the exponent. Since a properly normalized significand
228 | will have an integer portion equal to 1, the `zExp' input should be 1 less
229 | than the desired result exponent whenever `zSig' is a complete, normalized
230 | significand.
231 *----------------------------------------------------------------------------*/
233 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
236 return make_float32(
237 ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
241 /*----------------------------------------------------------------------------
242 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
243 | and significand `zSig', and returns the proper single-precision floating-
244 | point value corresponding to the abstract input. Ordinarily, the abstract
245 | value is simply rounded and packed into the single-precision format, with
246 | the inexact exception raised if the abstract input cannot be represented
247 | exactly. However, if the abstract value is too large, the overflow and
248 | inexact exceptions are raised and an infinity or maximal finite value is
249 | returned. If the abstract value is too small, the input value is rounded to
250 | a subnormal number, and the underflow and inexact exceptions are raised if
251 | the abstract input cannot be represented exactly as a subnormal single-
252 | precision floating-point number.
253 | The input significand `zSig' has its binary point between bits 30
254 | and 29, which is 7 bits to the left of the usual location. This shifted
255 | significand must be normalized or smaller. If `zSig' is not normalized,
256 | `zExp' must be 0; in that case, the result returned is a subnormal number,
257 | and it must not require rounding. In the usual case that `zSig' is
258 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
259 | The handling of underflow and overflow follows the IEC/IEEE Standard for
260 | Binary Floating-Point Arithmetic.
261 *----------------------------------------------------------------------------*/
263 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
265 int8 roundingMode;
266 flag roundNearestEven;
267 int8 roundIncrement, roundBits;
268 flag isTiny;
270 roundingMode = STATUS(float_rounding_mode);
271 roundNearestEven = ( roundingMode == float_round_nearest_even );
272 roundIncrement = 0x40;
273 if ( ! roundNearestEven ) {
274 if ( roundingMode == float_round_to_zero ) {
275 roundIncrement = 0;
277 else {
278 roundIncrement = 0x7F;
279 if ( zSign ) {
280 if ( roundingMode == float_round_up ) roundIncrement = 0;
282 else {
283 if ( roundingMode == float_round_down ) roundIncrement = 0;
287 roundBits = zSig & 0x7F;
288 if ( 0xFD <= (bits16) zExp ) {
289 if ( ( 0xFD < zExp )
290 || ( ( zExp == 0xFD )
291 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
293 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
294 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
296 if ( zExp < 0 ) {
297 isTiny =
298 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
299 || ( zExp < -1 )
300 || ( zSig + roundIncrement < 0x80000000 );
301 shift32RightJamming( zSig, - zExp, &zSig );
302 zExp = 0;
303 roundBits = zSig & 0x7F;
304 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
307 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
308 zSig = ( zSig + roundIncrement )>>7;
309 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
310 if ( zSig == 0 ) zExp = 0;
311 return packFloat32( zSign, zExp, zSig );
315 /*----------------------------------------------------------------------------
316 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
317 | and significand `zSig', and returns the proper single-precision floating-
318 | point value corresponding to the abstract input. This routine is just like
319 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
320 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
321 | floating-point exponent.
322 *----------------------------------------------------------------------------*/
324 static float32
325 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
327 int8 shiftCount;
329 shiftCount = countLeadingZeros32( zSig ) - 1;
330 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
334 /*----------------------------------------------------------------------------
335 | Returns the fraction bits of the double-precision floating-point value `a'.
336 *----------------------------------------------------------------------------*/
338 INLINE bits64 extractFloat64Frac( float64 a )
341 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
345 /*----------------------------------------------------------------------------
346 | Returns the exponent bits of the double-precision floating-point value `a'.
347 *----------------------------------------------------------------------------*/
349 INLINE int16 extractFloat64Exp( float64 a )
352 return ( float64_val(a)>>52 ) & 0x7FF;
356 /*----------------------------------------------------------------------------
357 | Returns the sign bit of the double-precision floating-point value `a'.
358 *----------------------------------------------------------------------------*/
360 INLINE flag extractFloat64Sign( float64 a )
363 return float64_val(a)>>63;
367 /*----------------------------------------------------------------------------
368 | Normalizes the subnormal double-precision floating-point value represented
369 | by the denormalized significand `aSig'. The normalized exponent and
370 | significand are stored at the locations pointed to by `zExpPtr' and
371 | `zSigPtr', respectively.
372 *----------------------------------------------------------------------------*/
374 static void
375 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
377 int8 shiftCount;
379 shiftCount = countLeadingZeros64( aSig ) - 11;
380 *zSigPtr = aSig<<shiftCount;
381 *zExpPtr = 1 - shiftCount;
385 /*----------------------------------------------------------------------------
386 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
387 | double-precision floating-point value, returning the result. After being
388 | shifted into the proper positions, the three fields are simply added
389 | together to form the result. This means that any integer portion of `zSig'
390 | will be added into the exponent. Since a properly normalized significand
391 | will have an integer portion equal to 1, the `zExp' input should be 1 less
392 | than the desired result exponent whenever `zSig' is a complete, normalized
393 | significand.
394 *----------------------------------------------------------------------------*/
396 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
399 return make_float64(
400 ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
404 /*----------------------------------------------------------------------------
405 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
406 | and significand `zSig', and returns the proper double-precision floating-
407 | point value corresponding to the abstract input. Ordinarily, the abstract
408 | value is simply rounded and packed into the double-precision format, with
409 | the inexact exception raised if the abstract input cannot be represented
410 | exactly. However, if the abstract value is too large, the overflow and
411 | inexact exceptions are raised and an infinity or maximal finite value is
412 | returned. If the abstract value is too small, the input value is rounded
413 | to a subnormal number, and the underflow and inexact exceptions are raised
414 | if the abstract input cannot be represented exactly as a subnormal double-
415 | precision floating-point number.
416 | The input significand `zSig' has its binary point between bits 62
417 | and 61, which is 10 bits to the left of the usual location. This shifted
418 | significand must be normalized or smaller. If `zSig' is not normalized,
419 | `zExp' must be 0; in that case, the result returned is a subnormal number,
420 | and it must not require rounding. In the usual case that `zSig' is
421 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
422 | The handling of underflow and overflow follows the IEC/IEEE Standard for
423 | Binary Floating-Point Arithmetic.
424 *----------------------------------------------------------------------------*/
426 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
428 int8 roundingMode;
429 flag roundNearestEven;
430 int16 roundIncrement, roundBits;
431 flag isTiny;
433 roundingMode = STATUS(float_rounding_mode);
434 roundNearestEven = ( roundingMode == float_round_nearest_even );
435 roundIncrement = 0x200;
436 if ( ! roundNearestEven ) {
437 if ( roundingMode == float_round_to_zero ) {
438 roundIncrement = 0;
440 else {
441 roundIncrement = 0x3FF;
442 if ( zSign ) {
443 if ( roundingMode == float_round_up ) roundIncrement = 0;
445 else {
446 if ( roundingMode == float_round_down ) roundIncrement = 0;
450 roundBits = zSig & 0x3FF;
451 if ( 0x7FD <= (bits16) zExp ) {
452 if ( ( 0x7FD < zExp )
453 || ( ( zExp == 0x7FD )
454 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
456 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
457 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
459 if ( zExp < 0 ) {
460 isTiny =
461 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
462 || ( zExp < -1 )
463 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
464 shift64RightJamming( zSig, - zExp, &zSig );
465 zExp = 0;
466 roundBits = zSig & 0x3FF;
467 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
470 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
471 zSig = ( zSig + roundIncrement )>>10;
472 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
473 if ( zSig == 0 ) zExp = 0;
474 return packFloat64( zSign, zExp, zSig );
478 /*----------------------------------------------------------------------------
479 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
480 | and significand `zSig', and returns the proper double-precision floating-
481 | point value corresponding to the abstract input. This routine is just like
482 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
483 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
484 | floating-point exponent.
485 *----------------------------------------------------------------------------*/
487 static float64
488 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
490 int8 shiftCount;
492 shiftCount = countLeadingZeros64( zSig ) - 1;
493 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
497 #ifdef FLOATX80
499 /*----------------------------------------------------------------------------
500 | Returns the fraction bits of the extended double-precision floating-point
501 | value `a'.
502 *----------------------------------------------------------------------------*/
504 INLINE bits64 extractFloatx80Frac( floatx80 a )
507 return a.low;
511 /*----------------------------------------------------------------------------
512 | Returns the exponent bits of the extended double-precision floating-point
513 | value `a'.
514 *----------------------------------------------------------------------------*/
516 INLINE int32 extractFloatx80Exp( floatx80 a )
519 return a.high & 0x7FFF;
523 /*----------------------------------------------------------------------------
524 | Returns the sign bit of the extended double-precision floating-point value
525 | `a'.
526 *----------------------------------------------------------------------------*/
528 INLINE flag extractFloatx80Sign( floatx80 a )
531 return a.high>>15;
535 /*----------------------------------------------------------------------------
536 | Normalizes the subnormal extended double-precision floating-point value
537 | represented by the denormalized significand `aSig'. The normalized exponent
538 | and significand are stored at the locations pointed to by `zExpPtr' and
539 | `zSigPtr', respectively.
540 *----------------------------------------------------------------------------*/
542 static void
543 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
545 int8 shiftCount;
547 shiftCount = countLeadingZeros64( aSig );
548 *zSigPtr = aSig<<shiftCount;
549 *zExpPtr = 1 - shiftCount;
553 /*----------------------------------------------------------------------------
554 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
555 | extended double-precision floating-point value, returning the result.
556 *----------------------------------------------------------------------------*/
558 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
560 floatx80 z;
562 z.low = zSig;
563 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
564 return z;
568 /*----------------------------------------------------------------------------
569 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
570 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
571 | and returns the proper extended double-precision floating-point value
572 | corresponding to the abstract input. Ordinarily, the abstract value is
573 | rounded and packed into the extended double-precision format, with the
574 | inexact exception raised if the abstract input cannot be represented
575 | exactly. However, if the abstract value is too large, the overflow and
576 | inexact exceptions are raised and an infinity or maximal finite value is
577 | returned. If the abstract value is too small, the input value is rounded to
578 | a subnormal number, and the underflow and inexact exceptions are raised if
579 | the abstract input cannot be represented exactly as a subnormal extended
580 | double-precision floating-point number.
581 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
582 | number of bits as single or double precision, respectively. Otherwise, the
583 | result is rounded to the full precision of the extended double-precision
584 | format.
585 | The input significand must be normalized or smaller. If the input
586 | significand is not normalized, `zExp' must be 0; in that case, the result
587 | returned is a subnormal number, and it must not require rounding. The
588 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
589 | Floating-Point Arithmetic.
590 *----------------------------------------------------------------------------*/
592 static floatx80
593 roundAndPackFloatx80(
594 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
595 STATUS_PARAM)
597 int8 roundingMode;
598 flag roundNearestEven, increment, isTiny;
599 int64 roundIncrement, roundMask, roundBits;
601 roundingMode = STATUS(float_rounding_mode);
602 roundNearestEven = ( roundingMode == float_round_nearest_even );
603 if ( roundingPrecision == 80 ) goto precision80;
604 if ( roundingPrecision == 64 ) {
605 roundIncrement = LIT64( 0x0000000000000400 );
606 roundMask = LIT64( 0x00000000000007FF );
608 else if ( roundingPrecision == 32 ) {
609 roundIncrement = LIT64( 0x0000008000000000 );
610 roundMask = LIT64( 0x000000FFFFFFFFFF );
612 else {
613 goto precision80;
615 zSig0 |= ( zSig1 != 0 );
616 if ( ! roundNearestEven ) {
617 if ( roundingMode == float_round_to_zero ) {
618 roundIncrement = 0;
620 else {
621 roundIncrement = roundMask;
622 if ( zSign ) {
623 if ( roundingMode == float_round_up ) roundIncrement = 0;
625 else {
626 if ( roundingMode == float_round_down ) roundIncrement = 0;
630 roundBits = zSig0 & roundMask;
631 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
632 if ( ( 0x7FFE < zExp )
633 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
635 goto overflow;
637 if ( zExp <= 0 ) {
638 isTiny =
639 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
640 || ( zExp < 0 )
641 || ( zSig0 <= zSig0 + roundIncrement );
642 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
643 zExp = 0;
644 roundBits = zSig0 & roundMask;
645 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
646 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
647 zSig0 += roundIncrement;
648 if ( (sbits64) zSig0 < 0 ) zExp = 1;
649 roundIncrement = roundMask + 1;
650 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
651 roundMask |= roundIncrement;
653 zSig0 &= ~ roundMask;
654 return packFloatx80( zSign, zExp, zSig0 );
657 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
658 zSig0 += roundIncrement;
659 if ( zSig0 < roundIncrement ) {
660 ++zExp;
661 zSig0 = LIT64( 0x8000000000000000 );
663 roundIncrement = roundMask + 1;
664 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
665 roundMask |= roundIncrement;
667 zSig0 &= ~ roundMask;
668 if ( zSig0 == 0 ) zExp = 0;
669 return packFloatx80( zSign, zExp, zSig0 );
670 precision80:
671 increment = ( (sbits64) zSig1 < 0 );
672 if ( ! roundNearestEven ) {
673 if ( roundingMode == float_round_to_zero ) {
674 increment = 0;
676 else {
677 if ( zSign ) {
678 increment = ( roundingMode == float_round_down ) && zSig1;
680 else {
681 increment = ( roundingMode == float_round_up ) && zSig1;
685 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
686 if ( ( 0x7FFE < zExp )
687 || ( ( zExp == 0x7FFE )
688 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
689 && increment
692 roundMask = 0;
693 overflow:
694 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
695 if ( ( roundingMode == float_round_to_zero )
696 || ( zSign && ( roundingMode == float_round_up ) )
697 || ( ! zSign && ( roundingMode == float_round_down ) )
699 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
701 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
703 if ( zExp <= 0 ) {
704 isTiny =
705 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
706 || ( zExp < 0 )
707 || ! increment
708 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
709 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
710 zExp = 0;
711 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
712 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
713 if ( roundNearestEven ) {
714 increment = ( (sbits64) zSig1 < 0 );
716 else {
717 if ( zSign ) {
718 increment = ( roundingMode == float_round_down ) && zSig1;
720 else {
721 increment = ( roundingMode == float_round_up ) && zSig1;
724 if ( increment ) {
725 ++zSig0;
726 zSig0 &=
727 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
728 if ( (sbits64) zSig0 < 0 ) zExp = 1;
730 return packFloatx80( zSign, zExp, zSig0 );
733 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
734 if ( increment ) {
735 ++zSig0;
736 if ( zSig0 == 0 ) {
737 ++zExp;
738 zSig0 = LIT64( 0x8000000000000000 );
740 else {
741 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
744 else {
745 if ( zSig0 == 0 ) zExp = 0;
747 return packFloatx80( zSign, zExp, zSig0 );
751 /*----------------------------------------------------------------------------
752 | Takes an abstract floating-point value having sign `zSign', exponent
753 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
754 | and returns the proper extended double-precision floating-point value
755 | corresponding to the abstract input. This routine is just like
756 | `roundAndPackFloatx80' except that the input significand does not have to be
757 | normalized.
758 *----------------------------------------------------------------------------*/
760 static floatx80
761 normalizeRoundAndPackFloatx80(
762 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
763 STATUS_PARAM)
765 int8 shiftCount;
767 if ( zSig0 == 0 ) {
768 zSig0 = zSig1;
769 zSig1 = 0;
770 zExp -= 64;
772 shiftCount = countLeadingZeros64( zSig0 );
773 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
774 zExp -= shiftCount;
775 return
776 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
780 #endif
782 #ifdef FLOAT128
784 /*----------------------------------------------------------------------------
785 | Returns the least-significant 64 fraction bits of the quadruple-precision
786 | floating-point value `a'.
787 *----------------------------------------------------------------------------*/
789 INLINE bits64 extractFloat128Frac1( float128 a )
792 return a.low;
796 /*----------------------------------------------------------------------------
797 | Returns the most-significant 48 fraction bits of the quadruple-precision
798 | floating-point value `a'.
799 *----------------------------------------------------------------------------*/
801 INLINE bits64 extractFloat128Frac0( float128 a )
804 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
808 /*----------------------------------------------------------------------------
809 | Returns the exponent bits of the quadruple-precision floating-point value
810 | `a'.
811 *----------------------------------------------------------------------------*/
813 INLINE int32 extractFloat128Exp( float128 a )
816 return ( a.high>>48 ) & 0x7FFF;
820 /*----------------------------------------------------------------------------
821 | Returns the sign bit of the quadruple-precision floating-point value `a'.
822 *----------------------------------------------------------------------------*/
824 INLINE flag extractFloat128Sign( float128 a )
827 return a.high>>63;
831 /*----------------------------------------------------------------------------
832 | Normalizes the subnormal quadruple-precision floating-point value
833 | represented by the denormalized significand formed by the concatenation of
834 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
835 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
836 | significand are stored at the location pointed to by `zSig0Ptr', and the
837 | least significant 64 bits of the normalized significand are stored at the
838 | location pointed to by `zSig1Ptr'.
839 *----------------------------------------------------------------------------*/
841 static void
842 normalizeFloat128Subnormal(
843 bits64 aSig0,
844 bits64 aSig1,
845 int32 *zExpPtr,
846 bits64 *zSig0Ptr,
847 bits64 *zSig1Ptr
850 int8 shiftCount;
852 if ( aSig0 == 0 ) {
853 shiftCount = countLeadingZeros64( aSig1 ) - 15;
854 if ( shiftCount < 0 ) {
855 *zSig0Ptr = aSig1>>( - shiftCount );
856 *zSig1Ptr = aSig1<<( shiftCount & 63 );
858 else {
859 *zSig0Ptr = aSig1<<shiftCount;
860 *zSig1Ptr = 0;
862 *zExpPtr = - shiftCount - 63;
864 else {
865 shiftCount = countLeadingZeros64( aSig0 ) - 15;
866 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
867 *zExpPtr = 1 - shiftCount;
872 /*----------------------------------------------------------------------------
873 | Packs the sign `zSign', the exponent `zExp', and the significand formed
874 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
875 | floating-point value, returning the result. After being shifted into the
876 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
877 | added together to form the most significant 32 bits of the result. This
878 | means that any integer portion of `zSig0' will be added into the exponent.
879 | Since a properly normalized significand will have an integer portion equal
880 | to 1, the `zExp' input should be 1 less than the desired result exponent
881 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
882 | significand.
883 *----------------------------------------------------------------------------*/
885 INLINE float128
886 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
888 float128 z;
890 z.low = zSig1;
891 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
892 return z;
896 /*----------------------------------------------------------------------------
897 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
898 | and extended significand formed by the concatenation of `zSig0', `zSig1',
899 | and `zSig2', and returns the proper quadruple-precision floating-point value
900 | corresponding to the abstract input. Ordinarily, the abstract value is
901 | simply rounded and packed into the quadruple-precision format, with the
902 | inexact exception raised if the abstract input cannot be represented
903 | exactly. However, if the abstract value is too large, the overflow and
904 | inexact exceptions are raised and an infinity or maximal finite value is
905 | returned. If the abstract value is too small, the input value is rounded to
906 | a subnormal number, and the underflow and inexact exceptions are raised if
907 | the abstract input cannot be represented exactly as a subnormal quadruple-
908 | precision floating-point number.
909 | The input significand must be normalized or smaller. If the input
910 | significand is not normalized, `zExp' must be 0; in that case, the result
911 | returned is a subnormal number, and it must not require rounding. In the
912 | usual case that the input significand is normalized, `zExp' must be 1 less
913 | than the ``true'' floating-point exponent. The handling of underflow and
914 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
915 *----------------------------------------------------------------------------*/
917 static float128
918 roundAndPackFloat128(
919 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
921 int8 roundingMode;
922 flag roundNearestEven, increment, isTiny;
924 roundingMode = STATUS(float_rounding_mode);
925 roundNearestEven = ( roundingMode == float_round_nearest_even );
926 increment = ( (sbits64) zSig2 < 0 );
927 if ( ! roundNearestEven ) {
928 if ( roundingMode == float_round_to_zero ) {
929 increment = 0;
931 else {
932 if ( zSign ) {
933 increment = ( roundingMode == float_round_down ) && zSig2;
935 else {
936 increment = ( roundingMode == float_round_up ) && zSig2;
940 if ( 0x7FFD <= (bits32) zExp ) {
941 if ( ( 0x7FFD < zExp )
942 || ( ( zExp == 0x7FFD )
943 && eq128(
944 LIT64( 0x0001FFFFFFFFFFFF ),
945 LIT64( 0xFFFFFFFFFFFFFFFF ),
946 zSig0,
947 zSig1
949 && increment
952 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
953 if ( ( roundingMode == float_round_to_zero )
954 || ( zSign && ( roundingMode == float_round_up ) )
955 || ( ! zSign && ( roundingMode == float_round_down ) )
957 return
958 packFloat128(
959 zSign,
960 0x7FFE,
961 LIT64( 0x0000FFFFFFFFFFFF ),
962 LIT64( 0xFFFFFFFFFFFFFFFF )
965 return packFloat128( zSign, 0x7FFF, 0, 0 );
967 if ( zExp < 0 ) {
968 isTiny =
969 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
970 || ( zExp < -1 )
971 || ! increment
972 || lt128(
973 zSig0,
974 zSig1,
975 LIT64( 0x0001FFFFFFFFFFFF ),
976 LIT64( 0xFFFFFFFFFFFFFFFF )
978 shift128ExtraRightJamming(
979 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
980 zExp = 0;
981 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
982 if ( roundNearestEven ) {
983 increment = ( (sbits64) zSig2 < 0 );
985 else {
986 if ( zSign ) {
987 increment = ( roundingMode == float_round_down ) && zSig2;
989 else {
990 increment = ( roundingMode == float_round_up ) && zSig2;
995 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
996 if ( increment ) {
997 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
998 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1000 else {
1001 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1003 return packFloat128( zSign, zExp, zSig0, zSig1 );
1007 /*----------------------------------------------------------------------------
1008 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1009 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1010 | returns the proper quadruple-precision floating-point value corresponding
1011 | to the abstract input. This routine is just like `roundAndPackFloat128'
1012 | except that the input significand has fewer bits and does not have to be
1013 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1014 | point exponent.
1015 *----------------------------------------------------------------------------*/
1017 static float128
1018 normalizeRoundAndPackFloat128(
1019 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1021 int8 shiftCount;
1022 bits64 zSig2;
1024 if ( zSig0 == 0 ) {
1025 zSig0 = zSig1;
1026 zSig1 = 0;
1027 zExp -= 64;
1029 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1030 if ( 0 <= shiftCount ) {
1031 zSig2 = 0;
1032 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1034 else {
1035 shift128ExtraRightJamming(
1036 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1038 zExp -= shiftCount;
1039 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1043 #endif
1045 /*----------------------------------------------------------------------------
1046 | Returns the result of converting the 32-bit two's complement integer `a'
1047 | to the single-precision floating-point format. The conversion is performed
1048 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1049 *----------------------------------------------------------------------------*/
1051 float32 int32_to_float32( int32 a STATUS_PARAM )
1053 flag zSign;
1055 if ( a == 0 ) return float32_zero;
1056 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1057 zSign = ( a < 0 );
1058 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1062 /*----------------------------------------------------------------------------
1063 | Returns the result of converting the 32-bit two's complement integer `a'
1064 | to the double-precision floating-point format. The conversion is performed
1065 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1066 *----------------------------------------------------------------------------*/
1068 float64 int32_to_float64( int32 a STATUS_PARAM )
1070 flag zSign;
1071 uint32 absA;
1072 int8 shiftCount;
1073 bits64 zSig;
1075 if ( a == 0 ) return float64_zero;
1076 zSign = ( a < 0 );
1077 absA = zSign ? - a : a;
1078 shiftCount = countLeadingZeros32( absA ) + 21;
1079 zSig = absA;
1080 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1084 #ifdef FLOATX80
1086 /*----------------------------------------------------------------------------
1087 | Returns the result of converting the 32-bit two's complement integer `a'
1088 | to the extended double-precision floating-point format. The conversion
1089 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1090 | Arithmetic.
1091 *----------------------------------------------------------------------------*/
1093 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1095 flag zSign;
1096 uint32 absA;
1097 int8 shiftCount;
1098 bits64 zSig;
1100 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1101 zSign = ( a < 0 );
1102 absA = zSign ? - a : a;
1103 shiftCount = countLeadingZeros32( absA ) + 32;
1104 zSig = absA;
1105 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1109 #endif
1111 #ifdef FLOAT128
1113 /*----------------------------------------------------------------------------
1114 | Returns the result of converting the 32-bit two's complement integer `a' to
1115 | the quadruple-precision floating-point format. The conversion is performed
1116 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1117 *----------------------------------------------------------------------------*/
1119 float128 int32_to_float128( int32 a STATUS_PARAM )
1121 flag zSign;
1122 uint32 absA;
1123 int8 shiftCount;
1124 bits64 zSig0;
1126 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1127 zSign = ( a < 0 );
1128 absA = zSign ? - a : a;
1129 shiftCount = countLeadingZeros32( absA ) + 17;
1130 zSig0 = absA;
1131 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1135 #endif
1137 /*----------------------------------------------------------------------------
1138 | Returns the result of converting the 64-bit two's complement integer `a'
1139 | to the single-precision floating-point format. The conversion is performed
1140 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1141 *----------------------------------------------------------------------------*/
1143 float32 int64_to_float32( int64 a STATUS_PARAM )
1145 flag zSign;
1146 uint64 absA;
1147 int8 shiftCount;
1149 if ( a == 0 ) return float32_zero;
1150 zSign = ( a < 0 );
1151 absA = zSign ? - a : a;
1152 shiftCount = countLeadingZeros64( absA ) - 40;
1153 if ( 0 <= shiftCount ) {
1154 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1156 else {
1157 shiftCount += 7;
1158 if ( shiftCount < 0 ) {
1159 shift64RightJamming( absA, - shiftCount, &absA );
1161 else {
1162 absA <<= shiftCount;
1164 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1169 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1171 int8 shiftCount;
1173 if ( a == 0 ) return float32_zero;
1174 shiftCount = countLeadingZeros64( a ) - 40;
1175 if ( 0 <= shiftCount ) {
1176 return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1178 else {
1179 shiftCount += 7;
1180 if ( shiftCount < 0 ) {
1181 shift64RightJamming( a, - shiftCount, &a );
1183 else {
1184 a <<= shiftCount;
1186 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1190 /*----------------------------------------------------------------------------
1191 | Returns the result of converting the 64-bit two's complement integer `a'
1192 | to the double-precision floating-point format. The conversion is performed
1193 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1194 *----------------------------------------------------------------------------*/
1196 float64 int64_to_float64( int64 a STATUS_PARAM )
1198 flag zSign;
1200 if ( a == 0 ) return float64_zero;
1201 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1202 return packFloat64( 1, 0x43E, 0 );
1204 zSign = ( a < 0 );
1205 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1209 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1211 if ( a == 0 ) return float64_zero;
1212 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1216 #ifdef FLOATX80
1218 /*----------------------------------------------------------------------------
1219 | Returns the result of converting the 64-bit two's complement integer `a'
1220 | to the extended double-precision floating-point format. The conversion
1221 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1222 | Arithmetic.
1223 *----------------------------------------------------------------------------*/
1225 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1227 flag zSign;
1228 uint64 absA;
1229 int8 shiftCount;
1231 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1232 zSign = ( a < 0 );
1233 absA = zSign ? - a : a;
1234 shiftCount = countLeadingZeros64( absA );
1235 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1239 #endif
1241 #ifdef FLOAT128
1243 /*----------------------------------------------------------------------------
1244 | Returns the result of converting the 64-bit two's complement integer `a' to
1245 | the quadruple-precision floating-point format. The conversion is performed
1246 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1247 *----------------------------------------------------------------------------*/
1249 float128 int64_to_float128( int64 a STATUS_PARAM )
1251 flag zSign;
1252 uint64 absA;
1253 int8 shiftCount;
1254 int32 zExp;
1255 bits64 zSig0, zSig1;
1257 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1258 zSign = ( a < 0 );
1259 absA = zSign ? - a : a;
1260 shiftCount = countLeadingZeros64( absA ) + 49;
1261 zExp = 0x406E - shiftCount;
1262 if ( 64 <= shiftCount ) {
1263 zSig1 = 0;
1264 zSig0 = absA;
1265 shiftCount -= 64;
1267 else {
1268 zSig1 = absA;
1269 zSig0 = 0;
1271 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1272 return packFloat128( zSign, zExp, zSig0, zSig1 );
1276 #endif
1278 /*----------------------------------------------------------------------------
1279 | Returns the result of converting the single-precision floating-point value
1280 | `a' to the 32-bit two's complement integer format. The conversion is
1281 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1282 | Arithmetic---which means in particular that the conversion is rounded
1283 | according to the current rounding mode. If `a' is a NaN, the largest
1284 | positive integer is returned. Otherwise, if the conversion overflows, the
1285 | largest integer with the same sign as `a' is returned.
1286 *----------------------------------------------------------------------------*/
1288 int32 float32_to_int32( float32 a STATUS_PARAM )
1290 flag aSign;
1291 int16 aExp, shiftCount;
1292 bits32 aSig;
1293 bits64 aSig64;
1295 aSig = extractFloat32Frac( a );
1296 aExp = extractFloat32Exp( a );
1297 aSign = extractFloat32Sign( a );
1298 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1299 if ( aExp ) aSig |= 0x00800000;
1300 shiftCount = 0xAF - aExp;
1301 aSig64 = aSig;
1302 aSig64 <<= 32;
1303 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1304 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1308 /*----------------------------------------------------------------------------
1309 | Returns the result of converting the single-precision floating-point value
1310 | `a' to the 32-bit two's complement integer format. The conversion is
1311 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1312 | Arithmetic, except that the conversion is always rounded toward zero.
1313 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1314 | the conversion overflows, the largest integer with the same sign as `a' is
1315 | returned.
1316 *----------------------------------------------------------------------------*/
1318 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1320 flag aSign;
1321 int16 aExp, shiftCount;
1322 bits32 aSig;
1323 int32 z;
1325 aSig = extractFloat32Frac( a );
1326 aExp = extractFloat32Exp( a );
1327 aSign = extractFloat32Sign( a );
1328 shiftCount = aExp - 0x9E;
1329 if ( 0 <= shiftCount ) {
1330 if ( float32_val(a) != 0xCF000000 ) {
1331 float_raise( float_flag_invalid STATUS_VAR);
1332 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1334 return (sbits32) 0x80000000;
1336 else if ( aExp <= 0x7E ) {
1337 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1338 return 0;
1340 aSig = ( aSig | 0x00800000 )<<8;
1341 z = aSig>>( - shiftCount );
1342 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1343 STATUS(float_exception_flags) |= float_flag_inexact;
1345 if ( aSign ) z = - z;
1346 return z;
1350 /*----------------------------------------------------------------------------
1351 | Returns the result of converting the single-precision floating-point value
1352 | `a' to the 64-bit two's complement integer format. The conversion is
1353 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1354 | Arithmetic---which means in particular that the conversion is rounded
1355 | according to the current rounding mode. If `a' is a NaN, the largest
1356 | positive integer is returned. Otherwise, if the conversion overflows, the
1357 | largest integer with the same sign as `a' is returned.
1358 *----------------------------------------------------------------------------*/
1360 int64 float32_to_int64( float32 a STATUS_PARAM )
1362 flag aSign;
1363 int16 aExp, shiftCount;
1364 bits32 aSig;
1365 bits64 aSig64, aSigExtra;
1367 aSig = extractFloat32Frac( a );
1368 aExp = extractFloat32Exp( a );
1369 aSign = extractFloat32Sign( a );
1370 shiftCount = 0xBE - aExp;
1371 if ( shiftCount < 0 ) {
1372 float_raise( float_flag_invalid STATUS_VAR);
1373 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1374 return LIT64( 0x7FFFFFFFFFFFFFFF );
1376 return (sbits64) LIT64( 0x8000000000000000 );
1378 if ( aExp ) aSig |= 0x00800000;
1379 aSig64 = aSig;
1380 aSig64 <<= 40;
1381 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1382 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1386 /*----------------------------------------------------------------------------
1387 | Returns the result of converting the single-precision floating-point value
1388 | `a' to the 64-bit two's complement integer format. The conversion is
1389 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1390 | Arithmetic, except that the conversion is always rounded toward zero. If
1391 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1392 | conversion overflows, the largest integer with the same sign as `a' is
1393 | returned.
1394 *----------------------------------------------------------------------------*/
1396 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1398 flag aSign;
1399 int16 aExp, shiftCount;
1400 bits32 aSig;
1401 bits64 aSig64;
1402 int64 z;
1404 aSig = extractFloat32Frac( a );
1405 aExp = extractFloat32Exp( a );
1406 aSign = extractFloat32Sign( a );
1407 shiftCount = aExp - 0xBE;
1408 if ( 0 <= shiftCount ) {
1409 if ( float32_val(a) != 0xDF000000 ) {
1410 float_raise( float_flag_invalid STATUS_VAR);
1411 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1412 return LIT64( 0x7FFFFFFFFFFFFFFF );
1415 return (sbits64) LIT64( 0x8000000000000000 );
1417 else if ( aExp <= 0x7E ) {
1418 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1419 return 0;
1421 aSig64 = aSig | 0x00800000;
1422 aSig64 <<= 40;
1423 z = aSig64>>( - shiftCount );
1424 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1425 STATUS(float_exception_flags) |= float_flag_inexact;
1427 if ( aSign ) z = - z;
1428 return z;
1432 /*----------------------------------------------------------------------------
1433 | Returns the result of converting the single-precision floating-point value
1434 | `a' to the double-precision floating-point format. The conversion is
1435 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1436 | Arithmetic.
1437 *----------------------------------------------------------------------------*/
1439 float64 float32_to_float64( float32 a STATUS_PARAM )
1441 flag aSign;
1442 int16 aExp;
1443 bits32 aSig;
1445 aSig = extractFloat32Frac( a );
1446 aExp = extractFloat32Exp( a );
1447 aSign = extractFloat32Sign( a );
1448 if ( aExp == 0xFF ) {
1449 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1450 return packFloat64( aSign, 0x7FF, 0 );
1452 if ( aExp == 0 ) {
1453 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1454 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1455 --aExp;
1457 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1461 #ifdef FLOATX80
1463 /*----------------------------------------------------------------------------
1464 | Returns the result of converting the single-precision floating-point value
1465 | `a' to the extended double-precision floating-point format. The conversion
1466 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1467 | Arithmetic.
1468 *----------------------------------------------------------------------------*/
1470 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1472 flag aSign;
1473 int16 aExp;
1474 bits32 aSig;
1476 aSig = extractFloat32Frac( a );
1477 aExp = extractFloat32Exp( a );
1478 aSign = extractFloat32Sign( a );
1479 if ( aExp == 0xFF ) {
1480 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1481 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1483 if ( aExp == 0 ) {
1484 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1485 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1487 aSig |= 0x00800000;
1488 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1492 #endif
1494 #ifdef FLOAT128
1496 /*----------------------------------------------------------------------------
1497 | Returns the result of converting the single-precision floating-point value
1498 | `a' to the double-precision floating-point format. The conversion is
1499 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1500 | Arithmetic.
1501 *----------------------------------------------------------------------------*/
1503 float128 float32_to_float128( float32 a STATUS_PARAM )
1505 flag aSign;
1506 int16 aExp;
1507 bits32 aSig;
1509 aSig = extractFloat32Frac( a );
1510 aExp = extractFloat32Exp( a );
1511 aSign = extractFloat32Sign( a );
1512 if ( aExp == 0xFF ) {
1513 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1514 return packFloat128( aSign, 0x7FFF, 0, 0 );
1516 if ( aExp == 0 ) {
1517 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1518 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1519 --aExp;
1521 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1525 #endif
1527 /*----------------------------------------------------------------------------
1528 | Rounds the single-precision floating-point value `a' to an integer, and
1529 | returns the result as a single-precision floating-point value. The
1530 | operation is performed according to the IEC/IEEE Standard for Binary
1531 | Floating-Point Arithmetic.
1532 *----------------------------------------------------------------------------*/
1534 float32 float32_round_to_int( float32 a STATUS_PARAM)
1536 flag aSign;
1537 int16 aExp;
1538 bits32 lastBitMask, roundBitsMask;
1539 int8 roundingMode;
1540 bits32 z;
1542 aExp = extractFloat32Exp( a );
1543 if ( 0x96 <= aExp ) {
1544 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1545 return propagateFloat32NaN( a, a STATUS_VAR );
1547 return a;
1549 if ( aExp <= 0x7E ) {
1550 if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1551 STATUS(float_exception_flags) |= float_flag_inexact;
1552 aSign = extractFloat32Sign( a );
1553 switch ( STATUS(float_rounding_mode) ) {
1554 case float_round_nearest_even:
1555 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1556 return packFloat32( aSign, 0x7F, 0 );
1558 break;
1559 case float_round_down:
1560 return make_float32(aSign ? 0xBF800000 : 0);
1561 case float_round_up:
1562 return make_float32(aSign ? 0x80000000 : 0x3F800000);
1564 return packFloat32( aSign, 0, 0 );
1566 lastBitMask = 1;
1567 lastBitMask <<= 0x96 - aExp;
1568 roundBitsMask = lastBitMask - 1;
1569 z = float32_val(a);
1570 roundingMode = STATUS(float_rounding_mode);
1571 if ( roundingMode == float_round_nearest_even ) {
1572 z += lastBitMask>>1;
1573 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1575 else if ( roundingMode != float_round_to_zero ) {
1576 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1577 z += roundBitsMask;
1580 z &= ~ roundBitsMask;
1581 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1582 return make_float32(z);
1586 /*----------------------------------------------------------------------------
1587 | Returns the result of adding the absolute values of the single-precision
1588 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1589 | before being returned. `zSign' is ignored if the result is a NaN.
1590 | The addition is performed according to the IEC/IEEE Standard for Binary
1591 | Floating-Point Arithmetic.
1592 *----------------------------------------------------------------------------*/
1594 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1596 int16 aExp, bExp, zExp;
1597 bits32 aSig, bSig, zSig;
1598 int16 expDiff;
1600 aSig = extractFloat32Frac( a );
1601 aExp = extractFloat32Exp( a );
1602 bSig = extractFloat32Frac( b );
1603 bExp = extractFloat32Exp( b );
1604 expDiff = aExp - bExp;
1605 aSig <<= 6;
1606 bSig <<= 6;
1607 if ( 0 < expDiff ) {
1608 if ( aExp == 0xFF ) {
1609 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1610 return a;
1612 if ( bExp == 0 ) {
1613 --expDiff;
1615 else {
1616 bSig |= 0x20000000;
1618 shift32RightJamming( bSig, expDiff, &bSig );
1619 zExp = aExp;
1621 else if ( expDiff < 0 ) {
1622 if ( bExp == 0xFF ) {
1623 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1624 return packFloat32( zSign, 0xFF, 0 );
1626 if ( aExp == 0 ) {
1627 ++expDiff;
1629 else {
1630 aSig |= 0x20000000;
1632 shift32RightJamming( aSig, - expDiff, &aSig );
1633 zExp = bExp;
1635 else {
1636 if ( aExp == 0xFF ) {
1637 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1638 return a;
1640 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1641 zSig = 0x40000000 + aSig + bSig;
1642 zExp = aExp;
1643 goto roundAndPack;
1645 aSig |= 0x20000000;
1646 zSig = ( aSig + bSig )<<1;
1647 --zExp;
1648 if ( (sbits32) zSig < 0 ) {
1649 zSig = aSig + bSig;
1650 ++zExp;
1652 roundAndPack:
1653 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1657 /*----------------------------------------------------------------------------
1658 | Returns the result of subtracting the absolute values of the single-
1659 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1660 | difference is negated before being returned. `zSign' is ignored if the
1661 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1662 | Standard for Binary Floating-Point Arithmetic.
1663 *----------------------------------------------------------------------------*/
1665 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1667 int16 aExp, bExp, zExp;
1668 bits32 aSig, bSig, zSig;
1669 int16 expDiff;
1671 aSig = extractFloat32Frac( a );
1672 aExp = extractFloat32Exp( a );
1673 bSig = extractFloat32Frac( b );
1674 bExp = extractFloat32Exp( b );
1675 expDiff = aExp - bExp;
1676 aSig <<= 7;
1677 bSig <<= 7;
1678 if ( 0 < expDiff ) goto aExpBigger;
1679 if ( expDiff < 0 ) goto bExpBigger;
1680 if ( aExp == 0xFF ) {
1681 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1682 float_raise( float_flag_invalid STATUS_VAR);
1683 return float32_default_nan;
1685 if ( aExp == 0 ) {
1686 aExp = 1;
1687 bExp = 1;
1689 if ( bSig < aSig ) goto aBigger;
1690 if ( aSig < bSig ) goto bBigger;
1691 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1692 bExpBigger:
1693 if ( bExp == 0xFF ) {
1694 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1695 return packFloat32( zSign ^ 1, 0xFF, 0 );
1697 if ( aExp == 0 ) {
1698 ++expDiff;
1700 else {
1701 aSig |= 0x40000000;
1703 shift32RightJamming( aSig, - expDiff, &aSig );
1704 bSig |= 0x40000000;
1705 bBigger:
1706 zSig = bSig - aSig;
1707 zExp = bExp;
1708 zSign ^= 1;
1709 goto normalizeRoundAndPack;
1710 aExpBigger:
1711 if ( aExp == 0xFF ) {
1712 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1713 return a;
1715 if ( bExp == 0 ) {
1716 --expDiff;
1718 else {
1719 bSig |= 0x40000000;
1721 shift32RightJamming( bSig, expDiff, &bSig );
1722 aSig |= 0x40000000;
1723 aBigger:
1724 zSig = aSig - bSig;
1725 zExp = aExp;
1726 normalizeRoundAndPack:
1727 --zExp;
1728 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1732 /*----------------------------------------------------------------------------
1733 | Returns the result of adding the single-precision floating-point values `a'
1734 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1735 | Binary Floating-Point Arithmetic.
1736 *----------------------------------------------------------------------------*/
1738 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1740 flag aSign, bSign;
1742 aSign = extractFloat32Sign( a );
1743 bSign = extractFloat32Sign( b );
1744 if ( aSign == bSign ) {
1745 return addFloat32Sigs( a, b, aSign STATUS_VAR);
1747 else {
1748 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1753 /*----------------------------------------------------------------------------
1754 | Returns the result of subtracting the single-precision floating-point values
1755 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1756 | for Binary Floating-Point Arithmetic.
1757 *----------------------------------------------------------------------------*/
1759 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1761 flag aSign, bSign;
1763 aSign = extractFloat32Sign( a );
1764 bSign = extractFloat32Sign( b );
1765 if ( aSign == bSign ) {
1766 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1768 else {
1769 return addFloat32Sigs( a, b, aSign STATUS_VAR );
1774 /*----------------------------------------------------------------------------
1775 | Returns the result of multiplying the single-precision floating-point values
1776 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1777 | for Binary Floating-Point Arithmetic.
1778 *----------------------------------------------------------------------------*/
1780 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1782 flag aSign, bSign, zSign;
1783 int16 aExp, bExp, zExp;
1784 bits32 aSig, bSig;
1785 bits64 zSig64;
1786 bits32 zSig;
1788 aSig = extractFloat32Frac( a );
1789 aExp = extractFloat32Exp( a );
1790 aSign = extractFloat32Sign( a );
1791 bSig = extractFloat32Frac( b );
1792 bExp = extractFloat32Exp( b );
1793 bSign = extractFloat32Sign( b );
1794 zSign = aSign ^ bSign;
1795 if ( aExp == 0xFF ) {
1796 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1797 return propagateFloat32NaN( a, b STATUS_VAR );
1799 if ( ( bExp | bSig ) == 0 ) {
1800 float_raise( float_flag_invalid STATUS_VAR);
1801 return float32_default_nan;
1803 return packFloat32( zSign, 0xFF, 0 );
1805 if ( bExp == 0xFF ) {
1806 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1807 if ( ( aExp | aSig ) == 0 ) {
1808 float_raise( float_flag_invalid STATUS_VAR);
1809 return float32_default_nan;
1811 return packFloat32( zSign, 0xFF, 0 );
1813 if ( aExp == 0 ) {
1814 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1815 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1817 if ( bExp == 0 ) {
1818 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1819 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1821 zExp = aExp + bExp - 0x7F;
1822 aSig = ( aSig | 0x00800000 )<<7;
1823 bSig = ( bSig | 0x00800000 )<<8;
1824 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1825 zSig = zSig64;
1826 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1827 zSig <<= 1;
1828 --zExp;
1830 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1834 /*----------------------------------------------------------------------------
1835 | Returns the result of dividing the single-precision floating-point value `a'
1836 | by the corresponding value `b'. The operation is performed according to the
1837 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1838 *----------------------------------------------------------------------------*/
1840 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1842 flag aSign, bSign, zSign;
1843 int16 aExp, bExp, zExp;
1844 bits32 aSig, bSig, zSig;
1846 aSig = extractFloat32Frac( a );
1847 aExp = extractFloat32Exp( a );
1848 aSign = extractFloat32Sign( a );
1849 bSig = extractFloat32Frac( b );
1850 bExp = extractFloat32Exp( b );
1851 bSign = extractFloat32Sign( b );
1852 zSign = aSign ^ bSign;
1853 if ( aExp == 0xFF ) {
1854 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1855 if ( bExp == 0xFF ) {
1856 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1857 float_raise( float_flag_invalid STATUS_VAR);
1858 return float32_default_nan;
1860 return packFloat32( zSign, 0xFF, 0 );
1862 if ( bExp == 0xFF ) {
1863 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1864 return packFloat32( zSign, 0, 0 );
1866 if ( bExp == 0 ) {
1867 if ( bSig == 0 ) {
1868 if ( ( aExp | aSig ) == 0 ) {
1869 float_raise( float_flag_invalid STATUS_VAR);
1870 return float32_default_nan;
1872 float_raise( float_flag_divbyzero STATUS_VAR);
1873 return packFloat32( zSign, 0xFF, 0 );
1875 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1877 if ( aExp == 0 ) {
1878 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1879 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1881 zExp = aExp - bExp + 0x7D;
1882 aSig = ( aSig | 0x00800000 )<<7;
1883 bSig = ( bSig | 0x00800000 )<<8;
1884 if ( bSig <= ( aSig + aSig ) ) {
1885 aSig >>= 1;
1886 ++zExp;
1888 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1889 if ( ( zSig & 0x3F ) == 0 ) {
1890 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1892 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1896 /*----------------------------------------------------------------------------
1897 | Returns the remainder of the single-precision floating-point value `a'
1898 | with respect to the corresponding value `b'. The operation is performed
1899 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1900 *----------------------------------------------------------------------------*/
1902 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1904 flag aSign, bSign, zSign;
1905 int16 aExp, bExp, expDiff;
1906 bits32 aSig, bSig;
1907 bits32 q;
1908 bits64 aSig64, bSig64, q64;
1909 bits32 alternateASig;
1910 sbits32 sigMean;
1912 aSig = extractFloat32Frac( a );
1913 aExp = extractFloat32Exp( a );
1914 aSign = extractFloat32Sign( a );
1915 bSig = extractFloat32Frac( b );
1916 bExp = extractFloat32Exp( b );
1917 bSign = extractFloat32Sign( b );
1918 if ( aExp == 0xFF ) {
1919 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1920 return propagateFloat32NaN( a, b STATUS_VAR );
1922 float_raise( float_flag_invalid STATUS_VAR);
1923 return float32_default_nan;
1925 if ( bExp == 0xFF ) {
1926 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1927 return a;
1929 if ( bExp == 0 ) {
1930 if ( bSig == 0 ) {
1931 float_raise( float_flag_invalid STATUS_VAR);
1932 return float32_default_nan;
1934 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1936 if ( aExp == 0 ) {
1937 if ( aSig == 0 ) return a;
1938 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1940 expDiff = aExp - bExp;
1941 aSig |= 0x00800000;
1942 bSig |= 0x00800000;
1943 if ( expDiff < 32 ) {
1944 aSig <<= 8;
1945 bSig <<= 8;
1946 if ( expDiff < 0 ) {
1947 if ( expDiff < -1 ) return a;
1948 aSig >>= 1;
1950 q = ( bSig <= aSig );
1951 if ( q ) aSig -= bSig;
1952 if ( 0 < expDiff ) {
1953 q = ( ( (bits64) aSig )<<32 ) / bSig;
1954 q >>= 32 - expDiff;
1955 bSig >>= 2;
1956 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1958 else {
1959 aSig >>= 2;
1960 bSig >>= 2;
1963 else {
1964 if ( bSig <= aSig ) aSig -= bSig;
1965 aSig64 = ( (bits64) aSig )<<40;
1966 bSig64 = ( (bits64) bSig )<<40;
1967 expDiff -= 64;
1968 while ( 0 < expDiff ) {
1969 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1970 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1971 aSig64 = - ( ( bSig * q64 )<<38 );
1972 expDiff -= 62;
1974 expDiff += 64;
1975 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1976 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1977 q = q64>>( 64 - expDiff );
1978 bSig <<= 6;
1979 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1981 do {
1982 alternateASig = aSig;
1983 ++q;
1984 aSig -= bSig;
1985 } while ( 0 <= (sbits32) aSig );
1986 sigMean = aSig + alternateASig;
1987 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1988 aSig = alternateASig;
1990 zSign = ( (sbits32) aSig < 0 );
1991 if ( zSign ) aSig = - aSig;
1992 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
1996 /*----------------------------------------------------------------------------
1997 | Returns the square root of the single-precision floating-point value `a'.
1998 | The operation is performed according to the IEC/IEEE Standard for Binary
1999 | Floating-Point Arithmetic.
2000 *----------------------------------------------------------------------------*/
2002 float32 float32_sqrt( float32 a STATUS_PARAM )
2004 flag aSign;
2005 int16 aExp, zExp;
2006 bits32 aSig, zSig;
2007 bits64 rem, term;
2009 aSig = extractFloat32Frac( a );
2010 aExp = extractFloat32Exp( a );
2011 aSign = extractFloat32Sign( a );
2012 if ( aExp == 0xFF ) {
2013 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2014 if ( ! aSign ) return a;
2015 float_raise( float_flag_invalid STATUS_VAR);
2016 return float32_default_nan;
2018 if ( aSign ) {
2019 if ( ( aExp | aSig ) == 0 ) return a;
2020 float_raise( float_flag_invalid STATUS_VAR);
2021 return float32_default_nan;
2023 if ( aExp == 0 ) {
2024 if ( aSig == 0 ) return float32_zero;
2025 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2027 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2028 aSig = ( aSig | 0x00800000 )<<8;
2029 zSig = estimateSqrt32( aExp, aSig ) + 2;
2030 if ( ( zSig & 0x7F ) <= 5 ) {
2031 if ( zSig < 2 ) {
2032 zSig = 0x7FFFFFFF;
2033 goto roundAndPack;
2035 aSig >>= aExp & 1;
2036 term = ( (bits64) zSig ) * zSig;
2037 rem = ( ( (bits64) aSig )<<32 ) - term;
2038 while ( (sbits64) rem < 0 ) {
2039 --zSig;
2040 rem += ( ( (bits64) zSig )<<1 ) | 1;
2042 zSig |= ( rem != 0 );
2044 shift32RightJamming( zSig, 1, &zSig );
2045 roundAndPack:
2046 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2050 /*----------------------------------------------------------------------------
2051 | Returns 1 if the single-precision floating-point value `a' is equal to
2052 | the corresponding value `b', and 0 otherwise. The comparison is performed
2053 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2054 *----------------------------------------------------------------------------*/
2056 int float32_eq( float32 a, float32 b STATUS_PARAM )
2059 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2060 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2062 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2063 float_raise( float_flag_invalid STATUS_VAR);
2065 return 0;
2067 return ( float32_val(a) == float32_val(b) ) ||
2068 ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2072 /*----------------------------------------------------------------------------
2073 | Returns 1 if the single-precision floating-point value `a' is less than
2074 | or equal to the corresponding value `b', and 0 otherwise. The comparison
2075 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2076 | Arithmetic.
2077 *----------------------------------------------------------------------------*/
2079 int float32_le( float32 a, float32 b STATUS_PARAM )
2081 flag aSign, bSign;
2082 bits32 av, bv;
2084 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2085 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2087 float_raise( float_flag_invalid STATUS_VAR);
2088 return 0;
2090 aSign = extractFloat32Sign( a );
2091 bSign = extractFloat32Sign( b );
2092 av = float32_val(a);
2093 bv = float32_val(b);
2094 if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2095 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2099 /*----------------------------------------------------------------------------
2100 | Returns 1 if the single-precision floating-point value `a' is less than
2101 | the corresponding value `b', and 0 otherwise. The comparison is performed
2102 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2103 *----------------------------------------------------------------------------*/
2105 int float32_lt( float32 a, float32 b STATUS_PARAM )
2107 flag aSign, bSign;
2108 bits32 av, bv;
2110 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2111 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2113 float_raise( float_flag_invalid STATUS_VAR);
2114 return 0;
2116 aSign = extractFloat32Sign( a );
2117 bSign = extractFloat32Sign( b );
2118 av = float32_val(a);
2119 bv = float32_val(b);
2120 if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2121 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2125 /*----------------------------------------------------------------------------
2126 | Returns 1 if the single-precision floating-point value `a' is equal to
2127 | the corresponding value `b', and 0 otherwise. The invalid exception is
2128 | raised if either operand is a NaN. Otherwise, the comparison is performed
2129 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2130 *----------------------------------------------------------------------------*/
2132 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2134 bits32 av, bv;
2136 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2137 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2139 float_raise( float_flag_invalid STATUS_VAR);
2140 return 0;
2142 av = float32_val(a);
2143 bv = float32_val(b);
2144 return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2148 /*----------------------------------------------------------------------------
2149 | Returns 1 if the single-precision floating-point value `a' is less than or
2150 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2151 | cause an exception. Otherwise, the comparison is performed according to the
2152 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2153 *----------------------------------------------------------------------------*/
2155 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2157 flag aSign, bSign;
2158 bits32 av, bv;
2160 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2161 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2163 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2164 float_raise( float_flag_invalid STATUS_VAR);
2166 return 0;
2168 aSign = extractFloat32Sign( a );
2169 bSign = extractFloat32Sign( b );
2170 av = float32_val(a);
2171 bv = float32_val(b);
2172 if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2173 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2177 /*----------------------------------------------------------------------------
2178 | Returns 1 if the single-precision floating-point value `a' is less than
2179 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2180 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2181 | Standard for Binary Floating-Point Arithmetic.
2182 *----------------------------------------------------------------------------*/
2184 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2186 flag aSign, bSign;
2187 bits32 av, bv;
2189 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2190 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2192 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2193 float_raise( float_flag_invalid STATUS_VAR);
2195 return 0;
2197 aSign = extractFloat32Sign( a );
2198 bSign = extractFloat32Sign( b );
2199 av = float32_val(a);
2200 bv = float32_val(b);
2201 if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2202 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2206 /*----------------------------------------------------------------------------
2207 | Returns the result of converting the double-precision floating-point value
2208 | `a' to the 32-bit two's complement integer format. The conversion is
2209 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2210 | Arithmetic---which means in particular that the conversion is rounded
2211 | according to the current rounding mode. If `a' is a NaN, the largest
2212 | positive integer is returned. Otherwise, if the conversion overflows, the
2213 | largest integer with the same sign as `a' is returned.
2214 *----------------------------------------------------------------------------*/
2216 int32 float64_to_int32( float64 a STATUS_PARAM )
2218 flag aSign;
2219 int16 aExp, shiftCount;
2220 bits64 aSig;
2222 aSig = extractFloat64Frac( a );
2223 aExp = extractFloat64Exp( a );
2224 aSign = extractFloat64Sign( a );
2225 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2226 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2227 shiftCount = 0x42C - aExp;
2228 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2229 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2233 /*----------------------------------------------------------------------------
2234 | Returns the result of converting the double-precision floating-point value
2235 | `a' to the 32-bit two's complement integer format. The conversion is
2236 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2237 | Arithmetic, except that the conversion is always rounded toward zero.
2238 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2239 | the conversion overflows, the largest integer with the same sign as `a' is
2240 | returned.
2241 *----------------------------------------------------------------------------*/
2243 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2245 flag aSign;
2246 int16 aExp, shiftCount;
2247 bits64 aSig, savedASig;
2248 int32 z;
2250 aSig = extractFloat64Frac( a );
2251 aExp = extractFloat64Exp( a );
2252 aSign = extractFloat64Sign( a );
2253 if ( 0x41E < aExp ) {
2254 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2255 goto invalid;
2257 else if ( aExp < 0x3FF ) {
2258 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2259 return 0;
2261 aSig |= LIT64( 0x0010000000000000 );
2262 shiftCount = 0x433 - aExp;
2263 savedASig = aSig;
2264 aSig >>= shiftCount;
2265 z = aSig;
2266 if ( aSign ) z = - z;
2267 if ( ( z < 0 ) ^ aSign ) {
2268 invalid:
2269 float_raise( float_flag_invalid STATUS_VAR);
2270 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2272 if ( ( aSig<<shiftCount ) != savedASig ) {
2273 STATUS(float_exception_flags) |= float_flag_inexact;
2275 return z;
2279 /*----------------------------------------------------------------------------
2280 | Returns the result of converting the double-precision floating-point value
2281 | `a' to the 64-bit two's complement integer format. The conversion is
2282 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2283 | Arithmetic---which means in particular that the conversion is rounded
2284 | according to the current rounding mode. If `a' is a NaN, the largest
2285 | positive integer is returned. Otherwise, if the conversion overflows, the
2286 | largest integer with the same sign as `a' is returned.
2287 *----------------------------------------------------------------------------*/
2289 int64 float64_to_int64( float64 a STATUS_PARAM )
2291 flag aSign;
2292 int16 aExp, shiftCount;
2293 bits64 aSig, aSigExtra;
2295 aSig = extractFloat64Frac( a );
2296 aExp = extractFloat64Exp( a );
2297 aSign = extractFloat64Sign( a );
2298 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2299 shiftCount = 0x433 - aExp;
2300 if ( shiftCount <= 0 ) {
2301 if ( 0x43E < aExp ) {
2302 float_raise( float_flag_invalid STATUS_VAR);
2303 if ( ! aSign
2304 || ( ( aExp == 0x7FF )
2305 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2307 return LIT64( 0x7FFFFFFFFFFFFFFF );
2309 return (sbits64) LIT64( 0x8000000000000000 );
2311 aSigExtra = 0;
2312 aSig <<= - shiftCount;
2314 else {
2315 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2317 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2321 /*----------------------------------------------------------------------------
2322 | Returns the result of converting the double-precision floating-point value
2323 | `a' to the 64-bit two's complement integer format. The conversion is
2324 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2325 | Arithmetic, except that the conversion is always rounded toward zero.
2326 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2327 | the conversion overflows, the largest integer with the same sign as `a' is
2328 | returned.
2329 *----------------------------------------------------------------------------*/
2331 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2333 flag aSign;
2334 int16 aExp, shiftCount;
2335 bits64 aSig;
2336 int64 z;
2338 aSig = extractFloat64Frac( a );
2339 aExp = extractFloat64Exp( a );
2340 aSign = extractFloat64Sign( a );
2341 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2342 shiftCount = aExp - 0x433;
2343 if ( 0 <= shiftCount ) {
2344 if ( 0x43E <= aExp ) {
2345 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2346 float_raise( float_flag_invalid STATUS_VAR);
2347 if ( ! aSign
2348 || ( ( aExp == 0x7FF )
2349 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2351 return LIT64( 0x7FFFFFFFFFFFFFFF );
2354 return (sbits64) LIT64( 0x8000000000000000 );
2356 z = aSig<<shiftCount;
2358 else {
2359 if ( aExp < 0x3FE ) {
2360 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2361 return 0;
2363 z = aSig>>( - shiftCount );
2364 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2365 STATUS(float_exception_flags) |= float_flag_inexact;
2368 if ( aSign ) z = - z;
2369 return z;
2373 /*----------------------------------------------------------------------------
2374 | Returns the result of converting the double-precision floating-point value
2375 | `a' to the single-precision floating-point format. The conversion is
2376 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2377 | Arithmetic.
2378 *----------------------------------------------------------------------------*/
2380 float32 float64_to_float32( float64 a STATUS_PARAM )
2382 flag aSign;
2383 int16 aExp;
2384 bits64 aSig;
2385 bits32 zSig;
2387 aSig = extractFloat64Frac( a );
2388 aExp = extractFloat64Exp( a );
2389 aSign = extractFloat64Sign( a );
2390 if ( aExp == 0x7FF ) {
2391 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2392 return packFloat32( aSign, 0xFF, 0 );
2394 shift64RightJamming( aSig, 22, &aSig );
2395 zSig = aSig;
2396 if ( aExp || zSig ) {
2397 zSig |= 0x40000000;
2398 aExp -= 0x381;
2400 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2404 #ifdef FLOATX80
2406 /*----------------------------------------------------------------------------
2407 | Returns the result of converting the double-precision floating-point value
2408 | `a' to the extended double-precision floating-point format. The conversion
2409 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2410 | Arithmetic.
2411 *----------------------------------------------------------------------------*/
2413 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2415 flag aSign;
2416 int16 aExp;
2417 bits64 aSig;
2419 aSig = extractFloat64Frac( a );
2420 aExp = extractFloat64Exp( a );
2421 aSign = extractFloat64Sign( a );
2422 if ( aExp == 0x7FF ) {
2423 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2424 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2426 if ( aExp == 0 ) {
2427 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2428 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2430 return
2431 packFloatx80(
2432 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2436 #endif
2438 #ifdef FLOAT128
2440 /*----------------------------------------------------------------------------
2441 | Returns the result of converting the double-precision floating-point value
2442 | `a' to the quadruple-precision floating-point format. The conversion is
2443 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2444 | Arithmetic.
2445 *----------------------------------------------------------------------------*/
2447 float128 float64_to_float128( float64 a STATUS_PARAM )
2449 flag aSign;
2450 int16 aExp;
2451 bits64 aSig, zSig0, zSig1;
2453 aSig = extractFloat64Frac( a );
2454 aExp = extractFloat64Exp( a );
2455 aSign = extractFloat64Sign( a );
2456 if ( aExp == 0x7FF ) {
2457 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2458 return packFloat128( aSign, 0x7FFF, 0, 0 );
2460 if ( aExp == 0 ) {
2461 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2462 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2463 --aExp;
2465 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2466 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2470 #endif
2472 /*----------------------------------------------------------------------------
2473 | Rounds the double-precision floating-point value `a' to an integer, and
2474 | returns the result as a double-precision floating-point value. The
2475 | operation is performed according to the IEC/IEEE Standard for Binary
2476 | Floating-Point Arithmetic.
2477 *----------------------------------------------------------------------------*/
2479 float64 float64_round_to_int( float64 a STATUS_PARAM )
2481 flag aSign;
2482 int16 aExp;
2483 bits64 lastBitMask, roundBitsMask;
2484 int8 roundingMode;
2485 bits64 z;
2487 aExp = extractFloat64Exp( a );
2488 if ( 0x433 <= aExp ) {
2489 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2490 return propagateFloat64NaN( a, a STATUS_VAR );
2492 return a;
2494 if ( aExp < 0x3FF ) {
2495 if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2496 STATUS(float_exception_flags) |= float_flag_inexact;
2497 aSign = extractFloat64Sign( a );
2498 switch ( STATUS(float_rounding_mode) ) {
2499 case float_round_nearest_even:
2500 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2501 return packFloat64( aSign, 0x3FF, 0 );
2503 break;
2504 case float_round_down:
2505 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2506 case float_round_up:
2507 return make_float64(
2508 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2510 return packFloat64( aSign, 0, 0 );
2512 lastBitMask = 1;
2513 lastBitMask <<= 0x433 - aExp;
2514 roundBitsMask = lastBitMask - 1;
2515 z = float64_val(a);
2516 roundingMode = STATUS(float_rounding_mode);
2517 if ( roundingMode == float_round_nearest_even ) {
2518 z += lastBitMask>>1;
2519 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2521 else if ( roundingMode != float_round_to_zero ) {
2522 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
2523 z += roundBitsMask;
2526 z &= ~ roundBitsMask;
2527 if ( z != float64_val(a) )
2528 STATUS(float_exception_flags) |= float_flag_inexact;
2529 return make_float64(z);
2533 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2535 int oldmode;
2536 float64 res;
2537 oldmode = STATUS(float_rounding_mode);
2538 STATUS(float_rounding_mode) = float_round_to_zero;
2539 res = float64_round_to_int(a STATUS_VAR);
2540 STATUS(float_rounding_mode) = oldmode;
2541 return res;
2544 /*----------------------------------------------------------------------------
2545 | Returns the result of adding the absolute values of the double-precision
2546 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2547 | before being returned. `zSign' is ignored if the result is a NaN.
2548 | The addition is performed according to the IEC/IEEE Standard for Binary
2549 | Floating-Point Arithmetic.
2550 *----------------------------------------------------------------------------*/
2552 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2554 int16 aExp, bExp, zExp;
2555 bits64 aSig, bSig, zSig;
2556 int16 expDiff;
2558 aSig = extractFloat64Frac( a );
2559 aExp = extractFloat64Exp( a );
2560 bSig = extractFloat64Frac( b );
2561 bExp = extractFloat64Exp( b );
2562 expDiff = aExp - bExp;
2563 aSig <<= 9;
2564 bSig <<= 9;
2565 if ( 0 < expDiff ) {
2566 if ( aExp == 0x7FF ) {
2567 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2568 return a;
2570 if ( bExp == 0 ) {
2571 --expDiff;
2573 else {
2574 bSig |= LIT64( 0x2000000000000000 );
2576 shift64RightJamming( bSig, expDiff, &bSig );
2577 zExp = aExp;
2579 else if ( expDiff < 0 ) {
2580 if ( bExp == 0x7FF ) {
2581 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2582 return packFloat64( zSign, 0x7FF, 0 );
2584 if ( aExp == 0 ) {
2585 ++expDiff;
2587 else {
2588 aSig |= LIT64( 0x2000000000000000 );
2590 shift64RightJamming( aSig, - expDiff, &aSig );
2591 zExp = bExp;
2593 else {
2594 if ( aExp == 0x7FF ) {
2595 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2596 return a;
2598 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2599 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2600 zExp = aExp;
2601 goto roundAndPack;
2603 aSig |= LIT64( 0x2000000000000000 );
2604 zSig = ( aSig + bSig )<<1;
2605 --zExp;
2606 if ( (sbits64) zSig < 0 ) {
2607 zSig = aSig + bSig;
2608 ++zExp;
2610 roundAndPack:
2611 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2615 /*----------------------------------------------------------------------------
2616 | Returns the result of subtracting the absolute values of the double-
2617 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2618 | difference is negated before being returned. `zSign' is ignored if the
2619 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2620 | Standard for Binary Floating-Point Arithmetic.
2621 *----------------------------------------------------------------------------*/
2623 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2625 int16 aExp, bExp, zExp;
2626 bits64 aSig, bSig, zSig;
2627 int16 expDiff;
2629 aSig = extractFloat64Frac( a );
2630 aExp = extractFloat64Exp( a );
2631 bSig = extractFloat64Frac( b );
2632 bExp = extractFloat64Exp( b );
2633 expDiff = aExp - bExp;
2634 aSig <<= 10;
2635 bSig <<= 10;
2636 if ( 0 < expDiff ) goto aExpBigger;
2637 if ( expDiff < 0 ) goto bExpBigger;
2638 if ( aExp == 0x7FF ) {
2639 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2640 float_raise( float_flag_invalid STATUS_VAR);
2641 return float64_default_nan;
2643 if ( aExp == 0 ) {
2644 aExp = 1;
2645 bExp = 1;
2647 if ( bSig < aSig ) goto aBigger;
2648 if ( aSig < bSig ) goto bBigger;
2649 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2650 bExpBigger:
2651 if ( bExp == 0x7FF ) {
2652 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2653 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2655 if ( aExp == 0 ) {
2656 ++expDiff;
2658 else {
2659 aSig |= LIT64( 0x4000000000000000 );
2661 shift64RightJamming( aSig, - expDiff, &aSig );
2662 bSig |= LIT64( 0x4000000000000000 );
2663 bBigger:
2664 zSig = bSig - aSig;
2665 zExp = bExp;
2666 zSign ^= 1;
2667 goto normalizeRoundAndPack;
2668 aExpBigger:
2669 if ( aExp == 0x7FF ) {
2670 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2671 return a;
2673 if ( bExp == 0 ) {
2674 --expDiff;
2676 else {
2677 bSig |= LIT64( 0x4000000000000000 );
2679 shift64RightJamming( bSig, expDiff, &bSig );
2680 aSig |= LIT64( 0x4000000000000000 );
2681 aBigger:
2682 zSig = aSig - bSig;
2683 zExp = aExp;
2684 normalizeRoundAndPack:
2685 --zExp;
2686 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2690 /*----------------------------------------------------------------------------
2691 | Returns the result of adding the double-precision floating-point values `a'
2692 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2693 | Binary Floating-Point Arithmetic.
2694 *----------------------------------------------------------------------------*/
2696 float64 float64_add( float64 a, float64 b STATUS_PARAM )
2698 flag aSign, bSign;
2700 aSign = extractFloat64Sign( a );
2701 bSign = extractFloat64Sign( b );
2702 if ( aSign == bSign ) {
2703 return addFloat64Sigs( a, b, aSign STATUS_VAR );
2705 else {
2706 return subFloat64Sigs( a, b, aSign STATUS_VAR );
2711 /*----------------------------------------------------------------------------
2712 | Returns the result of subtracting the double-precision floating-point values
2713 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2714 | for Binary Floating-Point Arithmetic.
2715 *----------------------------------------------------------------------------*/
2717 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2719 flag aSign, bSign;
2721 aSign = extractFloat64Sign( a );
2722 bSign = extractFloat64Sign( b );
2723 if ( aSign == bSign ) {
2724 return subFloat64Sigs( a, b, aSign STATUS_VAR );
2726 else {
2727 return addFloat64Sigs( a, b, aSign STATUS_VAR );
2732 /*----------------------------------------------------------------------------
2733 | Returns the result of multiplying the double-precision floating-point values
2734 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2735 | for Binary Floating-Point Arithmetic.
2736 *----------------------------------------------------------------------------*/
2738 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
2740 flag aSign, bSign, zSign;
2741 int16 aExp, bExp, zExp;
2742 bits64 aSig, bSig, zSig0, zSig1;
2744 aSig = extractFloat64Frac( a );
2745 aExp = extractFloat64Exp( a );
2746 aSign = extractFloat64Sign( a );
2747 bSig = extractFloat64Frac( b );
2748 bExp = extractFloat64Exp( b );
2749 bSign = extractFloat64Sign( b );
2750 zSign = aSign ^ bSign;
2751 if ( aExp == 0x7FF ) {
2752 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2753 return propagateFloat64NaN( a, b STATUS_VAR );
2755 if ( ( bExp | bSig ) == 0 ) {
2756 float_raise( float_flag_invalid STATUS_VAR);
2757 return float64_default_nan;
2759 return packFloat64( zSign, 0x7FF, 0 );
2761 if ( bExp == 0x7FF ) {
2762 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2763 if ( ( aExp | aSig ) == 0 ) {
2764 float_raise( float_flag_invalid STATUS_VAR);
2765 return float64_default_nan;
2767 return packFloat64( zSign, 0x7FF, 0 );
2769 if ( aExp == 0 ) {
2770 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2771 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2773 if ( bExp == 0 ) {
2774 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2775 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2777 zExp = aExp + bExp - 0x3FF;
2778 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2779 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2780 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2781 zSig0 |= ( zSig1 != 0 );
2782 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2783 zSig0 <<= 1;
2784 --zExp;
2786 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
2790 /*----------------------------------------------------------------------------
2791 | Returns the result of dividing the double-precision floating-point value `a'
2792 | by the corresponding value `b'. The operation is performed according to
2793 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2794 *----------------------------------------------------------------------------*/
2796 float64 float64_div( float64 a, float64 b STATUS_PARAM )
2798 flag aSign, bSign, zSign;
2799 int16 aExp, bExp, zExp;
2800 bits64 aSig, bSig, zSig;
2801 bits64 rem0, rem1;
2802 bits64 term0, term1;
2804 aSig = extractFloat64Frac( a );
2805 aExp = extractFloat64Exp( a );
2806 aSign = extractFloat64Sign( a );
2807 bSig = extractFloat64Frac( b );
2808 bExp = extractFloat64Exp( b );
2809 bSign = extractFloat64Sign( b );
2810 zSign = aSign ^ bSign;
2811 if ( aExp == 0x7FF ) {
2812 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2813 if ( bExp == 0x7FF ) {
2814 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2815 float_raise( float_flag_invalid STATUS_VAR);
2816 return float64_default_nan;
2818 return packFloat64( zSign, 0x7FF, 0 );
2820 if ( bExp == 0x7FF ) {
2821 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2822 return packFloat64( zSign, 0, 0 );
2824 if ( bExp == 0 ) {
2825 if ( bSig == 0 ) {
2826 if ( ( aExp | aSig ) == 0 ) {
2827 float_raise( float_flag_invalid STATUS_VAR);
2828 return float64_default_nan;
2830 float_raise( float_flag_divbyzero STATUS_VAR);
2831 return packFloat64( zSign, 0x7FF, 0 );
2833 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2835 if ( aExp == 0 ) {
2836 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2837 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2839 zExp = aExp - bExp + 0x3FD;
2840 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2841 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2842 if ( bSig <= ( aSig + aSig ) ) {
2843 aSig >>= 1;
2844 ++zExp;
2846 zSig = estimateDiv128To64( aSig, 0, bSig );
2847 if ( ( zSig & 0x1FF ) <= 2 ) {
2848 mul64To128( bSig, zSig, &term0, &term1 );
2849 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2850 while ( (sbits64) rem0 < 0 ) {
2851 --zSig;
2852 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2854 zSig |= ( rem1 != 0 );
2856 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2860 /*----------------------------------------------------------------------------
2861 | Returns the remainder of the double-precision floating-point value `a'
2862 | with respect to the corresponding value `b'. The operation is performed
2863 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2864 *----------------------------------------------------------------------------*/
2866 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
2868 flag aSign, bSign, zSign;
2869 int16 aExp, bExp, expDiff;
2870 bits64 aSig, bSig;
2871 bits64 q, alternateASig;
2872 sbits64 sigMean;
2874 aSig = extractFloat64Frac( a );
2875 aExp = extractFloat64Exp( a );
2876 aSign = extractFloat64Sign( a );
2877 bSig = extractFloat64Frac( b );
2878 bExp = extractFloat64Exp( b );
2879 bSign = extractFloat64Sign( b );
2880 if ( aExp == 0x7FF ) {
2881 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2882 return propagateFloat64NaN( a, b STATUS_VAR );
2884 float_raise( float_flag_invalid STATUS_VAR);
2885 return float64_default_nan;
2887 if ( bExp == 0x7FF ) {
2888 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2889 return a;
2891 if ( bExp == 0 ) {
2892 if ( bSig == 0 ) {
2893 float_raise( float_flag_invalid STATUS_VAR);
2894 return float64_default_nan;
2896 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2898 if ( aExp == 0 ) {
2899 if ( aSig == 0 ) return a;
2900 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2902 expDiff = aExp - bExp;
2903 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2904 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2905 if ( expDiff < 0 ) {
2906 if ( expDiff < -1 ) return a;
2907 aSig >>= 1;
2909 q = ( bSig <= aSig );
2910 if ( q ) aSig -= bSig;
2911 expDiff -= 64;
2912 while ( 0 < expDiff ) {
2913 q = estimateDiv128To64( aSig, 0, bSig );
2914 q = ( 2 < q ) ? q - 2 : 0;
2915 aSig = - ( ( bSig>>2 ) * q );
2916 expDiff -= 62;
2918 expDiff += 64;
2919 if ( 0 < expDiff ) {
2920 q = estimateDiv128To64( aSig, 0, bSig );
2921 q = ( 2 < q ) ? q - 2 : 0;
2922 q >>= 64 - expDiff;
2923 bSig >>= 2;
2924 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2926 else {
2927 aSig >>= 2;
2928 bSig >>= 2;
2930 do {
2931 alternateASig = aSig;
2932 ++q;
2933 aSig -= bSig;
2934 } while ( 0 <= (sbits64) aSig );
2935 sigMean = aSig + alternateASig;
2936 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2937 aSig = alternateASig;
2939 zSign = ( (sbits64) aSig < 0 );
2940 if ( zSign ) aSig = - aSig;
2941 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
2945 /*----------------------------------------------------------------------------
2946 | Returns the square root of the double-precision floating-point value `a'.
2947 | The operation is performed according to the IEC/IEEE Standard for Binary
2948 | Floating-Point Arithmetic.
2949 *----------------------------------------------------------------------------*/
2951 float64 float64_sqrt( float64 a STATUS_PARAM )
2953 flag aSign;
2954 int16 aExp, zExp;
2955 bits64 aSig, zSig, doubleZSig;
2956 bits64 rem0, rem1, term0, term1;
2958 aSig = extractFloat64Frac( a );
2959 aExp = extractFloat64Exp( a );
2960 aSign = extractFloat64Sign( a );
2961 if ( aExp == 0x7FF ) {
2962 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
2963 if ( ! aSign ) return a;
2964 float_raise( float_flag_invalid STATUS_VAR);
2965 return float64_default_nan;
2967 if ( aSign ) {
2968 if ( ( aExp | aSig ) == 0 ) return a;
2969 float_raise( float_flag_invalid STATUS_VAR);
2970 return float64_default_nan;
2972 if ( aExp == 0 ) {
2973 if ( aSig == 0 ) return float64_zero;
2974 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2976 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2977 aSig |= LIT64( 0x0010000000000000 );
2978 zSig = estimateSqrt32( aExp, aSig>>21 );
2979 aSig <<= 9 - ( aExp & 1 );
2980 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
2981 if ( ( zSig & 0x1FF ) <= 5 ) {
2982 doubleZSig = zSig<<1;
2983 mul64To128( zSig, zSig, &term0, &term1 );
2984 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2985 while ( (sbits64) rem0 < 0 ) {
2986 --zSig;
2987 doubleZSig -= 2;
2988 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
2990 zSig |= ( ( rem0 | rem1 ) != 0 );
2992 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
2996 /*----------------------------------------------------------------------------
2997 | Returns 1 if the double-precision floating-point value `a' is equal to the
2998 | corresponding value `b', and 0 otherwise. The comparison is performed
2999 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3000 *----------------------------------------------------------------------------*/
3002 int float64_eq( float64 a, float64 b STATUS_PARAM )
3004 bits64 av, bv;
3006 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3007 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3009 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3010 float_raise( float_flag_invalid STATUS_VAR);
3012 return 0;
3014 av = float64_val(a);
3015 bv = float64_val(b);
3016 return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3020 /*----------------------------------------------------------------------------
3021 | Returns 1 if the double-precision floating-point value `a' is less than or
3022 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3023 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3024 | Arithmetic.
3025 *----------------------------------------------------------------------------*/
3027 int float64_le( float64 a, float64 b STATUS_PARAM )
3029 flag aSign, bSign;
3030 bits64 av, bv;
3032 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3033 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3035 float_raise( float_flag_invalid STATUS_VAR);
3036 return 0;
3038 aSign = extractFloat64Sign( a );
3039 bSign = extractFloat64Sign( b );
3040 av = float64_val(a);
3041 bv = float64_val(b);
3042 if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3043 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3047 /*----------------------------------------------------------------------------
3048 | Returns 1 if the double-precision floating-point value `a' is less than
3049 | the corresponding value `b', and 0 otherwise. The comparison is performed
3050 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3051 *----------------------------------------------------------------------------*/
3053 int float64_lt( float64 a, float64 b STATUS_PARAM )
3055 flag aSign, bSign;
3056 bits64 av, bv;
3058 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3059 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3061 float_raise( float_flag_invalid STATUS_VAR);
3062 return 0;
3064 aSign = extractFloat64Sign( a );
3065 bSign = extractFloat64Sign( b );
3066 av = float64_val(a);
3067 bv = float64_val(b);
3068 if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3069 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3073 /*----------------------------------------------------------------------------
3074 | Returns 1 if the double-precision floating-point value `a' is equal to the
3075 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3076 | if either operand is a NaN. Otherwise, the comparison is performed
3077 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3078 *----------------------------------------------------------------------------*/
3080 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3082 bits64 av, bv;
3084 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3085 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3087 float_raise( float_flag_invalid STATUS_VAR);
3088 return 0;
3090 av = float64_val(a);
3091 bv = float64_val(b);
3092 return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3096 /*----------------------------------------------------------------------------
3097 | Returns 1 if the double-precision floating-point value `a' is less than or
3098 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3099 | cause an exception. Otherwise, the comparison is performed according to the
3100 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3101 *----------------------------------------------------------------------------*/
3103 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3105 flag aSign, bSign;
3106 bits64 av, bv;
3108 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3109 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3111 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3112 float_raise( float_flag_invalid STATUS_VAR);
3114 return 0;
3116 aSign = extractFloat64Sign( a );
3117 bSign = extractFloat64Sign( b );
3118 av = float64_val(a);
3119 bv = float64_val(b);
3120 if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3121 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3125 /*----------------------------------------------------------------------------
3126 | Returns 1 if the double-precision floating-point value `a' is less than
3127 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3128 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3129 | Standard for Binary Floating-Point Arithmetic.
3130 *----------------------------------------------------------------------------*/
3132 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3134 flag aSign, bSign;
3135 bits64 av, bv;
3137 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3138 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3140 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3141 float_raise( float_flag_invalid STATUS_VAR);
3143 return 0;
3145 aSign = extractFloat64Sign( a );
3146 bSign = extractFloat64Sign( b );
3147 av = float64_val(a);
3148 bv = float64_val(b);
3149 if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3150 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3154 #ifdef FLOATX80
3156 /*----------------------------------------------------------------------------
3157 | Returns the result of converting the extended double-precision floating-
3158 | point value `a' to the 32-bit two's complement integer format. The
3159 | conversion is performed according to the IEC/IEEE Standard for Binary
3160 | Floating-Point Arithmetic---which means in particular that the conversion
3161 | is rounded according to the current rounding mode. If `a' is a NaN, the
3162 | largest positive integer is returned. Otherwise, if the conversion
3163 | overflows, the largest integer with the same sign as `a' is returned.
3164 *----------------------------------------------------------------------------*/
3166 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3168 flag aSign;
3169 int32 aExp, shiftCount;
3170 bits64 aSig;
3172 aSig = extractFloatx80Frac( a );
3173 aExp = extractFloatx80Exp( a );
3174 aSign = extractFloatx80Sign( a );
3175 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3176 shiftCount = 0x4037 - aExp;
3177 if ( shiftCount <= 0 ) shiftCount = 1;
3178 shift64RightJamming( aSig, shiftCount, &aSig );
3179 return roundAndPackInt32( aSign, aSig STATUS_VAR );
3183 /*----------------------------------------------------------------------------
3184 | Returns the result of converting the extended double-precision floating-
3185 | point value `a' to the 32-bit two's complement integer format. The
3186 | conversion is performed according to the IEC/IEEE Standard for Binary
3187 | Floating-Point Arithmetic, except that the conversion is always rounded
3188 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3189 | Otherwise, if the conversion overflows, the largest integer with the same
3190 | sign as `a' is returned.
3191 *----------------------------------------------------------------------------*/
3193 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3195 flag aSign;
3196 int32 aExp, shiftCount;
3197 bits64 aSig, savedASig;
3198 int32 z;
3200 aSig = extractFloatx80Frac( a );
3201 aExp = extractFloatx80Exp( a );
3202 aSign = extractFloatx80Sign( a );
3203 if ( 0x401E < aExp ) {
3204 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3205 goto invalid;
3207 else if ( aExp < 0x3FFF ) {
3208 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3209 return 0;
3211 shiftCount = 0x403E - aExp;
3212 savedASig = aSig;
3213 aSig >>= shiftCount;
3214 z = aSig;
3215 if ( aSign ) z = - z;
3216 if ( ( z < 0 ) ^ aSign ) {
3217 invalid:
3218 float_raise( float_flag_invalid STATUS_VAR);
3219 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3221 if ( ( aSig<<shiftCount ) != savedASig ) {
3222 STATUS(float_exception_flags) |= float_flag_inexact;
3224 return z;
3228 /*----------------------------------------------------------------------------
3229 | Returns the result of converting the extended double-precision floating-
3230 | point value `a' to the 64-bit two's complement integer format. The
3231 | conversion is performed according to the IEC/IEEE Standard for Binary
3232 | Floating-Point Arithmetic---which means in particular that the conversion
3233 | is rounded according to the current rounding mode. If `a' is a NaN,
3234 | the largest positive integer is returned. Otherwise, if the conversion
3235 | overflows, the largest integer with the same sign as `a' is returned.
3236 *----------------------------------------------------------------------------*/
3238 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3240 flag aSign;
3241 int32 aExp, shiftCount;
3242 bits64 aSig, aSigExtra;
3244 aSig = extractFloatx80Frac( a );
3245 aExp = extractFloatx80Exp( a );
3246 aSign = extractFloatx80Sign( a );
3247 shiftCount = 0x403E - aExp;
3248 if ( shiftCount <= 0 ) {
3249 if ( shiftCount ) {
3250 float_raise( float_flag_invalid STATUS_VAR);
3251 if ( ! aSign
3252 || ( ( aExp == 0x7FFF )
3253 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3255 return LIT64( 0x7FFFFFFFFFFFFFFF );
3257 return (sbits64) LIT64( 0x8000000000000000 );
3259 aSigExtra = 0;
3261 else {
3262 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3264 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3268 /*----------------------------------------------------------------------------
3269 | Returns the result of converting the extended double-precision floating-
3270 | point value `a' to the 64-bit two's complement integer format. The
3271 | conversion is performed according to the IEC/IEEE Standard for Binary
3272 | Floating-Point Arithmetic, except that the conversion is always rounded
3273 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3274 | Otherwise, if the conversion overflows, the largest integer with the same
3275 | sign as `a' is returned.
3276 *----------------------------------------------------------------------------*/
3278 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3280 flag aSign;
3281 int32 aExp, shiftCount;
3282 bits64 aSig;
3283 int64 z;
3285 aSig = extractFloatx80Frac( a );
3286 aExp = extractFloatx80Exp( a );
3287 aSign = extractFloatx80Sign( a );
3288 shiftCount = aExp - 0x403E;
3289 if ( 0 <= shiftCount ) {
3290 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3291 if ( ( a.high != 0xC03E ) || aSig ) {
3292 float_raise( float_flag_invalid STATUS_VAR);
3293 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3294 return LIT64( 0x7FFFFFFFFFFFFFFF );
3297 return (sbits64) LIT64( 0x8000000000000000 );
3299 else if ( aExp < 0x3FFF ) {
3300 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3301 return 0;
3303 z = aSig>>( - shiftCount );
3304 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3305 STATUS(float_exception_flags) |= float_flag_inexact;
3307 if ( aSign ) z = - z;
3308 return z;
3312 /*----------------------------------------------------------------------------
3313 | Returns the result of converting the extended double-precision floating-
3314 | point value `a' to the single-precision floating-point format. The
3315 | conversion is performed according to the IEC/IEEE Standard for Binary
3316 | Floating-Point Arithmetic.
3317 *----------------------------------------------------------------------------*/
3319 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3321 flag aSign;
3322 int32 aExp;
3323 bits64 aSig;
3325 aSig = extractFloatx80Frac( a );
3326 aExp = extractFloatx80Exp( a );
3327 aSign = extractFloatx80Sign( a );
3328 if ( aExp == 0x7FFF ) {
3329 if ( (bits64) ( aSig<<1 ) ) {
3330 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3332 return packFloat32( aSign, 0xFF, 0 );
3334 shift64RightJamming( aSig, 33, &aSig );
3335 if ( aExp || aSig ) aExp -= 0x3F81;
3336 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3340 /*----------------------------------------------------------------------------
3341 | Returns the result of converting the extended double-precision floating-
3342 | point value `a' to the double-precision floating-point format. The
3343 | conversion is performed according to the IEC/IEEE Standard for Binary
3344 | Floating-Point Arithmetic.
3345 *----------------------------------------------------------------------------*/
3347 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3349 flag aSign;
3350 int32 aExp;
3351 bits64 aSig, zSig;
3353 aSig = extractFloatx80Frac( a );
3354 aExp = extractFloatx80Exp( a );
3355 aSign = extractFloatx80Sign( a );
3356 if ( aExp == 0x7FFF ) {
3357 if ( (bits64) ( aSig<<1 ) ) {
3358 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3360 return packFloat64( aSign, 0x7FF, 0 );
3362 shift64RightJamming( aSig, 1, &zSig );
3363 if ( aExp || aSig ) aExp -= 0x3C01;
3364 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3368 #ifdef FLOAT128
3370 /*----------------------------------------------------------------------------
3371 | Returns the result of converting the extended double-precision floating-
3372 | point value `a' to the quadruple-precision floating-point format. The
3373 | conversion is performed according to the IEC/IEEE Standard for Binary
3374 | Floating-Point Arithmetic.
3375 *----------------------------------------------------------------------------*/
3377 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3379 flag aSign;
3380 int16 aExp;
3381 bits64 aSig, zSig0, zSig1;
3383 aSig = extractFloatx80Frac( a );
3384 aExp = extractFloatx80Exp( a );
3385 aSign = extractFloatx80Sign( a );
3386 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3387 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3389 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3390 return packFloat128( aSign, aExp, zSig0, zSig1 );
3394 #endif
3396 /*----------------------------------------------------------------------------
3397 | Rounds the extended double-precision floating-point value `a' to an integer,
3398 | and returns the result as an extended quadruple-precision floating-point
3399 | value. The operation is performed according to the IEC/IEEE Standard for
3400 | Binary Floating-Point Arithmetic.
3401 *----------------------------------------------------------------------------*/
3403 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3405 flag aSign;
3406 int32 aExp;
3407 bits64 lastBitMask, roundBitsMask;
3408 int8 roundingMode;
3409 floatx80 z;
3411 aExp = extractFloatx80Exp( a );
3412 if ( 0x403E <= aExp ) {
3413 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3414 return propagateFloatx80NaN( a, a STATUS_VAR );
3416 return a;
3418 if ( aExp < 0x3FFF ) {
3419 if ( ( aExp == 0 )
3420 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3421 return a;
3423 STATUS(float_exception_flags) |= float_flag_inexact;
3424 aSign = extractFloatx80Sign( a );
3425 switch ( STATUS(float_rounding_mode) ) {
3426 case float_round_nearest_even:
3427 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3429 return
3430 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3432 break;
3433 case float_round_down:
3434 return
3435 aSign ?
3436 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3437 : packFloatx80( 0, 0, 0 );
3438 case float_round_up:
3439 return
3440 aSign ? packFloatx80( 1, 0, 0 )
3441 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3443 return packFloatx80( aSign, 0, 0 );
3445 lastBitMask = 1;
3446 lastBitMask <<= 0x403E - aExp;
3447 roundBitsMask = lastBitMask - 1;
3448 z = a;
3449 roundingMode = STATUS(float_rounding_mode);
3450 if ( roundingMode == float_round_nearest_even ) {
3451 z.low += lastBitMask>>1;
3452 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3454 else if ( roundingMode != float_round_to_zero ) {
3455 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3456 z.low += roundBitsMask;
3459 z.low &= ~ roundBitsMask;
3460 if ( z.low == 0 ) {
3461 ++z.high;
3462 z.low = LIT64( 0x8000000000000000 );
3464 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3465 return z;
3469 /*----------------------------------------------------------------------------
3470 | Returns the result of adding the absolute values of the extended double-
3471 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3472 | negated before being returned. `zSign' is ignored if the result is a NaN.
3473 | The addition is performed according to the IEC/IEEE Standard for Binary
3474 | Floating-Point Arithmetic.
3475 *----------------------------------------------------------------------------*/
3477 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3479 int32 aExp, bExp, zExp;
3480 bits64 aSig, bSig, zSig0, zSig1;
3481 int32 expDiff;
3483 aSig = extractFloatx80Frac( a );
3484 aExp = extractFloatx80Exp( a );
3485 bSig = extractFloatx80Frac( b );
3486 bExp = extractFloatx80Exp( b );
3487 expDiff = aExp - bExp;
3488 if ( 0 < expDiff ) {
3489 if ( aExp == 0x7FFF ) {
3490 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3491 return a;
3493 if ( bExp == 0 ) --expDiff;
3494 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3495 zExp = aExp;
3497 else if ( expDiff < 0 ) {
3498 if ( bExp == 0x7FFF ) {
3499 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3500 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3502 if ( aExp == 0 ) ++expDiff;
3503 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3504 zExp = bExp;
3506 else {
3507 if ( aExp == 0x7FFF ) {
3508 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3509 return propagateFloatx80NaN( a, b STATUS_VAR );
3511 return a;
3513 zSig1 = 0;
3514 zSig0 = aSig + bSig;
3515 if ( aExp == 0 ) {
3516 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3517 goto roundAndPack;
3519 zExp = aExp;
3520 goto shiftRight1;
3522 zSig0 = aSig + bSig;
3523 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3524 shiftRight1:
3525 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3526 zSig0 |= LIT64( 0x8000000000000000 );
3527 ++zExp;
3528 roundAndPack:
3529 return
3530 roundAndPackFloatx80(
3531 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3535 /*----------------------------------------------------------------------------
3536 | Returns the result of subtracting the absolute values of the extended
3537 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3538 | difference is negated before being returned. `zSign' is ignored if the
3539 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3540 | Standard for Binary Floating-Point Arithmetic.
3541 *----------------------------------------------------------------------------*/
3543 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3545 int32 aExp, bExp, zExp;
3546 bits64 aSig, bSig, zSig0, zSig1;
3547 int32 expDiff;
3548 floatx80 z;
3550 aSig = extractFloatx80Frac( a );
3551 aExp = extractFloatx80Exp( a );
3552 bSig = extractFloatx80Frac( b );
3553 bExp = extractFloatx80Exp( b );
3554 expDiff = aExp - bExp;
3555 if ( 0 < expDiff ) goto aExpBigger;
3556 if ( expDiff < 0 ) goto bExpBigger;
3557 if ( aExp == 0x7FFF ) {
3558 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3559 return propagateFloatx80NaN( a, b STATUS_VAR );
3561 float_raise( float_flag_invalid STATUS_VAR);
3562 z.low = floatx80_default_nan_low;
3563 z.high = floatx80_default_nan_high;
3564 return z;
3566 if ( aExp == 0 ) {
3567 aExp = 1;
3568 bExp = 1;
3570 zSig1 = 0;
3571 if ( bSig < aSig ) goto aBigger;
3572 if ( aSig < bSig ) goto bBigger;
3573 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3574 bExpBigger:
3575 if ( bExp == 0x7FFF ) {
3576 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3577 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3579 if ( aExp == 0 ) ++expDiff;
3580 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3581 bBigger:
3582 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3583 zExp = bExp;
3584 zSign ^= 1;
3585 goto normalizeRoundAndPack;
3586 aExpBigger:
3587 if ( aExp == 0x7FFF ) {
3588 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3589 return a;
3591 if ( bExp == 0 ) --expDiff;
3592 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3593 aBigger:
3594 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3595 zExp = aExp;
3596 normalizeRoundAndPack:
3597 return
3598 normalizeRoundAndPackFloatx80(
3599 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3603 /*----------------------------------------------------------------------------
3604 | Returns the result of adding the extended double-precision floating-point
3605 | values `a' and `b'. The operation is performed according to the IEC/IEEE
3606 | Standard for Binary Floating-Point Arithmetic.
3607 *----------------------------------------------------------------------------*/
3609 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3611 flag aSign, bSign;
3613 aSign = extractFloatx80Sign( a );
3614 bSign = extractFloatx80Sign( b );
3615 if ( aSign == bSign ) {
3616 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3618 else {
3619 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3624 /*----------------------------------------------------------------------------
3625 | Returns the result of subtracting the extended double-precision floating-
3626 | point values `a' and `b'. The operation is performed according to the
3627 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3628 *----------------------------------------------------------------------------*/
3630 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3632 flag aSign, bSign;
3634 aSign = extractFloatx80Sign( a );
3635 bSign = extractFloatx80Sign( b );
3636 if ( aSign == bSign ) {
3637 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3639 else {
3640 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3645 /*----------------------------------------------------------------------------
3646 | Returns the result of multiplying the extended double-precision floating-
3647 | point values `a' and `b'. The operation is performed according to the
3648 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3649 *----------------------------------------------------------------------------*/
3651 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3653 flag aSign, bSign, zSign;
3654 int32 aExp, bExp, zExp;
3655 bits64 aSig, bSig, zSig0, zSig1;
3656 floatx80 z;
3658 aSig = extractFloatx80Frac( a );
3659 aExp = extractFloatx80Exp( a );
3660 aSign = extractFloatx80Sign( a );
3661 bSig = extractFloatx80Frac( b );
3662 bExp = extractFloatx80Exp( b );
3663 bSign = extractFloatx80Sign( b );
3664 zSign = aSign ^ bSign;
3665 if ( aExp == 0x7FFF ) {
3666 if ( (bits64) ( aSig<<1 )
3667 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3668 return propagateFloatx80NaN( a, b STATUS_VAR );
3670 if ( ( bExp | bSig ) == 0 ) goto invalid;
3671 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3673 if ( bExp == 0x7FFF ) {
3674 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3675 if ( ( aExp | aSig ) == 0 ) {
3676 invalid:
3677 float_raise( float_flag_invalid STATUS_VAR);
3678 z.low = floatx80_default_nan_low;
3679 z.high = floatx80_default_nan_high;
3680 return z;
3682 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3684 if ( aExp == 0 ) {
3685 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3686 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3688 if ( bExp == 0 ) {
3689 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3690 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3692 zExp = aExp + bExp - 0x3FFE;
3693 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3694 if ( 0 < (sbits64) zSig0 ) {
3695 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3696 --zExp;
3698 return
3699 roundAndPackFloatx80(
3700 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3704 /*----------------------------------------------------------------------------
3705 | Returns the result of dividing the extended double-precision floating-point
3706 | value `a' by the corresponding value `b'. The operation is performed
3707 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3708 *----------------------------------------------------------------------------*/
3710 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
3712 flag aSign, bSign, zSign;
3713 int32 aExp, bExp, zExp;
3714 bits64 aSig, bSig, zSig0, zSig1;
3715 bits64 rem0, rem1, rem2, term0, term1, term2;
3716 floatx80 z;
3718 aSig = extractFloatx80Frac( a );
3719 aExp = extractFloatx80Exp( a );
3720 aSign = extractFloatx80Sign( a );
3721 bSig = extractFloatx80Frac( b );
3722 bExp = extractFloatx80Exp( b );
3723 bSign = extractFloatx80Sign( b );
3724 zSign = aSign ^ bSign;
3725 if ( aExp == 0x7FFF ) {
3726 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3727 if ( bExp == 0x7FFF ) {
3728 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3729 goto invalid;
3731 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3733 if ( bExp == 0x7FFF ) {
3734 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3735 return packFloatx80( zSign, 0, 0 );
3737 if ( bExp == 0 ) {
3738 if ( bSig == 0 ) {
3739 if ( ( aExp | aSig ) == 0 ) {
3740 invalid:
3741 float_raise( float_flag_invalid STATUS_VAR);
3742 z.low = floatx80_default_nan_low;
3743 z.high = floatx80_default_nan_high;
3744 return z;
3746 float_raise( float_flag_divbyzero STATUS_VAR);
3747 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3749 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3751 if ( aExp == 0 ) {
3752 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3753 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3755 zExp = aExp - bExp + 0x3FFE;
3756 rem1 = 0;
3757 if ( bSig <= aSig ) {
3758 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3759 ++zExp;
3761 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3762 mul64To128( bSig, zSig0, &term0, &term1 );
3763 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3764 while ( (sbits64) rem0 < 0 ) {
3765 --zSig0;
3766 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3768 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3769 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3770 mul64To128( bSig, zSig1, &term1, &term2 );
3771 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3772 while ( (sbits64) rem1 < 0 ) {
3773 --zSig1;
3774 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3776 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3778 return
3779 roundAndPackFloatx80(
3780 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3784 /*----------------------------------------------------------------------------
3785 | Returns the remainder of the extended double-precision floating-point value
3786 | `a' with respect to the corresponding value `b'. The operation is performed
3787 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3788 *----------------------------------------------------------------------------*/
3790 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
3792 flag aSign, bSign, zSign;
3793 int32 aExp, bExp, expDiff;
3794 bits64 aSig0, aSig1, bSig;
3795 bits64 q, term0, term1, alternateASig0, alternateASig1;
3796 floatx80 z;
3798 aSig0 = extractFloatx80Frac( a );
3799 aExp = extractFloatx80Exp( a );
3800 aSign = extractFloatx80Sign( a );
3801 bSig = extractFloatx80Frac( b );
3802 bExp = extractFloatx80Exp( b );
3803 bSign = extractFloatx80Sign( b );
3804 if ( aExp == 0x7FFF ) {
3805 if ( (bits64) ( aSig0<<1 )
3806 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3807 return propagateFloatx80NaN( a, b STATUS_VAR );
3809 goto invalid;
3811 if ( bExp == 0x7FFF ) {
3812 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3813 return a;
3815 if ( bExp == 0 ) {
3816 if ( bSig == 0 ) {
3817 invalid:
3818 float_raise( float_flag_invalid STATUS_VAR);
3819 z.low = floatx80_default_nan_low;
3820 z.high = floatx80_default_nan_high;
3821 return z;
3823 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3825 if ( aExp == 0 ) {
3826 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3827 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3829 bSig |= LIT64( 0x8000000000000000 );
3830 zSign = aSign;
3831 expDiff = aExp - bExp;
3832 aSig1 = 0;
3833 if ( expDiff < 0 ) {
3834 if ( expDiff < -1 ) return a;
3835 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3836 expDiff = 0;
3838 q = ( bSig <= aSig0 );
3839 if ( q ) aSig0 -= bSig;
3840 expDiff -= 64;
3841 while ( 0 < expDiff ) {
3842 q = estimateDiv128To64( aSig0, aSig1, bSig );
3843 q = ( 2 < q ) ? q - 2 : 0;
3844 mul64To128( bSig, q, &term0, &term1 );
3845 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3846 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3847 expDiff -= 62;
3849 expDiff += 64;
3850 if ( 0 < expDiff ) {
3851 q = estimateDiv128To64( aSig0, aSig1, bSig );
3852 q = ( 2 < q ) ? q - 2 : 0;
3853 q >>= 64 - expDiff;
3854 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3855 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3856 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3857 while ( le128( term0, term1, aSig0, aSig1 ) ) {
3858 ++q;
3859 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3862 else {
3863 term1 = 0;
3864 term0 = bSig;
3866 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3867 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3868 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3869 && ( q & 1 ) )
3871 aSig0 = alternateASig0;
3872 aSig1 = alternateASig1;
3873 zSign = ! zSign;
3875 return
3876 normalizeRoundAndPackFloatx80(
3877 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
3881 /*----------------------------------------------------------------------------
3882 | Returns the square root of the extended double-precision floating-point
3883 | value `a'. The operation is performed according to the IEC/IEEE Standard
3884 | for Binary Floating-Point Arithmetic.
3885 *----------------------------------------------------------------------------*/
3887 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
3889 flag aSign;
3890 int32 aExp, zExp;
3891 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3892 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3893 floatx80 z;
3895 aSig0 = extractFloatx80Frac( a );
3896 aExp = extractFloatx80Exp( a );
3897 aSign = extractFloatx80Sign( a );
3898 if ( aExp == 0x7FFF ) {
3899 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
3900 if ( ! aSign ) return a;
3901 goto invalid;
3903 if ( aSign ) {
3904 if ( ( aExp | aSig0 ) == 0 ) return a;
3905 invalid:
3906 float_raise( float_flag_invalid STATUS_VAR);
3907 z.low = floatx80_default_nan_low;
3908 z.high = floatx80_default_nan_high;
3909 return z;
3911 if ( aExp == 0 ) {
3912 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3913 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3915 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3916 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3917 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3918 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3919 doubleZSig0 = zSig0<<1;
3920 mul64To128( zSig0, zSig0, &term0, &term1 );
3921 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3922 while ( (sbits64) rem0 < 0 ) {
3923 --zSig0;
3924 doubleZSig0 -= 2;
3925 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3927 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3928 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3929 if ( zSig1 == 0 ) zSig1 = 1;
3930 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3931 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3932 mul64To128( zSig1, zSig1, &term2, &term3 );
3933 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3934 while ( (sbits64) rem1 < 0 ) {
3935 --zSig1;
3936 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3937 term3 |= 1;
3938 term2 |= doubleZSig0;
3939 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3941 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3943 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3944 zSig0 |= doubleZSig0;
3945 return
3946 roundAndPackFloatx80(
3947 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
3951 /*----------------------------------------------------------------------------
3952 | Returns 1 if the extended double-precision floating-point value `a' is
3953 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3954 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3955 | Arithmetic.
3956 *----------------------------------------------------------------------------*/
3958 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
3961 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3962 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3963 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3964 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3966 if ( floatx80_is_signaling_nan( a )
3967 || floatx80_is_signaling_nan( b ) ) {
3968 float_raise( float_flag_invalid STATUS_VAR);
3970 return 0;
3972 return
3973 ( a.low == b.low )
3974 && ( ( a.high == b.high )
3975 || ( ( a.low == 0 )
3976 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3981 /*----------------------------------------------------------------------------
3982 | Returns 1 if the extended double-precision floating-point value `a' is
3983 | less than or equal to the corresponding value `b', and 0 otherwise. The
3984 | comparison is performed according to the IEC/IEEE Standard for Binary
3985 | Floating-Point Arithmetic.
3986 *----------------------------------------------------------------------------*/
3988 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
3990 flag aSign, bSign;
3992 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3993 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3994 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3995 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3997 float_raise( float_flag_invalid STATUS_VAR);
3998 return 0;
4000 aSign = extractFloatx80Sign( a );
4001 bSign = extractFloatx80Sign( b );
4002 if ( aSign != bSign ) {
4003 return
4004 aSign
4005 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4006 == 0 );
4008 return
4009 aSign ? le128( b.high, b.low, a.high, a.low )
4010 : le128( a.high, a.low, b.high, b.low );
4014 /*----------------------------------------------------------------------------
4015 | Returns 1 if the extended double-precision floating-point value `a' is
4016 | less than the corresponding value `b', and 0 otherwise. The comparison
4017 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4018 | Arithmetic.
4019 *----------------------------------------------------------------------------*/
4021 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4023 flag aSign, bSign;
4025 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4026 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4027 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4028 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4030 float_raise( float_flag_invalid STATUS_VAR);
4031 return 0;
4033 aSign = extractFloatx80Sign( a );
4034 bSign = extractFloatx80Sign( b );
4035 if ( aSign != bSign ) {
4036 return
4037 aSign
4038 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4039 != 0 );
4041 return
4042 aSign ? lt128( b.high, b.low, a.high, a.low )
4043 : lt128( a.high, a.low, b.high, b.low );
4047 /*----------------------------------------------------------------------------
4048 | Returns 1 if the extended double-precision floating-point value `a' is equal
4049 | to the corresponding value `b', and 0 otherwise. The invalid exception is
4050 | raised if either operand is a NaN. Otherwise, the comparison is performed
4051 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4052 *----------------------------------------------------------------------------*/
4054 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4057 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4058 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4059 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4060 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4062 float_raise( float_flag_invalid STATUS_VAR);
4063 return 0;
4065 return
4066 ( a.low == b.low )
4067 && ( ( a.high == b.high )
4068 || ( ( a.low == 0 )
4069 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4074 /*----------------------------------------------------------------------------
4075 | Returns 1 if the extended double-precision floating-point value `a' is less
4076 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4077 | do not cause an exception. Otherwise, the comparison is performed according
4078 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4079 *----------------------------------------------------------------------------*/
4081 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4083 flag aSign, bSign;
4085 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4086 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4087 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4088 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4090 if ( floatx80_is_signaling_nan( a )
4091 || floatx80_is_signaling_nan( b ) ) {
4092 float_raise( float_flag_invalid STATUS_VAR);
4094 return 0;
4096 aSign = extractFloatx80Sign( a );
4097 bSign = extractFloatx80Sign( b );
4098 if ( aSign != bSign ) {
4099 return
4100 aSign
4101 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4102 == 0 );
4104 return
4105 aSign ? le128( b.high, b.low, a.high, a.low )
4106 : le128( a.high, a.low, b.high, b.low );
4110 /*----------------------------------------------------------------------------
4111 | Returns 1 if the extended double-precision floating-point value `a' is less
4112 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4113 | an exception. Otherwise, the comparison is performed according to the
4114 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4115 *----------------------------------------------------------------------------*/
4117 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4119 flag aSign, bSign;
4121 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4122 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4123 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4124 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4126 if ( floatx80_is_signaling_nan( a )
4127 || floatx80_is_signaling_nan( b ) ) {
4128 float_raise( float_flag_invalid STATUS_VAR);
4130 return 0;
4132 aSign = extractFloatx80Sign( a );
4133 bSign = extractFloatx80Sign( b );
4134 if ( aSign != bSign ) {
4135 return
4136 aSign
4137 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4138 != 0 );
4140 return
4141 aSign ? lt128( b.high, b.low, a.high, a.low )
4142 : lt128( a.high, a.low, b.high, b.low );
4146 #endif
4148 #ifdef FLOAT128
4150 /*----------------------------------------------------------------------------
4151 | Returns the result of converting the quadruple-precision floating-point
4152 | value `a' to the 32-bit two's complement integer format. The conversion
4153 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4154 | Arithmetic---which means in particular that the conversion is rounded
4155 | according to the current rounding mode. If `a' is a NaN, the largest
4156 | positive integer is returned. Otherwise, if the conversion overflows, the
4157 | largest integer with the same sign as `a' is returned.
4158 *----------------------------------------------------------------------------*/
4160 int32 float128_to_int32( float128 a STATUS_PARAM )
4162 flag aSign;
4163 int32 aExp, shiftCount;
4164 bits64 aSig0, aSig1;
4166 aSig1 = extractFloat128Frac1( a );
4167 aSig0 = extractFloat128Frac0( a );
4168 aExp = extractFloat128Exp( a );
4169 aSign = extractFloat128Sign( a );
4170 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4171 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4172 aSig0 |= ( aSig1 != 0 );
4173 shiftCount = 0x4028 - aExp;
4174 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4175 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4179 /*----------------------------------------------------------------------------
4180 | Returns the result of converting the quadruple-precision floating-point
4181 | value `a' to the 32-bit two's complement integer format. The conversion
4182 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4183 | Arithmetic, except that the conversion is always rounded toward zero. If
4184 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4185 | conversion overflows, the largest integer with the same sign as `a' is
4186 | returned.
4187 *----------------------------------------------------------------------------*/
4189 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4191 flag aSign;
4192 int32 aExp, shiftCount;
4193 bits64 aSig0, aSig1, savedASig;
4194 int32 z;
4196 aSig1 = extractFloat128Frac1( a );
4197 aSig0 = extractFloat128Frac0( a );
4198 aExp = extractFloat128Exp( a );
4199 aSign = extractFloat128Sign( a );
4200 aSig0 |= ( aSig1 != 0 );
4201 if ( 0x401E < aExp ) {
4202 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4203 goto invalid;
4205 else if ( aExp < 0x3FFF ) {
4206 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4207 return 0;
4209 aSig0 |= LIT64( 0x0001000000000000 );
4210 shiftCount = 0x402F - aExp;
4211 savedASig = aSig0;
4212 aSig0 >>= shiftCount;
4213 z = aSig0;
4214 if ( aSign ) z = - z;
4215 if ( ( z < 0 ) ^ aSign ) {
4216 invalid:
4217 float_raise( float_flag_invalid STATUS_VAR);
4218 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4220 if ( ( aSig0<<shiftCount ) != savedASig ) {
4221 STATUS(float_exception_flags) |= float_flag_inexact;
4223 return z;
4227 /*----------------------------------------------------------------------------
4228 | Returns the result of converting the quadruple-precision floating-point
4229 | value `a' to the 64-bit two's complement integer format. The conversion
4230 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4231 | Arithmetic---which means in particular that the conversion is rounded
4232 | according to the current rounding mode. If `a' is a NaN, the largest
4233 | positive integer is returned. Otherwise, if the conversion overflows, the
4234 | largest integer with the same sign as `a' is returned.
4235 *----------------------------------------------------------------------------*/
4237 int64 float128_to_int64( float128 a STATUS_PARAM )
4239 flag aSign;
4240 int32 aExp, shiftCount;
4241 bits64 aSig0, aSig1;
4243 aSig1 = extractFloat128Frac1( a );
4244 aSig0 = extractFloat128Frac0( a );
4245 aExp = extractFloat128Exp( a );
4246 aSign = extractFloat128Sign( a );
4247 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4248 shiftCount = 0x402F - aExp;
4249 if ( shiftCount <= 0 ) {
4250 if ( 0x403E < aExp ) {
4251 float_raise( float_flag_invalid STATUS_VAR);
4252 if ( ! aSign
4253 || ( ( aExp == 0x7FFF )
4254 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4257 return LIT64( 0x7FFFFFFFFFFFFFFF );
4259 return (sbits64) LIT64( 0x8000000000000000 );
4261 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4263 else {
4264 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4266 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4270 /*----------------------------------------------------------------------------
4271 | Returns the result of converting the quadruple-precision floating-point
4272 | value `a' to the 64-bit two's complement integer format. The conversion
4273 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4274 | Arithmetic, except that the conversion is always rounded toward zero.
4275 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4276 | the conversion overflows, the largest integer with the same sign as `a' is
4277 | returned.
4278 *----------------------------------------------------------------------------*/
4280 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4282 flag aSign;
4283 int32 aExp, shiftCount;
4284 bits64 aSig0, aSig1;
4285 int64 z;
4287 aSig1 = extractFloat128Frac1( a );
4288 aSig0 = extractFloat128Frac0( a );
4289 aExp = extractFloat128Exp( a );
4290 aSign = extractFloat128Sign( a );
4291 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4292 shiftCount = aExp - 0x402F;
4293 if ( 0 < shiftCount ) {
4294 if ( 0x403E <= aExp ) {
4295 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4296 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4297 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4298 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4300 else {
4301 float_raise( float_flag_invalid STATUS_VAR);
4302 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4303 return LIT64( 0x7FFFFFFFFFFFFFFF );
4306 return (sbits64) LIT64( 0x8000000000000000 );
4308 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4309 if ( (bits64) ( aSig1<<shiftCount ) ) {
4310 STATUS(float_exception_flags) |= float_flag_inexact;
4313 else {
4314 if ( aExp < 0x3FFF ) {
4315 if ( aExp | aSig0 | aSig1 ) {
4316 STATUS(float_exception_flags) |= float_flag_inexact;
4318 return 0;
4320 z = aSig0>>( - shiftCount );
4321 if ( aSig1
4322 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4323 STATUS(float_exception_flags) |= float_flag_inexact;
4326 if ( aSign ) z = - z;
4327 return z;
4331 /*----------------------------------------------------------------------------
4332 | Returns the result of converting the quadruple-precision floating-point
4333 | value `a' to the single-precision floating-point format. The conversion
4334 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4335 | Arithmetic.
4336 *----------------------------------------------------------------------------*/
4338 float32 float128_to_float32( float128 a STATUS_PARAM )
4340 flag aSign;
4341 int32 aExp;
4342 bits64 aSig0, aSig1;
4343 bits32 zSig;
4345 aSig1 = extractFloat128Frac1( a );
4346 aSig0 = extractFloat128Frac0( a );
4347 aExp = extractFloat128Exp( a );
4348 aSign = extractFloat128Sign( a );
4349 if ( aExp == 0x7FFF ) {
4350 if ( aSig0 | aSig1 ) {
4351 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4353 return packFloat32( aSign, 0xFF, 0 );
4355 aSig0 |= ( aSig1 != 0 );
4356 shift64RightJamming( aSig0, 18, &aSig0 );
4357 zSig = aSig0;
4358 if ( aExp || zSig ) {
4359 zSig |= 0x40000000;
4360 aExp -= 0x3F81;
4362 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4366 /*----------------------------------------------------------------------------
4367 | Returns the result of converting the quadruple-precision floating-point
4368 | value `a' to the double-precision floating-point format. The conversion
4369 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4370 | Arithmetic.
4371 *----------------------------------------------------------------------------*/
4373 float64 float128_to_float64( float128 a STATUS_PARAM )
4375 flag aSign;
4376 int32 aExp;
4377 bits64 aSig0, aSig1;
4379 aSig1 = extractFloat128Frac1( a );
4380 aSig0 = extractFloat128Frac0( a );
4381 aExp = extractFloat128Exp( a );
4382 aSign = extractFloat128Sign( a );
4383 if ( aExp == 0x7FFF ) {
4384 if ( aSig0 | aSig1 ) {
4385 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4387 return packFloat64( aSign, 0x7FF, 0 );
4389 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4390 aSig0 |= ( aSig1 != 0 );
4391 if ( aExp || aSig0 ) {
4392 aSig0 |= LIT64( 0x4000000000000000 );
4393 aExp -= 0x3C01;
4395 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4399 #ifdef FLOATX80
4401 /*----------------------------------------------------------------------------
4402 | Returns the result of converting the quadruple-precision floating-point
4403 | value `a' to the extended double-precision floating-point format. The
4404 | conversion is performed according to the IEC/IEEE Standard for Binary
4405 | Floating-Point Arithmetic.
4406 *----------------------------------------------------------------------------*/
4408 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4410 flag aSign;
4411 int32 aExp;
4412 bits64 aSig0, aSig1;
4414 aSig1 = extractFloat128Frac1( a );
4415 aSig0 = extractFloat128Frac0( a );
4416 aExp = extractFloat128Exp( a );
4417 aSign = extractFloat128Sign( a );
4418 if ( aExp == 0x7FFF ) {
4419 if ( aSig0 | aSig1 ) {
4420 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4422 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4424 if ( aExp == 0 ) {
4425 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4426 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4428 else {
4429 aSig0 |= LIT64( 0x0001000000000000 );
4431 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4432 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4436 #endif
4438 /*----------------------------------------------------------------------------
4439 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4440 | returns the result as a quadruple-precision floating-point value. The
4441 | operation is performed according to the IEC/IEEE Standard for Binary
4442 | Floating-Point Arithmetic.
4443 *----------------------------------------------------------------------------*/
4445 float128 float128_round_to_int( float128 a STATUS_PARAM )
4447 flag aSign;
4448 int32 aExp;
4449 bits64 lastBitMask, roundBitsMask;
4450 int8 roundingMode;
4451 float128 z;
4453 aExp = extractFloat128Exp( a );
4454 if ( 0x402F <= aExp ) {
4455 if ( 0x406F <= aExp ) {
4456 if ( ( aExp == 0x7FFF )
4457 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4459 return propagateFloat128NaN( a, a STATUS_VAR );
4461 return a;
4463 lastBitMask = 1;
4464 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4465 roundBitsMask = lastBitMask - 1;
4466 z = a;
4467 roundingMode = STATUS(float_rounding_mode);
4468 if ( roundingMode == float_round_nearest_even ) {
4469 if ( lastBitMask ) {
4470 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4471 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4473 else {
4474 if ( (sbits64) z.low < 0 ) {
4475 ++z.high;
4476 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4480 else if ( roundingMode != float_round_to_zero ) {
4481 if ( extractFloat128Sign( z )
4482 ^ ( roundingMode == float_round_up ) ) {
4483 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4486 z.low &= ~ roundBitsMask;
4488 else {
4489 if ( aExp < 0x3FFF ) {
4490 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4491 STATUS(float_exception_flags) |= float_flag_inexact;
4492 aSign = extractFloat128Sign( a );
4493 switch ( STATUS(float_rounding_mode) ) {
4494 case float_round_nearest_even:
4495 if ( ( aExp == 0x3FFE )
4496 && ( extractFloat128Frac0( a )
4497 | extractFloat128Frac1( a ) )
4499 return packFloat128( aSign, 0x3FFF, 0, 0 );
4501 break;
4502 case float_round_down:
4503 return
4504 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4505 : packFloat128( 0, 0, 0, 0 );
4506 case float_round_up:
4507 return
4508 aSign ? packFloat128( 1, 0, 0, 0 )
4509 : packFloat128( 0, 0x3FFF, 0, 0 );
4511 return packFloat128( aSign, 0, 0, 0 );
4513 lastBitMask = 1;
4514 lastBitMask <<= 0x402F - aExp;
4515 roundBitsMask = lastBitMask - 1;
4516 z.low = 0;
4517 z.high = a.high;
4518 roundingMode = STATUS(float_rounding_mode);
4519 if ( roundingMode == float_round_nearest_even ) {
4520 z.high += lastBitMask>>1;
4521 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4522 z.high &= ~ lastBitMask;
4525 else if ( roundingMode != float_round_to_zero ) {
4526 if ( extractFloat128Sign( z )
4527 ^ ( roundingMode == float_round_up ) ) {
4528 z.high |= ( a.low != 0 );
4529 z.high += roundBitsMask;
4532 z.high &= ~ roundBitsMask;
4534 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4535 STATUS(float_exception_flags) |= float_flag_inexact;
4537 return z;
4541 /*----------------------------------------------------------------------------
4542 | Returns the result of adding the absolute values of the quadruple-precision
4543 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4544 | before being returned. `zSign' is ignored if the result is a NaN.
4545 | The addition is performed according to the IEC/IEEE Standard for Binary
4546 | Floating-Point Arithmetic.
4547 *----------------------------------------------------------------------------*/
4549 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4551 int32 aExp, bExp, zExp;
4552 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4553 int32 expDiff;
4555 aSig1 = extractFloat128Frac1( a );
4556 aSig0 = extractFloat128Frac0( a );
4557 aExp = extractFloat128Exp( a );
4558 bSig1 = extractFloat128Frac1( b );
4559 bSig0 = extractFloat128Frac0( b );
4560 bExp = extractFloat128Exp( b );
4561 expDiff = aExp - bExp;
4562 if ( 0 < expDiff ) {
4563 if ( aExp == 0x7FFF ) {
4564 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4565 return a;
4567 if ( bExp == 0 ) {
4568 --expDiff;
4570 else {
4571 bSig0 |= LIT64( 0x0001000000000000 );
4573 shift128ExtraRightJamming(
4574 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4575 zExp = aExp;
4577 else if ( expDiff < 0 ) {
4578 if ( bExp == 0x7FFF ) {
4579 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4580 return packFloat128( zSign, 0x7FFF, 0, 0 );
4582 if ( aExp == 0 ) {
4583 ++expDiff;
4585 else {
4586 aSig0 |= LIT64( 0x0001000000000000 );
4588 shift128ExtraRightJamming(
4589 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4590 zExp = bExp;
4592 else {
4593 if ( aExp == 0x7FFF ) {
4594 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4595 return propagateFloat128NaN( a, b STATUS_VAR );
4597 return a;
4599 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4600 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4601 zSig2 = 0;
4602 zSig0 |= LIT64( 0x0002000000000000 );
4603 zExp = aExp;
4604 goto shiftRight1;
4606 aSig0 |= LIT64( 0x0001000000000000 );
4607 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4608 --zExp;
4609 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4610 ++zExp;
4611 shiftRight1:
4612 shift128ExtraRightJamming(
4613 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4614 roundAndPack:
4615 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4619 /*----------------------------------------------------------------------------
4620 | Returns the result of subtracting the absolute values of the quadruple-
4621 | precision floating-point values `a' and `b'. If `zSign' is 1, the
4622 | difference is negated before being returned. `zSign' is ignored if the
4623 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4624 | Standard for Binary Floating-Point Arithmetic.
4625 *----------------------------------------------------------------------------*/
4627 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4629 int32 aExp, bExp, zExp;
4630 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4631 int32 expDiff;
4632 float128 z;
4634 aSig1 = extractFloat128Frac1( a );
4635 aSig0 = extractFloat128Frac0( a );
4636 aExp = extractFloat128Exp( a );
4637 bSig1 = extractFloat128Frac1( b );
4638 bSig0 = extractFloat128Frac0( b );
4639 bExp = extractFloat128Exp( b );
4640 expDiff = aExp - bExp;
4641 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4642 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4643 if ( 0 < expDiff ) goto aExpBigger;
4644 if ( expDiff < 0 ) goto bExpBigger;
4645 if ( aExp == 0x7FFF ) {
4646 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4647 return propagateFloat128NaN( a, b STATUS_VAR );
4649 float_raise( float_flag_invalid STATUS_VAR);
4650 z.low = float128_default_nan_low;
4651 z.high = float128_default_nan_high;
4652 return z;
4654 if ( aExp == 0 ) {
4655 aExp = 1;
4656 bExp = 1;
4658 if ( bSig0 < aSig0 ) goto aBigger;
4659 if ( aSig0 < bSig0 ) goto bBigger;
4660 if ( bSig1 < aSig1 ) goto aBigger;
4661 if ( aSig1 < bSig1 ) goto bBigger;
4662 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4663 bExpBigger:
4664 if ( bExp == 0x7FFF ) {
4665 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4666 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4668 if ( aExp == 0 ) {
4669 ++expDiff;
4671 else {
4672 aSig0 |= LIT64( 0x4000000000000000 );
4674 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4675 bSig0 |= LIT64( 0x4000000000000000 );
4676 bBigger:
4677 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4678 zExp = bExp;
4679 zSign ^= 1;
4680 goto normalizeRoundAndPack;
4681 aExpBigger:
4682 if ( aExp == 0x7FFF ) {
4683 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4684 return a;
4686 if ( bExp == 0 ) {
4687 --expDiff;
4689 else {
4690 bSig0 |= LIT64( 0x4000000000000000 );
4692 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4693 aSig0 |= LIT64( 0x4000000000000000 );
4694 aBigger:
4695 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4696 zExp = aExp;
4697 normalizeRoundAndPack:
4698 --zExp;
4699 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
4703 /*----------------------------------------------------------------------------
4704 | Returns the result of adding the quadruple-precision floating-point values
4705 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4706 | for Binary Floating-Point Arithmetic.
4707 *----------------------------------------------------------------------------*/
4709 float128 float128_add( float128 a, float128 b STATUS_PARAM )
4711 flag aSign, bSign;
4713 aSign = extractFloat128Sign( a );
4714 bSign = extractFloat128Sign( b );
4715 if ( aSign == bSign ) {
4716 return addFloat128Sigs( a, b, aSign STATUS_VAR );
4718 else {
4719 return subFloat128Sigs( a, b, aSign STATUS_VAR );
4724 /*----------------------------------------------------------------------------
4725 | Returns the result of subtracting the quadruple-precision floating-point
4726 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4727 | Standard for Binary Floating-Point Arithmetic.
4728 *----------------------------------------------------------------------------*/
4730 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
4732 flag aSign, bSign;
4734 aSign = extractFloat128Sign( a );
4735 bSign = extractFloat128Sign( b );
4736 if ( aSign == bSign ) {
4737 return subFloat128Sigs( a, b, aSign STATUS_VAR );
4739 else {
4740 return addFloat128Sigs( a, b, aSign STATUS_VAR );
4745 /*----------------------------------------------------------------------------
4746 | Returns the result of multiplying the quadruple-precision floating-point
4747 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4748 | Standard for Binary Floating-Point Arithmetic.
4749 *----------------------------------------------------------------------------*/
4751 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
4753 flag aSign, bSign, zSign;
4754 int32 aExp, bExp, zExp;
4755 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4756 float128 z;
4758 aSig1 = extractFloat128Frac1( a );
4759 aSig0 = extractFloat128Frac0( a );
4760 aExp = extractFloat128Exp( a );
4761 aSign = extractFloat128Sign( a );
4762 bSig1 = extractFloat128Frac1( b );
4763 bSig0 = extractFloat128Frac0( b );
4764 bExp = extractFloat128Exp( b );
4765 bSign = extractFloat128Sign( b );
4766 zSign = aSign ^ bSign;
4767 if ( aExp == 0x7FFF ) {
4768 if ( ( aSig0 | aSig1 )
4769 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4770 return propagateFloat128NaN( a, b STATUS_VAR );
4772 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4773 return packFloat128( zSign, 0x7FFF, 0, 0 );
4775 if ( bExp == 0x7FFF ) {
4776 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4777 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4778 invalid:
4779 float_raise( float_flag_invalid STATUS_VAR);
4780 z.low = float128_default_nan_low;
4781 z.high = float128_default_nan_high;
4782 return z;
4784 return packFloat128( zSign, 0x7FFF, 0, 0 );
4786 if ( aExp == 0 ) {
4787 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4788 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4790 if ( bExp == 0 ) {
4791 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4792 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4794 zExp = aExp + bExp - 0x4000;
4795 aSig0 |= LIT64( 0x0001000000000000 );
4796 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4797 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4798 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4799 zSig2 |= ( zSig3 != 0 );
4800 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4801 shift128ExtraRightJamming(
4802 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4803 ++zExp;
4805 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4809 /*----------------------------------------------------------------------------
4810 | Returns the result of dividing the quadruple-precision floating-point value
4811 | `a' by the corresponding value `b'. The operation is performed according to
4812 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4813 *----------------------------------------------------------------------------*/
4815 float128 float128_div( float128 a, float128 b STATUS_PARAM )
4817 flag aSign, bSign, zSign;
4818 int32 aExp, bExp, zExp;
4819 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4820 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4821 float128 z;
4823 aSig1 = extractFloat128Frac1( a );
4824 aSig0 = extractFloat128Frac0( a );
4825 aExp = extractFloat128Exp( a );
4826 aSign = extractFloat128Sign( a );
4827 bSig1 = extractFloat128Frac1( b );
4828 bSig0 = extractFloat128Frac0( b );
4829 bExp = extractFloat128Exp( b );
4830 bSign = extractFloat128Sign( b );
4831 zSign = aSign ^ bSign;
4832 if ( aExp == 0x7FFF ) {
4833 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4834 if ( bExp == 0x7FFF ) {
4835 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4836 goto invalid;
4838 return packFloat128( zSign, 0x7FFF, 0, 0 );
4840 if ( bExp == 0x7FFF ) {
4841 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4842 return packFloat128( zSign, 0, 0, 0 );
4844 if ( bExp == 0 ) {
4845 if ( ( bSig0 | bSig1 ) == 0 ) {
4846 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4847 invalid:
4848 float_raise( float_flag_invalid STATUS_VAR);
4849 z.low = float128_default_nan_low;
4850 z.high = float128_default_nan_high;
4851 return z;
4853 float_raise( float_flag_divbyzero STATUS_VAR);
4854 return packFloat128( zSign, 0x7FFF, 0, 0 );
4856 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4858 if ( aExp == 0 ) {
4859 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4860 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4862 zExp = aExp - bExp + 0x3FFD;
4863 shortShift128Left(
4864 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4865 shortShift128Left(
4866 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4867 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4868 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4869 ++zExp;
4871 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4872 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4873 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4874 while ( (sbits64) rem0 < 0 ) {
4875 --zSig0;
4876 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4878 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4879 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4880 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4881 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4882 while ( (sbits64) rem1 < 0 ) {
4883 --zSig1;
4884 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4886 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4888 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4889 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4893 /*----------------------------------------------------------------------------
4894 | Returns the remainder of the quadruple-precision floating-point value `a'
4895 | with respect to the corresponding value `b'. The operation is performed
4896 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4897 *----------------------------------------------------------------------------*/
4899 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
4901 flag aSign, bSign, zSign;
4902 int32 aExp, bExp, expDiff;
4903 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4904 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4905 sbits64 sigMean0;
4906 float128 z;
4908 aSig1 = extractFloat128Frac1( a );
4909 aSig0 = extractFloat128Frac0( a );
4910 aExp = extractFloat128Exp( a );
4911 aSign = extractFloat128Sign( a );
4912 bSig1 = extractFloat128Frac1( b );
4913 bSig0 = extractFloat128Frac0( b );
4914 bExp = extractFloat128Exp( b );
4915 bSign = extractFloat128Sign( b );
4916 if ( aExp == 0x7FFF ) {
4917 if ( ( aSig0 | aSig1 )
4918 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4919 return propagateFloat128NaN( a, b STATUS_VAR );
4921 goto invalid;
4923 if ( bExp == 0x7FFF ) {
4924 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4925 return a;
4927 if ( bExp == 0 ) {
4928 if ( ( bSig0 | bSig1 ) == 0 ) {
4929 invalid:
4930 float_raise( float_flag_invalid STATUS_VAR);
4931 z.low = float128_default_nan_low;
4932 z.high = float128_default_nan_high;
4933 return z;
4935 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4937 if ( aExp == 0 ) {
4938 if ( ( aSig0 | aSig1 ) == 0 ) return a;
4939 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4941 expDiff = aExp - bExp;
4942 if ( expDiff < -1 ) return a;
4943 shortShift128Left(
4944 aSig0 | LIT64( 0x0001000000000000 ),
4945 aSig1,
4946 15 - ( expDiff < 0 ),
4947 &aSig0,
4948 &aSig1
4950 shortShift128Left(
4951 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4952 q = le128( bSig0, bSig1, aSig0, aSig1 );
4953 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4954 expDiff -= 64;
4955 while ( 0 < expDiff ) {
4956 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4957 q = ( 4 < q ) ? q - 4 : 0;
4958 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4959 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4960 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4961 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4962 expDiff -= 61;
4964 if ( -64 < expDiff ) {
4965 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4966 q = ( 4 < q ) ? q - 4 : 0;
4967 q >>= - expDiff;
4968 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4969 expDiff += 52;
4970 if ( expDiff < 0 ) {
4971 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4973 else {
4974 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4976 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4977 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4979 else {
4980 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4981 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4983 do {
4984 alternateASig0 = aSig0;
4985 alternateASig1 = aSig1;
4986 ++q;
4987 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4988 } while ( 0 <= (sbits64) aSig0 );
4989 add128(
4990 aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
4991 if ( ( sigMean0 < 0 )
4992 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4993 aSig0 = alternateASig0;
4994 aSig1 = alternateASig1;
4996 zSign = ( (sbits64) aSig0 < 0 );
4997 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4998 return
4999 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5003 /*----------------------------------------------------------------------------
5004 | Returns the square root of the quadruple-precision floating-point value `a'.
5005 | The operation is performed according to the IEC/IEEE Standard for Binary
5006 | Floating-Point Arithmetic.
5007 *----------------------------------------------------------------------------*/
5009 float128 float128_sqrt( float128 a STATUS_PARAM )
5011 flag aSign;
5012 int32 aExp, zExp;
5013 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5014 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5015 float128 z;
5017 aSig1 = extractFloat128Frac1( a );
5018 aSig0 = extractFloat128Frac0( a );
5019 aExp = extractFloat128Exp( a );
5020 aSign = extractFloat128Sign( a );
5021 if ( aExp == 0x7FFF ) {
5022 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5023 if ( ! aSign ) return a;
5024 goto invalid;
5026 if ( aSign ) {
5027 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5028 invalid:
5029 float_raise( float_flag_invalid STATUS_VAR);
5030 z.low = float128_default_nan_low;
5031 z.high = float128_default_nan_high;
5032 return z;
5034 if ( aExp == 0 ) {
5035 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5036 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5038 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5039 aSig0 |= LIT64( 0x0001000000000000 );
5040 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5041 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5042 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5043 doubleZSig0 = zSig0<<1;
5044 mul64To128( zSig0, zSig0, &term0, &term1 );
5045 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5046 while ( (sbits64) rem0 < 0 ) {
5047 --zSig0;
5048 doubleZSig0 -= 2;
5049 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5051 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5052 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5053 if ( zSig1 == 0 ) zSig1 = 1;
5054 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5055 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5056 mul64To128( zSig1, zSig1, &term2, &term3 );
5057 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5058 while ( (sbits64) rem1 < 0 ) {
5059 --zSig1;
5060 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5061 term3 |= 1;
5062 term2 |= doubleZSig0;
5063 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5065 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5067 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5068 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5072 /*----------------------------------------------------------------------------
5073 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5074 | the corresponding value `b', and 0 otherwise. The comparison is performed
5075 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5076 *----------------------------------------------------------------------------*/
5078 int float128_eq( float128 a, float128 b STATUS_PARAM )
5081 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5082 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5083 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5084 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5086 if ( float128_is_signaling_nan( a )
5087 || float128_is_signaling_nan( b ) ) {
5088 float_raise( float_flag_invalid STATUS_VAR);
5090 return 0;
5092 return
5093 ( a.low == b.low )
5094 && ( ( a.high == b.high )
5095 || ( ( a.low == 0 )
5096 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5101 /*----------------------------------------------------------------------------
5102 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5103 | or equal to the corresponding value `b', and 0 otherwise. The comparison
5104 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5105 | Arithmetic.
5106 *----------------------------------------------------------------------------*/
5108 int float128_le( float128 a, float128 b STATUS_PARAM )
5110 flag aSign, bSign;
5112 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5113 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5114 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5115 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5117 float_raise( float_flag_invalid STATUS_VAR);
5118 return 0;
5120 aSign = extractFloat128Sign( a );
5121 bSign = extractFloat128Sign( b );
5122 if ( aSign != bSign ) {
5123 return
5124 aSign
5125 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5126 == 0 );
5128 return
5129 aSign ? le128( b.high, b.low, a.high, a.low )
5130 : le128( a.high, a.low, b.high, b.low );
5134 /*----------------------------------------------------------------------------
5135 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5136 | the corresponding value `b', and 0 otherwise. The comparison is performed
5137 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5138 *----------------------------------------------------------------------------*/
5140 int float128_lt( float128 a, float128 b STATUS_PARAM )
5142 flag aSign, bSign;
5144 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5145 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5146 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5147 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5149 float_raise( float_flag_invalid STATUS_VAR);
5150 return 0;
5152 aSign = extractFloat128Sign( a );
5153 bSign = extractFloat128Sign( b );
5154 if ( aSign != bSign ) {
5155 return
5156 aSign
5157 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5158 != 0 );
5160 return
5161 aSign ? lt128( b.high, b.low, a.high, a.low )
5162 : lt128( a.high, a.low, b.high, b.low );
5166 /*----------------------------------------------------------------------------
5167 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5168 | the corresponding value `b', and 0 otherwise. The invalid exception is
5169 | raised if either operand is a NaN. Otherwise, the comparison is performed
5170 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5171 *----------------------------------------------------------------------------*/
5173 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5176 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5177 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5178 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5179 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5181 float_raise( float_flag_invalid STATUS_VAR);
5182 return 0;
5184 return
5185 ( a.low == b.low )
5186 && ( ( a.high == b.high )
5187 || ( ( a.low == 0 )
5188 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5193 /*----------------------------------------------------------------------------
5194 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5195 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5196 | cause an exception. Otherwise, the comparison is performed according to the
5197 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5198 *----------------------------------------------------------------------------*/
5200 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5202 flag aSign, bSign;
5204 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5205 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5206 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5207 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5209 if ( float128_is_signaling_nan( a )
5210 || float128_is_signaling_nan( b ) ) {
5211 float_raise( float_flag_invalid STATUS_VAR);
5213 return 0;
5215 aSign = extractFloat128Sign( a );
5216 bSign = extractFloat128Sign( b );
5217 if ( aSign != bSign ) {
5218 return
5219 aSign
5220 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5221 == 0 );
5223 return
5224 aSign ? le128( b.high, b.low, a.high, a.low )
5225 : le128( a.high, a.low, b.high, b.low );
5229 /*----------------------------------------------------------------------------
5230 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5231 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5232 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5233 | Standard for Binary Floating-Point Arithmetic.
5234 *----------------------------------------------------------------------------*/
5236 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5238 flag aSign, bSign;
5240 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5241 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5242 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5243 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5245 if ( float128_is_signaling_nan( a )
5246 || float128_is_signaling_nan( b ) ) {
5247 float_raise( float_flag_invalid STATUS_VAR);
5249 return 0;
5251 aSign = extractFloat128Sign( a );
5252 bSign = extractFloat128Sign( b );
5253 if ( aSign != bSign ) {
5254 return
5255 aSign
5256 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5257 != 0 );
5259 return
5260 aSign ? lt128( b.high, b.low, a.high, a.low )
5261 : lt128( a.high, a.low, b.high, b.low );
5265 #endif
5267 /* misc functions */
5268 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5270 return int64_to_float32(a STATUS_VAR);
5273 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5275 return int64_to_float64(a STATUS_VAR);
5278 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5280 int64_t v;
5281 unsigned int res;
5283 v = float32_to_int64(a STATUS_VAR);
5284 if (v < 0) {
5285 res = 0;
5286 float_raise( float_flag_invalid STATUS_VAR);
5287 } else if (v > 0xffffffff) {
5288 res = 0xffffffff;
5289 float_raise( float_flag_invalid STATUS_VAR);
5290 } else {
5291 res = v;
5293 return res;
5296 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5298 int64_t v;
5299 unsigned int res;
5301 v = float32_to_int64_round_to_zero(a STATUS_VAR);
5302 if (v < 0) {
5303 res = 0;
5304 float_raise( float_flag_invalid STATUS_VAR);
5305 } else if (v > 0xffffffff) {
5306 res = 0xffffffff;
5307 float_raise( float_flag_invalid STATUS_VAR);
5308 } else {
5309 res = v;
5311 return res;
5314 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5316 int64_t v;
5317 unsigned int res;
5319 v = float64_to_int64(a STATUS_VAR);
5320 if (v < 0) {
5321 res = 0;
5322 float_raise( float_flag_invalid STATUS_VAR);
5323 } else if (v > 0xffffffff) {
5324 res = 0xffffffff;
5325 float_raise( float_flag_invalid STATUS_VAR);
5326 } else {
5327 res = v;
5329 return res;
5332 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5334 int64_t v;
5335 unsigned int res;
5337 v = float64_to_int64_round_to_zero(a STATUS_VAR);
5338 if (v < 0) {
5339 res = 0;
5340 float_raise( float_flag_invalid STATUS_VAR);
5341 } else if (v > 0xffffffff) {
5342 res = 0xffffffff;
5343 float_raise( float_flag_invalid STATUS_VAR);
5344 } else {
5345 res = v;
5347 return res;
5350 /* FIXME: This looks broken. */
5351 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5353 int64_t v;
5355 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5356 v += float64_val(a);
5357 v = float64_to_int64(make_float64(v) STATUS_VAR);
5359 return v - INT64_MIN;
5362 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5364 int64_t v;
5366 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5367 v += float64_val(a);
5368 v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5370 return v - INT64_MIN;
5373 #define COMPARE(s, nan_exp) \
5374 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
5375 int is_quiet STATUS_PARAM ) \
5377 flag aSign, bSign; \
5378 bits ## s av, bv; \
5380 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
5381 extractFloat ## s ## Frac( a ) ) || \
5382 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
5383 extractFloat ## s ## Frac( b ) )) { \
5384 if (!is_quiet || \
5385 float ## s ## _is_signaling_nan( a ) || \
5386 float ## s ## _is_signaling_nan( b ) ) { \
5387 float_raise( float_flag_invalid STATUS_VAR); \
5389 return float_relation_unordered; \
5391 aSign = extractFloat ## s ## Sign( a ); \
5392 bSign = extractFloat ## s ## Sign( b ); \
5393 av = float ## s ## _val(a); \
5394 bv = float ## s ## _val(b); \
5395 if ( aSign != bSign ) { \
5396 if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) { \
5397 /* zero case */ \
5398 return float_relation_equal; \
5399 } else { \
5400 return 1 - (2 * aSign); \
5402 } else { \
5403 if (av == bv) { \
5404 return float_relation_equal; \
5405 } else { \
5406 return 1 - 2 * (aSign ^ ( av < bv )); \
5411 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
5413 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
5416 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
5418 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
5421 COMPARE(32, 0xff)
5422 COMPARE(64, 0x7ff)
5424 INLINE int float128_compare_internal( float128 a, float128 b,
5425 int is_quiet STATUS_PARAM )
5427 flag aSign, bSign;
5429 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
5430 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
5431 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
5432 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
5433 if (!is_quiet ||
5434 float128_is_signaling_nan( a ) ||
5435 float128_is_signaling_nan( b ) ) {
5436 float_raise( float_flag_invalid STATUS_VAR);
5438 return float_relation_unordered;
5440 aSign = extractFloat128Sign( a );
5441 bSign = extractFloat128Sign( b );
5442 if ( aSign != bSign ) {
5443 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
5444 /* zero case */
5445 return float_relation_equal;
5446 } else {
5447 return 1 - (2 * aSign);
5449 } else {
5450 if (a.low == b.low && a.high == b.high) {
5451 return float_relation_equal;
5452 } else {
5453 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
5458 int float128_compare( float128 a, float128 b STATUS_PARAM )
5460 return float128_compare_internal(a, b, 0 STATUS_VAR);
5463 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
5465 return float128_compare_internal(a, b, 1 STATUS_VAR);
5468 /* Multiply A by 2 raised to the power N. */
5469 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
5471 flag aSign;
5472 int16 aExp;
5473 bits32 aSig;
5475 aSig = extractFloat32Frac( a );
5476 aExp = extractFloat32Exp( a );
5477 aSign = extractFloat32Sign( a );
5479 if ( aExp == 0xFF ) {
5480 return a;
5482 aExp += n;
5483 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
5486 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
5488 flag aSign;
5489 int16 aExp;
5490 bits64 aSig;
5492 aSig = extractFloat64Frac( a );
5493 aExp = extractFloat64Exp( a );
5494 aSign = extractFloat64Sign( a );
5496 if ( aExp == 0x7FF ) {
5497 return a;
5499 aExp += n;
5500 return roundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
5503 #ifdef FLOATX80
5504 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
5506 flag aSign;
5507 int16 aExp;
5508 bits64 aSig;
5510 aSig = extractFloatx80Frac( a );
5511 aExp = extractFloatx80Exp( a );
5512 aSign = extractFloatx80Sign( a );
5514 if ( aExp == 0x7FF ) {
5515 return a;
5517 aExp += n;
5518 return roundAndPackFloatx80( STATUS(floatx80_rounding_precision),
5519 aSign, aExp, aSig, 0 STATUS_VAR );
5521 #endif
5523 #ifdef FLOAT128
5524 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
5526 flag aSign;
5527 int32 aExp;
5528 bits64 aSig0, aSig1;
5530 aSig1 = extractFloat128Frac1( a );
5531 aSig0 = extractFloat128Frac0( a );
5532 aExp = extractFloat128Exp( a );
5533 aSign = extractFloat128Sign( a );
5534 if ( aExp == 0x7FFF ) {
5535 return a;
5537 aExp += n;
5538 return roundAndPackFloat128( aSign, aExp, aSig0, aSig1, 0 STATUS_VAR );
5541 #endif