Replace tabs by 8 spaces. No code change, by Herve Poussineau.
[qemu/dscho.git] / fpu / softfloat.c
blob6db6cf1321403cda7b814daa4c2a81e1044c0a36
2 /*============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5 Package, Release 2b.
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15 arithmetic/SoftFloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
26 Derivative works are acceptable, even for commercial purposes, so long as
27 (1) the source code for the derivative work includes prominent notice that
28 the work is derivative, and (2) the source code includes prominent notice with
29 these four paragraphs for those parts of this code that are retained.
31 =============================================================================*/
33 #include "softfloat.h"
35 /*----------------------------------------------------------------------------
36 | Primitive arithmetic functions, including multi-word arithmetic, and
37 | division and square root approximations. (Can be specialized to target if
38 | desired.)
39 *----------------------------------------------------------------------------*/
40 #include "softfloat-macros.h"
42 /*----------------------------------------------------------------------------
43 | Functions and definitions to determine: (1) whether tininess for underflow
44 | is detected before or after rounding by default, (2) what (if anything)
45 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
46 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
47 | are propagated from function inputs to output. These details are target-
48 | specific.
49 *----------------------------------------------------------------------------*/
50 #include "softfloat-specialize.h"
52 void set_float_rounding_mode(int val STATUS_PARAM)
54 STATUS(float_rounding_mode) = val;
57 void set_float_exception_flags(int val STATUS_PARAM)
59 STATUS(float_exception_flags) = val;
62 #ifdef FLOATX80
63 void set_floatx80_rounding_precision(int val STATUS_PARAM)
65 STATUS(floatx80_rounding_precision) = val;
67 #endif
69 /*----------------------------------------------------------------------------
70 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
71 | and 7, and returns the properly rounded 32-bit integer corresponding to the
72 | input. If `zSign' is 1, the input is negated before being converted to an
73 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
74 | is simply rounded to an integer, with the inexact exception raised if the
75 | input cannot be represented exactly as an integer. However, if the fixed-
76 | point input is too large, the invalid exception is raised and the largest
77 | positive or negative integer is returned.
78 *----------------------------------------------------------------------------*/
80 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
82 int8 roundingMode;
83 flag roundNearestEven;
84 int8 roundIncrement, roundBits;
85 int32 z;
87 roundingMode = STATUS(float_rounding_mode);
88 roundNearestEven = ( roundingMode == float_round_nearest_even );
89 roundIncrement = 0x40;
90 if ( ! roundNearestEven ) {
91 if ( roundingMode == float_round_to_zero ) {
92 roundIncrement = 0;
94 else {
95 roundIncrement = 0x7F;
96 if ( zSign ) {
97 if ( roundingMode == float_round_up ) roundIncrement = 0;
99 else {
100 if ( roundingMode == float_round_down ) roundIncrement = 0;
104 roundBits = absZ & 0x7F;
105 absZ = ( absZ + roundIncrement )>>7;
106 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
107 z = absZ;
108 if ( zSign ) z = - z;
109 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
110 float_raise( float_flag_invalid STATUS_VAR);
111 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
113 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
114 return z;
118 /*----------------------------------------------------------------------------
119 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
120 | `absZ1', with binary point between bits 63 and 64 (between the input words),
121 | and returns the properly rounded 64-bit integer corresponding to the input.
122 | If `zSign' is 1, the input is negated before being converted to an integer.
123 | Ordinarily, the fixed-point input is simply rounded to an integer, with
124 | the inexact exception raised if the input cannot be represented exactly as
125 | an integer. However, if the fixed-point input is too large, the invalid
126 | exception is raised and the largest positive or negative integer is
127 | returned.
128 *----------------------------------------------------------------------------*/
130 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
132 int8 roundingMode;
133 flag roundNearestEven, increment;
134 int64 z;
136 roundingMode = STATUS(float_rounding_mode);
137 roundNearestEven = ( roundingMode == float_round_nearest_even );
138 increment = ( (sbits64) absZ1 < 0 );
139 if ( ! roundNearestEven ) {
140 if ( roundingMode == float_round_to_zero ) {
141 increment = 0;
143 else {
144 if ( zSign ) {
145 increment = ( roundingMode == float_round_down ) && absZ1;
147 else {
148 increment = ( roundingMode == float_round_up ) && absZ1;
152 if ( increment ) {
153 ++absZ0;
154 if ( absZ0 == 0 ) goto overflow;
155 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
157 z = absZ0;
158 if ( zSign ) z = - z;
159 if ( z && ( ( z < 0 ) ^ zSign ) ) {
160 overflow:
161 float_raise( float_flag_invalid STATUS_VAR);
162 return
163 zSign ? (sbits64) LIT64( 0x8000000000000000 )
164 : LIT64( 0x7FFFFFFFFFFFFFFF );
166 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
167 return z;
171 /*----------------------------------------------------------------------------
172 | Returns the fraction bits of the single-precision floating-point value `a'.
173 *----------------------------------------------------------------------------*/
175 INLINE bits32 extractFloat32Frac( float32 a )
178 return a & 0x007FFFFF;
182 /*----------------------------------------------------------------------------
183 | Returns the exponent bits of the single-precision floating-point value `a'.
184 *----------------------------------------------------------------------------*/
186 INLINE int16 extractFloat32Exp( float32 a )
189 return ( a>>23 ) & 0xFF;
193 /*----------------------------------------------------------------------------
194 | Returns the sign bit of the single-precision floating-point value `a'.
195 *----------------------------------------------------------------------------*/
197 INLINE flag extractFloat32Sign( float32 a )
200 return a>>31;
204 /*----------------------------------------------------------------------------
205 | Normalizes the subnormal single-precision floating-point value represented
206 | by the denormalized significand `aSig'. The normalized exponent and
207 | significand are stored at the locations pointed to by `zExpPtr' and
208 | `zSigPtr', respectively.
209 *----------------------------------------------------------------------------*/
211 static void
212 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
214 int8 shiftCount;
216 shiftCount = countLeadingZeros32( aSig ) - 8;
217 *zSigPtr = aSig<<shiftCount;
218 *zExpPtr = 1 - shiftCount;
222 /*----------------------------------------------------------------------------
223 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
224 | single-precision floating-point value, returning the result. After being
225 | shifted into the proper positions, the three fields are simply added
226 | together to form the result. This means that any integer portion of `zSig'
227 | will be added into the exponent. Since a properly normalized significand
228 | will have an integer portion equal to 1, the `zExp' input should be 1 less
229 | than the desired result exponent whenever `zSig' is a complete, normalized
230 | significand.
231 *----------------------------------------------------------------------------*/
233 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
236 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
240 /*----------------------------------------------------------------------------
241 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
242 | and significand `zSig', and returns the proper single-precision floating-
243 | point value corresponding to the abstract input. Ordinarily, the abstract
244 | value is simply rounded and packed into the single-precision format, with
245 | the inexact exception raised if the abstract input cannot be represented
246 | exactly. However, if the abstract value is too large, the overflow and
247 | inexact exceptions are raised and an infinity or maximal finite value is
248 | returned. If the abstract value is too small, the input value is rounded to
249 | a subnormal number, and the underflow and inexact exceptions are raised if
250 | the abstract input cannot be represented exactly as a subnormal single-
251 | precision floating-point number.
252 | The input significand `zSig' has its binary point between bits 30
253 | and 29, which is 7 bits to the left of the usual location. This shifted
254 | significand must be normalized or smaller. If `zSig' is not normalized,
255 | `zExp' must be 0; in that case, the result returned is a subnormal number,
256 | and it must not require rounding. In the usual case that `zSig' is
257 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
258 | The handling of underflow and overflow follows the IEC/IEEE Standard for
259 | Binary Floating-Point Arithmetic.
260 *----------------------------------------------------------------------------*/
262 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
264 int8 roundingMode;
265 flag roundNearestEven;
266 int8 roundIncrement, roundBits;
267 flag isTiny;
269 roundingMode = STATUS(float_rounding_mode);
270 roundNearestEven = ( roundingMode == float_round_nearest_even );
271 roundIncrement = 0x40;
272 if ( ! roundNearestEven ) {
273 if ( roundingMode == float_round_to_zero ) {
274 roundIncrement = 0;
276 else {
277 roundIncrement = 0x7F;
278 if ( zSign ) {
279 if ( roundingMode == float_round_up ) roundIncrement = 0;
281 else {
282 if ( roundingMode == float_round_down ) roundIncrement = 0;
286 roundBits = zSig & 0x7F;
287 if ( 0xFD <= (bits16) zExp ) {
288 if ( ( 0xFD < zExp )
289 || ( ( zExp == 0xFD )
290 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
292 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
293 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
295 if ( zExp < 0 ) {
296 isTiny =
297 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
298 || ( zExp < -1 )
299 || ( zSig + roundIncrement < 0x80000000 );
300 shift32RightJamming( zSig, - zExp, &zSig );
301 zExp = 0;
302 roundBits = zSig & 0x7F;
303 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
306 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
307 zSig = ( zSig + roundIncrement )>>7;
308 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
309 if ( zSig == 0 ) zExp = 0;
310 return packFloat32( zSign, zExp, zSig );
314 /*----------------------------------------------------------------------------
315 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
316 | and significand `zSig', and returns the proper single-precision floating-
317 | point value corresponding to the abstract input. This routine is just like
318 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
319 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
320 | floating-point exponent.
321 *----------------------------------------------------------------------------*/
323 static float32
324 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
326 int8 shiftCount;
328 shiftCount = countLeadingZeros32( zSig ) - 1;
329 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
333 /*----------------------------------------------------------------------------
334 | Returns the fraction bits of the double-precision floating-point value `a'.
335 *----------------------------------------------------------------------------*/
337 INLINE bits64 extractFloat64Frac( float64 a )
340 return a & LIT64( 0x000FFFFFFFFFFFFF );
344 /*----------------------------------------------------------------------------
345 | Returns the exponent bits of the double-precision floating-point value `a'.
346 *----------------------------------------------------------------------------*/
348 INLINE int16 extractFloat64Exp( float64 a )
351 return ( a>>52 ) & 0x7FF;
355 /*----------------------------------------------------------------------------
356 | Returns the sign bit of the double-precision floating-point value `a'.
357 *----------------------------------------------------------------------------*/
359 INLINE flag extractFloat64Sign( float64 a )
362 return a>>63;
366 /*----------------------------------------------------------------------------
367 | Normalizes the subnormal double-precision floating-point value represented
368 | by the denormalized significand `aSig'. The normalized exponent and
369 | significand are stored at the locations pointed to by `zExpPtr' and
370 | `zSigPtr', respectively.
371 *----------------------------------------------------------------------------*/
373 static void
374 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
376 int8 shiftCount;
378 shiftCount = countLeadingZeros64( aSig ) - 11;
379 *zSigPtr = aSig<<shiftCount;
380 *zExpPtr = 1 - shiftCount;
384 /*----------------------------------------------------------------------------
385 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
386 | double-precision floating-point value, returning the result. After being
387 | shifted into the proper positions, the three fields are simply added
388 | together to form the result. This means that any integer portion of `zSig'
389 | will be added into the exponent. Since a properly normalized significand
390 | will have an integer portion equal to 1, the `zExp' input should be 1 less
391 | than the desired result exponent whenever `zSig' is a complete, normalized
392 | significand.
393 *----------------------------------------------------------------------------*/
395 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
398 return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
402 /*----------------------------------------------------------------------------
403 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
404 | and significand `zSig', and returns the proper double-precision floating-
405 | point value corresponding to the abstract input. Ordinarily, the abstract
406 | value is simply rounded and packed into the double-precision format, with
407 | the inexact exception raised if the abstract input cannot be represented
408 | exactly. However, if the abstract value is too large, the overflow and
409 | inexact exceptions are raised and an infinity or maximal finite value is
410 | returned. If the abstract value is too small, the input value is rounded
411 | to a subnormal number, and the underflow and inexact exceptions are raised
412 | if the abstract input cannot be represented exactly as a subnormal double-
413 | precision floating-point number.
414 | The input significand `zSig' has its binary point between bits 62
415 | and 61, which is 10 bits to the left of the usual location. This shifted
416 | significand must be normalized or smaller. If `zSig' is not normalized,
417 | `zExp' must be 0; in that case, the result returned is a subnormal number,
418 | and it must not require rounding. In the usual case that `zSig' is
419 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
420 | The handling of underflow and overflow follows the IEC/IEEE Standard for
421 | Binary Floating-Point Arithmetic.
422 *----------------------------------------------------------------------------*/
424 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
426 int8 roundingMode;
427 flag roundNearestEven;
428 int16 roundIncrement, roundBits;
429 flag isTiny;
431 roundingMode = STATUS(float_rounding_mode);
432 roundNearestEven = ( roundingMode == float_round_nearest_even );
433 roundIncrement = 0x200;
434 if ( ! roundNearestEven ) {
435 if ( roundingMode == float_round_to_zero ) {
436 roundIncrement = 0;
438 else {
439 roundIncrement = 0x3FF;
440 if ( zSign ) {
441 if ( roundingMode == float_round_up ) roundIncrement = 0;
443 else {
444 if ( roundingMode == float_round_down ) roundIncrement = 0;
448 roundBits = zSig & 0x3FF;
449 if ( 0x7FD <= (bits16) zExp ) {
450 if ( ( 0x7FD < zExp )
451 || ( ( zExp == 0x7FD )
452 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
454 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
455 return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
457 if ( zExp < 0 ) {
458 isTiny =
459 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
460 || ( zExp < -1 )
461 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
462 shift64RightJamming( zSig, - zExp, &zSig );
463 zExp = 0;
464 roundBits = zSig & 0x3FF;
465 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
468 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
469 zSig = ( zSig + roundIncrement )>>10;
470 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
471 if ( zSig == 0 ) zExp = 0;
472 return packFloat64( zSign, zExp, zSig );
476 /*----------------------------------------------------------------------------
477 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
478 | and significand `zSig', and returns the proper double-precision floating-
479 | point value corresponding to the abstract input. This routine is just like
480 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
481 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
482 | floating-point exponent.
483 *----------------------------------------------------------------------------*/
485 static float64
486 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
488 int8 shiftCount;
490 shiftCount = countLeadingZeros64( zSig ) - 1;
491 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
495 #ifdef FLOATX80
497 /*----------------------------------------------------------------------------
498 | Returns the fraction bits of the extended double-precision floating-point
499 | value `a'.
500 *----------------------------------------------------------------------------*/
502 INLINE bits64 extractFloatx80Frac( floatx80 a )
505 return a.low;
509 /*----------------------------------------------------------------------------
510 | Returns the exponent bits of the extended double-precision floating-point
511 | value `a'.
512 *----------------------------------------------------------------------------*/
514 INLINE int32 extractFloatx80Exp( floatx80 a )
517 return a.high & 0x7FFF;
521 /*----------------------------------------------------------------------------
522 | Returns the sign bit of the extended double-precision floating-point value
523 | `a'.
524 *----------------------------------------------------------------------------*/
526 INLINE flag extractFloatx80Sign( floatx80 a )
529 return a.high>>15;
533 /*----------------------------------------------------------------------------
534 | Normalizes the subnormal extended double-precision floating-point value
535 | represented by the denormalized significand `aSig'. The normalized exponent
536 | and significand are stored at the locations pointed to by `zExpPtr' and
537 | `zSigPtr', respectively.
538 *----------------------------------------------------------------------------*/
540 static void
541 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
543 int8 shiftCount;
545 shiftCount = countLeadingZeros64( aSig );
546 *zSigPtr = aSig<<shiftCount;
547 *zExpPtr = 1 - shiftCount;
551 /*----------------------------------------------------------------------------
552 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
553 | extended double-precision floating-point value, returning the result.
554 *----------------------------------------------------------------------------*/
556 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
558 floatx80 z;
560 z.low = zSig;
561 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
562 return z;
566 /*----------------------------------------------------------------------------
567 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
568 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
569 | and returns the proper extended double-precision floating-point value
570 | corresponding to the abstract input. Ordinarily, the abstract value is
571 | rounded and packed into the extended double-precision format, with the
572 | inexact exception raised if the abstract input cannot be represented
573 | exactly. However, if the abstract value is too large, the overflow and
574 | inexact exceptions are raised and an infinity or maximal finite value is
575 | returned. If the abstract value is too small, the input value is rounded to
576 | a subnormal number, and the underflow and inexact exceptions are raised if
577 | the abstract input cannot be represented exactly as a subnormal extended
578 | double-precision floating-point number.
579 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
580 | number of bits as single or double precision, respectively. Otherwise, the
581 | result is rounded to the full precision of the extended double-precision
582 | format.
583 | The input significand must be normalized or smaller. If the input
584 | significand is not normalized, `zExp' must be 0; in that case, the result
585 | returned is a subnormal number, and it must not require rounding. The
586 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
587 | Floating-Point Arithmetic.
588 *----------------------------------------------------------------------------*/
590 static floatx80
591 roundAndPackFloatx80(
592 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
593 STATUS_PARAM)
595 int8 roundingMode;
596 flag roundNearestEven, increment, isTiny;
597 int64 roundIncrement, roundMask, roundBits;
599 roundingMode = STATUS(float_rounding_mode);
600 roundNearestEven = ( roundingMode == float_round_nearest_even );
601 if ( roundingPrecision == 80 ) goto precision80;
602 if ( roundingPrecision == 64 ) {
603 roundIncrement = LIT64( 0x0000000000000400 );
604 roundMask = LIT64( 0x00000000000007FF );
606 else if ( roundingPrecision == 32 ) {
607 roundIncrement = LIT64( 0x0000008000000000 );
608 roundMask = LIT64( 0x000000FFFFFFFFFF );
610 else {
611 goto precision80;
613 zSig0 |= ( zSig1 != 0 );
614 if ( ! roundNearestEven ) {
615 if ( roundingMode == float_round_to_zero ) {
616 roundIncrement = 0;
618 else {
619 roundIncrement = roundMask;
620 if ( zSign ) {
621 if ( roundingMode == float_round_up ) roundIncrement = 0;
623 else {
624 if ( roundingMode == float_round_down ) roundIncrement = 0;
628 roundBits = zSig0 & roundMask;
629 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
630 if ( ( 0x7FFE < zExp )
631 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
633 goto overflow;
635 if ( zExp <= 0 ) {
636 isTiny =
637 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
638 || ( zExp < 0 )
639 || ( zSig0 <= zSig0 + roundIncrement );
640 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
641 zExp = 0;
642 roundBits = zSig0 & roundMask;
643 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
644 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
645 zSig0 += roundIncrement;
646 if ( (sbits64) zSig0 < 0 ) zExp = 1;
647 roundIncrement = roundMask + 1;
648 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
649 roundMask |= roundIncrement;
651 zSig0 &= ~ roundMask;
652 return packFloatx80( zSign, zExp, zSig0 );
655 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
656 zSig0 += roundIncrement;
657 if ( zSig0 < roundIncrement ) {
658 ++zExp;
659 zSig0 = LIT64( 0x8000000000000000 );
661 roundIncrement = roundMask + 1;
662 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
663 roundMask |= roundIncrement;
665 zSig0 &= ~ roundMask;
666 if ( zSig0 == 0 ) zExp = 0;
667 return packFloatx80( zSign, zExp, zSig0 );
668 precision80:
669 increment = ( (sbits64) zSig1 < 0 );
670 if ( ! roundNearestEven ) {
671 if ( roundingMode == float_round_to_zero ) {
672 increment = 0;
674 else {
675 if ( zSign ) {
676 increment = ( roundingMode == float_round_down ) && zSig1;
678 else {
679 increment = ( roundingMode == float_round_up ) && zSig1;
683 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
684 if ( ( 0x7FFE < zExp )
685 || ( ( zExp == 0x7FFE )
686 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
687 && increment
690 roundMask = 0;
691 overflow:
692 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
693 if ( ( roundingMode == float_round_to_zero )
694 || ( zSign && ( roundingMode == float_round_up ) )
695 || ( ! zSign && ( roundingMode == float_round_down ) )
697 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
699 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
701 if ( zExp <= 0 ) {
702 isTiny =
703 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
704 || ( zExp < 0 )
705 || ! increment
706 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
707 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
708 zExp = 0;
709 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
710 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
711 if ( roundNearestEven ) {
712 increment = ( (sbits64) zSig1 < 0 );
714 else {
715 if ( zSign ) {
716 increment = ( roundingMode == float_round_down ) && zSig1;
718 else {
719 increment = ( roundingMode == float_round_up ) && zSig1;
722 if ( increment ) {
723 ++zSig0;
724 zSig0 &=
725 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
726 if ( (sbits64) zSig0 < 0 ) zExp = 1;
728 return packFloatx80( zSign, zExp, zSig0 );
731 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
732 if ( increment ) {
733 ++zSig0;
734 if ( zSig0 == 0 ) {
735 ++zExp;
736 zSig0 = LIT64( 0x8000000000000000 );
738 else {
739 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
742 else {
743 if ( zSig0 == 0 ) zExp = 0;
745 return packFloatx80( zSign, zExp, zSig0 );
749 /*----------------------------------------------------------------------------
750 | Takes an abstract floating-point value having sign `zSign', exponent
751 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
752 | and returns the proper extended double-precision floating-point value
753 | corresponding to the abstract input. This routine is just like
754 | `roundAndPackFloatx80' except that the input significand does not have to be
755 | normalized.
756 *----------------------------------------------------------------------------*/
758 static floatx80
759 normalizeRoundAndPackFloatx80(
760 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
761 STATUS_PARAM)
763 int8 shiftCount;
765 if ( zSig0 == 0 ) {
766 zSig0 = zSig1;
767 zSig1 = 0;
768 zExp -= 64;
770 shiftCount = countLeadingZeros64( zSig0 );
771 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
772 zExp -= shiftCount;
773 return
774 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
778 #endif
780 #ifdef FLOAT128
782 /*----------------------------------------------------------------------------
783 | Returns the least-significant 64 fraction bits of the quadruple-precision
784 | floating-point value `a'.
785 *----------------------------------------------------------------------------*/
787 INLINE bits64 extractFloat128Frac1( float128 a )
790 return a.low;
794 /*----------------------------------------------------------------------------
795 | Returns the most-significant 48 fraction bits of the quadruple-precision
796 | floating-point value `a'.
797 *----------------------------------------------------------------------------*/
799 INLINE bits64 extractFloat128Frac0( float128 a )
802 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
806 /*----------------------------------------------------------------------------
807 | Returns the exponent bits of the quadruple-precision floating-point value
808 | `a'.
809 *----------------------------------------------------------------------------*/
811 INLINE int32 extractFloat128Exp( float128 a )
814 return ( a.high>>48 ) & 0x7FFF;
818 /*----------------------------------------------------------------------------
819 | Returns the sign bit of the quadruple-precision floating-point value `a'.
820 *----------------------------------------------------------------------------*/
822 INLINE flag extractFloat128Sign( float128 a )
825 return a.high>>63;
829 /*----------------------------------------------------------------------------
830 | Normalizes the subnormal quadruple-precision floating-point value
831 | represented by the denormalized significand formed by the concatenation of
832 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
833 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
834 | significand are stored at the location pointed to by `zSig0Ptr', and the
835 | least significant 64 bits of the normalized significand are stored at the
836 | location pointed to by `zSig1Ptr'.
837 *----------------------------------------------------------------------------*/
839 static void
840 normalizeFloat128Subnormal(
841 bits64 aSig0,
842 bits64 aSig1,
843 int32 *zExpPtr,
844 bits64 *zSig0Ptr,
845 bits64 *zSig1Ptr
848 int8 shiftCount;
850 if ( aSig0 == 0 ) {
851 shiftCount = countLeadingZeros64( aSig1 ) - 15;
852 if ( shiftCount < 0 ) {
853 *zSig0Ptr = aSig1>>( - shiftCount );
854 *zSig1Ptr = aSig1<<( shiftCount & 63 );
856 else {
857 *zSig0Ptr = aSig1<<shiftCount;
858 *zSig1Ptr = 0;
860 *zExpPtr = - shiftCount - 63;
862 else {
863 shiftCount = countLeadingZeros64( aSig0 ) - 15;
864 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
865 *zExpPtr = 1 - shiftCount;
870 /*----------------------------------------------------------------------------
871 | Packs the sign `zSign', the exponent `zExp', and the significand formed
872 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
873 | floating-point value, returning the result. After being shifted into the
874 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
875 | added together to form the most significant 32 bits of the result. This
876 | means that any integer portion of `zSig0' will be added into the exponent.
877 | Since a properly normalized significand will have an integer portion equal
878 | to 1, the `zExp' input should be 1 less than the desired result exponent
879 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
880 | significand.
881 *----------------------------------------------------------------------------*/
883 INLINE float128
884 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
886 float128 z;
888 z.low = zSig1;
889 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
890 return z;
894 /*----------------------------------------------------------------------------
895 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
896 | and extended significand formed by the concatenation of `zSig0', `zSig1',
897 | and `zSig2', and returns the proper quadruple-precision floating-point value
898 | corresponding to the abstract input. Ordinarily, the abstract value is
899 | simply rounded and packed into the quadruple-precision format, with the
900 | inexact exception raised if the abstract input cannot be represented
901 | exactly. However, if the abstract value is too large, the overflow and
902 | inexact exceptions are raised and an infinity or maximal finite value is
903 | returned. If the abstract value is too small, the input value is rounded to
904 | a subnormal number, and the underflow and inexact exceptions are raised if
905 | the abstract input cannot be represented exactly as a subnormal quadruple-
906 | precision floating-point number.
907 | The input significand must be normalized or smaller. If the input
908 | significand is not normalized, `zExp' must be 0; in that case, the result
909 | returned is a subnormal number, and it must not require rounding. In the
910 | usual case that the input significand is normalized, `zExp' must be 1 less
911 | than the ``true'' floating-point exponent. The handling of underflow and
912 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
913 *----------------------------------------------------------------------------*/
915 static float128
916 roundAndPackFloat128(
917 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
919 int8 roundingMode;
920 flag roundNearestEven, increment, isTiny;
922 roundingMode = STATUS(float_rounding_mode);
923 roundNearestEven = ( roundingMode == float_round_nearest_even );
924 increment = ( (sbits64) zSig2 < 0 );
925 if ( ! roundNearestEven ) {
926 if ( roundingMode == float_round_to_zero ) {
927 increment = 0;
929 else {
930 if ( zSign ) {
931 increment = ( roundingMode == float_round_down ) && zSig2;
933 else {
934 increment = ( roundingMode == float_round_up ) && zSig2;
938 if ( 0x7FFD <= (bits32) zExp ) {
939 if ( ( 0x7FFD < zExp )
940 || ( ( zExp == 0x7FFD )
941 && eq128(
942 LIT64( 0x0001FFFFFFFFFFFF ),
943 LIT64( 0xFFFFFFFFFFFFFFFF ),
944 zSig0,
945 zSig1
947 && increment
950 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
951 if ( ( roundingMode == float_round_to_zero )
952 || ( zSign && ( roundingMode == float_round_up ) )
953 || ( ! zSign && ( roundingMode == float_round_down ) )
955 return
956 packFloat128(
957 zSign,
958 0x7FFE,
959 LIT64( 0x0000FFFFFFFFFFFF ),
960 LIT64( 0xFFFFFFFFFFFFFFFF )
963 return packFloat128( zSign, 0x7FFF, 0, 0 );
965 if ( zExp < 0 ) {
966 isTiny =
967 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
968 || ( zExp < -1 )
969 || ! increment
970 || lt128(
971 zSig0,
972 zSig1,
973 LIT64( 0x0001FFFFFFFFFFFF ),
974 LIT64( 0xFFFFFFFFFFFFFFFF )
976 shift128ExtraRightJamming(
977 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
978 zExp = 0;
979 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
980 if ( roundNearestEven ) {
981 increment = ( (sbits64) zSig2 < 0 );
983 else {
984 if ( zSign ) {
985 increment = ( roundingMode == float_round_down ) && zSig2;
987 else {
988 increment = ( roundingMode == float_round_up ) && zSig2;
993 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
994 if ( increment ) {
995 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
996 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
998 else {
999 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1001 return packFloat128( zSign, zExp, zSig0, zSig1 );
1005 /*----------------------------------------------------------------------------
1006 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1007 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1008 | returns the proper quadruple-precision floating-point value corresponding
1009 | to the abstract input. This routine is just like `roundAndPackFloat128'
1010 | except that the input significand has fewer bits and does not have to be
1011 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1012 | point exponent.
1013 *----------------------------------------------------------------------------*/
1015 static float128
1016 normalizeRoundAndPackFloat128(
1017 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1019 int8 shiftCount;
1020 bits64 zSig2;
1022 if ( zSig0 == 0 ) {
1023 zSig0 = zSig1;
1024 zSig1 = 0;
1025 zExp -= 64;
1027 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1028 if ( 0 <= shiftCount ) {
1029 zSig2 = 0;
1030 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1032 else {
1033 shift128ExtraRightJamming(
1034 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1036 zExp -= shiftCount;
1037 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1041 #endif
1043 /*----------------------------------------------------------------------------
1044 | Returns the result of converting the 32-bit two's complement integer `a'
1045 | to the single-precision floating-point format. The conversion is performed
1046 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1047 *----------------------------------------------------------------------------*/
1049 float32 int32_to_float32( int32 a STATUS_PARAM )
1051 flag zSign;
1053 if ( a == 0 ) return 0;
1054 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1055 zSign = ( a < 0 );
1056 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1060 /*----------------------------------------------------------------------------
1061 | Returns the result of converting the 32-bit two's complement integer `a'
1062 | to the double-precision floating-point format. The conversion is performed
1063 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1064 *----------------------------------------------------------------------------*/
1066 float64 int32_to_float64( int32 a STATUS_PARAM )
1068 flag zSign;
1069 uint32 absA;
1070 int8 shiftCount;
1071 bits64 zSig;
1073 if ( a == 0 ) return 0;
1074 zSign = ( a < 0 );
1075 absA = zSign ? - a : a;
1076 shiftCount = countLeadingZeros32( absA ) + 21;
1077 zSig = absA;
1078 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1082 #ifdef FLOATX80
1084 /*----------------------------------------------------------------------------
1085 | Returns the result of converting the 32-bit two's complement integer `a'
1086 | to the extended double-precision floating-point format. The conversion
1087 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1088 | Arithmetic.
1089 *----------------------------------------------------------------------------*/
1091 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1093 flag zSign;
1094 uint32 absA;
1095 int8 shiftCount;
1096 bits64 zSig;
1098 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1099 zSign = ( a < 0 );
1100 absA = zSign ? - a : a;
1101 shiftCount = countLeadingZeros32( absA ) + 32;
1102 zSig = absA;
1103 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1107 #endif
1109 #ifdef FLOAT128
1111 /*----------------------------------------------------------------------------
1112 | Returns the result of converting the 32-bit two's complement integer `a' to
1113 | the quadruple-precision floating-point format. The conversion is performed
1114 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115 *----------------------------------------------------------------------------*/
1117 float128 int32_to_float128( int32 a STATUS_PARAM )
1119 flag zSign;
1120 uint32 absA;
1121 int8 shiftCount;
1122 bits64 zSig0;
1124 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1125 zSign = ( a < 0 );
1126 absA = zSign ? - a : a;
1127 shiftCount = countLeadingZeros32( absA ) + 17;
1128 zSig0 = absA;
1129 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1133 #endif
1135 /*----------------------------------------------------------------------------
1136 | Returns the result of converting the 64-bit two's complement integer `a'
1137 | to the single-precision floating-point format. The conversion is performed
1138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139 *----------------------------------------------------------------------------*/
1141 float32 int64_to_float32( int64 a STATUS_PARAM )
1143 flag zSign;
1144 uint64 absA;
1145 int8 shiftCount;
1147 if ( a == 0 ) return 0;
1148 zSign = ( a < 0 );
1149 absA = zSign ? - a : a;
1150 shiftCount = countLeadingZeros64( absA ) - 40;
1151 if ( 0 <= shiftCount ) {
1152 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1154 else {
1155 shiftCount += 7;
1156 if ( shiftCount < 0 ) {
1157 shift64RightJamming( absA, - shiftCount, &absA );
1159 else {
1160 absA <<= shiftCount;
1162 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1167 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1169 int8 shiftCount;
1171 if ( a == 0 ) return 0;
1172 shiftCount = countLeadingZeros64( a ) - 40;
1173 if ( 0 <= shiftCount ) {
1174 return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1176 else {
1177 shiftCount += 7;
1178 if ( shiftCount < 0 ) {
1179 shift64RightJamming( a, - shiftCount, &a );
1181 else {
1182 a <<= shiftCount;
1184 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1188 /*----------------------------------------------------------------------------
1189 | Returns the result of converting the 64-bit two's complement integer `a'
1190 | to the double-precision floating-point format. The conversion is performed
1191 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1192 *----------------------------------------------------------------------------*/
1194 float64 int64_to_float64( int64 a STATUS_PARAM )
1196 flag zSign;
1198 if ( a == 0 ) return 0;
1199 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1200 return packFloat64( 1, 0x43E, 0 );
1202 zSign = ( a < 0 );
1203 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1207 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1209 if ( a == 0 ) return 0;
1210 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1214 #ifdef FLOATX80
1216 /*----------------------------------------------------------------------------
1217 | Returns the result of converting the 64-bit two's complement integer `a'
1218 | to the extended double-precision floating-point format. The conversion
1219 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1220 | Arithmetic.
1221 *----------------------------------------------------------------------------*/
1223 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1225 flag zSign;
1226 uint64 absA;
1227 int8 shiftCount;
1229 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1230 zSign = ( a < 0 );
1231 absA = zSign ? - a : a;
1232 shiftCount = countLeadingZeros64( absA );
1233 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1237 #endif
1239 #ifdef FLOAT128
1241 /*----------------------------------------------------------------------------
1242 | Returns the result of converting the 64-bit two's complement integer `a' to
1243 | the quadruple-precision floating-point format. The conversion is performed
1244 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1245 *----------------------------------------------------------------------------*/
1247 float128 int64_to_float128( int64 a STATUS_PARAM )
1249 flag zSign;
1250 uint64 absA;
1251 int8 shiftCount;
1252 int32 zExp;
1253 bits64 zSig0, zSig1;
1255 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1256 zSign = ( a < 0 );
1257 absA = zSign ? - a : a;
1258 shiftCount = countLeadingZeros64( absA ) + 49;
1259 zExp = 0x406E - shiftCount;
1260 if ( 64 <= shiftCount ) {
1261 zSig1 = 0;
1262 zSig0 = absA;
1263 shiftCount -= 64;
1265 else {
1266 zSig1 = absA;
1267 zSig0 = 0;
1269 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1270 return packFloat128( zSign, zExp, zSig0, zSig1 );
1274 #endif
1276 /*----------------------------------------------------------------------------
1277 | Returns the result of converting the single-precision floating-point value
1278 | `a' to the 32-bit two's complement integer format. The conversion is
1279 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1280 | Arithmetic---which means in particular that the conversion is rounded
1281 | according to the current rounding mode. If `a' is a NaN, the largest
1282 | positive integer is returned. Otherwise, if the conversion overflows, the
1283 | largest integer with the same sign as `a' is returned.
1284 *----------------------------------------------------------------------------*/
1286 int32 float32_to_int32( float32 a STATUS_PARAM )
1288 flag aSign;
1289 int16 aExp, shiftCount;
1290 bits32 aSig;
1291 bits64 aSig64;
1293 aSig = extractFloat32Frac( a );
1294 aExp = extractFloat32Exp( a );
1295 aSign = extractFloat32Sign( a );
1296 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1297 if ( aExp ) aSig |= 0x00800000;
1298 shiftCount = 0xAF - aExp;
1299 aSig64 = aSig;
1300 aSig64 <<= 32;
1301 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1302 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1306 /*----------------------------------------------------------------------------
1307 | Returns the result of converting the single-precision floating-point value
1308 | `a' to the 32-bit two's complement integer format. The conversion is
1309 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1310 | Arithmetic, except that the conversion is always rounded toward zero.
1311 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1312 | the conversion overflows, the largest integer with the same sign as `a' is
1313 | returned.
1314 *----------------------------------------------------------------------------*/
1316 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1318 flag aSign;
1319 int16 aExp, shiftCount;
1320 bits32 aSig;
1321 int32 z;
1323 aSig = extractFloat32Frac( a );
1324 aExp = extractFloat32Exp( a );
1325 aSign = extractFloat32Sign( a );
1326 shiftCount = aExp - 0x9E;
1327 if ( 0 <= shiftCount ) {
1328 if ( a != 0xCF000000 ) {
1329 float_raise( float_flag_invalid STATUS_VAR);
1330 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1332 return (sbits32) 0x80000000;
1334 else if ( aExp <= 0x7E ) {
1335 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1336 return 0;
1338 aSig = ( aSig | 0x00800000 )<<8;
1339 z = aSig>>( - shiftCount );
1340 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1341 STATUS(float_exception_flags) |= float_flag_inexact;
1343 if ( aSign ) z = - z;
1344 return z;
1348 /*----------------------------------------------------------------------------
1349 | Returns the result of converting the single-precision floating-point value
1350 | `a' to the 64-bit two's complement integer format. The conversion is
1351 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1352 | Arithmetic---which means in particular that the conversion is rounded
1353 | according to the current rounding mode. If `a' is a NaN, the largest
1354 | positive integer is returned. Otherwise, if the conversion overflows, the
1355 | largest integer with the same sign as `a' is returned.
1356 *----------------------------------------------------------------------------*/
1358 int64 float32_to_int64( float32 a STATUS_PARAM )
1360 flag aSign;
1361 int16 aExp, shiftCount;
1362 bits32 aSig;
1363 bits64 aSig64, aSigExtra;
1365 aSig = extractFloat32Frac( a );
1366 aExp = extractFloat32Exp( a );
1367 aSign = extractFloat32Sign( a );
1368 shiftCount = 0xBE - aExp;
1369 if ( shiftCount < 0 ) {
1370 float_raise( float_flag_invalid STATUS_VAR);
1371 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1372 return LIT64( 0x7FFFFFFFFFFFFFFF );
1374 return (sbits64) LIT64( 0x8000000000000000 );
1376 if ( aExp ) aSig |= 0x00800000;
1377 aSig64 = aSig;
1378 aSig64 <<= 40;
1379 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1380 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1384 /*----------------------------------------------------------------------------
1385 | Returns the result of converting the single-precision floating-point value
1386 | `a' to the 64-bit two's complement integer format. The conversion is
1387 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1388 | Arithmetic, except that the conversion is always rounded toward zero. If
1389 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1390 | conversion overflows, the largest integer with the same sign as `a' is
1391 | returned.
1392 *----------------------------------------------------------------------------*/
1394 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1396 flag aSign;
1397 int16 aExp, shiftCount;
1398 bits32 aSig;
1399 bits64 aSig64;
1400 int64 z;
1402 aSig = extractFloat32Frac( a );
1403 aExp = extractFloat32Exp( a );
1404 aSign = extractFloat32Sign( a );
1405 shiftCount = aExp - 0xBE;
1406 if ( 0 <= shiftCount ) {
1407 if ( a != 0xDF000000 ) {
1408 float_raise( float_flag_invalid STATUS_VAR);
1409 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1410 return LIT64( 0x7FFFFFFFFFFFFFFF );
1413 return (sbits64) LIT64( 0x8000000000000000 );
1415 else if ( aExp <= 0x7E ) {
1416 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1417 return 0;
1419 aSig64 = aSig | 0x00800000;
1420 aSig64 <<= 40;
1421 z = aSig64>>( - shiftCount );
1422 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1423 STATUS(float_exception_flags) |= float_flag_inexact;
1425 if ( aSign ) z = - z;
1426 return z;
1430 /*----------------------------------------------------------------------------
1431 | Returns the result of converting the single-precision floating-point value
1432 | `a' to the double-precision floating-point format. The conversion is
1433 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1434 | Arithmetic.
1435 *----------------------------------------------------------------------------*/
1437 float64 float32_to_float64( float32 a STATUS_PARAM )
1439 flag aSign;
1440 int16 aExp;
1441 bits32 aSig;
1443 aSig = extractFloat32Frac( a );
1444 aExp = extractFloat32Exp( a );
1445 aSign = extractFloat32Sign( a );
1446 if ( aExp == 0xFF ) {
1447 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1448 return packFloat64( aSign, 0x7FF, 0 );
1450 if ( aExp == 0 ) {
1451 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1452 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1453 --aExp;
1455 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1459 #ifdef FLOATX80
1461 /*----------------------------------------------------------------------------
1462 | Returns the result of converting the single-precision floating-point value
1463 | `a' to the extended double-precision floating-point format. The conversion
1464 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1465 | Arithmetic.
1466 *----------------------------------------------------------------------------*/
1468 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1470 flag aSign;
1471 int16 aExp;
1472 bits32 aSig;
1474 aSig = extractFloat32Frac( a );
1475 aExp = extractFloat32Exp( a );
1476 aSign = extractFloat32Sign( a );
1477 if ( aExp == 0xFF ) {
1478 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1479 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1481 if ( aExp == 0 ) {
1482 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1483 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1485 aSig |= 0x00800000;
1486 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1490 #endif
1492 #ifdef FLOAT128
1494 /*----------------------------------------------------------------------------
1495 | Returns the result of converting the single-precision floating-point value
1496 | `a' to the double-precision floating-point format. The conversion is
1497 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1498 | Arithmetic.
1499 *----------------------------------------------------------------------------*/
1501 float128 float32_to_float128( float32 a STATUS_PARAM )
1503 flag aSign;
1504 int16 aExp;
1505 bits32 aSig;
1507 aSig = extractFloat32Frac( a );
1508 aExp = extractFloat32Exp( a );
1509 aSign = extractFloat32Sign( a );
1510 if ( aExp == 0xFF ) {
1511 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1512 return packFloat128( aSign, 0x7FFF, 0, 0 );
1514 if ( aExp == 0 ) {
1515 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1516 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1517 --aExp;
1519 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1523 #endif
1525 /*----------------------------------------------------------------------------
1526 | Rounds the single-precision floating-point value `a' to an integer, and
1527 | returns the result as a single-precision floating-point value. The
1528 | operation is performed according to the IEC/IEEE Standard for Binary
1529 | Floating-Point Arithmetic.
1530 *----------------------------------------------------------------------------*/
1532 float32 float32_round_to_int( float32 a STATUS_PARAM)
1534 flag aSign;
1535 int16 aExp;
1536 bits32 lastBitMask, roundBitsMask;
1537 int8 roundingMode;
1538 float32 z;
1540 aExp = extractFloat32Exp( a );
1541 if ( 0x96 <= aExp ) {
1542 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1543 return propagateFloat32NaN( a, a STATUS_VAR );
1545 return a;
1547 if ( aExp <= 0x7E ) {
1548 if ( (bits32) ( a<<1 ) == 0 ) return a;
1549 STATUS(float_exception_flags) |= float_flag_inexact;
1550 aSign = extractFloat32Sign( a );
1551 switch ( STATUS(float_rounding_mode) ) {
1552 case float_round_nearest_even:
1553 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1554 return packFloat32( aSign, 0x7F, 0 );
1556 break;
1557 case float_round_down:
1558 return aSign ? 0xBF800000 : 0;
1559 case float_round_up:
1560 return aSign ? 0x80000000 : 0x3F800000;
1562 return packFloat32( aSign, 0, 0 );
1564 lastBitMask = 1;
1565 lastBitMask <<= 0x96 - aExp;
1566 roundBitsMask = lastBitMask - 1;
1567 z = a;
1568 roundingMode = STATUS(float_rounding_mode);
1569 if ( roundingMode == float_round_nearest_even ) {
1570 z += lastBitMask>>1;
1571 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1573 else if ( roundingMode != float_round_to_zero ) {
1574 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1575 z += roundBitsMask;
1578 z &= ~ roundBitsMask;
1579 if ( z != a ) STATUS(float_exception_flags) |= float_flag_inexact;
1580 return z;
1584 /*----------------------------------------------------------------------------
1585 | Returns the result of adding the absolute values of the single-precision
1586 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1587 | before being returned. `zSign' is ignored if the result is a NaN.
1588 | The addition is performed according to the IEC/IEEE Standard for Binary
1589 | Floating-Point Arithmetic.
1590 *----------------------------------------------------------------------------*/
1592 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1594 int16 aExp, bExp, zExp;
1595 bits32 aSig, bSig, zSig;
1596 int16 expDiff;
1598 aSig = extractFloat32Frac( a );
1599 aExp = extractFloat32Exp( a );
1600 bSig = extractFloat32Frac( b );
1601 bExp = extractFloat32Exp( b );
1602 expDiff = aExp - bExp;
1603 aSig <<= 6;
1604 bSig <<= 6;
1605 if ( 0 < expDiff ) {
1606 if ( aExp == 0xFF ) {
1607 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1608 return a;
1610 if ( bExp == 0 ) {
1611 --expDiff;
1613 else {
1614 bSig |= 0x20000000;
1616 shift32RightJamming( bSig, expDiff, &bSig );
1617 zExp = aExp;
1619 else if ( expDiff < 0 ) {
1620 if ( bExp == 0xFF ) {
1621 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1622 return packFloat32( zSign, 0xFF, 0 );
1624 if ( aExp == 0 ) {
1625 ++expDiff;
1627 else {
1628 aSig |= 0x20000000;
1630 shift32RightJamming( aSig, - expDiff, &aSig );
1631 zExp = bExp;
1633 else {
1634 if ( aExp == 0xFF ) {
1635 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1636 return a;
1638 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1639 zSig = 0x40000000 + aSig + bSig;
1640 zExp = aExp;
1641 goto roundAndPack;
1643 aSig |= 0x20000000;
1644 zSig = ( aSig + bSig )<<1;
1645 --zExp;
1646 if ( (sbits32) zSig < 0 ) {
1647 zSig = aSig + bSig;
1648 ++zExp;
1650 roundAndPack:
1651 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1655 /*----------------------------------------------------------------------------
1656 | Returns the result of subtracting the absolute values of the single-
1657 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1658 | difference is negated before being returned. `zSign' is ignored if the
1659 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1660 | Standard for Binary Floating-Point Arithmetic.
1661 *----------------------------------------------------------------------------*/
1663 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1665 int16 aExp, bExp, zExp;
1666 bits32 aSig, bSig, zSig;
1667 int16 expDiff;
1669 aSig = extractFloat32Frac( a );
1670 aExp = extractFloat32Exp( a );
1671 bSig = extractFloat32Frac( b );
1672 bExp = extractFloat32Exp( b );
1673 expDiff = aExp - bExp;
1674 aSig <<= 7;
1675 bSig <<= 7;
1676 if ( 0 < expDiff ) goto aExpBigger;
1677 if ( expDiff < 0 ) goto bExpBigger;
1678 if ( aExp == 0xFF ) {
1679 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1680 float_raise( float_flag_invalid STATUS_VAR);
1681 return float32_default_nan;
1683 if ( aExp == 0 ) {
1684 aExp = 1;
1685 bExp = 1;
1687 if ( bSig < aSig ) goto aBigger;
1688 if ( aSig < bSig ) goto bBigger;
1689 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1690 bExpBigger:
1691 if ( bExp == 0xFF ) {
1692 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1693 return packFloat32( zSign ^ 1, 0xFF, 0 );
1695 if ( aExp == 0 ) {
1696 ++expDiff;
1698 else {
1699 aSig |= 0x40000000;
1701 shift32RightJamming( aSig, - expDiff, &aSig );
1702 bSig |= 0x40000000;
1703 bBigger:
1704 zSig = bSig - aSig;
1705 zExp = bExp;
1706 zSign ^= 1;
1707 goto normalizeRoundAndPack;
1708 aExpBigger:
1709 if ( aExp == 0xFF ) {
1710 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1711 return a;
1713 if ( bExp == 0 ) {
1714 --expDiff;
1716 else {
1717 bSig |= 0x40000000;
1719 shift32RightJamming( bSig, expDiff, &bSig );
1720 aSig |= 0x40000000;
1721 aBigger:
1722 zSig = aSig - bSig;
1723 zExp = aExp;
1724 normalizeRoundAndPack:
1725 --zExp;
1726 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1730 /*----------------------------------------------------------------------------
1731 | Returns the result of adding the single-precision floating-point values `a'
1732 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1733 | Binary Floating-Point Arithmetic.
1734 *----------------------------------------------------------------------------*/
1736 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1738 flag aSign, bSign;
1740 aSign = extractFloat32Sign( a );
1741 bSign = extractFloat32Sign( b );
1742 if ( aSign == bSign ) {
1743 return addFloat32Sigs( a, b, aSign STATUS_VAR);
1745 else {
1746 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1751 /*----------------------------------------------------------------------------
1752 | Returns the result of subtracting the single-precision floating-point values
1753 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1754 | for Binary Floating-Point Arithmetic.
1755 *----------------------------------------------------------------------------*/
1757 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1759 flag aSign, bSign;
1761 aSign = extractFloat32Sign( a );
1762 bSign = extractFloat32Sign( b );
1763 if ( aSign == bSign ) {
1764 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1766 else {
1767 return addFloat32Sigs( a, b, aSign STATUS_VAR );
1772 /*----------------------------------------------------------------------------
1773 | Returns the result of multiplying the single-precision floating-point values
1774 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1775 | for Binary Floating-Point Arithmetic.
1776 *----------------------------------------------------------------------------*/
1778 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1780 flag aSign, bSign, zSign;
1781 int16 aExp, bExp, zExp;
1782 bits32 aSig, bSig;
1783 bits64 zSig64;
1784 bits32 zSig;
1786 aSig = extractFloat32Frac( a );
1787 aExp = extractFloat32Exp( a );
1788 aSign = extractFloat32Sign( a );
1789 bSig = extractFloat32Frac( b );
1790 bExp = extractFloat32Exp( b );
1791 bSign = extractFloat32Sign( b );
1792 zSign = aSign ^ bSign;
1793 if ( aExp == 0xFF ) {
1794 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1795 return propagateFloat32NaN( a, b STATUS_VAR );
1797 if ( ( bExp | bSig ) == 0 ) {
1798 float_raise( float_flag_invalid STATUS_VAR);
1799 return float32_default_nan;
1801 return packFloat32( zSign, 0xFF, 0 );
1803 if ( bExp == 0xFF ) {
1804 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1805 if ( ( aExp | aSig ) == 0 ) {
1806 float_raise( float_flag_invalid STATUS_VAR);
1807 return float32_default_nan;
1809 return packFloat32( zSign, 0xFF, 0 );
1811 if ( aExp == 0 ) {
1812 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1813 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1815 if ( bExp == 0 ) {
1816 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1817 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1819 zExp = aExp + bExp - 0x7F;
1820 aSig = ( aSig | 0x00800000 )<<7;
1821 bSig = ( bSig | 0x00800000 )<<8;
1822 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1823 zSig = zSig64;
1824 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1825 zSig <<= 1;
1826 --zExp;
1828 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1832 /*----------------------------------------------------------------------------
1833 | Returns the result of dividing the single-precision floating-point value `a'
1834 | by the corresponding value `b'. The operation is performed according to the
1835 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1836 *----------------------------------------------------------------------------*/
1838 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1840 flag aSign, bSign, zSign;
1841 int16 aExp, bExp, zExp;
1842 bits32 aSig, bSig, zSig;
1844 aSig = extractFloat32Frac( a );
1845 aExp = extractFloat32Exp( a );
1846 aSign = extractFloat32Sign( a );
1847 bSig = extractFloat32Frac( b );
1848 bExp = extractFloat32Exp( b );
1849 bSign = extractFloat32Sign( b );
1850 zSign = aSign ^ bSign;
1851 if ( aExp == 0xFF ) {
1852 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1853 if ( bExp == 0xFF ) {
1854 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1855 float_raise( float_flag_invalid STATUS_VAR);
1856 return float32_default_nan;
1858 return packFloat32( zSign, 0xFF, 0 );
1860 if ( bExp == 0xFF ) {
1861 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1862 return packFloat32( zSign, 0, 0 );
1864 if ( bExp == 0 ) {
1865 if ( bSig == 0 ) {
1866 if ( ( aExp | aSig ) == 0 ) {
1867 float_raise( float_flag_invalid STATUS_VAR);
1868 return float32_default_nan;
1870 float_raise( float_flag_divbyzero STATUS_VAR);
1871 return packFloat32( zSign, 0xFF, 0 );
1873 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1875 if ( aExp == 0 ) {
1876 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1877 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1879 zExp = aExp - bExp + 0x7D;
1880 aSig = ( aSig | 0x00800000 )<<7;
1881 bSig = ( bSig | 0x00800000 )<<8;
1882 if ( bSig <= ( aSig + aSig ) ) {
1883 aSig >>= 1;
1884 ++zExp;
1886 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1887 if ( ( zSig & 0x3F ) == 0 ) {
1888 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1890 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1894 /*----------------------------------------------------------------------------
1895 | Returns the remainder of the single-precision floating-point value `a'
1896 | with respect to the corresponding value `b'. The operation is performed
1897 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1898 *----------------------------------------------------------------------------*/
1900 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1902 flag aSign, bSign, zSign;
1903 int16 aExp, bExp, expDiff;
1904 bits32 aSig, bSig;
1905 bits32 q;
1906 bits64 aSig64, bSig64, q64;
1907 bits32 alternateASig;
1908 sbits32 sigMean;
1910 aSig = extractFloat32Frac( a );
1911 aExp = extractFloat32Exp( a );
1912 aSign = extractFloat32Sign( a );
1913 bSig = extractFloat32Frac( b );
1914 bExp = extractFloat32Exp( b );
1915 bSign = extractFloat32Sign( b );
1916 if ( aExp == 0xFF ) {
1917 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1918 return propagateFloat32NaN( a, b STATUS_VAR );
1920 float_raise( float_flag_invalid STATUS_VAR);
1921 return float32_default_nan;
1923 if ( bExp == 0xFF ) {
1924 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1925 return a;
1927 if ( bExp == 0 ) {
1928 if ( bSig == 0 ) {
1929 float_raise( float_flag_invalid STATUS_VAR);
1930 return float32_default_nan;
1932 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1934 if ( aExp == 0 ) {
1935 if ( aSig == 0 ) return a;
1936 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1938 expDiff = aExp - bExp;
1939 aSig |= 0x00800000;
1940 bSig |= 0x00800000;
1941 if ( expDiff < 32 ) {
1942 aSig <<= 8;
1943 bSig <<= 8;
1944 if ( expDiff < 0 ) {
1945 if ( expDiff < -1 ) return a;
1946 aSig >>= 1;
1948 q = ( bSig <= aSig );
1949 if ( q ) aSig -= bSig;
1950 if ( 0 < expDiff ) {
1951 q = ( ( (bits64) aSig )<<32 ) / bSig;
1952 q >>= 32 - expDiff;
1953 bSig >>= 2;
1954 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1956 else {
1957 aSig >>= 2;
1958 bSig >>= 2;
1961 else {
1962 if ( bSig <= aSig ) aSig -= bSig;
1963 aSig64 = ( (bits64) aSig )<<40;
1964 bSig64 = ( (bits64) bSig )<<40;
1965 expDiff -= 64;
1966 while ( 0 < expDiff ) {
1967 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1968 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1969 aSig64 = - ( ( bSig * q64 )<<38 );
1970 expDiff -= 62;
1972 expDiff += 64;
1973 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1974 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1975 q = q64>>( 64 - expDiff );
1976 bSig <<= 6;
1977 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1979 do {
1980 alternateASig = aSig;
1981 ++q;
1982 aSig -= bSig;
1983 } while ( 0 <= (sbits32) aSig );
1984 sigMean = aSig + alternateASig;
1985 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1986 aSig = alternateASig;
1988 zSign = ( (sbits32) aSig < 0 );
1989 if ( zSign ) aSig = - aSig;
1990 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
1994 /*----------------------------------------------------------------------------
1995 | Returns the square root of the single-precision floating-point value `a'.
1996 | The operation is performed according to the IEC/IEEE Standard for Binary
1997 | Floating-Point Arithmetic.
1998 *----------------------------------------------------------------------------*/
2000 float32 float32_sqrt( float32 a STATUS_PARAM )
2002 flag aSign;
2003 int16 aExp, zExp;
2004 bits32 aSig, zSig;
2005 bits64 rem, term;
2007 aSig = extractFloat32Frac( a );
2008 aExp = extractFloat32Exp( a );
2009 aSign = extractFloat32Sign( a );
2010 if ( aExp == 0xFF ) {
2011 if ( aSig ) return propagateFloat32NaN( a, 0 STATUS_VAR );
2012 if ( ! aSign ) return a;
2013 float_raise( float_flag_invalid STATUS_VAR);
2014 return float32_default_nan;
2016 if ( aSign ) {
2017 if ( ( aExp | aSig ) == 0 ) return a;
2018 float_raise( float_flag_invalid STATUS_VAR);
2019 return float32_default_nan;
2021 if ( aExp == 0 ) {
2022 if ( aSig == 0 ) return 0;
2023 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2025 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2026 aSig = ( aSig | 0x00800000 )<<8;
2027 zSig = estimateSqrt32( aExp, aSig ) + 2;
2028 if ( ( zSig & 0x7F ) <= 5 ) {
2029 if ( zSig < 2 ) {
2030 zSig = 0x7FFFFFFF;
2031 goto roundAndPack;
2033 aSig >>= aExp & 1;
2034 term = ( (bits64) zSig ) * zSig;
2035 rem = ( ( (bits64) aSig )<<32 ) - term;
2036 while ( (sbits64) rem < 0 ) {
2037 --zSig;
2038 rem += ( ( (bits64) zSig )<<1 ) | 1;
2040 zSig |= ( rem != 0 );
2042 shift32RightJamming( zSig, 1, &zSig );
2043 roundAndPack:
2044 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2048 /*----------------------------------------------------------------------------
2049 | Returns 1 if the single-precision floating-point value `a' is equal to
2050 | the corresponding value `b', and 0 otherwise. The comparison is performed
2051 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2052 *----------------------------------------------------------------------------*/
2054 int float32_eq( float32 a, float32 b STATUS_PARAM )
2057 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2058 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2060 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2061 float_raise( float_flag_invalid STATUS_VAR);
2063 return 0;
2065 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2069 /*----------------------------------------------------------------------------
2070 | Returns 1 if the single-precision floating-point value `a' is less than
2071 | or equal to the corresponding value `b', and 0 otherwise. The comparison
2072 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2073 | Arithmetic.
2074 *----------------------------------------------------------------------------*/
2076 int float32_le( float32 a, float32 b STATUS_PARAM )
2078 flag aSign, bSign;
2080 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2081 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2083 float_raise( float_flag_invalid STATUS_VAR);
2084 return 0;
2086 aSign = extractFloat32Sign( a );
2087 bSign = extractFloat32Sign( b );
2088 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2089 return ( a == b ) || ( aSign ^ ( a < b ) );
2093 /*----------------------------------------------------------------------------
2094 | Returns 1 if the single-precision floating-point value `a' is less than
2095 | the corresponding value `b', and 0 otherwise. The comparison is performed
2096 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2097 *----------------------------------------------------------------------------*/
2099 int float32_lt( float32 a, float32 b STATUS_PARAM )
2101 flag aSign, bSign;
2103 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2104 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2106 float_raise( float_flag_invalid STATUS_VAR);
2107 return 0;
2109 aSign = extractFloat32Sign( a );
2110 bSign = extractFloat32Sign( b );
2111 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2112 return ( a != b ) && ( aSign ^ ( a < b ) );
2116 /*----------------------------------------------------------------------------
2117 | Returns 1 if the single-precision floating-point value `a' is equal to
2118 | the corresponding value `b', and 0 otherwise. The invalid exception is
2119 | raised if either operand is a NaN. Otherwise, the comparison is performed
2120 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2121 *----------------------------------------------------------------------------*/
2123 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2126 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2127 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2129 float_raise( float_flag_invalid STATUS_VAR);
2130 return 0;
2132 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2136 /*----------------------------------------------------------------------------
2137 | Returns 1 if the single-precision floating-point value `a' is less than or
2138 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2139 | cause an exception. Otherwise, the comparison is performed according to the
2140 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2141 *----------------------------------------------------------------------------*/
2143 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2145 flag aSign, bSign;
2147 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2148 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2150 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2151 float_raise( float_flag_invalid STATUS_VAR);
2153 return 0;
2155 aSign = extractFloat32Sign( a );
2156 bSign = extractFloat32Sign( b );
2157 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2158 return ( a == b ) || ( aSign ^ ( a < b ) );
2162 /*----------------------------------------------------------------------------
2163 | Returns 1 if the single-precision floating-point value `a' is less than
2164 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2165 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2166 | Standard for Binary Floating-Point Arithmetic.
2167 *----------------------------------------------------------------------------*/
2169 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2171 flag aSign, bSign;
2173 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2174 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2176 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2177 float_raise( float_flag_invalid STATUS_VAR);
2179 return 0;
2181 aSign = extractFloat32Sign( a );
2182 bSign = extractFloat32Sign( b );
2183 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2184 return ( a != b ) && ( aSign ^ ( a < b ) );
2188 /*----------------------------------------------------------------------------
2189 | Returns the result of converting the double-precision floating-point value
2190 | `a' to the 32-bit two's complement integer format. The conversion is
2191 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2192 | Arithmetic---which means in particular that the conversion is rounded
2193 | according to the current rounding mode. If `a' is a NaN, the largest
2194 | positive integer is returned. Otherwise, if the conversion overflows, the
2195 | largest integer with the same sign as `a' is returned.
2196 *----------------------------------------------------------------------------*/
2198 int32 float64_to_int32( float64 a STATUS_PARAM )
2200 flag aSign;
2201 int16 aExp, shiftCount;
2202 bits64 aSig;
2204 aSig = extractFloat64Frac( a );
2205 aExp = extractFloat64Exp( a );
2206 aSign = extractFloat64Sign( a );
2207 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2208 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2209 shiftCount = 0x42C - aExp;
2210 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2211 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2215 /*----------------------------------------------------------------------------
2216 | Returns the result of converting the double-precision floating-point value
2217 | `a' to the 32-bit two's complement integer format. The conversion is
2218 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2219 | Arithmetic, except that the conversion is always rounded toward zero.
2220 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2221 | the conversion overflows, the largest integer with the same sign as `a' is
2222 | returned.
2223 *----------------------------------------------------------------------------*/
2225 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2227 flag aSign;
2228 int16 aExp, shiftCount;
2229 bits64 aSig, savedASig;
2230 int32 z;
2232 aSig = extractFloat64Frac( a );
2233 aExp = extractFloat64Exp( a );
2234 aSign = extractFloat64Sign( a );
2235 if ( 0x41E < aExp ) {
2236 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2237 goto invalid;
2239 else if ( aExp < 0x3FF ) {
2240 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2241 return 0;
2243 aSig |= LIT64( 0x0010000000000000 );
2244 shiftCount = 0x433 - aExp;
2245 savedASig = aSig;
2246 aSig >>= shiftCount;
2247 z = aSig;
2248 if ( aSign ) z = - z;
2249 if ( ( z < 0 ) ^ aSign ) {
2250 invalid:
2251 float_raise( float_flag_invalid STATUS_VAR);
2252 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2254 if ( ( aSig<<shiftCount ) != savedASig ) {
2255 STATUS(float_exception_flags) |= float_flag_inexact;
2257 return z;
2261 /*----------------------------------------------------------------------------
2262 | Returns the result of converting the double-precision floating-point value
2263 | `a' to the 64-bit two's complement integer format. The conversion is
2264 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2265 | Arithmetic---which means in particular that the conversion is rounded
2266 | according to the current rounding mode. If `a' is a NaN, the largest
2267 | positive integer is returned. Otherwise, if the conversion overflows, the
2268 | largest integer with the same sign as `a' is returned.
2269 *----------------------------------------------------------------------------*/
2271 int64 float64_to_int64( float64 a STATUS_PARAM )
2273 flag aSign;
2274 int16 aExp, shiftCount;
2275 bits64 aSig, aSigExtra;
2277 aSig = extractFloat64Frac( a );
2278 aExp = extractFloat64Exp( a );
2279 aSign = extractFloat64Sign( a );
2280 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2281 shiftCount = 0x433 - aExp;
2282 if ( shiftCount <= 0 ) {
2283 if ( 0x43E < aExp ) {
2284 float_raise( float_flag_invalid STATUS_VAR);
2285 if ( ! aSign
2286 || ( ( aExp == 0x7FF )
2287 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2289 return LIT64( 0x7FFFFFFFFFFFFFFF );
2291 return (sbits64) LIT64( 0x8000000000000000 );
2293 aSigExtra = 0;
2294 aSig <<= - shiftCount;
2296 else {
2297 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2299 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2303 /*----------------------------------------------------------------------------
2304 | Returns the result of converting the double-precision floating-point value
2305 | `a' to the 64-bit two's complement integer format. The conversion is
2306 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2307 | Arithmetic, except that the conversion is always rounded toward zero.
2308 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2309 | the conversion overflows, the largest integer with the same sign as `a' is
2310 | returned.
2311 *----------------------------------------------------------------------------*/
2313 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2315 flag aSign;
2316 int16 aExp, shiftCount;
2317 bits64 aSig;
2318 int64 z;
2320 aSig = extractFloat64Frac( a );
2321 aExp = extractFloat64Exp( a );
2322 aSign = extractFloat64Sign( a );
2323 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2324 shiftCount = aExp - 0x433;
2325 if ( 0 <= shiftCount ) {
2326 if ( 0x43E <= aExp ) {
2327 if ( a != LIT64( 0xC3E0000000000000 ) ) {
2328 float_raise( float_flag_invalid STATUS_VAR);
2329 if ( ! aSign
2330 || ( ( aExp == 0x7FF )
2331 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2333 return LIT64( 0x7FFFFFFFFFFFFFFF );
2336 return (sbits64) LIT64( 0x8000000000000000 );
2338 z = aSig<<shiftCount;
2340 else {
2341 if ( aExp < 0x3FE ) {
2342 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2343 return 0;
2345 z = aSig>>( - shiftCount );
2346 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2347 STATUS(float_exception_flags) |= float_flag_inexact;
2350 if ( aSign ) z = - z;
2351 return z;
2355 /*----------------------------------------------------------------------------
2356 | Returns the result of converting the double-precision floating-point value
2357 | `a' to the single-precision floating-point format. The conversion is
2358 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2359 | Arithmetic.
2360 *----------------------------------------------------------------------------*/
2362 float32 float64_to_float32( float64 a STATUS_PARAM )
2364 flag aSign;
2365 int16 aExp;
2366 bits64 aSig;
2367 bits32 zSig;
2369 aSig = extractFloat64Frac( a );
2370 aExp = extractFloat64Exp( a );
2371 aSign = extractFloat64Sign( a );
2372 if ( aExp == 0x7FF ) {
2373 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2374 return packFloat32( aSign, 0xFF, 0 );
2376 shift64RightJamming( aSig, 22, &aSig );
2377 zSig = aSig;
2378 if ( aExp || zSig ) {
2379 zSig |= 0x40000000;
2380 aExp -= 0x381;
2382 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2386 #ifdef FLOATX80
2388 /*----------------------------------------------------------------------------
2389 | Returns the result of converting the double-precision floating-point value
2390 | `a' to the extended double-precision floating-point format. The conversion
2391 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2392 | Arithmetic.
2393 *----------------------------------------------------------------------------*/
2395 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2397 flag aSign;
2398 int16 aExp;
2399 bits64 aSig;
2401 aSig = extractFloat64Frac( a );
2402 aExp = extractFloat64Exp( a );
2403 aSign = extractFloat64Sign( a );
2404 if ( aExp == 0x7FF ) {
2405 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2406 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2408 if ( aExp == 0 ) {
2409 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2410 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2412 return
2413 packFloatx80(
2414 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2418 #endif
2420 #ifdef FLOAT128
2422 /*----------------------------------------------------------------------------
2423 | Returns the result of converting the double-precision floating-point value
2424 | `a' to the quadruple-precision floating-point format. The conversion is
2425 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2426 | Arithmetic.
2427 *----------------------------------------------------------------------------*/
2429 float128 float64_to_float128( float64 a STATUS_PARAM )
2431 flag aSign;
2432 int16 aExp;
2433 bits64 aSig, zSig0, zSig1;
2435 aSig = extractFloat64Frac( a );
2436 aExp = extractFloat64Exp( a );
2437 aSign = extractFloat64Sign( a );
2438 if ( aExp == 0x7FF ) {
2439 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2440 return packFloat128( aSign, 0x7FFF, 0, 0 );
2442 if ( aExp == 0 ) {
2443 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2444 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2445 --aExp;
2447 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2448 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2452 #endif
2454 /*----------------------------------------------------------------------------
2455 | Rounds the double-precision floating-point value `a' to an integer, and
2456 | returns the result as a double-precision floating-point value. The
2457 | operation is performed according to the IEC/IEEE Standard for Binary
2458 | Floating-Point Arithmetic.
2459 *----------------------------------------------------------------------------*/
2461 float64 float64_round_to_int( float64 a STATUS_PARAM )
2463 flag aSign;
2464 int16 aExp;
2465 bits64 lastBitMask, roundBitsMask;
2466 int8 roundingMode;
2467 float64 z;
2469 aExp = extractFloat64Exp( a );
2470 if ( 0x433 <= aExp ) {
2471 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2472 return propagateFloat64NaN( a, a STATUS_VAR );
2474 return a;
2476 if ( aExp < 0x3FF ) {
2477 if ( (bits64) ( a<<1 ) == 0 ) return a;
2478 STATUS(float_exception_flags) |= float_flag_inexact;
2479 aSign = extractFloat64Sign( a );
2480 switch ( STATUS(float_rounding_mode) ) {
2481 case float_round_nearest_even:
2482 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2483 return packFloat64( aSign, 0x3FF, 0 );
2485 break;
2486 case float_round_down:
2487 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2488 case float_round_up:
2489 return
2490 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2492 return packFloat64( aSign, 0, 0 );
2494 lastBitMask = 1;
2495 lastBitMask <<= 0x433 - aExp;
2496 roundBitsMask = lastBitMask - 1;
2497 z = a;
2498 roundingMode = STATUS(float_rounding_mode);
2499 if ( roundingMode == float_round_nearest_even ) {
2500 z += lastBitMask>>1;
2501 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2503 else if ( roundingMode != float_round_to_zero ) {
2504 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2505 z += roundBitsMask;
2508 z &= ~ roundBitsMask;
2509 if ( z != a ) STATUS(float_exception_flags) |= float_flag_inexact;
2510 return z;
2514 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2516 int oldmode;
2517 float64 res;
2518 oldmode = STATUS(float_rounding_mode);
2519 STATUS(float_rounding_mode) = float_round_to_zero;
2520 res = float64_round_to_int(a STATUS_VAR);
2521 STATUS(float_rounding_mode) = oldmode;
2522 return res;
2525 /*----------------------------------------------------------------------------
2526 | Returns the result of adding the absolute values of the double-precision
2527 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2528 | before being returned. `zSign' is ignored if the result is a NaN.
2529 | The addition is performed according to the IEC/IEEE Standard for Binary
2530 | Floating-Point Arithmetic.
2531 *----------------------------------------------------------------------------*/
2533 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2535 int16 aExp, bExp, zExp;
2536 bits64 aSig, bSig, zSig;
2537 int16 expDiff;
2539 aSig = extractFloat64Frac( a );
2540 aExp = extractFloat64Exp( a );
2541 bSig = extractFloat64Frac( b );
2542 bExp = extractFloat64Exp( b );
2543 expDiff = aExp - bExp;
2544 aSig <<= 9;
2545 bSig <<= 9;
2546 if ( 0 < expDiff ) {
2547 if ( aExp == 0x7FF ) {
2548 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2549 return a;
2551 if ( bExp == 0 ) {
2552 --expDiff;
2554 else {
2555 bSig |= LIT64( 0x2000000000000000 );
2557 shift64RightJamming( bSig, expDiff, &bSig );
2558 zExp = aExp;
2560 else if ( expDiff < 0 ) {
2561 if ( bExp == 0x7FF ) {
2562 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2563 return packFloat64( zSign, 0x7FF, 0 );
2565 if ( aExp == 0 ) {
2566 ++expDiff;
2568 else {
2569 aSig |= LIT64( 0x2000000000000000 );
2571 shift64RightJamming( aSig, - expDiff, &aSig );
2572 zExp = bExp;
2574 else {
2575 if ( aExp == 0x7FF ) {
2576 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2577 return a;
2579 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2580 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2581 zExp = aExp;
2582 goto roundAndPack;
2584 aSig |= LIT64( 0x2000000000000000 );
2585 zSig = ( aSig + bSig )<<1;
2586 --zExp;
2587 if ( (sbits64) zSig < 0 ) {
2588 zSig = aSig + bSig;
2589 ++zExp;
2591 roundAndPack:
2592 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2596 /*----------------------------------------------------------------------------
2597 | Returns the result of subtracting the absolute values of the double-
2598 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2599 | difference is negated before being returned. `zSign' is ignored if the
2600 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2601 | Standard for Binary Floating-Point Arithmetic.
2602 *----------------------------------------------------------------------------*/
2604 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2606 int16 aExp, bExp, zExp;
2607 bits64 aSig, bSig, zSig;
2608 int16 expDiff;
2610 aSig = extractFloat64Frac( a );
2611 aExp = extractFloat64Exp( a );
2612 bSig = extractFloat64Frac( b );
2613 bExp = extractFloat64Exp( b );
2614 expDiff = aExp - bExp;
2615 aSig <<= 10;
2616 bSig <<= 10;
2617 if ( 0 < expDiff ) goto aExpBigger;
2618 if ( expDiff < 0 ) goto bExpBigger;
2619 if ( aExp == 0x7FF ) {
2620 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2621 float_raise( float_flag_invalid STATUS_VAR);
2622 return float64_default_nan;
2624 if ( aExp == 0 ) {
2625 aExp = 1;
2626 bExp = 1;
2628 if ( bSig < aSig ) goto aBigger;
2629 if ( aSig < bSig ) goto bBigger;
2630 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2631 bExpBigger:
2632 if ( bExp == 0x7FF ) {
2633 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2634 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2636 if ( aExp == 0 ) {
2637 ++expDiff;
2639 else {
2640 aSig |= LIT64( 0x4000000000000000 );
2642 shift64RightJamming( aSig, - expDiff, &aSig );
2643 bSig |= LIT64( 0x4000000000000000 );
2644 bBigger:
2645 zSig = bSig - aSig;
2646 zExp = bExp;
2647 zSign ^= 1;
2648 goto normalizeRoundAndPack;
2649 aExpBigger:
2650 if ( aExp == 0x7FF ) {
2651 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2652 return a;
2654 if ( bExp == 0 ) {
2655 --expDiff;
2657 else {
2658 bSig |= LIT64( 0x4000000000000000 );
2660 shift64RightJamming( bSig, expDiff, &bSig );
2661 aSig |= LIT64( 0x4000000000000000 );
2662 aBigger:
2663 zSig = aSig - bSig;
2664 zExp = aExp;
2665 normalizeRoundAndPack:
2666 --zExp;
2667 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2671 /*----------------------------------------------------------------------------
2672 | Returns the result of adding the double-precision floating-point values `a'
2673 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2674 | Binary Floating-Point Arithmetic.
2675 *----------------------------------------------------------------------------*/
2677 float64 float64_add( float64 a, float64 b STATUS_PARAM )
2679 flag aSign, bSign;
2681 aSign = extractFloat64Sign( a );
2682 bSign = extractFloat64Sign( b );
2683 if ( aSign == bSign ) {
2684 return addFloat64Sigs( a, b, aSign STATUS_VAR );
2686 else {
2687 return subFloat64Sigs( a, b, aSign STATUS_VAR );
2692 /*----------------------------------------------------------------------------
2693 | Returns the result of subtracting the double-precision floating-point values
2694 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2695 | for Binary Floating-Point Arithmetic.
2696 *----------------------------------------------------------------------------*/
2698 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2700 flag aSign, bSign;
2702 aSign = extractFloat64Sign( a );
2703 bSign = extractFloat64Sign( b );
2704 if ( aSign == bSign ) {
2705 return subFloat64Sigs( a, b, aSign STATUS_VAR );
2707 else {
2708 return addFloat64Sigs( a, b, aSign STATUS_VAR );
2713 /*----------------------------------------------------------------------------
2714 | Returns the result of multiplying the double-precision floating-point values
2715 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2716 | for Binary Floating-Point Arithmetic.
2717 *----------------------------------------------------------------------------*/
2719 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
2721 flag aSign, bSign, zSign;
2722 int16 aExp, bExp, zExp;
2723 bits64 aSig, bSig, zSig0, zSig1;
2725 aSig = extractFloat64Frac( a );
2726 aExp = extractFloat64Exp( a );
2727 aSign = extractFloat64Sign( a );
2728 bSig = extractFloat64Frac( b );
2729 bExp = extractFloat64Exp( b );
2730 bSign = extractFloat64Sign( b );
2731 zSign = aSign ^ bSign;
2732 if ( aExp == 0x7FF ) {
2733 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2734 return propagateFloat64NaN( a, b STATUS_VAR );
2736 if ( ( bExp | bSig ) == 0 ) {
2737 float_raise( float_flag_invalid STATUS_VAR);
2738 return float64_default_nan;
2740 return packFloat64( zSign, 0x7FF, 0 );
2742 if ( bExp == 0x7FF ) {
2743 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2744 if ( ( aExp | aSig ) == 0 ) {
2745 float_raise( float_flag_invalid STATUS_VAR);
2746 return float64_default_nan;
2748 return packFloat64( zSign, 0x7FF, 0 );
2750 if ( aExp == 0 ) {
2751 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2752 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2754 if ( bExp == 0 ) {
2755 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2756 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2758 zExp = aExp + bExp - 0x3FF;
2759 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2760 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2761 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2762 zSig0 |= ( zSig1 != 0 );
2763 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2764 zSig0 <<= 1;
2765 --zExp;
2767 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
2771 /*----------------------------------------------------------------------------
2772 | Returns the result of dividing the double-precision floating-point value `a'
2773 | by the corresponding value `b'. The operation is performed according to
2774 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2775 *----------------------------------------------------------------------------*/
2777 float64 float64_div( float64 a, float64 b STATUS_PARAM )
2779 flag aSign, bSign, zSign;
2780 int16 aExp, bExp, zExp;
2781 bits64 aSig, bSig, zSig;
2782 bits64 rem0, rem1;
2783 bits64 term0, term1;
2785 aSig = extractFloat64Frac( a );
2786 aExp = extractFloat64Exp( a );
2787 aSign = extractFloat64Sign( a );
2788 bSig = extractFloat64Frac( b );
2789 bExp = extractFloat64Exp( b );
2790 bSign = extractFloat64Sign( b );
2791 zSign = aSign ^ bSign;
2792 if ( aExp == 0x7FF ) {
2793 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2794 if ( bExp == 0x7FF ) {
2795 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2796 float_raise( float_flag_invalid STATUS_VAR);
2797 return float64_default_nan;
2799 return packFloat64( zSign, 0x7FF, 0 );
2801 if ( bExp == 0x7FF ) {
2802 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2803 return packFloat64( zSign, 0, 0 );
2805 if ( bExp == 0 ) {
2806 if ( bSig == 0 ) {
2807 if ( ( aExp | aSig ) == 0 ) {
2808 float_raise( float_flag_invalid STATUS_VAR);
2809 return float64_default_nan;
2811 float_raise( float_flag_divbyzero STATUS_VAR);
2812 return packFloat64( zSign, 0x7FF, 0 );
2814 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2816 if ( aExp == 0 ) {
2817 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2818 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2820 zExp = aExp - bExp + 0x3FD;
2821 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2822 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2823 if ( bSig <= ( aSig + aSig ) ) {
2824 aSig >>= 1;
2825 ++zExp;
2827 zSig = estimateDiv128To64( aSig, 0, bSig );
2828 if ( ( zSig & 0x1FF ) <= 2 ) {
2829 mul64To128( bSig, zSig, &term0, &term1 );
2830 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2831 while ( (sbits64) rem0 < 0 ) {
2832 --zSig;
2833 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2835 zSig |= ( rem1 != 0 );
2837 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2841 /*----------------------------------------------------------------------------
2842 | Returns the remainder of the double-precision floating-point value `a'
2843 | with respect to the corresponding value `b'. The operation is performed
2844 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2845 *----------------------------------------------------------------------------*/
2847 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
2849 flag aSign, bSign, zSign;
2850 int16 aExp, bExp, expDiff;
2851 bits64 aSig, bSig;
2852 bits64 q, alternateASig;
2853 sbits64 sigMean;
2855 aSig = extractFloat64Frac( a );
2856 aExp = extractFloat64Exp( a );
2857 aSign = extractFloat64Sign( a );
2858 bSig = extractFloat64Frac( b );
2859 bExp = extractFloat64Exp( b );
2860 bSign = extractFloat64Sign( b );
2861 if ( aExp == 0x7FF ) {
2862 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2863 return propagateFloat64NaN( a, b STATUS_VAR );
2865 float_raise( float_flag_invalid STATUS_VAR);
2866 return float64_default_nan;
2868 if ( bExp == 0x7FF ) {
2869 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2870 return a;
2872 if ( bExp == 0 ) {
2873 if ( bSig == 0 ) {
2874 float_raise( float_flag_invalid STATUS_VAR);
2875 return float64_default_nan;
2877 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2879 if ( aExp == 0 ) {
2880 if ( aSig == 0 ) return a;
2881 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2883 expDiff = aExp - bExp;
2884 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2885 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2886 if ( expDiff < 0 ) {
2887 if ( expDiff < -1 ) return a;
2888 aSig >>= 1;
2890 q = ( bSig <= aSig );
2891 if ( q ) aSig -= bSig;
2892 expDiff -= 64;
2893 while ( 0 < expDiff ) {
2894 q = estimateDiv128To64( aSig, 0, bSig );
2895 q = ( 2 < q ) ? q - 2 : 0;
2896 aSig = - ( ( bSig>>2 ) * q );
2897 expDiff -= 62;
2899 expDiff += 64;
2900 if ( 0 < expDiff ) {
2901 q = estimateDiv128To64( aSig, 0, bSig );
2902 q = ( 2 < q ) ? q - 2 : 0;
2903 q >>= 64 - expDiff;
2904 bSig >>= 2;
2905 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2907 else {
2908 aSig >>= 2;
2909 bSig >>= 2;
2911 do {
2912 alternateASig = aSig;
2913 ++q;
2914 aSig -= bSig;
2915 } while ( 0 <= (sbits64) aSig );
2916 sigMean = aSig + alternateASig;
2917 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2918 aSig = alternateASig;
2920 zSign = ( (sbits64) aSig < 0 );
2921 if ( zSign ) aSig = - aSig;
2922 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
2926 /*----------------------------------------------------------------------------
2927 | Returns the square root of the double-precision floating-point value `a'.
2928 | The operation is performed according to the IEC/IEEE Standard for Binary
2929 | Floating-Point Arithmetic.
2930 *----------------------------------------------------------------------------*/
2932 float64 float64_sqrt( float64 a STATUS_PARAM )
2934 flag aSign;
2935 int16 aExp, zExp;
2936 bits64 aSig, zSig, doubleZSig;
2937 bits64 rem0, rem1, term0, term1;
2939 aSig = extractFloat64Frac( a );
2940 aExp = extractFloat64Exp( a );
2941 aSign = extractFloat64Sign( a );
2942 if ( aExp == 0x7FF ) {
2943 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
2944 if ( ! aSign ) return a;
2945 float_raise( float_flag_invalid STATUS_VAR);
2946 return float64_default_nan;
2948 if ( aSign ) {
2949 if ( ( aExp | aSig ) == 0 ) return a;
2950 float_raise( float_flag_invalid STATUS_VAR);
2951 return float64_default_nan;
2953 if ( aExp == 0 ) {
2954 if ( aSig == 0 ) return 0;
2955 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2957 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2958 aSig |= LIT64( 0x0010000000000000 );
2959 zSig = estimateSqrt32( aExp, aSig>>21 );
2960 aSig <<= 9 - ( aExp & 1 );
2961 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
2962 if ( ( zSig & 0x1FF ) <= 5 ) {
2963 doubleZSig = zSig<<1;
2964 mul64To128( zSig, zSig, &term0, &term1 );
2965 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2966 while ( (sbits64) rem0 < 0 ) {
2967 --zSig;
2968 doubleZSig -= 2;
2969 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
2971 zSig |= ( ( rem0 | rem1 ) != 0 );
2973 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
2977 /*----------------------------------------------------------------------------
2978 | Returns 1 if the double-precision floating-point value `a' is equal to the
2979 | corresponding value `b', and 0 otherwise. The comparison is performed
2980 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2981 *----------------------------------------------------------------------------*/
2983 int float64_eq( float64 a, float64 b STATUS_PARAM )
2986 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2987 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2989 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2990 float_raise( float_flag_invalid STATUS_VAR);
2992 return 0;
2994 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2998 /*----------------------------------------------------------------------------
2999 | Returns 1 if the double-precision floating-point value `a' is less than or
3000 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3001 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3002 | Arithmetic.
3003 *----------------------------------------------------------------------------*/
3005 int float64_le( float64 a, float64 b STATUS_PARAM )
3007 flag aSign, bSign;
3009 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3010 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3012 float_raise( float_flag_invalid STATUS_VAR);
3013 return 0;
3015 aSign = extractFloat64Sign( a );
3016 bSign = extractFloat64Sign( b );
3017 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3018 return ( a == b ) || ( aSign ^ ( a < b ) );
3022 /*----------------------------------------------------------------------------
3023 | Returns 1 if the double-precision floating-point value `a' is less than
3024 | the corresponding value `b', and 0 otherwise. The comparison is performed
3025 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3026 *----------------------------------------------------------------------------*/
3028 int float64_lt( float64 a, float64 b STATUS_PARAM )
3030 flag aSign, bSign;
3032 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3033 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3035 float_raise( float_flag_invalid STATUS_VAR);
3036 return 0;
3038 aSign = extractFloat64Sign( a );
3039 bSign = extractFloat64Sign( b );
3040 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3041 return ( a != b ) && ( aSign ^ ( a < b ) );
3045 /*----------------------------------------------------------------------------
3046 | Returns 1 if the double-precision floating-point value `a' is equal to the
3047 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3048 | if either operand is a NaN. Otherwise, the comparison is performed
3049 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3050 *----------------------------------------------------------------------------*/
3052 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3055 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3056 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3058 float_raise( float_flag_invalid STATUS_VAR);
3059 return 0;
3061 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3065 /*----------------------------------------------------------------------------
3066 | Returns 1 if the double-precision floating-point value `a' is less than or
3067 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3068 | cause an exception. Otherwise, the comparison is performed according to the
3069 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3070 *----------------------------------------------------------------------------*/
3072 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3074 flag aSign, bSign;
3076 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3077 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3079 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3080 float_raise( float_flag_invalid STATUS_VAR);
3082 return 0;
3084 aSign = extractFloat64Sign( a );
3085 bSign = extractFloat64Sign( b );
3086 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3087 return ( a == b ) || ( aSign ^ ( a < b ) );
3091 /*----------------------------------------------------------------------------
3092 | Returns 1 if the double-precision floating-point value `a' is less than
3093 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3094 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3095 | Standard for Binary Floating-Point Arithmetic.
3096 *----------------------------------------------------------------------------*/
3098 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3100 flag aSign, bSign;
3102 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3103 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3105 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3106 float_raise( float_flag_invalid STATUS_VAR);
3108 return 0;
3110 aSign = extractFloat64Sign( a );
3111 bSign = extractFloat64Sign( b );
3112 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3113 return ( a != b ) && ( aSign ^ ( a < b ) );
3117 #ifdef FLOATX80
3119 /*----------------------------------------------------------------------------
3120 | Returns the result of converting the extended double-precision floating-
3121 | point value `a' to the 32-bit two's complement integer format. The
3122 | conversion is performed according to the IEC/IEEE Standard for Binary
3123 | Floating-Point Arithmetic---which means in particular that the conversion
3124 | is rounded according to the current rounding mode. If `a' is a NaN, the
3125 | largest positive integer is returned. Otherwise, if the conversion
3126 | overflows, the largest integer with the same sign as `a' is returned.
3127 *----------------------------------------------------------------------------*/
3129 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3131 flag aSign;
3132 int32 aExp, shiftCount;
3133 bits64 aSig;
3135 aSig = extractFloatx80Frac( a );
3136 aExp = extractFloatx80Exp( a );
3137 aSign = extractFloatx80Sign( a );
3138 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3139 shiftCount = 0x4037 - aExp;
3140 if ( shiftCount <= 0 ) shiftCount = 1;
3141 shift64RightJamming( aSig, shiftCount, &aSig );
3142 return roundAndPackInt32( aSign, aSig STATUS_VAR );
3146 /*----------------------------------------------------------------------------
3147 | Returns the result of converting the extended double-precision floating-
3148 | point value `a' to the 32-bit two's complement integer format. The
3149 | conversion is performed according to the IEC/IEEE Standard for Binary
3150 | Floating-Point Arithmetic, except that the conversion is always rounded
3151 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3152 | Otherwise, if the conversion overflows, the largest integer with the same
3153 | sign as `a' is returned.
3154 *----------------------------------------------------------------------------*/
3156 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3158 flag aSign;
3159 int32 aExp, shiftCount;
3160 bits64 aSig, savedASig;
3161 int32 z;
3163 aSig = extractFloatx80Frac( a );
3164 aExp = extractFloatx80Exp( a );
3165 aSign = extractFloatx80Sign( a );
3166 if ( 0x401E < aExp ) {
3167 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3168 goto invalid;
3170 else if ( aExp < 0x3FFF ) {
3171 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3172 return 0;
3174 shiftCount = 0x403E - aExp;
3175 savedASig = aSig;
3176 aSig >>= shiftCount;
3177 z = aSig;
3178 if ( aSign ) z = - z;
3179 if ( ( z < 0 ) ^ aSign ) {
3180 invalid:
3181 float_raise( float_flag_invalid STATUS_VAR);
3182 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3184 if ( ( aSig<<shiftCount ) != savedASig ) {
3185 STATUS(float_exception_flags) |= float_flag_inexact;
3187 return z;
3191 /*----------------------------------------------------------------------------
3192 | Returns the result of converting the extended double-precision floating-
3193 | point value `a' to the 64-bit two's complement integer format. The
3194 | conversion is performed according to the IEC/IEEE Standard for Binary
3195 | Floating-Point Arithmetic---which means in particular that the conversion
3196 | is rounded according to the current rounding mode. If `a' is a NaN,
3197 | the largest positive integer is returned. Otherwise, if the conversion
3198 | overflows, the largest integer with the same sign as `a' is returned.
3199 *----------------------------------------------------------------------------*/
3201 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3203 flag aSign;
3204 int32 aExp, shiftCount;
3205 bits64 aSig, aSigExtra;
3207 aSig = extractFloatx80Frac( a );
3208 aExp = extractFloatx80Exp( a );
3209 aSign = extractFloatx80Sign( a );
3210 shiftCount = 0x403E - aExp;
3211 if ( shiftCount <= 0 ) {
3212 if ( shiftCount ) {
3213 float_raise( float_flag_invalid STATUS_VAR);
3214 if ( ! aSign
3215 || ( ( aExp == 0x7FFF )
3216 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3218 return LIT64( 0x7FFFFFFFFFFFFFFF );
3220 return (sbits64) LIT64( 0x8000000000000000 );
3222 aSigExtra = 0;
3224 else {
3225 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3227 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3231 /*----------------------------------------------------------------------------
3232 | Returns the result of converting the extended double-precision floating-
3233 | point value `a' to the 64-bit two's complement integer format. The
3234 | conversion is performed according to the IEC/IEEE Standard for Binary
3235 | Floating-Point Arithmetic, except that the conversion is always rounded
3236 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3237 | Otherwise, if the conversion overflows, the largest integer with the same
3238 | sign as `a' is returned.
3239 *----------------------------------------------------------------------------*/
3241 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3243 flag aSign;
3244 int32 aExp, shiftCount;
3245 bits64 aSig;
3246 int64 z;
3248 aSig = extractFloatx80Frac( a );
3249 aExp = extractFloatx80Exp( a );
3250 aSign = extractFloatx80Sign( a );
3251 shiftCount = aExp - 0x403E;
3252 if ( 0 <= shiftCount ) {
3253 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3254 if ( ( a.high != 0xC03E ) || aSig ) {
3255 float_raise( float_flag_invalid STATUS_VAR);
3256 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3257 return LIT64( 0x7FFFFFFFFFFFFFFF );
3260 return (sbits64) LIT64( 0x8000000000000000 );
3262 else if ( aExp < 0x3FFF ) {
3263 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3264 return 0;
3266 z = aSig>>( - shiftCount );
3267 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3268 STATUS(float_exception_flags) |= float_flag_inexact;
3270 if ( aSign ) z = - z;
3271 return z;
3275 /*----------------------------------------------------------------------------
3276 | Returns the result of converting the extended double-precision floating-
3277 | point value `a' to the single-precision floating-point format. The
3278 | conversion is performed according to the IEC/IEEE Standard for Binary
3279 | Floating-Point Arithmetic.
3280 *----------------------------------------------------------------------------*/
3282 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3284 flag aSign;
3285 int32 aExp;
3286 bits64 aSig;
3288 aSig = extractFloatx80Frac( a );
3289 aExp = extractFloatx80Exp( a );
3290 aSign = extractFloatx80Sign( a );
3291 if ( aExp == 0x7FFF ) {
3292 if ( (bits64) ( aSig<<1 ) ) {
3293 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3295 return packFloat32( aSign, 0xFF, 0 );
3297 shift64RightJamming( aSig, 33, &aSig );
3298 if ( aExp || aSig ) aExp -= 0x3F81;
3299 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3303 /*----------------------------------------------------------------------------
3304 | Returns the result of converting the extended double-precision floating-
3305 | point value `a' to the double-precision floating-point format. The
3306 | conversion is performed according to the IEC/IEEE Standard for Binary
3307 | Floating-Point Arithmetic.
3308 *----------------------------------------------------------------------------*/
3310 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3312 flag aSign;
3313 int32 aExp;
3314 bits64 aSig, zSig;
3316 aSig = extractFloatx80Frac( a );
3317 aExp = extractFloatx80Exp( a );
3318 aSign = extractFloatx80Sign( a );
3319 if ( aExp == 0x7FFF ) {
3320 if ( (bits64) ( aSig<<1 ) ) {
3321 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3323 return packFloat64( aSign, 0x7FF, 0 );
3325 shift64RightJamming( aSig, 1, &zSig );
3326 if ( aExp || aSig ) aExp -= 0x3C01;
3327 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3331 #ifdef FLOAT128
3333 /*----------------------------------------------------------------------------
3334 | Returns the result of converting the extended double-precision floating-
3335 | point value `a' to the quadruple-precision floating-point format. The
3336 | conversion is performed according to the IEC/IEEE Standard for Binary
3337 | Floating-Point Arithmetic.
3338 *----------------------------------------------------------------------------*/
3340 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3342 flag aSign;
3343 int16 aExp;
3344 bits64 aSig, zSig0, zSig1;
3346 aSig = extractFloatx80Frac( a );
3347 aExp = extractFloatx80Exp( a );
3348 aSign = extractFloatx80Sign( a );
3349 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3350 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3352 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3353 return packFloat128( aSign, aExp, zSig0, zSig1 );
3357 #endif
3359 /*----------------------------------------------------------------------------
3360 | Rounds the extended double-precision floating-point value `a' to an integer,
3361 | and returns the result as an extended quadruple-precision floating-point
3362 | value. The operation is performed according to the IEC/IEEE Standard for
3363 | Binary Floating-Point Arithmetic.
3364 *----------------------------------------------------------------------------*/
3366 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3368 flag aSign;
3369 int32 aExp;
3370 bits64 lastBitMask, roundBitsMask;
3371 int8 roundingMode;
3372 floatx80 z;
3374 aExp = extractFloatx80Exp( a );
3375 if ( 0x403E <= aExp ) {
3376 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3377 return propagateFloatx80NaN( a, a STATUS_VAR );
3379 return a;
3381 if ( aExp < 0x3FFF ) {
3382 if ( ( aExp == 0 )
3383 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3384 return a;
3386 STATUS(float_exception_flags) |= float_flag_inexact;
3387 aSign = extractFloatx80Sign( a );
3388 switch ( STATUS(float_rounding_mode) ) {
3389 case float_round_nearest_even:
3390 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3392 return
3393 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3395 break;
3396 case float_round_down:
3397 return
3398 aSign ?
3399 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3400 : packFloatx80( 0, 0, 0 );
3401 case float_round_up:
3402 return
3403 aSign ? packFloatx80( 1, 0, 0 )
3404 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3406 return packFloatx80( aSign, 0, 0 );
3408 lastBitMask = 1;
3409 lastBitMask <<= 0x403E - aExp;
3410 roundBitsMask = lastBitMask - 1;
3411 z = a;
3412 roundingMode = STATUS(float_rounding_mode);
3413 if ( roundingMode == float_round_nearest_even ) {
3414 z.low += lastBitMask>>1;
3415 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3417 else if ( roundingMode != float_round_to_zero ) {
3418 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3419 z.low += roundBitsMask;
3422 z.low &= ~ roundBitsMask;
3423 if ( z.low == 0 ) {
3424 ++z.high;
3425 z.low = LIT64( 0x8000000000000000 );
3427 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3428 return z;
3432 /*----------------------------------------------------------------------------
3433 | Returns the result of adding the absolute values of the extended double-
3434 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3435 | negated before being returned. `zSign' is ignored if the result is a NaN.
3436 | The addition is performed according to the IEC/IEEE Standard for Binary
3437 | Floating-Point Arithmetic.
3438 *----------------------------------------------------------------------------*/
3440 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3442 int32 aExp, bExp, zExp;
3443 bits64 aSig, bSig, zSig0, zSig1;
3444 int32 expDiff;
3446 aSig = extractFloatx80Frac( a );
3447 aExp = extractFloatx80Exp( a );
3448 bSig = extractFloatx80Frac( b );
3449 bExp = extractFloatx80Exp( b );
3450 expDiff = aExp - bExp;
3451 if ( 0 < expDiff ) {
3452 if ( aExp == 0x7FFF ) {
3453 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3454 return a;
3456 if ( bExp == 0 ) --expDiff;
3457 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3458 zExp = aExp;
3460 else if ( expDiff < 0 ) {
3461 if ( bExp == 0x7FFF ) {
3462 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3463 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3465 if ( aExp == 0 ) ++expDiff;
3466 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3467 zExp = bExp;
3469 else {
3470 if ( aExp == 0x7FFF ) {
3471 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3472 return propagateFloatx80NaN( a, b STATUS_VAR );
3474 return a;
3476 zSig1 = 0;
3477 zSig0 = aSig + bSig;
3478 if ( aExp == 0 ) {
3479 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3480 goto roundAndPack;
3482 zExp = aExp;
3483 goto shiftRight1;
3485 zSig0 = aSig + bSig;
3486 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3487 shiftRight1:
3488 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3489 zSig0 |= LIT64( 0x8000000000000000 );
3490 ++zExp;
3491 roundAndPack:
3492 return
3493 roundAndPackFloatx80(
3494 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3498 /*----------------------------------------------------------------------------
3499 | Returns the result of subtracting the absolute values of the extended
3500 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3501 | difference is negated before being returned. `zSign' is ignored if the
3502 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3503 | Standard for Binary Floating-Point Arithmetic.
3504 *----------------------------------------------------------------------------*/
3506 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3508 int32 aExp, bExp, zExp;
3509 bits64 aSig, bSig, zSig0, zSig1;
3510 int32 expDiff;
3511 floatx80 z;
3513 aSig = extractFloatx80Frac( a );
3514 aExp = extractFloatx80Exp( a );
3515 bSig = extractFloatx80Frac( b );
3516 bExp = extractFloatx80Exp( b );
3517 expDiff = aExp - bExp;
3518 if ( 0 < expDiff ) goto aExpBigger;
3519 if ( expDiff < 0 ) goto bExpBigger;
3520 if ( aExp == 0x7FFF ) {
3521 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3522 return propagateFloatx80NaN( a, b STATUS_VAR );
3524 float_raise( float_flag_invalid STATUS_VAR);
3525 z.low = floatx80_default_nan_low;
3526 z.high = floatx80_default_nan_high;
3527 return z;
3529 if ( aExp == 0 ) {
3530 aExp = 1;
3531 bExp = 1;
3533 zSig1 = 0;
3534 if ( bSig < aSig ) goto aBigger;
3535 if ( aSig < bSig ) goto bBigger;
3536 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3537 bExpBigger:
3538 if ( bExp == 0x7FFF ) {
3539 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3540 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3542 if ( aExp == 0 ) ++expDiff;
3543 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3544 bBigger:
3545 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3546 zExp = bExp;
3547 zSign ^= 1;
3548 goto normalizeRoundAndPack;
3549 aExpBigger:
3550 if ( aExp == 0x7FFF ) {
3551 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3552 return a;
3554 if ( bExp == 0 ) --expDiff;
3555 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3556 aBigger:
3557 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3558 zExp = aExp;
3559 normalizeRoundAndPack:
3560 return
3561 normalizeRoundAndPackFloatx80(
3562 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3566 /*----------------------------------------------------------------------------
3567 | Returns the result of adding the extended double-precision floating-point
3568 | values `a' and `b'. The operation is performed according to the IEC/IEEE
3569 | Standard for Binary Floating-Point Arithmetic.
3570 *----------------------------------------------------------------------------*/
3572 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3574 flag aSign, bSign;
3576 aSign = extractFloatx80Sign( a );
3577 bSign = extractFloatx80Sign( b );
3578 if ( aSign == bSign ) {
3579 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3581 else {
3582 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3587 /*----------------------------------------------------------------------------
3588 | Returns the result of subtracting the extended double-precision floating-
3589 | point values `a' and `b'. The operation is performed according to the
3590 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3591 *----------------------------------------------------------------------------*/
3593 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3595 flag aSign, bSign;
3597 aSign = extractFloatx80Sign( a );
3598 bSign = extractFloatx80Sign( b );
3599 if ( aSign == bSign ) {
3600 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3602 else {
3603 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3608 /*----------------------------------------------------------------------------
3609 | Returns the result of multiplying the extended double-precision floating-
3610 | point values `a' and `b'. The operation is performed according to the
3611 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3612 *----------------------------------------------------------------------------*/
3614 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3616 flag aSign, bSign, zSign;
3617 int32 aExp, bExp, zExp;
3618 bits64 aSig, bSig, zSig0, zSig1;
3619 floatx80 z;
3621 aSig = extractFloatx80Frac( a );
3622 aExp = extractFloatx80Exp( a );
3623 aSign = extractFloatx80Sign( a );
3624 bSig = extractFloatx80Frac( b );
3625 bExp = extractFloatx80Exp( b );
3626 bSign = extractFloatx80Sign( b );
3627 zSign = aSign ^ bSign;
3628 if ( aExp == 0x7FFF ) {
3629 if ( (bits64) ( aSig<<1 )
3630 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3631 return propagateFloatx80NaN( a, b STATUS_VAR );
3633 if ( ( bExp | bSig ) == 0 ) goto invalid;
3634 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3636 if ( bExp == 0x7FFF ) {
3637 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3638 if ( ( aExp | aSig ) == 0 ) {
3639 invalid:
3640 float_raise( float_flag_invalid STATUS_VAR);
3641 z.low = floatx80_default_nan_low;
3642 z.high = floatx80_default_nan_high;
3643 return z;
3645 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3647 if ( aExp == 0 ) {
3648 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3649 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3651 if ( bExp == 0 ) {
3652 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3653 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3655 zExp = aExp + bExp - 0x3FFE;
3656 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3657 if ( 0 < (sbits64) zSig0 ) {
3658 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3659 --zExp;
3661 return
3662 roundAndPackFloatx80(
3663 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3667 /*----------------------------------------------------------------------------
3668 | Returns the result of dividing the extended double-precision floating-point
3669 | value `a' by the corresponding value `b'. The operation is performed
3670 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3671 *----------------------------------------------------------------------------*/
3673 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
3675 flag aSign, bSign, zSign;
3676 int32 aExp, bExp, zExp;
3677 bits64 aSig, bSig, zSig0, zSig1;
3678 bits64 rem0, rem1, rem2, term0, term1, term2;
3679 floatx80 z;
3681 aSig = extractFloatx80Frac( a );
3682 aExp = extractFloatx80Exp( a );
3683 aSign = extractFloatx80Sign( a );
3684 bSig = extractFloatx80Frac( b );
3685 bExp = extractFloatx80Exp( b );
3686 bSign = extractFloatx80Sign( b );
3687 zSign = aSign ^ bSign;
3688 if ( aExp == 0x7FFF ) {
3689 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3690 if ( bExp == 0x7FFF ) {
3691 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3692 goto invalid;
3694 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3696 if ( bExp == 0x7FFF ) {
3697 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3698 return packFloatx80( zSign, 0, 0 );
3700 if ( bExp == 0 ) {
3701 if ( bSig == 0 ) {
3702 if ( ( aExp | aSig ) == 0 ) {
3703 invalid:
3704 float_raise( float_flag_invalid STATUS_VAR);
3705 z.low = floatx80_default_nan_low;
3706 z.high = floatx80_default_nan_high;
3707 return z;
3709 float_raise( float_flag_divbyzero STATUS_VAR);
3710 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3712 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3714 if ( aExp == 0 ) {
3715 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3716 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3718 zExp = aExp - bExp + 0x3FFE;
3719 rem1 = 0;
3720 if ( bSig <= aSig ) {
3721 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3722 ++zExp;
3724 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3725 mul64To128( bSig, zSig0, &term0, &term1 );
3726 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3727 while ( (sbits64) rem0 < 0 ) {
3728 --zSig0;
3729 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3731 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3732 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3733 mul64To128( bSig, zSig1, &term1, &term2 );
3734 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3735 while ( (sbits64) rem1 < 0 ) {
3736 --zSig1;
3737 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3739 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3741 return
3742 roundAndPackFloatx80(
3743 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3747 /*----------------------------------------------------------------------------
3748 | Returns the remainder of the extended double-precision floating-point value
3749 | `a' with respect to the corresponding value `b'. The operation is performed
3750 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3751 *----------------------------------------------------------------------------*/
3753 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
3755 flag aSign, bSign, zSign;
3756 int32 aExp, bExp, expDiff;
3757 bits64 aSig0, aSig1, bSig;
3758 bits64 q, term0, term1, alternateASig0, alternateASig1;
3759 floatx80 z;
3761 aSig0 = extractFloatx80Frac( a );
3762 aExp = extractFloatx80Exp( a );
3763 aSign = extractFloatx80Sign( a );
3764 bSig = extractFloatx80Frac( b );
3765 bExp = extractFloatx80Exp( b );
3766 bSign = extractFloatx80Sign( b );
3767 if ( aExp == 0x7FFF ) {
3768 if ( (bits64) ( aSig0<<1 )
3769 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3770 return propagateFloatx80NaN( a, b STATUS_VAR );
3772 goto invalid;
3774 if ( bExp == 0x7FFF ) {
3775 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3776 return a;
3778 if ( bExp == 0 ) {
3779 if ( bSig == 0 ) {
3780 invalid:
3781 float_raise( float_flag_invalid STATUS_VAR);
3782 z.low = floatx80_default_nan_low;
3783 z.high = floatx80_default_nan_high;
3784 return z;
3786 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3788 if ( aExp == 0 ) {
3789 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3790 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3792 bSig |= LIT64( 0x8000000000000000 );
3793 zSign = aSign;
3794 expDiff = aExp - bExp;
3795 aSig1 = 0;
3796 if ( expDiff < 0 ) {
3797 if ( expDiff < -1 ) return a;
3798 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3799 expDiff = 0;
3801 q = ( bSig <= aSig0 );
3802 if ( q ) aSig0 -= bSig;
3803 expDiff -= 64;
3804 while ( 0 < expDiff ) {
3805 q = estimateDiv128To64( aSig0, aSig1, bSig );
3806 q = ( 2 < q ) ? q - 2 : 0;
3807 mul64To128( bSig, q, &term0, &term1 );
3808 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3809 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3810 expDiff -= 62;
3812 expDiff += 64;
3813 if ( 0 < expDiff ) {
3814 q = estimateDiv128To64( aSig0, aSig1, bSig );
3815 q = ( 2 < q ) ? q - 2 : 0;
3816 q >>= 64 - expDiff;
3817 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3818 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3819 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3820 while ( le128( term0, term1, aSig0, aSig1 ) ) {
3821 ++q;
3822 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3825 else {
3826 term1 = 0;
3827 term0 = bSig;
3829 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3830 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3831 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3832 && ( q & 1 ) )
3834 aSig0 = alternateASig0;
3835 aSig1 = alternateASig1;
3836 zSign = ! zSign;
3838 return
3839 normalizeRoundAndPackFloatx80(
3840 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
3844 /*----------------------------------------------------------------------------
3845 | Returns the square root of the extended double-precision floating-point
3846 | value `a'. The operation is performed according to the IEC/IEEE Standard
3847 | for Binary Floating-Point Arithmetic.
3848 *----------------------------------------------------------------------------*/
3850 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
3852 flag aSign;
3853 int32 aExp, zExp;
3854 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3855 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3856 floatx80 z;
3858 aSig0 = extractFloatx80Frac( a );
3859 aExp = extractFloatx80Exp( a );
3860 aSign = extractFloatx80Sign( a );
3861 if ( aExp == 0x7FFF ) {
3862 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
3863 if ( ! aSign ) return a;
3864 goto invalid;
3866 if ( aSign ) {
3867 if ( ( aExp | aSig0 ) == 0 ) return a;
3868 invalid:
3869 float_raise( float_flag_invalid STATUS_VAR);
3870 z.low = floatx80_default_nan_low;
3871 z.high = floatx80_default_nan_high;
3872 return z;
3874 if ( aExp == 0 ) {
3875 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3876 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3878 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3879 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3880 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3881 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3882 doubleZSig0 = zSig0<<1;
3883 mul64To128( zSig0, zSig0, &term0, &term1 );
3884 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3885 while ( (sbits64) rem0 < 0 ) {
3886 --zSig0;
3887 doubleZSig0 -= 2;
3888 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3890 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3891 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3892 if ( zSig1 == 0 ) zSig1 = 1;
3893 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3894 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3895 mul64To128( zSig1, zSig1, &term2, &term3 );
3896 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3897 while ( (sbits64) rem1 < 0 ) {
3898 --zSig1;
3899 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3900 term3 |= 1;
3901 term2 |= doubleZSig0;
3902 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3904 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3906 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3907 zSig0 |= doubleZSig0;
3908 return
3909 roundAndPackFloatx80(
3910 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
3914 /*----------------------------------------------------------------------------
3915 | Returns 1 if the extended double-precision floating-point value `a' is
3916 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3917 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3918 | Arithmetic.
3919 *----------------------------------------------------------------------------*/
3921 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
3924 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3925 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3926 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3927 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3929 if ( floatx80_is_signaling_nan( a )
3930 || floatx80_is_signaling_nan( b ) ) {
3931 float_raise( float_flag_invalid STATUS_VAR);
3933 return 0;
3935 return
3936 ( a.low == b.low )
3937 && ( ( a.high == b.high )
3938 || ( ( a.low == 0 )
3939 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3944 /*----------------------------------------------------------------------------
3945 | Returns 1 if the extended double-precision floating-point value `a' is
3946 | less than or equal to the corresponding value `b', and 0 otherwise. The
3947 | comparison is performed according to the IEC/IEEE Standard for Binary
3948 | Floating-Point Arithmetic.
3949 *----------------------------------------------------------------------------*/
3951 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
3953 flag aSign, bSign;
3955 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3956 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3957 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3958 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3960 float_raise( float_flag_invalid STATUS_VAR);
3961 return 0;
3963 aSign = extractFloatx80Sign( a );
3964 bSign = extractFloatx80Sign( b );
3965 if ( aSign != bSign ) {
3966 return
3967 aSign
3968 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3969 == 0 );
3971 return
3972 aSign ? le128( b.high, b.low, a.high, a.low )
3973 : le128( a.high, a.low, b.high, b.low );
3977 /*----------------------------------------------------------------------------
3978 | Returns 1 if the extended double-precision floating-point value `a' is
3979 | less than the corresponding value `b', and 0 otherwise. The comparison
3980 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3981 | Arithmetic.
3982 *----------------------------------------------------------------------------*/
3984 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
3986 flag aSign, bSign;
3988 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3989 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3990 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3991 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3993 float_raise( float_flag_invalid STATUS_VAR);
3994 return 0;
3996 aSign = extractFloatx80Sign( a );
3997 bSign = extractFloatx80Sign( b );
3998 if ( aSign != bSign ) {
3999 return
4000 aSign
4001 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4002 != 0 );
4004 return
4005 aSign ? lt128( b.high, b.low, a.high, a.low )
4006 : lt128( a.high, a.low, b.high, b.low );
4010 /*----------------------------------------------------------------------------
4011 | Returns 1 if the extended double-precision floating-point value `a' is equal
4012 | to the corresponding value `b', and 0 otherwise. The invalid exception is
4013 | raised if either operand is a NaN. Otherwise, the comparison is performed
4014 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4015 *----------------------------------------------------------------------------*/
4017 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4020 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4021 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4022 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4023 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4025 float_raise( float_flag_invalid STATUS_VAR);
4026 return 0;
4028 return
4029 ( a.low == b.low )
4030 && ( ( a.high == b.high )
4031 || ( ( a.low == 0 )
4032 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4037 /*----------------------------------------------------------------------------
4038 | Returns 1 if the extended double-precision floating-point value `a' is less
4039 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4040 | do not cause an exception. Otherwise, the comparison is performed according
4041 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4042 *----------------------------------------------------------------------------*/
4044 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4046 flag aSign, bSign;
4048 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4049 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4050 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4051 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4053 if ( floatx80_is_signaling_nan( a )
4054 || floatx80_is_signaling_nan( b ) ) {
4055 float_raise( float_flag_invalid STATUS_VAR);
4057 return 0;
4059 aSign = extractFloatx80Sign( a );
4060 bSign = extractFloatx80Sign( b );
4061 if ( aSign != bSign ) {
4062 return
4063 aSign
4064 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4065 == 0 );
4067 return
4068 aSign ? le128( b.high, b.low, a.high, a.low )
4069 : le128( a.high, a.low, b.high, b.low );
4073 /*----------------------------------------------------------------------------
4074 | Returns 1 if the extended double-precision floating-point value `a' is less
4075 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4076 | an exception. Otherwise, the comparison is performed according to the
4077 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4078 *----------------------------------------------------------------------------*/
4080 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4082 flag aSign, bSign;
4084 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4085 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4086 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4087 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4089 if ( floatx80_is_signaling_nan( a )
4090 || floatx80_is_signaling_nan( b ) ) {
4091 float_raise( float_flag_invalid STATUS_VAR);
4093 return 0;
4095 aSign = extractFloatx80Sign( a );
4096 bSign = extractFloatx80Sign( b );
4097 if ( aSign != bSign ) {
4098 return
4099 aSign
4100 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4101 != 0 );
4103 return
4104 aSign ? lt128( b.high, b.low, a.high, a.low )
4105 : lt128( a.high, a.low, b.high, b.low );
4109 #endif
4111 #ifdef FLOAT128
4113 /*----------------------------------------------------------------------------
4114 | Returns the result of converting the quadruple-precision floating-point
4115 | value `a' to the 32-bit two's complement integer format. The conversion
4116 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4117 | Arithmetic---which means in particular that the conversion is rounded
4118 | according to the current rounding mode. If `a' is a NaN, the largest
4119 | positive integer is returned. Otherwise, if the conversion overflows, the
4120 | largest integer with the same sign as `a' is returned.
4121 *----------------------------------------------------------------------------*/
4123 int32 float128_to_int32( float128 a STATUS_PARAM )
4125 flag aSign;
4126 int32 aExp, shiftCount;
4127 bits64 aSig0, aSig1;
4129 aSig1 = extractFloat128Frac1( a );
4130 aSig0 = extractFloat128Frac0( a );
4131 aExp = extractFloat128Exp( a );
4132 aSign = extractFloat128Sign( a );
4133 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4134 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4135 aSig0 |= ( aSig1 != 0 );
4136 shiftCount = 0x4028 - aExp;
4137 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4138 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4142 /*----------------------------------------------------------------------------
4143 | Returns the result of converting the quadruple-precision floating-point
4144 | value `a' to the 32-bit two's complement integer format. The conversion
4145 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4146 | Arithmetic, except that the conversion is always rounded toward zero. If
4147 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4148 | conversion overflows, the largest integer with the same sign as `a' is
4149 | returned.
4150 *----------------------------------------------------------------------------*/
4152 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4154 flag aSign;
4155 int32 aExp, shiftCount;
4156 bits64 aSig0, aSig1, savedASig;
4157 int32 z;
4159 aSig1 = extractFloat128Frac1( a );
4160 aSig0 = extractFloat128Frac0( a );
4161 aExp = extractFloat128Exp( a );
4162 aSign = extractFloat128Sign( a );
4163 aSig0 |= ( aSig1 != 0 );
4164 if ( 0x401E < aExp ) {
4165 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4166 goto invalid;
4168 else if ( aExp < 0x3FFF ) {
4169 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4170 return 0;
4172 aSig0 |= LIT64( 0x0001000000000000 );
4173 shiftCount = 0x402F - aExp;
4174 savedASig = aSig0;
4175 aSig0 >>= shiftCount;
4176 z = aSig0;
4177 if ( aSign ) z = - z;
4178 if ( ( z < 0 ) ^ aSign ) {
4179 invalid:
4180 float_raise( float_flag_invalid STATUS_VAR);
4181 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4183 if ( ( aSig0<<shiftCount ) != savedASig ) {
4184 STATUS(float_exception_flags) |= float_flag_inexact;
4186 return z;
4190 /*----------------------------------------------------------------------------
4191 | Returns the result of converting the quadruple-precision floating-point
4192 | value `a' to the 64-bit two's complement integer format. The conversion
4193 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4194 | Arithmetic---which means in particular that the conversion is rounded
4195 | according to the current rounding mode. If `a' is a NaN, the largest
4196 | positive integer is returned. Otherwise, if the conversion overflows, the
4197 | largest integer with the same sign as `a' is returned.
4198 *----------------------------------------------------------------------------*/
4200 int64 float128_to_int64( float128 a STATUS_PARAM )
4202 flag aSign;
4203 int32 aExp, shiftCount;
4204 bits64 aSig0, aSig1;
4206 aSig1 = extractFloat128Frac1( a );
4207 aSig0 = extractFloat128Frac0( a );
4208 aExp = extractFloat128Exp( a );
4209 aSign = extractFloat128Sign( a );
4210 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4211 shiftCount = 0x402F - aExp;
4212 if ( shiftCount <= 0 ) {
4213 if ( 0x403E < aExp ) {
4214 float_raise( float_flag_invalid STATUS_VAR);
4215 if ( ! aSign
4216 || ( ( aExp == 0x7FFF )
4217 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4220 return LIT64( 0x7FFFFFFFFFFFFFFF );
4222 return (sbits64) LIT64( 0x8000000000000000 );
4224 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4226 else {
4227 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4229 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4233 /*----------------------------------------------------------------------------
4234 | Returns the result of converting the quadruple-precision floating-point
4235 | value `a' to the 64-bit two's complement integer format. The conversion
4236 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4237 | Arithmetic, except that the conversion is always rounded toward zero.
4238 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4239 | the conversion overflows, the largest integer with the same sign as `a' is
4240 | returned.
4241 *----------------------------------------------------------------------------*/
4243 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4245 flag aSign;
4246 int32 aExp, shiftCount;
4247 bits64 aSig0, aSig1;
4248 int64 z;
4250 aSig1 = extractFloat128Frac1( a );
4251 aSig0 = extractFloat128Frac0( a );
4252 aExp = extractFloat128Exp( a );
4253 aSign = extractFloat128Sign( a );
4254 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4255 shiftCount = aExp - 0x402F;
4256 if ( 0 < shiftCount ) {
4257 if ( 0x403E <= aExp ) {
4258 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4259 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4260 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4261 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4263 else {
4264 float_raise( float_flag_invalid STATUS_VAR);
4265 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4266 return LIT64( 0x7FFFFFFFFFFFFFFF );
4269 return (sbits64) LIT64( 0x8000000000000000 );
4271 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4272 if ( (bits64) ( aSig1<<shiftCount ) ) {
4273 STATUS(float_exception_flags) |= float_flag_inexact;
4276 else {
4277 if ( aExp < 0x3FFF ) {
4278 if ( aExp | aSig0 | aSig1 ) {
4279 STATUS(float_exception_flags) |= float_flag_inexact;
4281 return 0;
4283 z = aSig0>>( - shiftCount );
4284 if ( aSig1
4285 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4286 STATUS(float_exception_flags) |= float_flag_inexact;
4289 if ( aSign ) z = - z;
4290 return z;
4294 /*----------------------------------------------------------------------------
4295 | Returns the result of converting the quadruple-precision floating-point
4296 | value `a' to the single-precision floating-point format. The conversion
4297 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4298 | Arithmetic.
4299 *----------------------------------------------------------------------------*/
4301 float32 float128_to_float32( float128 a STATUS_PARAM )
4303 flag aSign;
4304 int32 aExp;
4305 bits64 aSig0, aSig1;
4306 bits32 zSig;
4308 aSig1 = extractFloat128Frac1( a );
4309 aSig0 = extractFloat128Frac0( a );
4310 aExp = extractFloat128Exp( a );
4311 aSign = extractFloat128Sign( a );
4312 if ( aExp == 0x7FFF ) {
4313 if ( aSig0 | aSig1 ) {
4314 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4316 return packFloat32( aSign, 0xFF, 0 );
4318 aSig0 |= ( aSig1 != 0 );
4319 shift64RightJamming( aSig0, 18, &aSig0 );
4320 zSig = aSig0;
4321 if ( aExp || zSig ) {
4322 zSig |= 0x40000000;
4323 aExp -= 0x3F81;
4325 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4329 /*----------------------------------------------------------------------------
4330 | Returns the result of converting the quadruple-precision floating-point
4331 | value `a' to the double-precision floating-point format. The conversion
4332 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4333 | Arithmetic.
4334 *----------------------------------------------------------------------------*/
4336 float64 float128_to_float64( float128 a STATUS_PARAM )
4338 flag aSign;
4339 int32 aExp;
4340 bits64 aSig0, aSig1;
4342 aSig1 = extractFloat128Frac1( a );
4343 aSig0 = extractFloat128Frac0( a );
4344 aExp = extractFloat128Exp( a );
4345 aSign = extractFloat128Sign( a );
4346 if ( aExp == 0x7FFF ) {
4347 if ( aSig0 | aSig1 ) {
4348 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4350 return packFloat64( aSign, 0x7FF, 0 );
4352 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4353 aSig0 |= ( aSig1 != 0 );
4354 if ( aExp || aSig0 ) {
4355 aSig0 |= LIT64( 0x4000000000000000 );
4356 aExp -= 0x3C01;
4358 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4362 #ifdef FLOATX80
4364 /*----------------------------------------------------------------------------
4365 | Returns the result of converting the quadruple-precision floating-point
4366 | value `a' to the extended double-precision floating-point format. The
4367 | conversion is performed according to the IEC/IEEE Standard for Binary
4368 | Floating-Point Arithmetic.
4369 *----------------------------------------------------------------------------*/
4371 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4373 flag aSign;
4374 int32 aExp;
4375 bits64 aSig0, aSig1;
4377 aSig1 = extractFloat128Frac1( a );
4378 aSig0 = extractFloat128Frac0( a );
4379 aExp = extractFloat128Exp( a );
4380 aSign = extractFloat128Sign( a );
4381 if ( aExp == 0x7FFF ) {
4382 if ( aSig0 | aSig1 ) {
4383 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4385 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4387 if ( aExp == 0 ) {
4388 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4389 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4391 else {
4392 aSig0 |= LIT64( 0x0001000000000000 );
4394 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4395 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4399 #endif
4401 /*----------------------------------------------------------------------------
4402 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4403 | returns the result as a quadruple-precision floating-point value. The
4404 | operation is performed according to the IEC/IEEE Standard for Binary
4405 | Floating-Point Arithmetic.
4406 *----------------------------------------------------------------------------*/
4408 float128 float128_round_to_int( float128 a STATUS_PARAM )
4410 flag aSign;
4411 int32 aExp;
4412 bits64 lastBitMask, roundBitsMask;
4413 int8 roundingMode;
4414 float128 z;
4416 aExp = extractFloat128Exp( a );
4417 if ( 0x402F <= aExp ) {
4418 if ( 0x406F <= aExp ) {
4419 if ( ( aExp == 0x7FFF )
4420 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4422 return propagateFloat128NaN( a, a STATUS_VAR );
4424 return a;
4426 lastBitMask = 1;
4427 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4428 roundBitsMask = lastBitMask - 1;
4429 z = a;
4430 roundingMode = STATUS(float_rounding_mode);
4431 if ( roundingMode == float_round_nearest_even ) {
4432 if ( lastBitMask ) {
4433 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4434 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4436 else {
4437 if ( (sbits64) z.low < 0 ) {
4438 ++z.high;
4439 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4443 else if ( roundingMode != float_round_to_zero ) {
4444 if ( extractFloat128Sign( z )
4445 ^ ( roundingMode == float_round_up ) ) {
4446 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4449 z.low &= ~ roundBitsMask;
4451 else {
4452 if ( aExp < 0x3FFF ) {
4453 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4454 STATUS(float_exception_flags) |= float_flag_inexact;
4455 aSign = extractFloat128Sign( a );
4456 switch ( STATUS(float_rounding_mode) ) {
4457 case float_round_nearest_even:
4458 if ( ( aExp == 0x3FFE )
4459 && ( extractFloat128Frac0( a )
4460 | extractFloat128Frac1( a ) )
4462 return packFloat128( aSign, 0x3FFF, 0, 0 );
4464 break;
4465 case float_round_down:
4466 return
4467 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4468 : packFloat128( 0, 0, 0, 0 );
4469 case float_round_up:
4470 return
4471 aSign ? packFloat128( 1, 0, 0, 0 )
4472 : packFloat128( 0, 0x3FFF, 0, 0 );
4474 return packFloat128( aSign, 0, 0, 0 );
4476 lastBitMask = 1;
4477 lastBitMask <<= 0x402F - aExp;
4478 roundBitsMask = lastBitMask - 1;
4479 z.low = 0;
4480 z.high = a.high;
4481 roundingMode = STATUS(float_rounding_mode);
4482 if ( roundingMode == float_round_nearest_even ) {
4483 z.high += lastBitMask>>1;
4484 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4485 z.high &= ~ lastBitMask;
4488 else if ( roundingMode != float_round_to_zero ) {
4489 if ( extractFloat128Sign( z )
4490 ^ ( roundingMode == float_round_up ) ) {
4491 z.high |= ( a.low != 0 );
4492 z.high += roundBitsMask;
4495 z.high &= ~ roundBitsMask;
4497 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4498 STATUS(float_exception_flags) |= float_flag_inexact;
4500 return z;
4504 /*----------------------------------------------------------------------------
4505 | Returns the result of adding the absolute values of the quadruple-precision
4506 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4507 | before being returned. `zSign' is ignored if the result is a NaN.
4508 | The addition is performed according to the IEC/IEEE Standard for Binary
4509 | Floating-Point Arithmetic.
4510 *----------------------------------------------------------------------------*/
4512 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4514 int32 aExp, bExp, zExp;
4515 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4516 int32 expDiff;
4518 aSig1 = extractFloat128Frac1( a );
4519 aSig0 = extractFloat128Frac0( a );
4520 aExp = extractFloat128Exp( a );
4521 bSig1 = extractFloat128Frac1( b );
4522 bSig0 = extractFloat128Frac0( b );
4523 bExp = extractFloat128Exp( b );
4524 expDiff = aExp - bExp;
4525 if ( 0 < expDiff ) {
4526 if ( aExp == 0x7FFF ) {
4527 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4528 return a;
4530 if ( bExp == 0 ) {
4531 --expDiff;
4533 else {
4534 bSig0 |= LIT64( 0x0001000000000000 );
4536 shift128ExtraRightJamming(
4537 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4538 zExp = aExp;
4540 else if ( expDiff < 0 ) {
4541 if ( bExp == 0x7FFF ) {
4542 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4543 return packFloat128( zSign, 0x7FFF, 0, 0 );
4545 if ( aExp == 0 ) {
4546 ++expDiff;
4548 else {
4549 aSig0 |= LIT64( 0x0001000000000000 );
4551 shift128ExtraRightJamming(
4552 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4553 zExp = bExp;
4555 else {
4556 if ( aExp == 0x7FFF ) {
4557 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4558 return propagateFloat128NaN( a, b STATUS_VAR );
4560 return a;
4562 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4563 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4564 zSig2 = 0;
4565 zSig0 |= LIT64( 0x0002000000000000 );
4566 zExp = aExp;
4567 goto shiftRight1;
4569 aSig0 |= LIT64( 0x0001000000000000 );
4570 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4571 --zExp;
4572 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4573 ++zExp;
4574 shiftRight1:
4575 shift128ExtraRightJamming(
4576 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4577 roundAndPack:
4578 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4582 /*----------------------------------------------------------------------------
4583 | Returns the result of subtracting the absolute values of the quadruple-
4584 | precision floating-point values `a' and `b'. If `zSign' is 1, the
4585 | difference is negated before being returned. `zSign' is ignored if the
4586 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4587 | Standard for Binary Floating-Point Arithmetic.
4588 *----------------------------------------------------------------------------*/
4590 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4592 int32 aExp, bExp, zExp;
4593 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4594 int32 expDiff;
4595 float128 z;
4597 aSig1 = extractFloat128Frac1( a );
4598 aSig0 = extractFloat128Frac0( a );
4599 aExp = extractFloat128Exp( a );
4600 bSig1 = extractFloat128Frac1( b );
4601 bSig0 = extractFloat128Frac0( b );
4602 bExp = extractFloat128Exp( b );
4603 expDiff = aExp - bExp;
4604 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4605 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4606 if ( 0 < expDiff ) goto aExpBigger;
4607 if ( expDiff < 0 ) goto bExpBigger;
4608 if ( aExp == 0x7FFF ) {
4609 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4610 return propagateFloat128NaN( a, b STATUS_VAR );
4612 float_raise( float_flag_invalid STATUS_VAR);
4613 z.low = float128_default_nan_low;
4614 z.high = float128_default_nan_high;
4615 return z;
4617 if ( aExp == 0 ) {
4618 aExp = 1;
4619 bExp = 1;
4621 if ( bSig0 < aSig0 ) goto aBigger;
4622 if ( aSig0 < bSig0 ) goto bBigger;
4623 if ( bSig1 < aSig1 ) goto aBigger;
4624 if ( aSig1 < bSig1 ) goto bBigger;
4625 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4626 bExpBigger:
4627 if ( bExp == 0x7FFF ) {
4628 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4629 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4631 if ( aExp == 0 ) {
4632 ++expDiff;
4634 else {
4635 aSig0 |= LIT64( 0x4000000000000000 );
4637 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4638 bSig0 |= LIT64( 0x4000000000000000 );
4639 bBigger:
4640 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4641 zExp = bExp;
4642 zSign ^= 1;
4643 goto normalizeRoundAndPack;
4644 aExpBigger:
4645 if ( aExp == 0x7FFF ) {
4646 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4647 return a;
4649 if ( bExp == 0 ) {
4650 --expDiff;
4652 else {
4653 bSig0 |= LIT64( 0x4000000000000000 );
4655 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4656 aSig0 |= LIT64( 0x4000000000000000 );
4657 aBigger:
4658 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4659 zExp = aExp;
4660 normalizeRoundAndPack:
4661 --zExp;
4662 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
4666 /*----------------------------------------------------------------------------
4667 | Returns the result of adding the quadruple-precision floating-point values
4668 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4669 | for Binary Floating-Point Arithmetic.
4670 *----------------------------------------------------------------------------*/
4672 float128 float128_add( float128 a, float128 b STATUS_PARAM )
4674 flag aSign, bSign;
4676 aSign = extractFloat128Sign( a );
4677 bSign = extractFloat128Sign( b );
4678 if ( aSign == bSign ) {
4679 return addFloat128Sigs( a, b, aSign STATUS_VAR );
4681 else {
4682 return subFloat128Sigs( a, b, aSign STATUS_VAR );
4687 /*----------------------------------------------------------------------------
4688 | Returns the result of subtracting the quadruple-precision floating-point
4689 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4690 | Standard for Binary Floating-Point Arithmetic.
4691 *----------------------------------------------------------------------------*/
4693 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
4695 flag aSign, bSign;
4697 aSign = extractFloat128Sign( a );
4698 bSign = extractFloat128Sign( b );
4699 if ( aSign == bSign ) {
4700 return subFloat128Sigs( a, b, aSign STATUS_VAR );
4702 else {
4703 return addFloat128Sigs( a, b, aSign STATUS_VAR );
4708 /*----------------------------------------------------------------------------
4709 | Returns the result of multiplying the quadruple-precision floating-point
4710 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4711 | Standard for Binary Floating-Point Arithmetic.
4712 *----------------------------------------------------------------------------*/
4714 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
4716 flag aSign, bSign, zSign;
4717 int32 aExp, bExp, zExp;
4718 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4719 float128 z;
4721 aSig1 = extractFloat128Frac1( a );
4722 aSig0 = extractFloat128Frac0( a );
4723 aExp = extractFloat128Exp( a );
4724 aSign = extractFloat128Sign( a );
4725 bSig1 = extractFloat128Frac1( b );
4726 bSig0 = extractFloat128Frac0( b );
4727 bExp = extractFloat128Exp( b );
4728 bSign = extractFloat128Sign( b );
4729 zSign = aSign ^ bSign;
4730 if ( aExp == 0x7FFF ) {
4731 if ( ( aSig0 | aSig1 )
4732 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4733 return propagateFloat128NaN( a, b STATUS_VAR );
4735 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4736 return packFloat128( zSign, 0x7FFF, 0, 0 );
4738 if ( bExp == 0x7FFF ) {
4739 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4740 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4741 invalid:
4742 float_raise( float_flag_invalid STATUS_VAR);
4743 z.low = float128_default_nan_low;
4744 z.high = float128_default_nan_high;
4745 return z;
4747 return packFloat128( zSign, 0x7FFF, 0, 0 );
4749 if ( aExp == 0 ) {
4750 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4751 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4753 if ( bExp == 0 ) {
4754 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4755 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4757 zExp = aExp + bExp - 0x4000;
4758 aSig0 |= LIT64( 0x0001000000000000 );
4759 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4760 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4761 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4762 zSig2 |= ( zSig3 != 0 );
4763 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4764 shift128ExtraRightJamming(
4765 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4766 ++zExp;
4768 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4772 /*----------------------------------------------------------------------------
4773 | Returns the result of dividing the quadruple-precision floating-point value
4774 | `a' by the corresponding value `b'. The operation is performed according to
4775 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4776 *----------------------------------------------------------------------------*/
4778 float128 float128_div( float128 a, float128 b STATUS_PARAM )
4780 flag aSign, bSign, zSign;
4781 int32 aExp, bExp, zExp;
4782 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4783 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4784 float128 z;
4786 aSig1 = extractFloat128Frac1( a );
4787 aSig0 = extractFloat128Frac0( a );
4788 aExp = extractFloat128Exp( a );
4789 aSign = extractFloat128Sign( a );
4790 bSig1 = extractFloat128Frac1( b );
4791 bSig0 = extractFloat128Frac0( b );
4792 bExp = extractFloat128Exp( b );
4793 bSign = extractFloat128Sign( b );
4794 zSign = aSign ^ bSign;
4795 if ( aExp == 0x7FFF ) {
4796 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4797 if ( bExp == 0x7FFF ) {
4798 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4799 goto invalid;
4801 return packFloat128( zSign, 0x7FFF, 0, 0 );
4803 if ( bExp == 0x7FFF ) {
4804 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4805 return packFloat128( zSign, 0, 0, 0 );
4807 if ( bExp == 0 ) {
4808 if ( ( bSig0 | bSig1 ) == 0 ) {
4809 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4810 invalid:
4811 float_raise( float_flag_invalid STATUS_VAR);
4812 z.low = float128_default_nan_low;
4813 z.high = float128_default_nan_high;
4814 return z;
4816 float_raise( float_flag_divbyzero STATUS_VAR);
4817 return packFloat128( zSign, 0x7FFF, 0, 0 );
4819 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4821 if ( aExp == 0 ) {
4822 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4823 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4825 zExp = aExp - bExp + 0x3FFD;
4826 shortShift128Left(
4827 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4828 shortShift128Left(
4829 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4830 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4831 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4832 ++zExp;
4834 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4835 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4836 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4837 while ( (sbits64) rem0 < 0 ) {
4838 --zSig0;
4839 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4841 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4842 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4843 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4844 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4845 while ( (sbits64) rem1 < 0 ) {
4846 --zSig1;
4847 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4849 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4851 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4852 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4856 /*----------------------------------------------------------------------------
4857 | Returns the remainder of the quadruple-precision floating-point value `a'
4858 | with respect to the corresponding value `b'. The operation is performed
4859 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4860 *----------------------------------------------------------------------------*/
4862 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
4864 flag aSign, bSign, zSign;
4865 int32 aExp, bExp, expDiff;
4866 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4867 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4868 sbits64 sigMean0;
4869 float128 z;
4871 aSig1 = extractFloat128Frac1( a );
4872 aSig0 = extractFloat128Frac0( a );
4873 aExp = extractFloat128Exp( a );
4874 aSign = extractFloat128Sign( a );
4875 bSig1 = extractFloat128Frac1( b );
4876 bSig0 = extractFloat128Frac0( b );
4877 bExp = extractFloat128Exp( b );
4878 bSign = extractFloat128Sign( b );
4879 if ( aExp == 0x7FFF ) {
4880 if ( ( aSig0 | aSig1 )
4881 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4882 return propagateFloat128NaN( a, b STATUS_VAR );
4884 goto invalid;
4886 if ( bExp == 0x7FFF ) {
4887 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4888 return a;
4890 if ( bExp == 0 ) {
4891 if ( ( bSig0 | bSig1 ) == 0 ) {
4892 invalid:
4893 float_raise( float_flag_invalid STATUS_VAR);
4894 z.low = float128_default_nan_low;
4895 z.high = float128_default_nan_high;
4896 return z;
4898 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4900 if ( aExp == 0 ) {
4901 if ( ( aSig0 | aSig1 ) == 0 ) return a;
4902 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4904 expDiff = aExp - bExp;
4905 if ( expDiff < -1 ) return a;
4906 shortShift128Left(
4907 aSig0 | LIT64( 0x0001000000000000 ),
4908 aSig1,
4909 15 - ( expDiff < 0 ),
4910 &aSig0,
4911 &aSig1
4913 shortShift128Left(
4914 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4915 q = le128( bSig0, bSig1, aSig0, aSig1 );
4916 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4917 expDiff -= 64;
4918 while ( 0 < expDiff ) {
4919 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4920 q = ( 4 < q ) ? q - 4 : 0;
4921 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4922 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4923 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4924 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4925 expDiff -= 61;
4927 if ( -64 < expDiff ) {
4928 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4929 q = ( 4 < q ) ? q - 4 : 0;
4930 q >>= - expDiff;
4931 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4932 expDiff += 52;
4933 if ( expDiff < 0 ) {
4934 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4936 else {
4937 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4939 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4940 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4942 else {
4943 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4944 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4946 do {
4947 alternateASig0 = aSig0;
4948 alternateASig1 = aSig1;
4949 ++q;
4950 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4951 } while ( 0 <= (sbits64) aSig0 );
4952 add128(
4953 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
4954 if ( ( sigMean0 < 0 )
4955 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4956 aSig0 = alternateASig0;
4957 aSig1 = alternateASig1;
4959 zSign = ( (sbits64) aSig0 < 0 );
4960 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4961 return
4962 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
4966 /*----------------------------------------------------------------------------
4967 | Returns the square root of the quadruple-precision floating-point value `a'.
4968 | The operation is performed according to the IEC/IEEE Standard for Binary
4969 | Floating-Point Arithmetic.
4970 *----------------------------------------------------------------------------*/
4972 float128 float128_sqrt( float128 a STATUS_PARAM )
4974 flag aSign;
4975 int32 aExp, zExp;
4976 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
4977 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4978 float128 z;
4980 aSig1 = extractFloat128Frac1( a );
4981 aSig0 = extractFloat128Frac0( a );
4982 aExp = extractFloat128Exp( a );
4983 aSign = extractFloat128Sign( a );
4984 if ( aExp == 0x7FFF ) {
4985 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
4986 if ( ! aSign ) return a;
4987 goto invalid;
4989 if ( aSign ) {
4990 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
4991 invalid:
4992 float_raise( float_flag_invalid STATUS_VAR);
4993 z.low = float128_default_nan_low;
4994 z.high = float128_default_nan_high;
4995 return z;
4997 if ( aExp == 0 ) {
4998 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
4999 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5001 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5002 aSig0 |= LIT64( 0x0001000000000000 );
5003 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5004 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5005 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5006 doubleZSig0 = zSig0<<1;
5007 mul64To128( zSig0, zSig0, &term0, &term1 );
5008 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5009 while ( (sbits64) rem0 < 0 ) {
5010 --zSig0;
5011 doubleZSig0 -= 2;
5012 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5014 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5015 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5016 if ( zSig1 == 0 ) zSig1 = 1;
5017 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5018 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5019 mul64To128( zSig1, zSig1, &term2, &term3 );
5020 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5021 while ( (sbits64) rem1 < 0 ) {
5022 --zSig1;
5023 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5024 term3 |= 1;
5025 term2 |= doubleZSig0;
5026 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5028 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5030 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5031 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5035 /*----------------------------------------------------------------------------
5036 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5037 | the corresponding value `b', and 0 otherwise. The comparison is performed
5038 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5039 *----------------------------------------------------------------------------*/
5041 int float128_eq( float128 a, float128 b STATUS_PARAM )
5044 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5045 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5046 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5047 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5049 if ( float128_is_signaling_nan( a )
5050 || float128_is_signaling_nan( b ) ) {
5051 float_raise( float_flag_invalid STATUS_VAR);
5053 return 0;
5055 return
5056 ( a.low == b.low )
5057 && ( ( a.high == b.high )
5058 || ( ( a.low == 0 )
5059 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5064 /*----------------------------------------------------------------------------
5065 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5066 | or equal to the corresponding value `b', and 0 otherwise. The comparison
5067 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5068 | Arithmetic.
5069 *----------------------------------------------------------------------------*/
5071 int float128_le( float128 a, float128 b STATUS_PARAM )
5073 flag aSign, bSign;
5075 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5076 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5077 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5078 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5080 float_raise( float_flag_invalid STATUS_VAR);
5081 return 0;
5083 aSign = extractFloat128Sign( a );
5084 bSign = extractFloat128Sign( b );
5085 if ( aSign != bSign ) {
5086 return
5087 aSign
5088 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5089 == 0 );
5091 return
5092 aSign ? le128( b.high, b.low, a.high, a.low )
5093 : le128( a.high, a.low, b.high, b.low );
5097 /*----------------------------------------------------------------------------
5098 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5099 | the corresponding value `b', and 0 otherwise. The comparison is performed
5100 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5101 *----------------------------------------------------------------------------*/
5103 int float128_lt( float128 a, float128 b STATUS_PARAM )
5105 flag aSign, bSign;
5107 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5108 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5109 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5110 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5112 float_raise( float_flag_invalid STATUS_VAR);
5113 return 0;
5115 aSign = extractFloat128Sign( a );
5116 bSign = extractFloat128Sign( b );
5117 if ( aSign != bSign ) {
5118 return
5119 aSign
5120 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5121 != 0 );
5123 return
5124 aSign ? lt128( b.high, b.low, a.high, a.low )
5125 : lt128( a.high, a.low, b.high, b.low );
5129 /*----------------------------------------------------------------------------
5130 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5131 | the corresponding value `b', and 0 otherwise. The invalid exception is
5132 | raised if either operand is a NaN. Otherwise, the comparison is performed
5133 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5134 *----------------------------------------------------------------------------*/
5136 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5139 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5140 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5141 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5142 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5144 float_raise( float_flag_invalid STATUS_VAR);
5145 return 0;
5147 return
5148 ( a.low == b.low )
5149 && ( ( a.high == b.high )
5150 || ( ( a.low == 0 )
5151 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5156 /*----------------------------------------------------------------------------
5157 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5158 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5159 | cause an exception. Otherwise, the comparison is performed according to the
5160 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5161 *----------------------------------------------------------------------------*/
5163 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5165 flag aSign, bSign;
5167 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5168 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5169 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5170 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5172 if ( float128_is_signaling_nan( a )
5173 || float128_is_signaling_nan( b ) ) {
5174 float_raise( float_flag_invalid STATUS_VAR);
5176 return 0;
5178 aSign = extractFloat128Sign( a );
5179 bSign = extractFloat128Sign( b );
5180 if ( aSign != bSign ) {
5181 return
5182 aSign
5183 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5184 == 0 );
5186 return
5187 aSign ? le128( b.high, b.low, a.high, a.low )
5188 : le128( a.high, a.low, b.high, b.low );
5192 /*----------------------------------------------------------------------------
5193 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5194 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5195 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5196 | Standard for Binary Floating-Point Arithmetic.
5197 *----------------------------------------------------------------------------*/
5199 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5201 flag aSign, bSign;
5203 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5204 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5205 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5206 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5208 if ( float128_is_signaling_nan( a )
5209 || float128_is_signaling_nan( b ) ) {
5210 float_raise( float_flag_invalid STATUS_VAR);
5212 return 0;
5214 aSign = extractFloat128Sign( a );
5215 bSign = extractFloat128Sign( b );
5216 if ( aSign != bSign ) {
5217 return
5218 aSign
5219 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5220 != 0 );
5222 return
5223 aSign ? lt128( b.high, b.low, a.high, a.low )
5224 : lt128( a.high, a.low, b.high, b.low );
5228 #endif
5230 /* misc functions */
5231 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5233 return int64_to_float32(a STATUS_VAR);
5236 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5238 return int64_to_float64(a STATUS_VAR);
5241 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5243 int64_t v;
5244 unsigned int res;
5246 v = float32_to_int64(a STATUS_VAR);
5247 if (v < 0) {
5248 res = 0;
5249 float_raise( float_flag_invalid STATUS_VAR);
5250 } else if (v > 0xffffffff) {
5251 res = 0xffffffff;
5252 float_raise( float_flag_invalid STATUS_VAR);
5253 } else {
5254 res = v;
5256 return res;
5259 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5261 int64_t v;
5262 unsigned int res;
5264 v = float32_to_int64_round_to_zero(a STATUS_VAR);
5265 if (v < 0) {
5266 res = 0;
5267 float_raise( float_flag_invalid STATUS_VAR);
5268 } else if (v > 0xffffffff) {
5269 res = 0xffffffff;
5270 float_raise( float_flag_invalid STATUS_VAR);
5271 } else {
5272 res = v;
5274 return res;
5277 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5279 int64_t v;
5280 unsigned int res;
5282 v = float64_to_int64(a STATUS_VAR);
5283 if (v < 0) {
5284 res = 0;
5285 float_raise( float_flag_invalid STATUS_VAR);
5286 } else if (v > 0xffffffff) {
5287 res = 0xffffffff;
5288 float_raise( float_flag_invalid STATUS_VAR);
5289 } else {
5290 res = v;
5292 return res;
5295 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5297 int64_t v;
5298 unsigned int res;
5300 v = float64_to_int64_round_to_zero(a STATUS_VAR);
5301 if (v < 0) {
5302 res = 0;
5303 float_raise( float_flag_invalid STATUS_VAR);
5304 } else if (v > 0xffffffff) {
5305 res = 0xffffffff;
5306 float_raise( float_flag_invalid STATUS_VAR);
5307 } else {
5308 res = v;
5310 return res;
5313 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5315 int64_t v;
5317 v = int64_to_float64(INT64_MIN STATUS_VAR);
5318 v = float64_to_int64((a + v) STATUS_VAR);
5320 return v - INT64_MIN;
5323 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5325 int64_t v;
5327 v = int64_to_float64(INT64_MIN STATUS_VAR);
5328 v = float64_to_int64_round_to_zero((a + v) STATUS_VAR);
5330 return v - INT64_MIN;
5333 #define COMPARE(s, nan_exp) \
5334 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
5335 int is_quiet STATUS_PARAM ) \
5337 flag aSign, bSign; \
5339 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
5340 extractFloat ## s ## Frac( a ) ) || \
5341 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
5342 extractFloat ## s ## Frac( b ) )) { \
5343 if (!is_quiet || \
5344 float ## s ## _is_signaling_nan( a ) || \
5345 float ## s ## _is_signaling_nan( b ) ) { \
5346 float_raise( float_flag_invalid STATUS_VAR); \
5348 return float_relation_unordered; \
5350 aSign = extractFloat ## s ## Sign( a ); \
5351 bSign = extractFloat ## s ## Sign( b ); \
5352 if ( aSign != bSign ) { \
5353 if ( (bits ## s) ( ( a | b )<<1 ) == 0 ) { \
5354 /* zero case */ \
5355 return float_relation_equal; \
5356 } else { \
5357 return 1 - (2 * aSign); \
5359 } else { \
5360 if (a == b) { \
5361 return float_relation_equal; \
5362 } else { \
5363 return 1 - 2 * (aSign ^ ( a < b )); \
5368 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
5370 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
5373 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
5375 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
5378 COMPARE(32, 0xff)
5379 COMPARE(64, 0x7ff)