blockdev: add refcount to DriveInfo
[qemu.git] / fpu / softfloat.c
blob17842f43dac691aed848c365272e6e853738593e
2 /*============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5 Package, Release 2b.
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15 arithmetic/SoftFloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
26 Derivative works are acceptable, even for commercial purposes, so long as
27 (1) the source code for the derivative work includes prominent notice that
28 the work is derivative, and (2) the source code includes prominent notice with
29 these four paragraphs for those parts of this code that are retained.
31 =============================================================================*/
33 #include "softfloat.h"
35 /*----------------------------------------------------------------------------
36 | Primitive arithmetic functions, including multi-word arithmetic, and
37 | division and square root approximations. (Can be specialized to target if
38 | desired.)
39 *----------------------------------------------------------------------------*/
40 #include "softfloat-macros.h"
42 /*----------------------------------------------------------------------------
43 | Functions and definitions to determine: (1) whether tininess for underflow
44 | is detected before or after rounding by default, (2) what (if anything)
45 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
46 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
47 | are propagated from function inputs to output. These details are target-
48 | specific.
49 *----------------------------------------------------------------------------*/
50 #include "softfloat-specialize.h"
52 void set_float_rounding_mode(int val STATUS_PARAM)
54 STATUS(float_rounding_mode) = val;
57 void set_float_exception_flags(int val STATUS_PARAM)
59 STATUS(float_exception_flags) = val;
62 #ifdef FLOATX80
63 void set_floatx80_rounding_precision(int val STATUS_PARAM)
65 STATUS(floatx80_rounding_precision) = val;
67 #endif
69 /*----------------------------------------------------------------------------
70 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
71 | and 7, and returns the properly rounded 32-bit integer corresponding to the
72 | input. If `zSign' is 1, the input is negated before being converted to an
73 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
74 | is simply rounded to an integer, with the inexact exception raised if the
75 | input cannot be represented exactly as an integer. However, if the fixed-
76 | point input is too large, the invalid exception is raised and the largest
77 | positive or negative integer is returned.
78 *----------------------------------------------------------------------------*/
80 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
82 int8 roundingMode;
83 flag roundNearestEven;
84 int8 roundIncrement, roundBits;
85 int32 z;
87 roundingMode = STATUS(float_rounding_mode);
88 roundNearestEven = ( roundingMode == float_round_nearest_even );
89 roundIncrement = 0x40;
90 if ( ! roundNearestEven ) {
91 if ( roundingMode == float_round_to_zero ) {
92 roundIncrement = 0;
94 else {
95 roundIncrement = 0x7F;
96 if ( zSign ) {
97 if ( roundingMode == float_round_up ) roundIncrement = 0;
99 else {
100 if ( roundingMode == float_round_down ) roundIncrement = 0;
104 roundBits = absZ & 0x7F;
105 absZ = ( absZ + roundIncrement )>>7;
106 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
107 z = absZ;
108 if ( zSign ) z = - z;
109 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
110 float_raise( float_flag_invalid STATUS_VAR);
111 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
113 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
114 return z;
118 /*----------------------------------------------------------------------------
119 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
120 | `absZ1', with binary point between bits 63 and 64 (between the input words),
121 | and returns the properly rounded 64-bit integer corresponding to the input.
122 | If `zSign' is 1, the input is negated before being converted to an integer.
123 | Ordinarily, the fixed-point input is simply rounded to an integer, with
124 | the inexact exception raised if the input cannot be represented exactly as
125 | an integer. However, if the fixed-point input is too large, the invalid
126 | exception is raised and the largest positive or negative integer is
127 | returned.
128 *----------------------------------------------------------------------------*/
130 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
132 int8 roundingMode;
133 flag roundNearestEven, increment;
134 int64 z;
136 roundingMode = STATUS(float_rounding_mode);
137 roundNearestEven = ( roundingMode == float_round_nearest_even );
138 increment = ( (sbits64) absZ1 < 0 );
139 if ( ! roundNearestEven ) {
140 if ( roundingMode == float_round_to_zero ) {
141 increment = 0;
143 else {
144 if ( zSign ) {
145 increment = ( roundingMode == float_round_down ) && absZ1;
147 else {
148 increment = ( roundingMode == float_round_up ) && absZ1;
152 if ( increment ) {
153 ++absZ0;
154 if ( absZ0 == 0 ) goto overflow;
155 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
157 z = absZ0;
158 if ( zSign ) z = - z;
159 if ( z && ( ( z < 0 ) ^ zSign ) ) {
160 overflow:
161 float_raise( float_flag_invalid STATUS_VAR);
162 return
163 zSign ? (sbits64) LIT64( 0x8000000000000000 )
164 : LIT64( 0x7FFFFFFFFFFFFFFF );
166 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
167 return z;
171 /*----------------------------------------------------------------------------
172 | Returns the fraction bits of the single-precision floating-point value `a'.
173 *----------------------------------------------------------------------------*/
175 INLINE bits32 extractFloat32Frac( float32 a )
178 return float32_val(a) & 0x007FFFFF;
182 /*----------------------------------------------------------------------------
183 | Returns the exponent bits of the single-precision floating-point value `a'.
184 *----------------------------------------------------------------------------*/
186 INLINE int16 extractFloat32Exp( float32 a )
189 return ( float32_val(a)>>23 ) & 0xFF;
193 /*----------------------------------------------------------------------------
194 | Returns the sign bit of the single-precision floating-point value `a'.
195 *----------------------------------------------------------------------------*/
197 INLINE flag extractFloat32Sign( float32 a )
200 return float32_val(a)>>31;
204 /*----------------------------------------------------------------------------
205 | If `a' is denormal and we are in flush-to-zero mode then set the
206 | input-denormal exception and return zero. Otherwise just return the value.
207 *----------------------------------------------------------------------------*/
208 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
210 if (STATUS(flush_inputs_to_zero)) {
211 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
212 float_raise(float_flag_input_denormal STATUS_VAR);
213 return make_float32(float32_val(a) & 0x80000000);
216 return a;
219 /*----------------------------------------------------------------------------
220 | Normalizes the subnormal single-precision floating-point value represented
221 | by the denormalized significand `aSig'. The normalized exponent and
222 | significand are stored at the locations pointed to by `zExpPtr' and
223 | `zSigPtr', respectively.
224 *----------------------------------------------------------------------------*/
226 static void
227 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
229 int8 shiftCount;
231 shiftCount = countLeadingZeros32( aSig ) - 8;
232 *zSigPtr = aSig<<shiftCount;
233 *zExpPtr = 1 - shiftCount;
237 /*----------------------------------------------------------------------------
238 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
239 | single-precision floating-point value, returning the result. After being
240 | shifted into the proper positions, the three fields are simply added
241 | together to form the result. This means that any integer portion of `zSig'
242 | will be added into the exponent. Since a properly normalized significand
243 | will have an integer portion equal to 1, the `zExp' input should be 1 less
244 | than the desired result exponent whenever `zSig' is a complete, normalized
245 | significand.
246 *----------------------------------------------------------------------------*/
248 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
251 return make_float32(
252 ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
256 /*----------------------------------------------------------------------------
257 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
258 | and significand `zSig', and returns the proper single-precision floating-
259 | point value corresponding to the abstract input. Ordinarily, the abstract
260 | value is simply rounded and packed into the single-precision format, with
261 | the inexact exception raised if the abstract input cannot be represented
262 | exactly. However, if the abstract value is too large, the overflow and
263 | inexact exceptions are raised and an infinity or maximal finite value is
264 | returned. If the abstract value is too small, the input value is rounded to
265 | a subnormal number, and the underflow and inexact exceptions are raised if
266 | the abstract input cannot be represented exactly as a subnormal single-
267 | precision floating-point number.
268 | The input significand `zSig' has its binary point between bits 30
269 | and 29, which is 7 bits to the left of the usual location. This shifted
270 | significand must be normalized or smaller. If `zSig' is not normalized,
271 | `zExp' must be 0; in that case, the result returned is a subnormal number,
272 | and it must not require rounding. In the usual case that `zSig' is
273 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
274 | The handling of underflow and overflow follows the IEC/IEEE Standard for
275 | Binary Floating-Point Arithmetic.
276 *----------------------------------------------------------------------------*/
278 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
280 int8 roundingMode;
281 flag roundNearestEven;
282 int8 roundIncrement, roundBits;
283 flag isTiny;
285 roundingMode = STATUS(float_rounding_mode);
286 roundNearestEven = ( roundingMode == float_round_nearest_even );
287 roundIncrement = 0x40;
288 if ( ! roundNearestEven ) {
289 if ( roundingMode == float_round_to_zero ) {
290 roundIncrement = 0;
292 else {
293 roundIncrement = 0x7F;
294 if ( zSign ) {
295 if ( roundingMode == float_round_up ) roundIncrement = 0;
297 else {
298 if ( roundingMode == float_round_down ) roundIncrement = 0;
302 roundBits = zSig & 0x7F;
303 if ( 0xFD <= (bits16) zExp ) {
304 if ( ( 0xFD < zExp )
305 || ( ( zExp == 0xFD )
306 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
308 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
309 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
311 if ( zExp < 0 ) {
312 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
313 isTiny =
314 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
315 || ( zExp < -1 )
316 || ( zSig + roundIncrement < 0x80000000 );
317 shift32RightJamming( zSig, - zExp, &zSig );
318 zExp = 0;
319 roundBits = zSig & 0x7F;
320 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
323 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
324 zSig = ( zSig + roundIncrement )>>7;
325 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
326 if ( zSig == 0 ) zExp = 0;
327 return packFloat32( zSign, zExp, zSig );
331 /*----------------------------------------------------------------------------
332 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
333 | and significand `zSig', and returns the proper single-precision floating-
334 | point value corresponding to the abstract input. This routine is just like
335 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
336 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
337 | floating-point exponent.
338 *----------------------------------------------------------------------------*/
340 static float32
341 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
343 int8 shiftCount;
345 shiftCount = countLeadingZeros32( zSig ) - 1;
346 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
350 /*----------------------------------------------------------------------------
351 | Returns the fraction bits of the double-precision floating-point value `a'.
352 *----------------------------------------------------------------------------*/
354 INLINE bits64 extractFloat64Frac( float64 a )
357 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
361 /*----------------------------------------------------------------------------
362 | Returns the exponent bits of the double-precision floating-point value `a'.
363 *----------------------------------------------------------------------------*/
365 INLINE int16 extractFloat64Exp( float64 a )
368 return ( float64_val(a)>>52 ) & 0x7FF;
372 /*----------------------------------------------------------------------------
373 | Returns the sign bit of the double-precision floating-point value `a'.
374 *----------------------------------------------------------------------------*/
376 INLINE flag extractFloat64Sign( float64 a )
379 return float64_val(a)>>63;
383 /*----------------------------------------------------------------------------
384 | If `a' is denormal and we are in flush-to-zero mode then set the
385 | input-denormal exception and return zero. Otherwise just return the value.
386 *----------------------------------------------------------------------------*/
387 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
389 if (STATUS(flush_inputs_to_zero)) {
390 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
391 float_raise(float_flag_input_denormal STATUS_VAR);
392 return make_float64(float64_val(a) & (1ULL << 63));
395 return a;
398 /*----------------------------------------------------------------------------
399 | Normalizes the subnormal double-precision floating-point value represented
400 | by the denormalized significand `aSig'. The normalized exponent and
401 | significand are stored at the locations pointed to by `zExpPtr' and
402 | `zSigPtr', respectively.
403 *----------------------------------------------------------------------------*/
405 static void
406 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
408 int8 shiftCount;
410 shiftCount = countLeadingZeros64( aSig ) - 11;
411 *zSigPtr = aSig<<shiftCount;
412 *zExpPtr = 1 - shiftCount;
416 /*----------------------------------------------------------------------------
417 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
418 | double-precision floating-point value, returning the result. After being
419 | shifted into the proper positions, the three fields are simply added
420 | together to form the result. This means that any integer portion of `zSig'
421 | will be added into the exponent. Since a properly normalized significand
422 | will have an integer portion equal to 1, the `zExp' input should be 1 less
423 | than the desired result exponent whenever `zSig' is a complete, normalized
424 | significand.
425 *----------------------------------------------------------------------------*/
427 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
430 return make_float64(
431 ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
435 /*----------------------------------------------------------------------------
436 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
437 | and significand `zSig', and returns the proper double-precision floating-
438 | point value corresponding to the abstract input. Ordinarily, the abstract
439 | value is simply rounded and packed into the double-precision format, with
440 | the inexact exception raised if the abstract input cannot be represented
441 | exactly. However, if the abstract value is too large, the overflow and
442 | inexact exceptions are raised and an infinity or maximal finite value is
443 | returned. If the abstract value is too small, the input value is rounded
444 | to a subnormal number, and the underflow and inexact exceptions are raised
445 | if the abstract input cannot be represented exactly as a subnormal double-
446 | precision floating-point number.
447 | The input significand `zSig' has its binary point between bits 62
448 | and 61, which is 10 bits to the left of the usual location. This shifted
449 | significand must be normalized or smaller. If `zSig' is not normalized,
450 | `zExp' must be 0; in that case, the result returned is a subnormal number,
451 | and it must not require rounding. In the usual case that `zSig' is
452 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
453 | The handling of underflow and overflow follows the IEC/IEEE Standard for
454 | Binary Floating-Point Arithmetic.
455 *----------------------------------------------------------------------------*/
457 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
459 int8 roundingMode;
460 flag roundNearestEven;
461 int16 roundIncrement, roundBits;
462 flag isTiny;
464 roundingMode = STATUS(float_rounding_mode);
465 roundNearestEven = ( roundingMode == float_round_nearest_even );
466 roundIncrement = 0x200;
467 if ( ! roundNearestEven ) {
468 if ( roundingMode == float_round_to_zero ) {
469 roundIncrement = 0;
471 else {
472 roundIncrement = 0x3FF;
473 if ( zSign ) {
474 if ( roundingMode == float_round_up ) roundIncrement = 0;
476 else {
477 if ( roundingMode == float_round_down ) roundIncrement = 0;
481 roundBits = zSig & 0x3FF;
482 if ( 0x7FD <= (bits16) zExp ) {
483 if ( ( 0x7FD < zExp )
484 || ( ( zExp == 0x7FD )
485 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
487 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
488 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
490 if ( zExp < 0 ) {
491 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
492 isTiny =
493 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
494 || ( zExp < -1 )
495 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
496 shift64RightJamming( zSig, - zExp, &zSig );
497 zExp = 0;
498 roundBits = zSig & 0x3FF;
499 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
502 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
503 zSig = ( zSig + roundIncrement )>>10;
504 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
505 if ( zSig == 0 ) zExp = 0;
506 return packFloat64( zSign, zExp, zSig );
510 /*----------------------------------------------------------------------------
511 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
512 | and significand `zSig', and returns the proper double-precision floating-
513 | point value corresponding to the abstract input. This routine is just like
514 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
515 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
516 | floating-point exponent.
517 *----------------------------------------------------------------------------*/
519 static float64
520 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
522 int8 shiftCount;
524 shiftCount = countLeadingZeros64( zSig ) - 1;
525 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
529 #ifdef FLOATX80
531 /*----------------------------------------------------------------------------
532 | Returns the fraction bits of the extended double-precision floating-point
533 | value `a'.
534 *----------------------------------------------------------------------------*/
536 INLINE bits64 extractFloatx80Frac( floatx80 a )
539 return a.low;
543 /*----------------------------------------------------------------------------
544 | Returns the exponent bits of the extended double-precision floating-point
545 | value `a'.
546 *----------------------------------------------------------------------------*/
548 INLINE int32 extractFloatx80Exp( floatx80 a )
551 return a.high & 0x7FFF;
555 /*----------------------------------------------------------------------------
556 | Returns the sign bit of the extended double-precision floating-point value
557 | `a'.
558 *----------------------------------------------------------------------------*/
560 INLINE flag extractFloatx80Sign( floatx80 a )
563 return a.high>>15;
567 /*----------------------------------------------------------------------------
568 | Normalizes the subnormal extended double-precision floating-point value
569 | represented by the denormalized significand `aSig'. The normalized exponent
570 | and significand are stored at the locations pointed to by `zExpPtr' and
571 | `zSigPtr', respectively.
572 *----------------------------------------------------------------------------*/
574 static void
575 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
577 int8 shiftCount;
579 shiftCount = countLeadingZeros64( aSig );
580 *zSigPtr = aSig<<shiftCount;
581 *zExpPtr = 1 - shiftCount;
585 /*----------------------------------------------------------------------------
586 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
587 | extended double-precision floating-point value, returning the result.
588 *----------------------------------------------------------------------------*/
590 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
592 floatx80 z;
594 z.low = zSig;
595 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
596 return z;
600 /*----------------------------------------------------------------------------
601 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
602 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
603 | and returns the proper extended double-precision floating-point value
604 | corresponding to the abstract input. Ordinarily, the abstract value is
605 | rounded and packed into the extended double-precision format, with the
606 | inexact exception raised if the abstract input cannot be represented
607 | exactly. However, if the abstract value is too large, the overflow and
608 | inexact exceptions are raised and an infinity or maximal finite value is
609 | returned. If the abstract value is too small, the input value is rounded to
610 | a subnormal number, and the underflow and inexact exceptions are raised if
611 | the abstract input cannot be represented exactly as a subnormal extended
612 | double-precision floating-point number.
613 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
614 | number of bits as single or double precision, respectively. Otherwise, the
615 | result is rounded to the full precision of the extended double-precision
616 | format.
617 | The input significand must be normalized or smaller. If the input
618 | significand is not normalized, `zExp' must be 0; in that case, the result
619 | returned is a subnormal number, and it must not require rounding. The
620 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
621 | Floating-Point Arithmetic.
622 *----------------------------------------------------------------------------*/
624 static floatx80
625 roundAndPackFloatx80(
626 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
627 STATUS_PARAM)
629 int8 roundingMode;
630 flag roundNearestEven, increment, isTiny;
631 int64 roundIncrement, roundMask, roundBits;
633 roundingMode = STATUS(float_rounding_mode);
634 roundNearestEven = ( roundingMode == float_round_nearest_even );
635 if ( roundingPrecision == 80 ) goto precision80;
636 if ( roundingPrecision == 64 ) {
637 roundIncrement = LIT64( 0x0000000000000400 );
638 roundMask = LIT64( 0x00000000000007FF );
640 else if ( roundingPrecision == 32 ) {
641 roundIncrement = LIT64( 0x0000008000000000 );
642 roundMask = LIT64( 0x000000FFFFFFFFFF );
644 else {
645 goto precision80;
647 zSig0 |= ( zSig1 != 0 );
648 if ( ! roundNearestEven ) {
649 if ( roundingMode == float_round_to_zero ) {
650 roundIncrement = 0;
652 else {
653 roundIncrement = roundMask;
654 if ( zSign ) {
655 if ( roundingMode == float_round_up ) roundIncrement = 0;
657 else {
658 if ( roundingMode == float_round_down ) roundIncrement = 0;
662 roundBits = zSig0 & roundMask;
663 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
664 if ( ( 0x7FFE < zExp )
665 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
667 goto overflow;
669 if ( zExp <= 0 ) {
670 if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
671 isTiny =
672 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
673 || ( zExp < 0 )
674 || ( zSig0 <= zSig0 + roundIncrement );
675 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
676 zExp = 0;
677 roundBits = zSig0 & roundMask;
678 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
679 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
680 zSig0 += roundIncrement;
681 if ( (sbits64) zSig0 < 0 ) zExp = 1;
682 roundIncrement = roundMask + 1;
683 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
684 roundMask |= roundIncrement;
686 zSig0 &= ~ roundMask;
687 return packFloatx80( zSign, zExp, zSig0 );
690 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
691 zSig0 += roundIncrement;
692 if ( zSig0 < roundIncrement ) {
693 ++zExp;
694 zSig0 = LIT64( 0x8000000000000000 );
696 roundIncrement = roundMask + 1;
697 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
698 roundMask |= roundIncrement;
700 zSig0 &= ~ roundMask;
701 if ( zSig0 == 0 ) zExp = 0;
702 return packFloatx80( zSign, zExp, zSig0 );
703 precision80:
704 increment = ( (sbits64) zSig1 < 0 );
705 if ( ! roundNearestEven ) {
706 if ( roundingMode == float_round_to_zero ) {
707 increment = 0;
709 else {
710 if ( zSign ) {
711 increment = ( roundingMode == float_round_down ) && zSig1;
713 else {
714 increment = ( roundingMode == float_round_up ) && zSig1;
718 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
719 if ( ( 0x7FFE < zExp )
720 || ( ( zExp == 0x7FFE )
721 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
722 && increment
725 roundMask = 0;
726 overflow:
727 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
728 if ( ( roundingMode == float_round_to_zero )
729 || ( zSign && ( roundingMode == float_round_up ) )
730 || ( ! zSign && ( roundingMode == float_round_down ) )
732 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
734 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
736 if ( zExp <= 0 ) {
737 isTiny =
738 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
739 || ( zExp < 0 )
740 || ! increment
741 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
742 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
743 zExp = 0;
744 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
745 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
746 if ( roundNearestEven ) {
747 increment = ( (sbits64) zSig1 < 0 );
749 else {
750 if ( zSign ) {
751 increment = ( roundingMode == float_round_down ) && zSig1;
753 else {
754 increment = ( roundingMode == float_round_up ) && zSig1;
757 if ( increment ) {
758 ++zSig0;
759 zSig0 &=
760 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
761 if ( (sbits64) zSig0 < 0 ) zExp = 1;
763 return packFloatx80( zSign, zExp, zSig0 );
766 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
767 if ( increment ) {
768 ++zSig0;
769 if ( zSig0 == 0 ) {
770 ++zExp;
771 zSig0 = LIT64( 0x8000000000000000 );
773 else {
774 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
777 else {
778 if ( zSig0 == 0 ) zExp = 0;
780 return packFloatx80( zSign, zExp, zSig0 );
784 /*----------------------------------------------------------------------------
785 | Takes an abstract floating-point value having sign `zSign', exponent
786 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
787 | and returns the proper extended double-precision floating-point value
788 | corresponding to the abstract input. This routine is just like
789 | `roundAndPackFloatx80' except that the input significand does not have to be
790 | normalized.
791 *----------------------------------------------------------------------------*/
793 static floatx80
794 normalizeRoundAndPackFloatx80(
795 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
796 STATUS_PARAM)
798 int8 shiftCount;
800 if ( zSig0 == 0 ) {
801 zSig0 = zSig1;
802 zSig1 = 0;
803 zExp -= 64;
805 shiftCount = countLeadingZeros64( zSig0 );
806 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
807 zExp -= shiftCount;
808 return
809 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
813 #endif
815 #ifdef FLOAT128
817 /*----------------------------------------------------------------------------
818 | Returns the least-significant 64 fraction bits of the quadruple-precision
819 | floating-point value `a'.
820 *----------------------------------------------------------------------------*/
822 INLINE bits64 extractFloat128Frac1( float128 a )
825 return a.low;
829 /*----------------------------------------------------------------------------
830 | Returns the most-significant 48 fraction bits of the quadruple-precision
831 | floating-point value `a'.
832 *----------------------------------------------------------------------------*/
834 INLINE bits64 extractFloat128Frac0( float128 a )
837 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
841 /*----------------------------------------------------------------------------
842 | Returns the exponent bits of the quadruple-precision floating-point value
843 | `a'.
844 *----------------------------------------------------------------------------*/
846 INLINE int32 extractFloat128Exp( float128 a )
849 return ( a.high>>48 ) & 0x7FFF;
853 /*----------------------------------------------------------------------------
854 | Returns the sign bit of the quadruple-precision floating-point value `a'.
855 *----------------------------------------------------------------------------*/
857 INLINE flag extractFloat128Sign( float128 a )
860 return a.high>>63;
864 /*----------------------------------------------------------------------------
865 | Normalizes the subnormal quadruple-precision floating-point value
866 | represented by the denormalized significand formed by the concatenation of
867 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
868 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
869 | significand are stored at the location pointed to by `zSig0Ptr', and the
870 | least significant 64 bits of the normalized significand are stored at the
871 | location pointed to by `zSig1Ptr'.
872 *----------------------------------------------------------------------------*/
874 static void
875 normalizeFloat128Subnormal(
876 bits64 aSig0,
877 bits64 aSig1,
878 int32 *zExpPtr,
879 bits64 *zSig0Ptr,
880 bits64 *zSig1Ptr
883 int8 shiftCount;
885 if ( aSig0 == 0 ) {
886 shiftCount = countLeadingZeros64( aSig1 ) - 15;
887 if ( shiftCount < 0 ) {
888 *zSig0Ptr = aSig1>>( - shiftCount );
889 *zSig1Ptr = aSig1<<( shiftCount & 63 );
891 else {
892 *zSig0Ptr = aSig1<<shiftCount;
893 *zSig1Ptr = 0;
895 *zExpPtr = - shiftCount - 63;
897 else {
898 shiftCount = countLeadingZeros64( aSig0 ) - 15;
899 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
900 *zExpPtr = 1 - shiftCount;
905 /*----------------------------------------------------------------------------
906 | Packs the sign `zSign', the exponent `zExp', and the significand formed
907 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
908 | floating-point value, returning the result. After being shifted into the
909 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
910 | added together to form the most significant 32 bits of the result. This
911 | means that any integer portion of `zSig0' will be added into the exponent.
912 | Since a properly normalized significand will have an integer portion equal
913 | to 1, the `zExp' input should be 1 less than the desired result exponent
914 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
915 | significand.
916 *----------------------------------------------------------------------------*/
918 INLINE float128
919 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
921 float128 z;
923 z.low = zSig1;
924 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
925 return z;
929 /*----------------------------------------------------------------------------
930 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
931 | and extended significand formed by the concatenation of `zSig0', `zSig1',
932 | and `zSig2', and returns the proper quadruple-precision floating-point value
933 | corresponding to the abstract input. Ordinarily, the abstract value is
934 | simply rounded and packed into the quadruple-precision format, with the
935 | inexact exception raised if the abstract input cannot be represented
936 | exactly. However, if the abstract value is too large, the overflow and
937 | inexact exceptions are raised and an infinity or maximal finite value is
938 | returned. If the abstract value is too small, the input value is rounded to
939 | a subnormal number, and the underflow and inexact exceptions are raised if
940 | the abstract input cannot be represented exactly as a subnormal quadruple-
941 | precision floating-point number.
942 | The input significand must be normalized or smaller. If the input
943 | significand is not normalized, `zExp' must be 0; in that case, the result
944 | returned is a subnormal number, and it must not require rounding. In the
945 | usual case that the input significand is normalized, `zExp' must be 1 less
946 | than the ``true'' floating-point exponent. The handling of underflow and
947 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
948 *----------------------------------------------------------------------------*/
950 static float128
951 roundAndPackFloat128(
952 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
954 int8 roundingMode;
955 flag roundNearestEven, increment, isTiny;
957 roundingMode = STATUS(float_rounding_mode);
958 roundNearestEven = ( roundingMode == float_round_nearest_even );
959 increment = ( (sbits64) zSig2 < 0 );
960 if ( ! roundNearestEven ) {
961 if ( roundingMode == float_round_to_zero ) {
962 increment = 0;
964 else {
965 if ( zSign ) {
966 increment = ( roundingMode == float_round_down ) && zSig2;
968 else {
969 increment = ( roundingMode == float_round_up ) && zSig2;
973 if ( 0x7FFD <= (bits32) zExp ) {
974 if ( ( 0x7FFD < zExp )
975 || ( ( zExp == 0x7FFD )
976 && eq128(
977 LIT64( 0x0001FFFFFFFFFFFF ),
978 LIT64( 0xFFFFFFFFFFFFFFFF ),
979 zSig0,
980 zSig1
982 && increment
985 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
986 if ( ( roundingMode == float_round_to_zero )
987 || ( zSign && ( roundingMode == float_round_up ) )
988 || ( ! zSign && ( roundingMode == float_round_down ) )
990 return
991 packFloat128(
992 zSign,
993 0x7FFE,
994 LIT64( 0x0000FFFFFFFFFFFF ),
995 LIT64( 0xFFFFFFFFFFFFFFFF )
998 return packFloat128( zSign, 0x7FFF, 0, 0 );
1000 if ( zExp < 0 ) {
1001 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
1002 isTiny =
1003 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1004 || ( zExp < -1 )
1005 || ! increment
1006 || lt128(
1007 zSig0,
1008 zSig1,
1009 LIT64( 0x0001FFFFFFFFFFFF ),
1010 LIT64( 0xFFFFFFFFFFFFFFFF )
1012 shift128ExtraRightJamming(
1013 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1014 zExp = 0;
1015 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1016 if ( roundNearestEven ) {
1017 increment = ( (sbits64) zSig2 < 0 );
1019 else {
1020 if ( zSign ) {
1021 increment = ( roundingMode == float_round_down ) && zSig2;
1023 else {
1024 increment = ( roundingMode == float_round_up ) && zSig2;
1029 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1030 if ( increment ) {
1031 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1032 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1034 else {
1035 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1037 return packFloat128( zSign, zExp, zSig0, zSig1 );
1041 /*----------------------------------------------------------------------------
1042 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1043 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1044 | returns the proper quadruple-precision floating-point value corresponding
1045 | to the abstract input. This routine is just like `roundAndPackFloat128'
1046 | except that the input significand has fewer bits and does not have to be
1047 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1048 | point exponent.
1049 *----------------------------------------------------------------------------*/
1051 static float128
1052 normalizeRoundAndPackFloat128(
1053 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1055 int8 shiftCount;
1056 bits64 zSig2;
1058 if ( zSig0 == 0 ) {
1059 zSig0 = zSig1;
1060 zSig1 = 0;
1061 zExp -= 64;
1063 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1064 if ( 0 <= shiftCount ) {
1065 zSig2 = 0;
1066 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1068 else {
1069 shift128ExtraRightJamming(
1070 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1072 zExp -= shiftCount;
1073 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1077 #endif
1079 /*----------------------------------------------------------------------------
1080 | Returns the result of converting the 32-bit two's complement integer `a'
1081 | to the single-precision floating-point format. The conversion is performed
1082 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1083 *----------------------------------------------------------------------------*/
1085 float32 int32_to_float32( int32 a STATUS_PARAM )
1087 flag zSign;
1089 if ( a == 0 ) return float32_zero;
1090 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1091 zSign = ( a < 0 );
1092 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1096 /*----------------------------------------------------------------------------
1097 | Returns the result of converting the 32-bit two's complement integer `a'
1098 | to the double-precision floating-point format. The conversion is performed
1099 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1100 *----------------------------------------------------------------------------*/
1102 float64 int32_to_float64( int32 a STATUS_PARAM )
1104 flag zSign;
1105 uint32 absA;
1106 int8 shiftCount;
1107 bits64 zSig;
1109 if ( a == 0 ) return float64_zero;
1110 zSign = ( a < 0 );
1111 absA = zSign ? - a : a;
1112 shiftCount = countLeadingZeros32( absA ) + 21;
1113 zSig = absA;
1114 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1118 #ifdef FLOATX80
1120 /*----------------------------------------------------------------------------
1121 | Returns the result of converting the 32-bit two's complement integer `a'
1122 | to the extended double-precision floating-point format. The conversion
1123 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1124 | Arithmetic.
1125 *----------------------------------------------------------------------------*/
1127 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1129 flag zSign;
1130 uint32 absA;
1131 int8 shiftCount;
1132 bits64 zSig;
1134 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1135 zSign = ( a < 0 );
1136 absA = zSign ? - a : a;
1137 shiftCount = countLeadingZeros32( absA ) + 32;
1138 zSig = absA;
1139 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1143 #endif
1145 #ifdef FLOAT128
1147 /*----------------------------------------------------------------------------
1148 | Returns the result of converting the 32-bit two's complement integer `a' to
1149 | the quadruple-precision floating-point format. The conversion is performed
1150 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1151 *----------------------------------------------------------------------------*/
1153 float128 int32_to_float128( int32 a STATUS_PARAM )
1155 flag zSign;
1156 uint32 absA;
1157 int8 shiftCount;
1158 bits64 zSig0;
1160 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1161 zSign = ( a < 0 );
1162 absA = zSign ? - a : a;
1163 shiftCount = countLeadingZeros32( absA ) + 17;
1164 zSig0 = absA;
1165 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1169 #endif
1171 /*----------------------------------------------------------------------------
1172 | Returns the result of converting the 64-bit two's complement integer `a'
1173 | to the single-precision floating-point format. The conversion is performed
1174 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1175 *----------------------------------------------------------------------------*/
1177 float32 int64_to_float32( int64 a STATUS_PARAM )
1179 flag zSign;
1180 uint64 absA;
1181 int8 shiftCount;
1183 if ( a == 0 ) return float32_zero;
1184 zSign = ( a < 0 );
1185 absA = zSign ? - a : a;
1186 shiftCount = countLeadingZeros64( absA ) - 40;
1187 if ( 0 <= shiftCount ) {
1188 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1190 else {
1191 shiftCount += 7;
1192 if ( shiftCount < 0 ) {
1193 shift64RightJamming( absA, - shiftCount, &absA );
1195 else {
1196 absA <<= shiftCount;
1198 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1203 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1205 int8 shiftCount;
1207 if ( a == 0 ) return float32_zero;
1208 shiftCount = countLeadingZeros64( a ) - 40;
1209 if ( 0 <= shiftCount ) {
1210 return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1212 else {
1213 shiftCount += 7;
1214 if ( shiftCount < 0 ) {
1215 shift64RightJamming( a, - shiftCount, &a );
1217 else {
1218 a <<= shiftCount;
1220 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1224 /*----------------------------------------------------------------------------
1225 | Returns the result of converting the 64-bit two's complement integer `a'
1226 | to the double-precision floating-point format. The conversion is performed
1227 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1228 *----------------------------------------------------------------------------*/
1230 float64 int64_to_float64( int64 a STATUS_PARAM )
1232 flag zSign;
1234 if ( a == 0 ) return float64_zero;
1235 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1236 return packFloat64( 1, 0x43E, 0 );
1238 zSign = ( a < 0 );
1239 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1243 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1245 if ( a == 0 ) return float64_zero;
1246 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1250 #ifdef FLOATX80
1252 /*----------------------------------------------------------------------------
1253 | Returns the result of converting the 64-bit two's complement integer `a'
1254 | to the extended double-precision floating-point format. The conversion
1255 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1256 | Arithmetic.
1257 *----------------------------------------------------------------------------*/
1259 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1261 flag zSign;
1262 uint64 absA;
1263 int8 shiftCount;
1265 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1266 zSign = ( a < 0 );
1267 absA = zSign ? - a : a;
1268 shiftCount = countLeadingZeros64( absA );
1269 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1273 #endif
1275 #ifdef FLOAT128
1277 /*----------------------------------------------------------------------------
1278 | Returns the result of converting the 64-bit two's complement integer `a' to
1279 | the quadruple-precision floating-point format. The conversion is performed
1280 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1281 *----------------------------------------------------------------------------*/
1283 float128 int64_to_float128( int64 a STATUS_PARAM )
1285 flag zSign;
1286 uint64 absA;
1287 int8 shiftCount;
1288 int32 zExp;
1289 bits64 zSig0, zSig1;
1291 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1292 zSign = ( a < 0 );
1293 absA = zSign ? - a : a;
1294 shiftCount = countLeadingZeros64( absA ) + 49;
1295 zExp = 0x406E - shiftCount;
1296 if ( 64 <= shiftCount ) {
1297 zSig1 = 0;
1298 zSig0 = absA;
1299 shiftCount -= 64;
1301 else {
1302 zSig1 = absA;
1303 zSig0 = 0;
1305 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1306 return packFloat128( zSign, zExp, zSig0, zSig1 );
1310 #endif
1312 /*----------------------------------------------------------------------------
1313 | Returns the result of converting the single-precision floating-point value
1314 | `a' to the 32-bit two's complement integer format. The conversion is
1315 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1316 | Arithmetic---which means in particular that the conversion is rounded
1317 | according to the current rounding mode. If `a' is a NaN, the largest
1318 | positive integer is returned. Otherwise, if the conversion overflows, the
1319 | largest integer with the same sign as `a' is returned.
1320 *----------------------------------------------------------------------------*/
1322 int32 float32_to_int32( float32 a STATUS_PARAM )
1324 flag aSign;
1325 int16 aExp, shiftCount;
1326 bits32 aSig;
1327 bits64 aSig64;
1329 a = float32_squash_input_denormal(a STATUS_VAR);
1330 aSig = extractFloat32Frac( a );
1331 aExp = extractFloat32Exp( a );
1332 aSign = extractFloat32Sign( a );
1333 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1334 if ( aExp ) aSig |= 0x00800000;
1335 shiftCount = 0xAF - aExp;
1336 aSig64 = aSig;
1337 aSig64 <<= 32;
1338 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1339 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1343 /*----------------------------------------------------------------------------
1344 | Returns the result of converting the single-precision floating-point value
1345 | `a' to the 32-bit two's complement integer format. The conversion is
1346 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1347 | Arithmetic, except that the conversion is always rounded toward zero.
1348 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1349 | the conversion overflows, the largest integer with the same sign as `a' is
1350 | returned.
1351 *----------------------------------------------------------------------------*/
1353 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1355 flag aSign;
1356 int16 aExp, shiftCount;
1357 bits32 aSig;
1358 int32 z;
1359 a = float32_squash_input_denormal(a STATUS_VAR);
1361 aSig = extractFloat32Frac( a );
1362 aExp = extractFloat32Exp( a );
1363 aSign = extractFloat32Sign( a );
1364 shiftCount = aExp - 0x9E;
1365 if ( 0 <= shiftCount ) {
1366 if ( float32_val(a) != 0xCF000000 ) {
1367 float_raise( float_flag_invalid STATUS_VAR);
1368 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1370 return (sbits32) 0x80000000;
1372 else if ( aExp <= 0x7E ) {
1373 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1374 return 0;
1376 aSig = ( aSig | 0x00800000 )<<8;
1377 z = aSig>>( - shiftCount );
1378 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1379 STATUS(float_exception_flags) |= float_flag_inexact;
1381 if ( aSign ) z = - z;
1382 return z;
1386 /*----------------------------------------------------------------------------
1387 | Returns the result of converting the single-precision floating-point value
1388 | `a' to the 16-bit two's complement integer format. The conversion is
1389 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1390 | Arithmetic, except that the conversion is always rounded toward zero.
1391 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1392 | the conversion overflows, the largest integer with the same sign as `a' is
1393 | returned.
1394 *----------------------------------------------------------------------------*/
1396 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1398 flag aSign;
1399 int16 aExp, shiftCount;
1400 bits32 aSig;
1401 int32 z;
1403 aSig = extractFloat32Frac( a );
1404 aExp = extractFloat32Exp( a );
1405 aSign = extractFloat32Sign( a );
1406 shiftCount = aExp - 0x8E;
1407 if ( 0 <= shiftCount ) {
1408 if ( float32_val(a) != 0xC7000000 ) {
1409 float_raise( float_flag_invalid STATUS_VAR);
1410 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1411 return 0x7FFF;
1414 return (sbits32) 0xffff8000;
1416 else if ( aExp <= 0x7E ) {
1417 if ( aExp | aSig ) {
1418 STATUS(float_exception_flags) |= float_flag_inexact;
1420 return 0;
1422 shiftCount -= 0x10;
1423 aSig = ( aSig | 0x00800000 )<<8;
1424 z = aSig>>( - shiftCount );
1425 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1426 STATUS(float_exception_flags) |= float_flag_inexact;
1428 if ( aSign ) {
1429 z = - z;
1431 return z;
1435 /*----------------------------------------------------------------------------
1436 | Returns the result of converting the single-precision floating-point value
1437 | `a' to the 64-bit two's complement integer format. The conversion is
1438 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1439 | Arithmetic---which means in particular that the conversion is rounded
1440 | according to the current rounding mode. If `a' is a NaN, the largest
1441 | positive integer is returned. Otherwise, if the conversion overflows, the
1442 | largest integer with the same sign as `a' is returned.
1443 *----------------------------------------------------------------------------*/
1445 int64 float32_to_int64( float32 a STATUS_PARAM )
1447 flag aSign;
1448 int16 aExp, shiftCount;
1449 bits32 aSig;
1450 bits64 aSig64, aSigExtra;
1451 a = float32_squash_input_denormal(a STATUS_VAR);
1453 aSig = extractFloat32Frac( a );
1454 aExp = extractFloat32Exp( a );
1455 aSign = extractFloat32Sign( a );
1456 shiftCount = 0xBE - aExp;
1457 if ( shiftCount < 0 ) {
1458 float_raise( float_flag_invalid STATUS_VAR);
1459 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1460 return LIT64( 0x7FFFFFFFFFFFFFFF );
1462 return (sbits64) LIT64( 0x8000000000000000 );
1464 if ( aExp ) aSig |= 0x00800000;
1465 aSig64 = aSig;
1466 aSig64 <<= 40;
1467 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1468 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1472 /*----------------------------------------------------------------------------
1473 | Returns the result of converting the single-precision floating-point value
1474 | `a' to the 64-bit two's complement integer format. The conversion is
1475 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1476 | Arithmetic, except that the conversion is always rounded toward zero. If
1477 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1478 | conversion overflows, the largest integer with the same sign as `a' is
1479 | returned.
1480 *----------------------------------------------------------------------------*/
1482 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1484 flag aSign;
1485 int16 aExp, shiftCount;
1486 bits32 aSig;
1487 bits64 aSig64;
1488 int64 z;
1489 a = float32_squash_input_denormal(a STATUS_VAR);
1491 aSig = extractFloat32Frac( a );
1492 aExp = extractFloat32Exp( a );
1493 aSign = extractFloat32Sign( a );
1494 shiftCount = aExp - 0xBE;
1495 if ( 0 <= shiftCount ) {
1496 if ( float32_val(a) != 0xDF000000 ) {
1497 float_raise( float_flag_invalid STATUS_VAR);
1498 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1499 return LIT64( 0x7FFFFFFFFFFFFFFF );
1502 return (sbits64) LIT64( 0x8000000000000000 );
1504 else if ( aExp <= 0x7E ) {
1505 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1506 return 0;
1508 aSig64 = aSig | 0x00800000;
1509 aSig64 <<= 40;
1510 z = aSig64>>( - shiftCount );
1511 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1512 STATUS(float_exception_flags) |= float_flag_inexact;
1514 if ( aSign ) z = - z;
1515 return z;
1519 /*----------------------------------------------------------------------------
1520 | Returns the result of converting the single-precision floating-point value
1521 | `a' to the double-precision floating-point format. The conversion is
1522 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1523 | Arithmetic.
1524 *----------------------------------------------------------------------------*/
1526 float64 float32_to_float64( float32 a STATUS_PARAM )
1528 flag aSign;
1529 int16 aExp;
1530 bits32 aSig;
1531 a = float32_squash_input_denormal(a STATUS_VAR);
1533 aSig = extractFloat32Frac( a );
1534 aExp = extractFloat32Exp( a );
1535 aSign = extractFloat32Sign( a );
1536 if ( aExp == 0xFF ) {
1537 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1538 return packFloat64( aSign, 0x7FF, 0 );
1540 if ( aExp == 0 ) {
1541 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1542 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1543 --aExp;
1545 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1549 #ifdef FLOATX80
1551 /*----------------------------------------------------------------------------
1552 | Returns the result of converting the single-precision floating-point value
1553 | `a' to the extended double-precision floating-point format. The conversion
1554 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1555 | Arithmetic.
1556 *----------------------------------------------------------------------------*/
1558 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1560 flag aSign;
1561 int16 aExp;
1562 bits32 aSig;
1564 a = float32_squash_input_denormal(a STATUS_VAR);
1565 aSig = extractFloat32Frac( a );
1566 aExp = extractFloat32Exp( a );
1567 aSign = extractFloat32Sign( a );
1568 if ( aExp == 0xFF ) {
1569 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1570 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1572 if ( aExp == 0 ) {
1573 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1574 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1576 aSig |= 0x00800000;
1577 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1581 #endif
1583 #ifdef FLOAT128
1585 /*----------------------------------------------------------------------------
1586 | Returns the result of converting the single-precision floating-point value
1587 | `a' to the double-precision floating-point format. The conversion is
1588 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1589 | Arithmetic.
1590 *----------------------------------------------------------------------------*/
1592 float128 float32_to_float128( float32 a STATUS_PARAM )
1594 flag aSign;
1595 int16 aExp;
1596 bits32 aSig;
1598 a = float32_squash_input_denormal(a STATUS_VAR);
1599 aSig = extractFloat32Frac( a );
1600 aExp = extractFloat32Exp( a );
1601 aSign = extractFloat32Sign( a );
1602 if ( aExp == 0xFF ) {
1603 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1604 return packFloat128( aSign, 0x7FFF, 0, 0 );
1606 if ( aExp == 0 ) {
1607 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1608 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1609 --aExp;
1611 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1615 #endif
1617 /*----------------------------------------------------------------------------
1618 | Rounds the single-precision floating-point value `a' to an integer, and
1619 | returns the result as a single-precision floating-point value. The
1620 | operation is performed according to the IEC/IEEE Standard for Binary
1621 | Floating-Point Arithmetic.
1622 *----------------------------------------------------------------------------*/
1624 float32 float32_round_to_int( float32 a STATUS_PARAM)
1626 flag aSign;
1627 int16 aExp;
1628 bits32 lastBitMask, roundBitsMask;
1629 int8 roundingMode;
1630 bits32 z;
1631 a = float32_squash_input_denormal(a STATUS_VAR);
1633 aExp = extractFloat32Exp( a );
1634 if ( 0x96 <= aExp ) {
1635 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1636 return propagateFloat32NaN( a, a STATUS_VAR );
1638 return a;
1640 if ( aExp <= 0x7E ) {
1641 if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1642 STATUS(float_exception_flags) |= float_flag_inexact;
1643 aSign = extractFloat32Sign( a );
1644 switch ( STATUS(float_rounding_mode) ) {
1645 case float_round_nearest_even:
1646 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1647 return packFloat32( aSign, 0x7F, 0 );
1649 break;
1650 case float_round_down:
1651 return make_float32(aSign ? 0xBF800000 : 0);
1652 case float_round_up:
1653 return make_float32(aSign ? 0x80000000 : 0x3F800000);
1655 return packFloat32( aSign, 0, 0 );
1657 lastBitMask = 1;
1658 lastBitMask <<= 0x96 - aExp;
1659 roundBitsMask = lastBitMask - 1;
1660 z = float32_val(a);
1661 roundingMode = STATUS(float_rounding_mode);
1662 if ( roundingMode == float_round_nearest_even ) {
1663 z += lastBitMask>>1;
1664 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1666 else if ( roundingMode != float_round_to_zero ) {
1667 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1668 z += roundBitsMask;
1671 z &= ~ roundBitsMask;
1672 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1673 return make_float32(z);
1677 /*----------------------------------------------------------------------------
1678 | Returns the result of adding the absolute values of the single-precision
1679 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1680 | before being returned. `zSign' is ignored if the result is a NaN.
1681 | The addition is performed according to the IEC/IEEE Standard for Binary
1682 | Floating-Point Arithmetic.
1683 *----------------------------------------------------------------------------*/
1685 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1687 int16 aExp, bExp, zExp;
1688 bits32 aSig, bSig, zSig;
1689 int16 expDiff;
1691 aSig = extractFloat32Frac( a );
1692 aExp = extractFloat32Exp( a );
1693 bSig = extractFloat32Frac( b );
1694 bExp = extractFloat32Exp( b );
1695 expDiff = aExp - bExp;
1696 aSig <<= 6;
1697 bSig <<= 6;
1698 if ( 0 < expDiff ) {
1699 if ( aExp == 0xFF ) {
1700 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1701 return a;
1703 if ( bExp == 0 ) {
1704 --expDiff;
1706 else {
1707 bSig |= 0x20000000;
1709 shift32RightJamming( bSig, expDiff, &bSig );
1710 zExp = aExp;
1712 else if ( expDiff < 0 ) {
1713 if ( bExp == 0xFF ) {
1714 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1715 return packFloat32( zSign, 0xFF, 0 );
1717 if ( aExp == 0 ) {
1718 ++expDiff;
1720 else {
1721 aSig |= 0x20000000;
1723 shift32RightJamming( aSig, - expDiff, &aSig );
1724 zExp = bExp;
1726 else {
1727 if ( aExp == 0xFF ) {
1728 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1729 return a;
1731 if ( aExp == 0 ) {
1732 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1733 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1735 zSig = 0x40000000 + aSig + bSig;
1736 zExp = aExp;
1737 goto roundAndPack;
1739 aSig |= 0x20000000;
1740 zSig = ( aSig + bSig )<<1;
1741 --zExp;
1742 if ( (sbits32) zSig < 0 ) {
1743 zSig = aSig + bSig;
1744 ++zExp;
1746 roundAndPack:
1747 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1751 /*----------------------------------------------------------------------------
1752 | Returns the result of subtracting the absolute values of the single-
1753 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1754 | difference is negated before being returned. `zSign' is ignored if the
1755 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1756 | Standard for Binary Floating-Point Arithmetic.
1757 *----------------------------------------------------------------------------*/
1759 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1761 int16 aExp, bExp, zExp;
1762 bits32 aSig, bSig, zSig;
1763 int16 expDiff;
1765 aSig = extractFloat32Frac( a );
1766 aExp = extractFloat32Exp( a );
1767 bSig = extractFloat32Frac( b );
1768 bExp = extractFloat32Exp( b );
1769 expDiff = aExp - bExp;
1770 aSig <<= 7;
1771 bSig <<= 7;
1772 if ( 0 < expDiff ) goto aExpBigger;
1773 if ( expDiff < 0 ) goto bExpBigger;
1774 if ( aExp == 0xFF ) {
1775 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1776 float_raise( float_flag_invalid STATUS_VAR);
1777 return float32_default_nan;
1779 if ( aExp == 0 ) {
1780 aExp = 1;
1781 bExp = 1;
1783 if ( bSig < aSig ) goto aBigger;
1784 if ( aSig < bSig ) goto bBigger;
1785 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1786 bExpBigger:
1787 if ( bExp == 0xFF ) {
1788 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1789 return packFloat32( zSign ^ 1, 0xFF, 0 );
1791 if ( aExp == 0 ) {
1792 ++expDiff;
1794 else {
1795 aSig |= 0x40000000;
1797 shift32RightJamming( aSig, - expDiff, &aSig );
1798 bSig |= 0x40000000;
1799 bBigger:
1800 zSig = bSig - aSig;
1801 zExp = bExp;
1802 zSign ^= 1;
1803 goto normalizeRoundAndPack;
1804 aExpBigger:
1805 if ( aExp == 0xFF ) {
1806 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1807 return a;
1809 if ( bExp == 0 ) {
1810 --expDiff;
1812 else {
1813 bSig |= 0x40000000;
1815 shift32RightJamming( bSig, expDiff, &bSig );
1816 aSig |= 0x40000000;
1817 aBigger:
1818 zSig = aSig - bSig;
1819 zExp = aExp;
1820 normalizeRoundAndPack:
1821 --zExp;
1822 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1826 /*----------------------------------------------------------------------------
1827 | Returns the result of adding the single-precision floating-point values `a'
1828 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1829 | Binary Floating-Point Arithmetic.
1830 *----------------------------------------------------------------------------*/
1832 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1834 flag aSign, bSign;
1835 a = float32_squash_input_denormal(a STATUS_VAR);
1836 b = float32_squash_input_denormal(b STATUS_VAR);
1838 aSign = extractFloat32Sign( a );
1839 bSign = extractFloat32Sign( b );
1840 if ( aSign == bSign ) {
1841 return addFloat32Sigs( a, b, aSign STATUS_VAR);
1843 else {
1844 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1849 /*----------------------------------------------------------------------------
1850 | Returns the result of subtracting the single-precision floating-point values
1851 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1852 | for Binary Floating-Point Arithmetic.
1853 *----------------------------------------------------------------------------*/
1855 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1857 flag aSign, bSign;
1858 a = float32_squash_input_denormal(a STATUS_VAR);
1859 b = float32_squash_input_denormal(b STATUS_VAR);
1861 aSign = extractFloat32Sign( a );
1862 bSign = extractFloat32Sign( b );
1863 if ( aSign == bSign ) {
1864 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1866 else {
1867 return addFloat32Sigs( a, b, aSign STATUS_VAR );
1872 /*----------------------------------------------------------------------------
1873 | Returns the result of multiplying the single-precision floating-point values
1874 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1875 | for Binary Floating-Point Arithmetic.
1876 *----------------------------------------------------------------------------*/
1878 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1880 flag aSign, bSign, zSign;
1881 int16 aExp, bExp, zExp;
1882 bits32 aSig, bSig;
1883 bits64 zSig64;
1884 bits32 zSig;
1886 a = float32_squash_input_denormal(a STATUS_VAR);
1887 b = float32_squash_input_denormal(b STATUS_VAR);
1889 aSig = extractFloat32Frac( a );
1890 aExp = extractFloat32Exp( a );
1891 aSign = extractFloat32Sign( a );
1892 bSig = extractFloat32Frac( b );
1893 bExp = extractFloat32Exp( b );
1894 bSign = extractFloat32Sign( b );
1895 zSign = aSign ^ bSign;
1896 if ( aExp == 0xFF ) {
1897 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1898 return propagateFloat32NaN( a, b STATUS_VAR );
1900 if ( ( bExp | bSig ) == 0 ) {
1901 float_raise( float_flag_invalid STATUS_VAR);
1902 return float32_default_nan;
1904 return packFloat32( zSign, 0xFF, 0 );
1906 if ( bExp == 0xFF ) {
1907 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1908 if ( ( aExp | aSig ) == 0 ) {
1909 float_raise( float_flag_invalid STATUS_VAR);
1910 return float32_default_nan;
1912 return packFloat32( zSign, 0xFF, 0 );
1914 if ( aExp == 0 ) {
1915 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1916 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1918 if ( bExp == 0 ) {
1919 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1920 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1922 zExp = aExp + bExp - 0x7F;
1923 aSig = ( aSig | 0x00800000 )<<7;
1924 bSig = ( bSig | 0x00800000 )<<8;
1925 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1926 zSig = zSig64;
1927 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1928 zSig <<= 1;
1929 --zExp;
1931 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1935 /*----------------------------------------------------------------------------
1936 | Returns the result of dividing the single-precision floating-point value `a'
1937 | by the corresponding value `b'. The operation is performed according to the
1938 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1939 *----------------------------------------------------------------------------*/
1941 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1943 flag aSign, bSign, zSign;
1944 int16 aExp, bExp, zExp;
1945 bits32 aSig, bSig, zSig;
1946 a = float32_squash_input_denormal(a STATUS_VAR);
1947 b = float32_squash_input_denormal(b STATUS_VAR);
1949 aSig = extractFloat32Frac( a );
1950 aExp = extractFloat32Exp( a );
1951 aSign = extractFloat32Sign( a );
1952 bSig = extractFloat32Frac( b );
1953 bExp = extractFloat32Exp( b );
1954 bSign = extractFloat32Sign( b );
1955 zSign = aSign ^ bSign;
1956 if ( aExp == 0xFF ) {
1957 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1958 if ( bExp == 0xFF ) {
1959 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1960 float_raise( float_flag_invalid STATUS_VAR);
1961 return float32_default_nan;
1963 return packFloat32( zSign, 0xFF, 0 );
1965 if ( bExp == 0xFF ) {
1966 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1967 return packFloat32( zSign, 0, 0 );
1969 if ( bExp == 0 ) {
1970 if ( bSig == 0 ) {
1971 if ( ( aExp | aSig ) == 0 ) {
1972 float_raise( float_flag_invalid STATUS_VAR);
1973 return float32_default_nan;
1975 float_raise( float_flag_divbyzero STATUS_VAR);
1976 return packFloat32( zSign, 0xFF, 0 );
1978 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1980 if ( aExp == 0 ) {
1981 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1982 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1984 zExp = aExp - bExp + 0x7D;
1985 aSig = ( aSig | 0x00800000 )<<7;
1986 bSig = ( bSig | 0x00800000 )<<8;
1987 if ( bSig <= ( aSig + aSig ) ) {
1988 aSig >>= 1;
1989 ++zExp;
1991 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1992 if ( ( zSig & 0x3F ) == 0 ) {
1993 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1995 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1999 /*----------------------------------------------------------------------------
2000 | Returns the remainder of the single-precision floating-point value `a'
2001 | with respect to the corresponding value `b'. The operation is performed
2002 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2003 *----------------------------------------------------------------------------*/
2005 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2007 flag aSign, zSign;
2008 int16 aExp, bExp, expDiff;
2009 bits32 aSig, bSig;
2010 bits32 q;
2011 bits64 aSig64, bSig64, q64;
2012 bits32 alternateASig;
2013 sbits32 sigMean;
2014 a = float32_squash_input_denormal(a STATUS_VAR);
2015 b = float32_squash_input_denormal(b STATUS_VAR);
2017 aSig = extractFloat32Frac( a );
2018 aExp = extractFloat32Exp( a );
2019 aSign = extractFloat32Sign( a );
2020 bSig = extractFloat32Frac( b );
2021 bExp = extractFloat32Exp( b );
2022 if ( aExp == 0xFF ) {
2023 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2024 return propagateFloat32NaN( a, b STATUS_VAR );
2026 float_raise( float_flag_invalid STATUS_VAR);
2027 return float32_default_nan;
2029 if ( bExp == 0xFF ) {
2030 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2031 return a;
2033 if ( bExp == 0 ) {
2034 if ( bSig == 0 ) {
2035 float_raise( float_flag_invalid STATUS_VAR);
2036 return float32_default_nan;
2038 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2040 if ( aExp == 0 ) {
2041 if ( aSig == 0 ) return a;
2042 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2044 expDiff = aExp - bExp;
2045 aSig |= 0x00800000;
2046 bSig |= 0x00800000;
2047 if ( expDiff < 32 ) {
2048 aSig <<= 8;
2049 bSig <<= 8;
2050 if ( expDiff < 0 ) {
2051 if ( expDiff < -1 ) return a;
2052 aSig >>= 1;
2054 q = ( bSig <= aSig );
2055 if ( q ) aSig -= bSig;
2056 if ( 0 < expDiff ) {
2057 q = ( ( (bits64) aSig )<<32 ) / bSig;
2058 q >>= 32 - expDiff;
2059 bSig >>= 2;
2060 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2062 else {
2063 aSig >>= 2;
2064 bSig >>= 2;
2067 else {
2068 if ( bSig <= aSig ) aSig -= bSig;
2069 aSig64 = ( (bits64) aSig )<<40;
2070 bSig64 = ( (bits64) bSig )<<40;
2071 expDiff -= 64;
2072 while ( 0 < expDiff ) {
2073 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2074 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2075 aSig64 = - ( ( bSig * q64 )<<38 );
2076 expDiff -= 62;
2078 expDiff += 64;
2079 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2080 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2081 q = q64>>( 64 - expDiff );
2082 bSig <<= 6;
2083 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2085 do {
2086 alternateASig = aSig;
2087 ++q;
2088 aSig -= bSig;
2089 } while ( 0 <= (sbits32) aSig );
2090 sigMean = aSig + alternateASig;
2091 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2092 aSig = alternateASig;
2094 zSign = ( (sbits32) aSig < 0 );
2095 if ( zSign ) aSig = - aSig;
2096 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2100 /*----------------------------------------------------------------------------
2101 | Returns the square root of the single-precision floating-point value `a'.
2102 | The operation is performed according to the IEC/IEEE Standard for Binary
2103 | Floating-Point Arithmetic.
2104 *----------------------------------------------------------------------------*/
2106 float32 float32_sqrt( float32 a STATUS_PARAM )
2108 flag aSign;
2109 int16 aExp, zExp;
2110 bits32 aSig, zSig;
2111 bits64 rem, term;
2112 a = float32_squash_input_denormal(a STATUS_VAR);
2114 aSig = extractFloat32Frac( a );
2115 aExp = extractFloat32Exp( a );
2116 aSign = extractFloat32Sign( a );
2117 if ( aExp == 0xFF ) {
2118 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2119 if ( ! aSign ) return a;
2120 float_raise( float_flag_invalid STATUS_VAR);
2121 return float32_default_nan;
2123 if ( aSign ) {
2124 if ( ( aExp | aSig ) == 0 ) return a;
2125 float_raise( float_flag_invalid STATUS_VAR);
2126 return float32_default_nan;
2128 if ( aExp == 0 ) {
2129 if ( aSig == 0 ) return float32_zero;
2130 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2132 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2133 aSig = ( aSig | 0x00800000 )<<8;
2134 zSig = estimateSqrt32( aExp, aSig ) + 2;
2135 if ( ( zSig & 0x7F ) <= 5 ) {
2136 if ( zSig < 2 ) {
2137 zSig = 0x7FFFFFFF;
2138 goto roundAndPack;
2140 aSig >>= aExp & 1;
2141 term = ( (bits64) zSig ) * zSig;
2142 rem = ( ( (bits64) aSig )<<32 ) - term;
2143 while ( (sbits64) rem < 0 ) {
2144 --zSig;
2145 rem += ( ( (bits64) zSig )<<1 ) | 1;
2147 zSig |= ( rem != 0 );
2149 shift32RightJamming( zSig, 1, &zSig );
2150 roundAndPack:
2151 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2155 /*----------------------------------------------------------------------------
2156 | Returns the binary exponential of the single-precision floating-point value
2157 | `a'. The operation is performed according to the IEC/IEEE Standard for
2158 | Binary Floating-Point Arithmetic.
2160 | Uses the following identities:
2162 | 1. -------------------------------------------------------------------------
2163 | x x*ln(2)
2164 | 2 = e
2166 | 2. -------------------------------------------------------------------------
2167 | 2 3 4 5 n
2168 | x x x x x x x
2169 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2170 | 1! 2! 3! 4! 5! n!
2171 *----------------------------------------------------------------------------*/
2173 static const float64 float32_exp2_coefficients[15] =
2175 make_float64( 0x3ff0000000000000ll ), /* 1 */
2176 make_float64( 0x3fe0000000000000ll ), /* 2 */
2177 make_float64( 0x3fc5555555555555ll ), /* 3 */
2178 make_float64( 0x3fa5555555555555ll ), /* 4 */
2179 make_float64( 0x3f81111111111111ll ), /* 5 */
2180 make_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2181 make_float64( 0x3f2a01a01a01a01all ), /* 7 */
2182 make_float64( 0x3efa01a01a01a01all ), /* 8 */
2183 make_float64( 0x3ec71de3a556c734ll ), /* 9 */
2184 make_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2185 make_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2186 make_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2187 make_float64( 0x3de6124613a86d09ll ), /* 13 */
2188 make_float64( 0x3da93974a8c07c9dll ), /* 14 */
2189 make_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2192 float32 float32_exp2( float32 a STATUS_PARAM )
2194 flag aSign;
2195 int16 aExp;
2196 bits32 aSig;
2197 float64 r, x, xn;
2198 int i;
2199 a = float32_squash_input_denormal(a STATUS_VAR);
2201 aSig = extractFloat32Frac( a );
2202 aExp = extractFloat32Exp( a );
2203 aSign = extractFloat32Sign( a );
2205 if ( aExp == 0xFF) {
2206 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2207 return (aSign) ? float32_zero : a;
2209 if (aExp == 0) {
2210 if (aSig == 0) return float32_one;
2213 float_raise( float_flag_inexact STATUS_VAR);
2215 /* ******************************* */
2216 /* using float64 for approximation */
2217 /* ******************************* */
2218 x = float32_to_float64(a STATUS_VAR);
2219 x = float64_mul(x, float64_ln2 STATUS_VAR);
2221 xn = x;
2222 r = float64_one;
2223 for (i = 0 ; i < 15 ; i++) {
2224 float64 f;
2226 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2227 r = float64_add(r, f STATUS_VAR);
2229 xn = float64_mul(xn, x STATUS_VAR);
2232 return float64_to_float32(r, status);
2235 /*----------------------------------------------------------------------------
2236 | Returns the binary log of the single-precision floating-point value `a'.
2237 | The operation is performed according to the IEC/IEEE Standard for Binary
2238 | Floating-Point Arithmetic.
2239 *----------------------------------------------------------------------------*/
2240 float32 float32_log2( float32 a STATUS_PARAM )
2242 flag aSign, zSign;
2243 int16 aExp;
2244 bits32 aSig, zSig, i;
2246 a = float32_squash_input_denormal(a STATUS_VAR);
2247 aSig = extractFloat32Frac( a );
2248 aExp = extractFloat32Exp( a );
2249 aSign = extractFloat32Sign( a );
2251 if ( aExp == 0 ) {
2252 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2253 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2255 if ( aSign ) {
2256 float_raise( float_flag_invalid STATUS_VAR);
2257 return float32_default_nan;
2259 if ( aExp == 0xFF ) {
2260 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2261 return a;
2264 aExp -= 0x7F;
2265 aSig |= 0x00800000;
2266 zSign = aExp < 0;
2267 zSig = aExp << 23;
2269 for (i = 1 << 22; i > 0; i >>= 1) {
2270 aSig = ( (bits64)aSig * aSig ) >> 23;
2271 if ( aSig & 0x01000000 ) {
2272 aSig >>= 1;
2273 zSig |= i;
2277 if ( zSign )
2278 zSig = -zSig;
2280 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2283 /*----------------------------------------------------------------------------
2284 | Returns 1 if the single-precision floating-point value `a' is equal to
2285 | the corresponding value `b', and 0 otherwise. The comparison is performed
2286 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2287 *----------------------------------------------------------------------------*/
2289 int float32_eq( float32 a, float32 b STATUS_PARAM )
2291 a = float32_squash_input_denormal(a STATUS_VAR);
2292 b = float32_squash_input_denormal(b STATUS_VAR);
2294 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2295 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2297 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2298 float_raise( float_flag_invalid STATUS_VAR);
2300 return 0;
2302 return ( float32_val(a) == float32_val(b) ) ||
2303 ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2307 /*----------------------------------------------------------------------------
2308 | Returns 1 if the single-precision floating-point value `a' is less than
2309 | or equal to the corresponding value `b', and 0 otherwise. The comparison
2310 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2311 | Arithmetic.
2312 *----------------------------------------------------------------------------*/
2314 int float32_le( float32 a, float32 b STATUS_PARAM )
2316 flag aSign, bSign;
2317 bits32 av, bv;
2318 a = float32_squash_input_denormal(a STATUS_VAR);
2319 b = float32_squash_input_denormal(b STATUS_VAR);
2321 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2322 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2324 float_raise( float_flag_invalid STATUS_VAR);
2325 return 0;
2327 aSign = extractFloat32Sign( a );
2328 bSign = extractFloat32Sign( b );
2329 av = float32_val(a);
2330 bv = float32_val(b);
2331 if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2332 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2336 /*----------------------------------------------------------------------------
2337 | Returns 1 if the single-precision floating-point value `a' is less than
2338 | the corresponding value `b', and 0 otherwise. The comparison is performed
2339 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2340 *----------------------------------------------------------------------------*/
2342 int float32_lt( float32 a, float32 b STATUS_PARAM )
2344 flag aSign, bSign;
2345 bits32 av, bv;
2346 a = float32_squash_input_denormal(a STATUS_VAR);
2347 b = float32_squash_input_denormal(b STATUS_VAR);
2349 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2350 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2352 float_raise( float_flag_invalid STATUS_VAR);
2353 return 0;
2355 aSign = extractFloat32Sign( a );
2356 bSign = extractFloat32Sign( b );
2357 av = float32_val(a);
2358 bv = float32_val(b);
2359 if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2360 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2364 /*----------------------------------------------------------------------------
2365 | Returns 1 if the single-precision floating-point value `a' is equal to
2366 | the corresponding value `b', and 0 otherwise. The invalid exception is
2367 | raised if either operand is a NaN. Otherwise, the comparison is performed
2368 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2369 *----------------------------------------------------------------------------*/
2371 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2373 bits32 av, bv;
2374 a = float32_squash_input_denormal(a STATUS_VAR);
2375 b = float32_squash_input_denormal(b STATUS_VAR);
2377 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2378 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2380 float_raise( float_flag_invalid STATUS_VAR);
2381 return 0;
2383 av = float32_val(a);
2384 bv = float32_val(b);
2385 return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2389 /*----------------------------------------------------------------------------
2390 | Returns 1 if the single-precision floating-point value `a' is less than or
2391 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2392 | cause an exception. Otherwise, the comparison is performed according to the
2393 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2394 *----------------------------------------------------------------------------*/
2396 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2398 flag aSign, bSign;
2399 bits32 av, bv;
2400 a = float32_squash_input_denormal(a STATUS_VAR);
2401 b = float32_squash_input_denormal(b STATUS_VAR);
2403 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2404 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2406 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2407 float_raise( float_flag_invalid STATUS_VAR);
2409 return 0;
2411 aSign = extractFloat32Sign( a );
2412 bSign = extractFloat32Sign( b );
2413 av = float32_val(a);
2414 bv = float32_val(b);
2415 if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2416 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2420 /*----------------------------------------------------------------------------
2421 | Returns 1 if the single-precision floating-point value `a' is less than
2422 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2423 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2424 | Standard for Binary Floating-Point Arithmetic.
2425 *----------------------------------------------------------------------------*/
2427 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2429 flag aSign, bSign;
2430 bits32 av, bv;
2431 a = float32_squash_input_denormal(a STATUS_VAR);
2432 b = float32_squash_input_denormal(b STATUS_VAR);
2434 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2435 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2437 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2438 float_raise( float_flag_invalid STATUS_VAR);
2440 return 0;
2442 aSign = extractFloat32Sign( a );
2443 bSign = extractFloat32Sign( b );
2444 av = float32_val(a);
2445 bv = float32_val(b);
2446 if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2447 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2451 /*----------------------------------------------------------------------------
2452 | Returns the result of converting the double-precision floating-point value
2453 | `a' to the 32-bit two's complement integer format. The conversion is
2454 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2455 | Arithmetic---which means in particular that the conversion is rounded
2456 | according to the current rounding mode. If `a' is a NaN, the largest
2457 | positive integer is returned. Otherwise, if the conversion overflows, the
2458 | largest integer with the same sign as `a' is returned.
2459 *----------------------------------------------------------------------------*/
2461 int32 float64_to_int32( float64 a STATUS_PARAM )
2463 flag aSign;
2464 int16 aExp, shiftCount;
2465 bits64 aSig;
2466 a = float64_squash_input_denormal(a STATUS_VAR);
2468 aSig = extractFloat64Frac( a );
2469 aExp = extractFloat64Exp( a );
2470 aSign = extractFloat64Sign( a );
2471 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2472 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2473 shiftCount = 0x42C - aExp;
2474 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2475 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2479 /*----------------------------------------------------------------------------
2480 | Returns the result of converting the double-precision floating-point value
2481 | `a' to the 32-bit two's complement integer format. The conversion is
2482 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2483 | Arithmetic, except that the conversion is always rounded toward zero.
2484 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2485 | the conversion overflows, the largest integer with the same sign as `a' is
2486 | returned.
2487 *----------------------------------------------------------------------------*/
2489 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2491 flag aSign;
2492 int16 aExp, shiftCount;
2493 bits64 aSig, savedASig;
2494 int32 z;
2495 a = float64_squash_input_denormal(a STATUS_VAR);
2497 aSig = extractFloat64Frac( a );
2498 aExp = extractFloat64Exp( a );
2499 aSign = extractFloat64Sign( a );
2500 if ( 0x41E < aExp ) {
2501 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2502 goto invalid;
2504 else if ( aExp < 0x3FF ) {
2505 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2506 return 0;
2508 aSig |= LIT64( 0x0010000000000000 );
2509 shiftCount = 0x433 - aExp;
2510 savedASig = aSig;
2511 aSig >>= shiftCount;
2512 z = aSig;
2513 if ( aSign ) z = - z;
2514 if ( ( z < 0 ) ^ aSign ) {
2515 invalid:
2516 float_raise( float_flag_invalid STATUS_VAR);
2517 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2519 if ( ( aSig<<shiftCount ) != savedASig ) {
2520 STATUS(float_exception_flags) |= float_flag_inexact;
2522 return z;
2526 /*----------------------------------------------------------------------------
2527 | Returns the result of converting the double-precision floating-point value
2528 | `a' to the 16-bit two's complement integer format. The conversion is
2529 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2530 | Arithmetic, except that the conversion is always rounded toward zero.
2531 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2532 | the conversion overflows, the largest integer with the same sign as `a' is
2533 | returned.
2534 *----------------------------------------------------------------------------*/
2536 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2538 flag aSign;
2539 int16 aExp, shiftCount;
2540 bits64 aSig, savedASig;
2541 int32 z;
2543 aSig = extractFloat64Frac( a );
2544 aExp = extractFloat64Exp( a );
2545 aSign = extractFloat64Sign( a );
2546 if ( 0x40E < aExp ) {
2547 if ( ( aExp == 0x7FF ) && aSig ) {
2548 aSign = 0;
2550 goto invalid;
2552 else if ( aExp < 0x3FF ) {
2553 if ( aExp || aSig ) {
2554 STATUS(float_exception_flags) |= float_flag_inexact;
2556 return 0;
2558 aSig |= LIT64( 0x0010000000000000 );
2559 shiftCount = 0x433 - aExp;
2560 savedASig = aSig;
2561 aSig >>= shiftCount;
2562 z = aSig;
2563 if ( aSign ) {
2564 z = - z;
2566 if ( ( (int16_t)z < 0 ) ^ aSign ) {
2567 invalid:
2568 float_raise( float_flag_invalid STATUS_VAR);
2569 return aSign ? (sbits32) 0xffff8000 : 0x7FFF;
2571 if ( ( aSig<<shiftCount ) != savedASig ) {
2572 STATUS(float_exception_flags) |= float_flag_inexact;
2574 return z;
2577 /*----------------------------------------------------------------------------
2578 | Returns the result of converting the double-precision floating-point value
2579 | `a' to the 64-bit two's complement integer format. The conversion is
2580 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2581 | Arithmetic---which means in particular that the conversion is rounded
2582 | according to the current rounding mode. If `a' is a NaN, the largest
2583 | positive integer is returned. Otherwise, if the conversion overflows, the
2584 | largest integer with the same sign as `a' is returned.
2585 *----------------------------------------------------------------------------*/
2587 int64 float64_to_int64( float64 a STATUS_PARAM )
2589 flag aSign;
2590 int16 aExp, shiftCount;
2591 bits64 aSig, aSigExtra;
2592 a = float64_squash_input_denormal(a STATUS_VAR);
2594 aSig = extractFloat64Frac( a );
2595 aExp = extractFloat64Exp( a );
2596 aSign = extractFloat64Sign( a );
2597 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2598 shiftCount = 0x433 - aExp;
2599 if ( shiftCount <= 0 ) {
2600 if ( 0x43E < aExp ) {
2601 float_raise( float_flag_invalid STATUS_VAR);
2602 if ( ! aSign
2603 || ( ( aExp == 0x7FF )
2604 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2606 return LIT64( 0x7FFFFFFFFFFFFFFF );
2608 return (sbits64) LIT64( 0x8000000000000000 );
2610 aSigExtra = 0;
2611 aSig <<= - shiftCount;
2613 else {
2614 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2616 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2620 /*----------------------------------------------------------------------------
2621 | Returns the result of converting the double-precision floating-point value
2622 | `a' to the 64-bit two's complement integer format. The conversion is
2623 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2624 | Arithmetic, except that the conversion is always rounded toward zero.
2625 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2626 | the conversion overflows, the largest integer with the same sign as `a' is
2627 | returned.
2628 *----------------------------------------------------------------------------*/
2630 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2632 flag aSign;
2633 int16 aExp, shiftCount;
2634 bits64 aSig;
2635 int64 z;
2636 a = float64_squash_input_denormal(a STATUS_VAR);
2638 aSig = extractFloat64Frac( a );
2639 aExp = extractFloat64Exp( a );
2640 aSign = extractFloat64Sign( a );
2641 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2642 shiftCount = aExp - 0x433;
2643 if ( 0 <= shiftCount ) {
2644 if ( 0x43E <= aExp ) {
2645 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2646 float_raise( float_flag_invalid STATUS_VAR);
2647 if ( ! aSign
2648 || ( ( aExp == 0x7FF )
2649 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2651 return LIT64( 0x7FFFFFFFFFFFFFFF );
2654 return (sbits64) LIT64( 0x8000000000000000 );
2656 z = aSig<<shiftCount;
2658 else {
2659 if ( aExp < 0x3FE ) {
2660 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2661 return 0;
2663 z = aSig>>( - shiftCount );
2664 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2665 STATUS(float_exception_flags) |= float_flag_inexact;
2668 if ( aSign ) z = - z;
2669 return z;
2673 /*----------------------------------------------------------------------------
2674 | Returns the result of converting the double-precision floating-point value
2675 | `a' to the single-precision floating-point format. The conversion is
2676 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2677 | Arithmetic.
2678 *----------------------------------------------------------------------------*/
2680 float32 float64_to_float32( float64 a STATUS_PARAM )
2682 flag aSign;
2683 int16 aExp;
2684 bits64 aSig;
2685 bits32 zSig;
2686 a = float64_squash_input_denormal(a STATUS_VAR);
2688 aSig = extractFloat64Frac( a );
2689 aExp = extractFloat64Exp( a );
2690 aSign = extractFloat64Sign( a );
2691 if ( aExp == 0x7FF ) {
2692 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2693 return packFloat32( aSign, 0xFF, 0 );
2695 shift64RightJamming( aSig, 22, &aSig );
2696 zSig = aSig;
2697 if ( aExp || zSig ) {
2698 zSig |= 0x40000000;
2699 aExp -= 0x381;
2701 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2706 /*----------------------------------------------------------------------------
2707 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2708 | half-precision floating-point value, returning the result. After being
2709 | shifted into the proper positions, the three fields are simply added
2710 | together to form the result. This means that any integer portion of `zSig'
2711 | will be added into the exponent. Since a properly normalized significand
2712 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2713 | than the desired result exponent whenever `zSig' is a complete, normalized
2714 | significand.
2715 *----------------------------------------------------------------------------*/
2716 static bits16 packFloat16(flag zSign, int16 zExp, bits16 zSig)
2718 return (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig;
2721 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2722 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
2724 float32 float16_to_float32( bits16 a, flag ieee STATUS_PARAM )
2726 flag aSign;
2727 int16 aExp;
2728 bits32 aSig;
2730 aSign = a >> 15;
2731 aExp = (a >> 10) & 0x1f;
2732 aSig = a & 0x3ff;
2734 if (aExp == 0x1f && ieee) {
2735 if (aSig) {
2736 /* Make sure correct exceptions are raised. */
2737 float32ToCommonNaN(a STATUS_VAR);
2738 aSig |= 0x200;
2740 return packFloat32(aSign, 0xff, aSig << 13);
2742 if (aExp == 0) {
2743 int8 shiftCount;
2745 if (aSig == 0) {
2746 return packFloat32(aSign, 0, 0);
2749 shiftCount = countLeadingZeros32( aSig ) - 21;
2750 aSig = aSig << shiftCount;
2751 aExp = -shiftCount;
2753 return packFloat32( aSign, aExp + 0x70, aSig << 13);
2756 bits16 float32_to_float16( float32 a, flag ieee STATUS_PARAM)
2758 flag aSign;
2759 int16 aExp;
2760 bits32 aSig;
2761 bits32 mask;
2762 bits32 increment;
2763 int8 roundingMode;
2764 a = float32_squash_input_denormal(a STATUS_VAR);
2766 aSig = extractFloat32Frac( a );
2767 aExp = extractFloat32Exp( a );
2768 aSign = extractFloat32Sign( a );
2769 if ( aExp == 0xFF ) {
2770 if (aSig) {
2771 /* Make sure correct exceptions are raised. */
2772 float32ToCommonNaN(a STATUS_VAR);
2773 aSig |= 0x00400000;
2775 return packFloat16(aSign, 0x1f, aSig >> 13);
2777 if (aExp == 0 && aSign == 0) {
2778 return packFloat16(aSign, 0, 0);
2780 /* Decimal point between bits 22 and 23. */
2781 aSig |= 0x00800000;
2782 aExp -= 0x7f;
2783 if (aExp < -14) {
2784 mask = 0x007fffff;
2785 if (aExp < -24) {
2786 aExp = -25;
2787 } else {
2788 mask >>= 24 + aExp;
2790 } else {
2791 mask = 0x00001fff;
2793 if (aSig & mask) {
2794 float_raise( float_flag_underflow STATUS_VAR );
2795 roundingMode = STATUS(float_rounding_mode);
2796 switch (roundingMode) {
2797 case float_round_nearest_even:
2798 increment = (mask + 1) >> 1;
2799 if ((aSig & mask) == increment) {
2800 increment = aSig & (increment << 1);
2802 break;
2803 case float_round_up:
2804 increment = aSign ? 0 : mask;
2805 break;
2806 case float_round_down:
2807 increment = aSign ? mask : 0;
2808 break;
2809 default: /* round_to_zero */
2810 increment = 0;
2811 break;
2813 aSig += increment;
2814 if (aSig >= 0x01000000) {
2815 aSig >>= 1;
2816 aExp++;
2818 } else if (aExp < -14
2819 && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2820 float_raise( float_flag_underflow STATUS_VAR);
2823 if (ieee) {
2824 if (aExp > 15) {
2825 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2826 return packFloat16(aSign, 0x1f, 0);
2828 } else {
2829 if (aExp > 16) {
2830 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2831 return packFloat16(aSign, 0x1f, 0x3ff);
2834 if (aExp < -24) {
2835 return packFloat16(aSign, 0, 0);
2837 if (aExp < -14) {
2838 aSig >>= -14 - aExp;
2839 aExp = -14;
2841 return packFloat16(aSign, aExp + 14, aSig >> 13);
2844 #ifdef FLOATX80
2846 /*----------------------------------------------------------------------------
2847 | Returns the result of converting the double-precision floating-point value
2848 | `a' to the extended double-precision floating-point format. The conversion
2849 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2850 | Arithmetic.
2851 *----------------------------------------------------------------------------*/
2853 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2855 flag aSign;
2856 int16 aExp;
2857 bits64 aSig;
2859 a = float64_squash_input_denormal(a STATUS_VAR);
2860 aSig = extractFloat64Frac( a );
2861 aExp = extractFloat64Exp( a );
2862 aSign = extractFloat64Sign( a );
2863 if ( aExp == 0x7FF ) {
2864 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2865 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2867 if ( aExp == 0 ) {
2868 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2869 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2871 return
2872 packFloatx80(
2873 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2877 #endif
2879 #ifdef FLOAT128
2881 /*----------------------------------------------------------------------------
2882 | Returns the result of converting the double-precision floating-point value
2883 | `a' to the quadruple-precision floating-point format. The conversion is
2884 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2885 | Arithmetic.
2886 *----------------------------------------------------------------------------*/
2888 float128 float64_to_float128( float64 a STATUS_PARAM )
2890 flag aSign;
2891 int16 aExp;
2892 bits64 aSig, zSig0, zSig1;
2894 a = float64_squash_input_denormal(a STATUS_VAR);
2895 aSig = extractFloat64Frac( a );
2896 aExp = extractFloat64Exp( a );
2897 aSign = extractFloat64Sign( a );
2898 if ( aExp == 0x7FF ) {
2899 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2900 return packFloat128( aSign, 0x7FFF, 0, 0 );
2902 if ( aExp == 0 ) {
2903 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2904 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2905 --aExp;
2907 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2908 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2912 #endif
2914 /*----------------------------------------------------------------------------
2915 | Rounds the double-precision floating-point value `a' to an integer, and
2916 | returns the result as a double-precision floating-point value. The
2917 | operation is performed according to the IEC/IEEE Standard for Binary
2918 | Floating-Point Arithmetic.
2919 *----------------------------------------------------------------------------*/
2921 float64 float64_round_to_int( float64 a STATUS_PARAM )
2923 flag aSign;
2924 int16 aExp;
2925 bits64 lastBitMask, roundBitsMask;
2926 int8 roundingMode;
2927 bits64 z;
2928 a = float64_squash_input_denormal(a STATUS_VAR);
2930 aExp = extractFloat64Exp( a );
2931 if ( 0x433 <= aExp ) {
2932 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2933 return propagateFloat64NaN( a, a STATUS_VAR );
2935 return a;
2937 if ( aExp < 0x3FF ) {
2938 if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2939 STATUS(float_exception_flags) |= float_flag_inexact;
2940 aSign = extractFloat64Sign( a );
2941 switch ( STATUS(float_rounding_mode) ) {
2942 case float_round_nearest_even:
2943 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2944 return packFloat64( aSign, 0x3FF, 0 );
2946 break;
2947 case float_round_down:
2948 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2949 case float_round_up:
2950 return make_float64(
2951 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2953 return packFloat64( aSign, 0, 0 );
2955 lastBitMask = 1;
2956 lastBitMask <<= 0x433 - aExp;
2957 roundBitsMask = lastBitMask - 1;
2958 z = float64_val(a);
2959 roundingMode = STATUS(float_rounding_mode);
2960 if ( roundingMode == float_round_nearest_even ) {
2961 z += lastBitMask>>1;
2962 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2964 else if ( roundingMode != float_round_to_zero ) {
2965 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
2966 z += roundBitsMask;
2969 z &= ~ roundBitsMask;
2970 if ( z != float64_val(a) )
2971 STATUS(float_exception_flags) |= float_flag_inexact;
2972 return make_float64(z);
2976 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2978 int oldmode;
2979 float64 res;
2980 oldmode = STATUS(float_rounding_mode);
2981 STATUS(float_rounding_mode) = float_round_to_zero;
2982 res = float64_round_to_int(a STATUS_VAR);
2983 STATUS(float_rounding_mode) = oldmode;
2984 return res;
2987 /*----------------------------------------------------------------------------
2988 | Returns the result of adding the absolute values of the double-precision
2989 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2990 | before being returned. `zSign' is ignored if the result is a NaN.
2991 | The addition is performed according to the IEC/IEEE Standard for Binary
2992 | Floating-Point Arithmetic.
2993 *----------------------------------------------------------------------------*/
2995 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2997 int16 aExp, bExp, zExp;
2998 bits64 aSig, bSig, zSig;
2999 int16 expDiff;
3001 aSig = extractFloat64Frac( a );
3002 aExp = extractFloat64Exp( a );
3003 bSig = extractFloat64Frac( b );
3004 bExp = extractFloat64Exp( b );
3005 expDiff = aExp - bExp;
3006 aSig <<= 9;
3007 bSig <<= 9;
3008 if ( 0 < expDiff ) {
3009 if ( aExp == 0x7FF ) {
3010 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3011 return a;
3013 if ( bExp == 0 ) {
3014 --expDiff;
3016 else {
3017 bSig |= LIT64( 0x2000000000000000 );
3019 shift64RightJamming( bSig, expDiff, &bSig );
3020 zExp = aExp;
3022 else if ( expDiff < 0 ) {
3023 if ( bExp == 0x7FF ) {
3024 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3025 return packFloat64( zSign, 0x7FF, 0 );
3027 if ( aExp == 0 ) {
3028 ++expDiff;
3030 else {
3031 aSig |= LIT64( 0x2000000000000000 );
3033 shift64RightJamming( aSig, - expDiff, &aSig );
3034 zExp = bExp;
3036 else {
3037 if ( aExp == 0x7FF ) {
3038 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3039 return a;
3041 if ( aExp == 0 ) {
3042 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
3043 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3045 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3046 zExp = aExp;
3047 goto roundAndPack;
3049 aSig |= LIT64( 0x2000000000000000 );
3050 zSig = ( aSig + bSig )<<1;
3051 --zExp;
3052 if ( (sbits64) zSig < 0 ) {
3053 zSig = aSig + bSig;
3054 ++zExp;
3056 roundAndPack:
3057 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3061 /*----------------------------------------------------------------------------
3062 | Returns the result of subtracting the absolute values of the double-
3063 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3064 | difference is negated before being returned. `zSign' is ignored if the
3065 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3066 | Standard for Binary Floating-Point Arithmetic.
3067 *----------------------------------------------------------------------------*/
3069 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3071 int16 aExp, bExp, zExp;
3072 bits64 aSig, bSig, zSig;
3073 int16 expDiff;
3075 aSig = extractFloat64Frac( a );
3076 aExp = extractFloat64Exp( a );
3077 bSig = extractFloat64Frac( b );
3078 bExp = extractFloat64Exp( b );
3079 expDiff = aExp - bExp;
3080 aSig <<= 10;
3081 bSig <<= 10;
3082 if ( 0 < expDiff ) goto aExpBigger;
3083 if ( expDiff < 0 ) goto bExpBigger;
3084 if ( aExp == 0x7FF ) {
3085 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3086 float_raise( float_flag_invalid STATUS_VAR);
3087 return float64_default_nan;
3089 if ( aExp == 0 ) {
3090 aExp = 1;
3091 bExp = 1;
3093 if ( bSig < aSig ) goto aBigger;
3094 if ( aSig < bSig ) goto bBigger;
3095 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3096 bExpBigger:
3097 if ( bExp == 0x7FF ) {
3098 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3099 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3101 if ( aExp == 0 ) {
3102 ++expDiff;
3104 else {
3105 aSig |= LIT64( 0x4000000000000000 );
3107 shift64RightJamming( aSig, - expDiff, &aSig );
3108 bSig |= LIT64( 0x4000000000000000 );
3109 bBigger:
3110 zSig = bSig - aSig;
3111 zExp = bExp;
3112 zSign ^= 1;
3113 goto normalizeRoundAndPack;
3114 aExpBigger:
3115 if ( aExp == 0x7FF ) {
3116 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3117 return a;
3119 if ( bExp == 0 ) {
3120 --expDiff;
3122 else {
3123 bSig |= LIT64( 0x4000000000000000 );
3125 shift64RightJamming( bSig, expDiff, &bSig );
3126 aSig |= LIT64( 0x4000000000000000 );
3127 aBigger:
3128 zSig = aSig - bSig;
3129 zExp = aExp;
3130 normalizeRoundAndPack:
3131 --zExp;
3132 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3136 /*----------------------------------------------------------------------------
3137 | Returns the result of adding the double-precision floating-point values `a'
3138 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3139 | Binary Floating-Point Arithmetic.
3140 *----------------------------------------------------------------------------*/
3142 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3144 flag aSign, bSign;
3145 a = float64_squash_input_denormal(a STATUS_VAR);
3146 b = float64_squash_input_denormal(b STATUS_VAR);
3148 aSign = extractFloat64Sign( a );
3149 bSign = extractFloat64Sign( b );
3150 if ( aSign == bSign ) {
3151 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3153 else {
3154 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3159 /*----------------------------------------------------------------------------
3160 | Returns the result of subtracting the double-precision floating-point values
3161 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3162 | for Binary Floating-Point Arithmetic.
3163 *----------------------------------------------------------------------------*/
3165 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3167 flag aSign, bSign;
3168 a = float64_squash_input_denormal(a STATUS_VAR);
3169 b = float64_squash_input_denormal(b STATUS_VAR);
3171 aSign = extractFloat64Sign( a );
3172 bSign = extractFloat64Sign( b );
3173 if ( aSign == bSign ) {
3174 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3176 else {
3177 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3182 /*----------------------------------------------------------------------------
3183 | Returns the result of multiplying the double-precision floating-point values
3184 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3185 | for Binary Floating-Point Arithmetic.
3186 *----------------------------------------------------------------------------*/
3188 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3190 flag aSign, bSign, zSign;
3191 int16 aExp, bExp, zExp;
3192 bits64 aSig, bSig, zSig0, zSig1;
3194 a = float64_squash_input_denormal(a STATUS_VAR);
3195 b = float64_squash_input_denormal(b STATUS_VAR);
3197 aSig = extractFloat64Frac( a );
3198 aExp = extractFloat64Exp( a );
3199 aSign = extractFloat64Sign( a );
3200 bSig = extractFloat64Frac( b );
3201 bExp = extractFloat64Exp( b );
3202 bSign = extractFloat64Sign( b );
3203 zSign = aSign ^ bSign;
3204 if ( aExp == 0x7FF ) {
3205 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3206 return propagateFloat64NaN( a, b STATUS_VAR );
3208 if ( ( bExp | bSig ) == 0 ) {
3209 float_raise( float_flag_invalid STATUS_VAR);
3210 return float64_default_nan;
3212 return packFloat64( zSign, 0x7FF, 0 );
3214 if ( bExp == 0x7FF ) {
3215 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3216 if ( ( aExp | aSig ) == 0 ) {
3217 float_raise( float_flag_invalid STATUS_VAR);
3218 return float64_default_nan;
3220 return packFloat64( zSign, 0x7FF, 0 );
3222 if ( aExp == 0 ) {
3223 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3224 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3226 if ( bExp == 0 ) {
3227 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3228 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3230 zExp = aExp + bExp - 0x3FF;
3231 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3232 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3233 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3234 zSig0 |= ( zSig1 != 0 );
3235 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
3236 zSig0 <<= 1;
3237 --zExp;
3239 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3243 /*----------------------------------------------------------------------------
3244 | Returns the result of dividing the double-precision floating-point value `a'
3245 | by the corresponding value `b'. The operation is performed according to
3246 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3247 *----------------------------------------------------------------------------*/
3249 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3251 flag aSign, bSign, zSign;
3252 int16 aExp, bExp, zExp;
3253 bits64 aSig, bSig, zSig;
3254 bits64 rem0, rem1;
3255 bits64 term0, term1;
3256 a = float64_squash_input_denormal(a STATUS_VAR);
3257 b = float64_squash_input_denormal(b STATUS_VAR);
3259 aSig = extractFloat64Frac( a );
3260 aExp = extractFloat64Exp( a );
3261 aSign = extractFloat64Sign( a );
3262 bSig = extractFloat64Frac( b );
3263 bExp = extractFloat64Exp( b );
3264 bSign = extractFloat64Sign( b );
3265 zSign = aSign ^ bSign;
3266 if ( aExp == 0x7FF ) {
3267 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3268 if ( bExp == 0x7FF ) {
3269 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3270 float_raise( float_flag_invalid STATUS_VAR);
3271 return float64_default_nan;
3273 return packFloat64( zSign, 0x7FF, 0 );
3275 if ( bExp == 0x7FF ) {
3276 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3277 return packFloat64( zSign, 0, 0 );
3279 if ( bExp == 0 ) {
3280 if ( bSig == 0 ) {
3281 if ( ( aExp | aSig ) == 0 ) {
3282 float_raise( float_flag_invalid STATUS_VAR);
3283 return float64_default_nan;
3285 float_raise( float_flag_divbyzero STATUS_VAR);
3286 return packFloat64( zSign, 0x7FF, 0 );
3288 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3290 if ( aExp == 0 ) {
3291 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3292 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3294 zExp = aExp - bExp + 0x3FD;
3295 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3296 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3297 if ( bSig <= ( aSig + aSig ) ) {
3298 aSig >>= 1;
3299 ++zExp;
3301 zSig = estimateDiv128To64( aSig, 0, bSig );
3302 if ( ( zSig & 0x1FF ) <= 2 ) {
3303 mul64To128( bSig, zSig, &term0, &term1 );
3304 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3305 while ( (sbits64) rem0 < 0 ) {
3306 --zSig;
3307 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3309 zSig |= ( rem1 != 0 );
3311 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3315 /*----------------------------------------------------------------------------
3316 | Returns the remainder of the double-precision floating-point value `a'
3317 | with respect to the corresponding value `b'. The operation is performed
3318 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3319 *----------------------------------------------------------------------------*/
3321 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3323 flag aSign, zSign;
3324 int16 aExp, bExp, expDiff;
3325 bits64 aSig, bSig;
3326 bits64 q, alternateASig;
3327 sbits64 sigMean;
3329 a = float64_squash_input_denormal(a STATUS_VAR);
3330 b = float64_squash_input_denormal(b STATUS_VAR);
3331 aSig = extractFloat64Frac( a );
3332 aExp = extractFloat64Exp( a );
3333 aSign = extractFloat64Sign( a );
3334 bSig = extractFloat64Frac( b );
3335 bExp = extractFloat64Exp( b );
3336 if ( aExp == 0x7FF ) {
3337 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3338 return propagateFloat64NaN( a, b STATUS_VAR );
3340 float_raise( float_flag_invalid STATUS_VAR);
3341 return float64_default_nan;
3343 if ( bExp == 0x7FF ) {
3344 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3345 return a;
3347 if ( bExp == 0 ) {
3348 if ( bSig == 0 ) {
3349 float_raise( float_flag_invalid STATUS_VAR);
3350 return float64_default_nan;
3352 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3354 if ( aExp == 0 ) {
3355 if ( aSig == 0 ) return a;
3356 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3358 expDiff = aExp - bExp;
3359 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3360 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3361 if ( expDiff < 0 ) {
3362 if ( expDiff < -1 ) return a;
3363 aSig >>= 1;
3365 q = ( bSig <= aSig );
3366 if ( q ) aSig -= bSig;
3367 expDiff -= 64;
3368 while ( 0 < expDiff ) {
3369 q = estimateDiv128To64( aSig, 0, bSig );
3370 q = ( 2 < q ) ? q - 2 : 0;
3371 aSig = - ( ( bSig>>2 ) * q );
3372 expDiff -= 62;
3374 expDiff += 64;
3375 if ( 0 < expDiff ) {
3376 q = estimateDiv128To64( aSig, 0, bSig );
3377 q = ( 2 < q ) ? q - 2 : 0;
3378 q >>= 64 - expDiff;
3379 bSig >>= 2;
3380 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3382 else {
3383 aSig >>= 2;
3384 bSig >>= 2;
3386 do {
3387 alternateASig = aSig;
3388 ++q;
3389 aSig -= bSig;
3390 } while ( 0 <= (sbits64) aSig );
3391 sigMean = aSig + alternateASig;
3392 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3393 aSig = alternateASig;
3395 zSign = ( (sbits64) aSig < 0 );
3396 if ( zSign ) aSig = - aSig;
3397 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3401 /*----------------------------------------------------------------------------
3402 | Returns the square root of the double-precision floating-point value `a'.
3403 | The operation is performed according to the IEC/IEEE Standard for Binary
3404 | Floating-Point Arithmetic.
3405 *----------------------------------------------------------------------------*/
3407 float64 float64_sqrt( float64 a STATUS_PARAM )
3409 flag aSign;
3410 int16 aExp, zExp;
3411 bits64 aSig, zSig, doubleZSig;
3412 bits64 rem0, rem1, term0, term1;
3413 a = float64_squash_input_denormal(a STATUS_VAR);
3415 aSig = extractFloat64Frac( a );
3416 aExp = extractFloat64Exp( a );
3417 aSign = extractFloat64Sign( a );
3418 if ( aExp == 0x7FF ) {
3419 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3420 if ( ! aSign ) return a;
3421 float_raise( float_flag_invalid STATUS_VAR);
3422 return float64_default_nan;
3424 if ( aSign ) {
3425 if ( ( aExp | aSig ) == 0 ) return a;
3426 float_raise( float_flag_invalid STATUS_VAR);
3427 return float64_default_nan;
3429 if ( aExp == 0 ) {
3430 if ( aSig == 0 ) return float64_zero;
3431 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3433 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3434 aSig |= LIT64( 0x0010000000000000 );
3435 zSig = estimateSqrt32( aExp, aSig>>21 );
3436 aSig <<= 9 - ( aExp & 1 );
3437 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3438 if ( ( zSig & 0x1FF ) <= 5 ) {
3439 doubleZSig = zSig<<1;
3440 mul64To128( zSig, zSig, &term0, &term1 );
3441 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3442 while ( (sbits64) rem0 < 0 ) {
3443 --zSig;
3444 doubleZSig -= 2;
3445 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3447 zSig |= ( ( rem0 | rem1 ) != 0 );
3449 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3453 /*----------------------------------------------------------------------------
3454 | Returns the binary log of the double-precision floating-point value `a'.
3455 | The operation is performed according to the IEC/IEEE Standard for Binary
3456 | Floating-Point Arithmetic.
3457 *----------------------------------------------------------------------------*/
3458 float64 float64_log2( float64 a STATUS_PARAM )
3460 flag aSign, zSign;
3461 int16 aExp;
3462 bits64 aSig, aSig0, aSig1, zSig, i;
3463 a = float64_squash_input_denormal(a STATUS_VAR);
3465 aSig = extractFloat64Frac( a );
3466 aExp = extractFloat64Exp( a );
3467 aSign = extractFloat64Sign( a );
3469 if ( aExp == 0 ) {
3470 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3471 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3473 if ( aSign ) {
3474 float_raise( float_flag_invalid STATUS_VAR);
3475 return float64_default_nan;
3477 if ( aExp == 0x7FF ) {
3478 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3479 return a;
3482 aExp -= 0x3FF;
3483 aSig |= LIT64( 0x0010000000000000 );
3484 zSign = aExp < 0;
3485 zSig = (bits64)aExp << 52;
3486 for (i = 1LL << 51; i > 0; i >>= 1) {
3487 mul64To128( aSig, aSig, &aSig0, &aSig1 );
3488 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3489 if ( aSig & LIT64( 0x0020000000000000 ) ) {
3490 aSig >>= 1;
3491 zSig |= i;
3495 if ( zSign )
3496 zSig = -zSig;
3497 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3500 /*----------------------------------------------------------------------------
3501 | Returns 1 if the double-precision floating-point value `a' is equal to the
3502 | corresponding value `b', and 0 otherwise. The comparison is performed
3503 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3504 *----------------------------------------------------------------------------*/
3506 int float64_eq( float64 a, float64 b STATUS_PARAM )
3508 bits64 av, bv;
3509 a = float64_squash_input_denormal(a STATUS_VAR);
3510 b = float64_squash_input_denormal(b STATUS_VAR);
3512 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3513 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3515 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3516 float_raise( float_flag_invalid STATUS_VAR);
3518 return 0;
3520 av = float64_val(a);
3521 bv = float64_val(b);
3522 return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3526 /*----------------------------------------------------------------------------
3527 | Returns 1 if the double-precision floating-point value `a' is less than or
3528 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3529 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3530 | Arithmetic.
3531 *----------------------------------------------------------------------------*/
3533 int float64_le( float64 a, float64 b STATUS_PARAM )
3535 flag aSign, bSign;
3536 bits64 av, bv;
3537 a = float64_squash_input_denormal(a STATUS_VAR);
3538 b = float64_squash_input_denormal(b STATUS_VAR);
3540 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3541 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3543 float_raise( float_flag_invalid STATUS_VAR);
3544 return 0;
3546 aSign = extractFloat64Sign( a );
3547 bSign = extractFloat64Sign( b );
3548 av = float64_val(a);
3549 bv = float64_val(b);
3550 if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3551 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3555 /*----------------------------------------------------------------------------
3556 | Returns 1 if the double-precision floating-point value `a' is less than
3557 | the corresponding value `b', and 0 otherwise. The comparison is performed
3558 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3559 *----------------------------------------------------------------------------*/
3561 int float64_lt( float64 a, float64 b STATUS_PARAM )
3563 flag aSign, bSign;
3564 bits64 av, bv;
3566 a = float64_squash_input_denormal(a STATUS_VAR);
3567 b = float64_squash_input_denormal(b STATUS_VAR);
3568 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3569 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3571 float_raise( float_flag_invalid STATUS_VAR);
3572 return 0;
3574 aSign = extractFloat64Sign( a );
3575 bSign = extractFloat64Sign( b );
3576 av = float64_val(a);
3577 bv = float64_val(b);
3578 if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3579 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3583 /*----------------------------------------------------------------------------
3584 | Returns 1 if the double-precision floating-point value `a' is equal to the
3585 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3586 | if either operand is a NaN. Otherwise, the comparison is performed
3587 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3588 *----------------------------------------------------------------------------*/
3590 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3592 bits64 av, bv;
3593 a = float64_squash_input_denormal(a STATUS_VAR);
3594 b = float64_squash_input_denormal(b STATUS_VAR);
3596 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3597 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3599 float_raise( float_flag_invalid STATUS_VAR);
3600 return 0;
3602 av = float64_val(a);
3603 bv = float64_val(b);
3604 return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3608 /*----------------------------------------------------------------------------
3609 | Returns 1 if the double-precision floating-point value `a' is less than or
3610 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3611 | cause an exception. Otherwise, the comparison is performed according to the
3612 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3613 *----------------------------------------------------------------------------*/
3615 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3617 flag aSign, bSign;
3618 bits64 av, bv;
3619 a = float64_squash_input_denormal(a STATUS_VAR);
3620 b = float64_squash_input_denormal(b STATUS_VAR);
3622 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3623 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3625 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3626 float_raise( float_flag_invalid STATUS_VAR);
3628 return 0;
3630 aSign = extractFloat64Sign( a );
3631 bSign = extractFloat64Sign( b );
3632 av = float64_val(a);
3633 bv = float64_val(b);
3634 if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3635 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3639 /*----------------------------------------------------------------------------
3640 | Returns 1 if the double-precision floating-point value `a' is less than
3641 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3642 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3643 | Standard for Binary Floating-Point Arithmetic.
3644 *----------------------------------------------------------------------------*/
3646 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3648 flag aSign, bSign;
3649 bits64 av, bv;
3650 a = float64_squash_input_denormal(a STATUS_VAR);
3651 b = float64_squash_input_denormal(b STATUS_VAR);
3653 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3654 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3656 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3657 float_raise( float_flag_invalid STATUS_VAR);
3659 return 0;
3661 aSign = extractFloat64Sign( a );
3662 bSign = extractFloat64Sign( b );
3663 av = float64_val(a);
3664 bv = float64_val(b);
3665 if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3666 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3670 #ifdef FLOATX80
3672 /*----------------------------------------------------------------------------
3673 | Returns the result of converting the extended double-precision floating-
3674 | point value `a' to the 32-bit two's complement integer format. The
3675 | conversion is performed according to the IEC/IEEE Standard for Binary
3676 | Floating-Point Arithmetic---which means in particular that the conversion
3677 | is rounded according to the current rounding mode. If `a' is a NaN, the
3678 | largest positive integer is returned. Otherwise, if the conversion
3679 | overflows, the largest integer with the same sign as `a' is returned.
3680 *----------------------------------------------------------------------------*/
3682 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3684 flag aSign;
3685 int32 aExp, shiftCount;
3686 bits64 aSig;
3688 aSig = extractFloatx80Frac( a );
3689 aExp = extractFloatx80Exp( a );
3690 aSign = extractFloatx80Sign( a );
3691 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3692 shiftCount = 0x4037 - aExp;
3693 if ( shiftCount <= 0 ) shiftCount = 1;
3694 shift64RightJamming( aSig, shiftCount, &aSig );
3695 return roundAndPackInt32( aSign, aSig STATUS_VAR );
3699 /*----------------------------------------------------------------------------
3700 | Returns the result of converting the extended double-precision floating-
3701 | point value `a' to the 32-bit two's complement integer format. The
3702 | conversion is performed according to the IEC/IEEE Standard for Binary
3703 | Floating-Point Arithmetic, except that the conversion is always rounded
3704 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3705 | Otherwise, if the conversion overflows, the largest integer with the same
3706 | sign as `a' is returned.
3707 *----------------------------------------------------------------------------*/
3709 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3711 flag aSign;
3712 int32 aExp, shiftCount;
3713 bits64 aSig, savedASig;
3714 int32 z;
3716 aSig = extractFloatx80Frac( a );
3717 aExp = extractFloatx80Exp( a );
3718 aSign = extractFloatx80Sign( a );
3719 if ( 0x401E < aExp ) {
3720 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3721 goto invalid;
3723 else if ( aExp < 0x3FFF ) {
3724 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3725 return 0;
3727 shiftCount = 0x403E - aExp;
3728 savedASig = aSig;
3729 aSig >>= shiftCount;
3730 z = aSig;
3731 if ( aSign ) z = - z;
3732 if ( ( z < 0 ) ^ aSign ) {
3733 invalid:
3734 float_raise( float_flag_invalid STATUS_VAR);
3735 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3737 if ( ( aSig<<shiftCount ) != savedASig ) {
3738 STATUS(float_exception_flags) |= float_flag_inexact;
3740 return z;
3744 /*----------------------------------------------------------------------------
3745 | Returns the result of converting the extended double-precision floating-
3746 | point value `a' to the 64-bit two's complement integer format. The
3747 | conversion is performed according to the IEC/IEEE Standard for Binary
3748 | Floating-Point Arithmetic---which means in particular that the conversion
3749 | is rounded according to the current rounding mode. If `a' is a NaN,
3750 | the largest positive integer is returned. Otherwise, if the conversion
3751 | overflows, the largest integer with the same sign as `a' is returned.
3752 *----------------------------------------------------------------------------*/
3754 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3756 flag aSign;
3757 int32 aExp, shiftCount;
3758 bits64 aSig, aSigExtra;
3760 aSig = extractFloatx80Frac( a );
3761 aExp = extractFloatx80Exp( a );
3762 aSign = extractFloatx80Sign( a );
3763 shiftCount = 0x403E - aExp;
3764 if ( shiftCount <= 0 ) {
3765 if ( shiftCount ) {
3766 float_raise( float_flag_invalid STATUS_VAR);
3767 if ( ! aSign
3768 || ( ( aExp == 0x7FFF )
3769 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3771 return LIT64( 0x7FFFFFFFFFFFFFFF );
3773 return (sbits64) LIT64( 0x8000000000000000 );
3775 aSigExtra = 0;
3777 else {
3778 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3780 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3784 /*----------------------------------------------------------------------------
3785 | Returns the result of converting the extended double-precision floating-
3786 | point value `a' to the 64-bit two's complement integer format. The
3787 | conversion is performed according to the IEC/IEEE Standard for Binary
3788 | Floating-Point Arithmetic, except that the conversion is always rounded
3789 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3790 | Otherwise, if the conversion overflows, the largest integer with the same
3791 | sign as `a' is returned.
3792 *----------------------------------------------------------------------------*/
3794 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3796 flag aSign;
3797 int32 aExp, shiftCount;
3798 bits64 aSig;
3799 int64 z;
3801 aSig = extractFloatx80Frac( a );
3802 aExp = extractFloatx80Exp( a );
3803 aSign = extractFloatx80Sign( a );
3804 shiftCount = aExp - 0x403E;
3805 if ( 0 <= shiftCount ) {
3806 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3807 if ( ( a.high != 0xC03E ) || aSig ) {
3808 float_raise( float_flag_invalid STATUS_VAR);
3809 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3810 return LIT64( 0x7FFFFFFFFFFFFFFF );
3813 return (sbits64) LIT64( 0x8000000000000000 );
3815 else if ( aExp < 0x3FFF ) {
3816 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3817 return 0;
3819 z = aSig>>( - shiftCount );
3820 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3821 STATUS(float_exception_flags) |= float_flag_inexact;
3823 if ( aSign ) z = - z;
3824 return z;
3828 /*----------------------------------------------------------------------------
3829 | Returns the result of converting the extended double-precision floating-
3830 | point value `a' to the single-precision floating-point format. The
3831 | conversion is performed according to the IEC/IEEE Standard for Binary
3832 | Floating-Point Arithmetic.
3833 *----------------------------------------------------------------------------*/
3835 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3837 flag aSign;
3838 int32 aExp;
3839 bits64 aSig;
3841 aSig = extractFloatx80Frac( a );
3842 aExp = extractFloatx80Exp( a );
3843 aSign = extractFloatx80Sign( a );
3844 if ( aExp == 0x7FFF ) {
3845 if ( (bits64) ( aSig<<1 ) ) {
3846 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3848 return packFloat32( aSign, 0xFF, 0 );
3850 shift64RightJamming( aSig, 33, &aSig );
3851 if ( aExp || aSig ) aExp -= 0x3F81;
3852 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3856 /*----------------------------------------------------------------------------
3857 | Returns the result of converting the extended double-precision floating-
3858 | point value `a' to the double-precision floating-point format. The
3859 | conversion is performed according to the IEC/IEEE Standard for Binary
3860 | Floating-Point Arithmetic.
3861 *----------------------------------------------------------------------------*/
3863 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3865 flag aSign;
3866 int32 aExp;
3867 bits64 aSig, zSig;
3869 aSig = extractFloatx80Frac( a );
3870 aExp = extractFloatx80Exp( a );
3871 aSign = extractFloatx80Sign( a );
3872 if ( aExp == 0x7FFF ) {
3873 if ( (bits64) ( aSig<<1 ) ) {
3874 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3876 return packFloat64( aSign, 0x7FF, 0 );
3878 shift64RightJamming( aSig, 1, &zSig );
3879 if ( aExp || aSig ) aExp -= 0x3C01;
3880 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3884 #ifdef FLOAT128
3886 /*----------------------------------------------------------------------------
3887 | Returns the result of converting the extended double-precision floating-
3888 | point value `a' to the quadruple-precision floating-point format. The
3889 | conversion is performed according to the IEC/IEEE Standard for Binary
3890 | Floating-Point Arithmetic.
3891 *----------------------------------------------------------------------------*/
3893 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3895 flag aSign;
3896 int16 aExp;
3897 bits64 aSig, zSig0, zSig1;
3899 aSig = extractFloatx80Frac( a );
3900 aExp = extractFloatx80Exp( a );
3901 aSign = extractFloatx80Sign( a );
3902 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3903 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3905 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3906 return packFloat128( aSign, aExp, zSig0, zSig1 );
3910 #endif
3912 /*----------------------------------------------------------------------------
3913 | Rounds the extended double-precision floating-point value `a' to an integer,
3914 | and returns the result as an extended quadruple-precision floating-point
3915 | value. The operation is performed according to the IEC/IEEE Standard for
3916 | Binary Floating-Point Arithmetic.
3917 *----------------------------------------------------------------------------*/
3919 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3921 flag aSign;
3922 int32 aExp;
3923 bits64 lastBitMask, roundBitsMask;
3924 int8 roundingMode;
3925 floatx80 z;
3927 aExp = extractFloatx80Exp( a );
3928 if ( 0x403E <= aExp ) {
3929 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3930 return propagateFloatx80NaN( a, a STATUS_VAR );
3932 return a;
3934 if ( aExp < 0x3FFF ) {
3935 if ( ( aExp == 0 )
3936 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3937 return a;
3939 STATUS(float_exception_flags) |= float_flag_inexact;
3940 aSign = extractFloatx80Sign( a );
3941 switch ( STATUS(float_rounding_mode) ) {
3942 case float_round_nearest_even:
3943 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3945 return
3946 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3948 break;
3949 case float_round_down:
3950 return
3951 aSign ?
3952 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3953 : packFloatx80( 0, 0, 0 );
3954 case float_round_up:
3955 return
3956 aSign ? packFloatx80( 1, 0, 0 )
3957 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3959 return packFloatx80( aSign, 0, 0 );
3961 lastBitMask = 1;
3962 lastBitMask <<= 0x403E - aExp;
3963 roundBitsMask = lastBitMask - 1;
3964 z = a;
3965 roundingMode = STATUS(float_rounding_mode);
3966 if ( roundingMode == float_round_nearest_even ) {
3967 z.low += lastBitMask>>1;
3968 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3970 else if ( roundingMode != float_round_to_zero ) {
3971 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3972 z.low += roundBitsMask;
3975 z.low &= ~ roundBitsMask;
3976 if ( z.low == 0 ) {
3977 ++z.high;
3978 z.low = LIT64( 0x8000000000000000 );
3980 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3981 return z;
3985 /*----------------------------------------------------------------------------
3986 | Returns the result of adding the absolute values of the extended double-
3987 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3988 | negated before being returned. `zSign' is ignored if the result is a NaN.
3989 | The addition is performed according to the IEC/IEEE Standard for Binary
3990 | Floating-Point Arithmetic.
3991 *----------------------------------------------------------------------------*/
3993 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3995 int32 aExp, bExp, zExp;
3996 bits64 aSig, bSig, zSig0, zSig1;
3997 int32 expDiff;
3999 aSig = extractFloatx80Frac( a );
4000 aExp = extractFloatx80Exp( a );
4001 bSig = extractFloatx80Frac( b );
4002 bExp = extractFloatx80Exp( b );
4003 expDiff = aExp - bExp;
4004 if ( 0 < expDiff ) {
4005 if ( aExp == 0x7FFF ) {
4006 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4007 return a;
4009 if ( bExp == 0 ) --expDiff;
4010 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4011 zExp = aExp;
4013 else if ( expDiff < 0 ) {
4014 if ( bExp == 0x7FFF ) {
4015 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4016 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4018 if ( aExp == 0 ) ++expDiff;
4019 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4020 zExp = bExp;
4022 else {
4023 if ( aExp == 0x7FFF ) {
4024 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
4025 return propagateFloatx80NaN( a, b STATUS_VAR );
4027 return a;
4029 zSig1 = 0;
4030 zSig0 = aSig + bSig;
4031 if ( aExp == 0 ) {
4032 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4033 goto roundAndPack;
4035 zExp = aExp;
4036 goto shiftRight1;
4038 zSig0 = aSig + bSig;
4039 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
4040 shiftRight1:
4041 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4042 zSig0 |= LIT64( 0x8000000000000000 );
4043 ++zExp;
4044 roundAndPack:
4045 return
4046 roundAndPackFloatx80(
4047 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4051 /*----------------------------------------------------------------------------
4052 | Returns the result of subtracting the absolute values of the extended
4053 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4054 | difference is negated before being returned. `zSign' is ignored if the
4055 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4056 | Standard for Binary Floating-Point Arithmetic.
4057 *----------------------------------------------------------------------------*/
4059 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4061 int32 aExp, bExp, zExp;
4062 bits64 aSig, bSig, zSig0, zSig1;
4063 int32 expDiff;
4064 floatx80 z;
4066 aSig = extractFloatx80Frac( a );
4067 aExp = extractFloatx80Exp( a );
4068 bSig = extractFloatx80Frac( b );
4069 bExp = extractFloatx80Exp( b );
4070 expDiff = aExp - bExp;
4071 if ( 0 < expDiff ) goto aExpBigger;
4072 if ( expDiff < 0 ) goto bExpBigger;
4073 if ( aExp == 0x7FFF ) {
4074 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
4075 return propagateFloatx80NaN( a, b STATUS_VAR );
4077 float_raise( float_flag_invalid STATUS_VAR);
4078 z.low = floatx80_default_nan_low;
4079 z.high = floatx80_default_nan_high;
4080 return z;
4082 if ( aExp == 0 ) {
4083 aExp = 1;
4084 bExp = 1;
4086 zSig1 = 0;
4087 if ( bSig < aSig ) goto aBigger;
4088 if ( aSig < bSig ) goto bBigger;
4089 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4090 bExpBigger:
4091 if ( bExp == 0x7FFF ) {
4092 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4093 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4095 if ( aExp == 0 ) ++expDiff;
4096 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4097 bBigger:
4098 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4099 zExp = bExp;
4100 zSign ^= 1;
4101 goto normalizeRoundAndPack;
4102 aExpBigger:
4103 if ( aExp == 0x7FFF ) {
4104 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4105 return a;
4107 if ( bExp == 0 ) --expDiff;
4108 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4109 aBigger:
4110 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4111 zExp = aExp;
4112 normalizeRoundAndPack:
4113 return
4114 normalizeRoundAndPackFloatx80(
4115 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4119 /*----------------------------------------------------------------------------
4120 | Returns the result of adding the extended double-precision floating-point
4121 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4122 | Standard for Binary Floating-Point Arithmetic.
4123 *----------------------------------------------------------------------------*/
4125 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4127 flag aSign, bSign;
4129 aSign = extractFloatx80Sign( a );
4130 bSign = extractFloatx80Sign( b );
4131 if ( aSign == bSign ) {
4132 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4134 else {
4135 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4140 /*----------------------------------------------------------------------------
4141 | Returns the result of subtracting the extended double-precision floating-
4142 | point values `a' and `b'. The operation is performed according to the
4143 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4144 *----------------------------------------------------------------------------*/
4146 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4148 flag aSign, bSign;
4150 aSign = extractFloatx80Sign( a );
4151 bSign = extractFloatx80Sign( b );
4152 if ( aSign == bSign ) {
4153 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4155 else {
4156 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4161 /*----------------------------------------------------------------------------
4162 | Returns the result of multiplying the extended double-precision floating-
4163 | point values `a' and `b'. The operation is performed according to the
4164 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4165 *----------------------------------------------------------------------------*/
4167 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4169 flag aSign, bSign, zSign;
4170 int32 aExp, bExp, zExp;
4171 bits64 aSig, bSig, zSig0, zSig1;
4172 floatx80 z;
4174 aSig = extractFloatx80Frac( a );
4175 aExp = extractFloatx80Exp( a );
4176 aSign = extractFloatx80Sign( a );
4177 bSig = extractFloatx80Frac( b );
4178 bExp = extractFloatx80Exp( b );
4179 bSign = extractFloatx80Sign( b );
4180 zSign = aSign ^ bSign;
4181 if ( aExp == 0x7FFF ) {
4182 if ( (bits64) ( aSig<<1 )
4183 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4184 return propagateFloatx80NaN( a, b STATUS_VAR );
4186 if ( ( bExp | bSig ) == 0 ) goto invalid;
4187 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4189 if ( bExp == 0x7FFF ) {
4190 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4191 if ( ( aExp | aSig ) == 0 ) {
4192 invalid:
4193 float_raise( float_flag_invalid STATUS_VAR);
4194 z.low = floatx80_default_nan_low;
4195 z.high = floatx80_default_nan_high;
4196 return z;
4198 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4200 if ( aExp == 0 ) {
4201 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4202 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4204 if ( bExp == 0 ) {
4205 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4206 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4208 zExp = aExp + bExp - 0x3FFE;
4209 mul64To128( aSig, bSig, &zSig0, &zSig1 );
4210 if ( 0 < (sbits64) zSig0 ) {
4211 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4212 --zExp;
4214 return
4215 roundAndPackFloatx80(
4216 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4220 /*----------------------------------------------------------------------------
4221 | Returns the result of dividing the extended double-precision floating-point
4222 | value `a' by the corresponding value `b'. The operation is performed
4223 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4224 *----------------------------------------------------------------------------*/
4226 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4228 flag aSign, bSign, zSign;
4229 int32 aExp, bExp, zExp;
4230 bits64 aSig, bSig, zSig0, zSig1;
4231 bits64 rem0, rem1, rem2, term0, term1, term2;
4232 floatx80 z;
4234 aSig = extractFloatx80Frac( a );
4235 aExp = extractFloatx80Exp( a );
4236 aSign = extractFloatx80Sign( a );
4237 bSig = extractFloatx80Frac( b );
4238 bExp = extractFloatx80Exp( b );
4239 bSign = extractFloatx80Sign( b );
4240 zSign = aSign ^ bSign;
4241 if ( aExp == 0x7FFF ) {
4242 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4243 if ( bExp == 0x7FFF ) {
4244 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4245 goto invalid;
4247 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4249 if ( bExp == 0x7FFF ) {
4250 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4251 return packFloatx80( zSign, 0, 0 );
4253 if ( bExp == 0 ) {
4254 if ( bSig == 0 ) {
4255 if ( ( aExp | aSig ) == 0 ) {
4256 invalid:
4257 float_raise( float_flag_invalid STATUS_VAR);
4258 z.low = floatx80_default_nan_low;
4259 z.high = floatx80_default_nan_high;
4260 return z;
4262 float_raise( float_flag_divbyzero STATUS_VAR);
4263 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4265 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4267 if ( aExp == 0 ) {
4268 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4269 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4271 zExp = aExp - bExp + 0x3FFE;
4272 rem1 = 0;
4273 if ( bSig <= aSig ) {
4274 shift128Right( aSig, 0, 1, &aSig, &rem1 );
4275 ++zExp;
4277 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4278 mul64To128( bSig, zSig0, &term0, &term1 );
4279 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4280 while ( (sbits64) rem0 < 0 ) {
4281 --zSig0;
4282 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4284 zSig1 = estimateDiv128To64( rem1, 0, bSig );
4285 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
4286 mul64To128( bSig, zSig1, &term1, &term2 );
4287 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4288 while ( (sbits64) rem1 < 0 ) {
4289 --zSig1;
4290 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4292 zSig1 |= ( ( rem1 | rem2 ) != 0 );
4294 return
4295 roundAndPackFloatx80(
4296 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4300 /*----------------------------------------------------------------------------
4301 | Returns the remainder of the extended double-precision floating-point value
4302 | `a' with respect to the corresponding value `b'. The operation is performed
4303 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4304 *----------------------------------------------------------------------------*/
4306 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4308 flag aSign, zSign;
4309 int32 aExp, bExp, expDiff;
4310 bits64 aSig0, aSig1, bSig;
4311 bits64 q, term0, term1, alternateASig0, alternateASig1;
4312 floatx80 z;
4314 aSig0 = extractFloatx80Frac( a );
4315 aExp = extractFloatx80Exp( a );
4316 aSign = extractFloatx80Sign( a );
4317 bSig = extractFloatx80Frac( b );
4318 bExp = extractFloatx80Exp( b );
4319 if ( aExp == 0x7FFF ) {
4320 if ( (bits64) ( aSig0<<1 )
4321 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4322 return propagateFloatx80NaN( a, b STATUS_VAR );
4324 goto invalid;
4326 if ( bExp == 0x7FFF ) {
4327 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4328 return a;
4330 if ( bExp == 0 ) {
4331 if ( bSig == 0 ) {
4332 invalid:
4333 float_raise( float_flag_invalid STATUS_VAR);
4334 z.low = floatx80_default_nan_low;
4335 z.high = floatx80_default_nan_high;
4336 return z;
4338 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4340 if ( aExp == 0 ) {
4341 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
4342 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4344 bSig |= LIT64( 0x8000000000000000 );
4345 zSign = aSign;
4346 expDiff = aExp - bExp;
4347 aSig1 = 0;
4348 if ( expDiff < 0 ) {
4349 if ( expDiff < -1 ) return a;
4350 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4351 expDiff = 0;
4353 q = ( bSig <= aSig0 );
4354 if ( q ) aSig0 -= bSig;
4355 expDiff -= 64;
4356 while ( 0 < expDiff ) {
4357 q = estimateDiv128To64( aSig0, aSig1, bSig );
4358 q = ( 2 < q ) ? q - 2 : 0;
4359 mul64To128( bSig, q, &term0, &term1 );
4360 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4361 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4362 expDiff -= 62;
4364 expDiff += 64;
4365 if ( 0 < expDiff ) {
4366 q = estimateDiv128To64( aSig0, aSig1, bSig );
4367 q = ( 2 < q ) ? q - 2 : 0;
4368 q >>= 64 - expDiff;
4369 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4370 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4371 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4372 while ( le128( term0, term1, aSig0, aSig1 ) ) {
4373 ++q;
4374 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4377 else {
4378 term1 = 0;
4379 term0 = bSig;
4381 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4382 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4383 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4384 && ( q & 1 ) )
4386 aSig0 = alternateASig0;
4387 aSig1 = alternateASig1;
4388 zSign = ! zSign;
4390 return
4391 normalizeRoundAndPackFloatx80(
4392 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4396 /*----------------------------------------------------------------------------
4397 | Returns the square root of the extended double-precision floating-point
4398 | value `a'. The operation is performed according to the IEC/IEEE Standard
4399 | for Binary Floating-Point Arithmetic.
4400 *----------------------------------------------------------------------------*/
4402 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4404 flag aSign;
4405 int32 aExp, zExp;
4406 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4407 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4408 floatx80 z;
4410 aSig0 = extractFloatx80Frac( a );
4411 aExp = extractFloatx80Exp( a );
4412 aSign = extractFloatx80Sign( a );
4413 if ( aExp == 0x7FFF ) {
4414 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4415 if ( ! aSign ) return a;
4416 goto invalid;
4418 if ( aSign ) {
4419 if ( ( aExp | aSig0 ) == 0 ) return a;
4420 invalid:
4421 float_raise( float_flag_invalid STATUS_VAR);
4422 z.low = floatx80_default_nan_low;
4423 z.high = floatx80_default_nan_high;
4424 return z;
4426 if ( aExp == 0 ) {
4427 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4428 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4430 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4431 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4432 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4433 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4434 doubleZSig0 = zSig0<<1;
4435 mul64To128( zSig0, zSig0, &term0, &term1 );
4436 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4437 while ( (sbits64) rem0 < 0 ) {
4438 --zSig0;
4439 doubleZSig0 -= 2;
4440 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4442 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4443 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4444 if ( zSig1 == 0 ) zSig1 = 1;
4445 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4446 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4447 mul64To128( zSig1, zSig1, &term2, &term3 );
4448 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4449 while ( (sbits64) rem1 < 0 ) {
4450 --zSig1;
4451 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4452 term3 |= 1;
4453 term2 |= doubleZSig0;
4454 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4456 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4458 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4459 zSig0 |= doubleZSig0;
4460 return
4461 roundAndPackFloatx80(
4462 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4466 /*----------------------------------------------------------------------------
4467 | Returns 1 if the extended double-precision floating-point value `a' is
4468 | equal to the corresponding value `b', and 0 otherwise. The comparison is
4469 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4470 | Arithmetic.
4471 *----------------------------------------------------------------------------*/
4473 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4476 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4477 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4478 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4479 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4481 if ( floatx80_is_signaling_nan( a )
4482 || floatx80_is_signaling_nan( b ) ) {
4483 float_raise( float_flag_invalid STATUS_VAR);
4485 return 0;
4487 return
4488 ( a.low == b.low )
4489 && ( ( a.high == b.high )
4490 || ( ( a.low == 0 )
4491 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4496 /*----------------------------------------------------------------------------
4497 | Returns 1 if the extended double-precision floating-point value `a' is
4498 | less than or equal to the corresponding value `b', and 0 otherwise. The
4499 | comparison is performed according to the IEC/IEEE Standard for Binary
4500 | Floating-Point Arithmetic.
4501 *----------------------------------------------------------------------------*/
4503 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4505 flag aSign, bSign;
4507 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4508 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4509 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4510 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4512 float_raise( float_flag_invalid STATUS_VAR);
4513 return 0;
4515 aSign = extractFloatx80Sign( a );
4516 bSign = extractFloatx80Sign( b );
4517 if ( aSign != bSign ) {
4518 return
4519 aSign
4520 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4521 == 0 );
4523 return
4524 aSign ? le128( b.high, b.low, a.high, a.low )
4525 : le128( a.high, a.low, b.high, b.low );
4529 /*----------------------------------------------------------------------------
4530 | Returns 1 if the extended double-precision floating-point value `a' is
4531 | less than the corresponding value `b', and 0 otherwise. The comparison
4532 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4533 | Arithmetic.
4534 *----------------------------------------------------------------------------*/
4536 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4538 flag aSign, bSign;
4540 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4541 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4542 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4543 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4545 float_raise( float_flag_invalid STATUS_VAR);
4546 return 0;
4548 aSign = extractFloatx80Sign( a );
4549 bSign = extractFloatx80Sign( b );
4550 if ( aSign != bSign ) {
4551 return
4552 aSign
4553 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4554 != 0 );
4556 return
4557 aSign ? lt128( b.high, b.low, a.high, a.low )
4558 : lt128( a.high, a.low, b.high, b.low );
4562 /*----------------------------------------------------------------------------
4563 | Returns 1 if the extended double-precision floating-point value `a' is equal
4564 | to the corresponding value `b', and 0 otherwise. The invalid exception is
4565 | raised if either operand is a NaN. Otherwise, the comparison is performed
4566 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4567 *----------------------------------------------------------------------------*/
4569 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4572 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4573 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4574 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4575 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4577 float_raise( float_flag_invalid STATUS_VAR);
4578 return 0;
4580 return
4581 ( a.low == b.low )
4582 && ( ( a.high == b.high )
4583 || ( ( a.low == 0 )
4584 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4589 /*----------------------------------------------------------------------------
4590 | Returns 1 if the extended double-precision floating-point value `a' is less
4591 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4592 | do not cause an exception. Otherwise, the comparison is performed according
4593 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4594 *----------------------------------------------------------------------------*/
4596 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4598 flag aSign, bSign;
4600 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4601 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4602 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4603 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4605 if ( floatx80_is_signaling_nan( a )
4606 || floatx80_is_signaling_nan( b ) ) {
4607 float_raise( float_flag_invalid STATUS_VAR);
4609 return 0;
4611 aSign = extractFloatx80Sign( a );
4612 bSign = extractFloatx80Sign( b );
4613 if ( aSign != bSign ) {
4614 return
4615 aSign
4616 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4617 == 0 );
4619 return
4620 aSign ? le128( b.high, b.low, a.high, a.low )
4621 : le128( a.high, a.low, b.high, b.low );
4625 /*----------------------------------------------------------------------------
4626 | Returns 1 if the extended double-precision floating-point value `a' is less
4627 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4628 | an exception. Otherwise, the comparison is performed according to the
4629 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4630 *----------------------------------------------------------------------------*/
4632 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4634 flag aSign, bSign;
4636 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4637 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4638 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4639 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4641 if ( floatx80_is_signaling_nan( a )
4642 || floatx80_is_signaling_nan( b ) ) {
4643 float_raise( float_flag_invalid STATUS_VAR);
4645 return 0;
4647 aSign = extractFloatx80Sign( a );
4648 bSign = extractFloatx80Sign( b );
4649 if ( aSign != bSign ) {
4650 return
4651 aSign
4652 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4653 != 0 );
4655 return
4656 aSign ? lt128( b.high, b.low, a.high, a.low )
4657 : lt128( a.high, a.low, b.high, b.low );
4661 #endif
4663 #ifdef FLOAT128
4665 /*----------------------------------------------------------------------------
4666 | Returns the result of converting the quadruple-precision floating-point
4667 | value `a' to the 32-bit two's complement integer format. The conversion
4668 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4669 | Arithmetic---which means in particular that the conversion is rounded
4670 | according to the current rounding mode. If `a' is a NaN, the largest
4671 | positive integer is returned. Otherwise, if the conversion overflows, the
4672 | largest integer with the same sign as `a' is returned.
4673 *----------------------------------------------------------------------------*/
4675 int32 float128_to_int32( float128 a STATUS_PARAM )
4677 flag aSign;
4678 int32 aExp, shiftCount;
4679 bits64 aSig0, aSig1;
4681 aSig1 = extractFloat128Frac1( a );
4682 aSig0 = extractFloat128Frac0( a );
4683 aExp = extractFloat128Exp( a );
4684 aSign = extractFloat128Sign( a );
4685 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4686 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4687 aSig0 |= ( aSig1 != 0 );
4688 shiftCount = 0x4028 - aExp;
4689 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4690 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4694 /*----------------------------------------------------------------------------
4695 | Returns the result of converting the quadruple-precision floating-point
4696 | value `a' to the 32-bit two's complement integer format. The conversion
4697 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4698 | Arithmetic, except that the conversion is always rounded toward zero. If
4699 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4700 | conversion overflows, the largest integer with the same sign as `a' is
4701 | returned.
4702 *----------------------------------------------------------------------------*/
4704 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4706 flag aSign;
4707 int32 aExp, shiftCount;
4708 bits64 aSig0, aSig1, savedASig;
4709 int32 z;
4711 aSig1 = extractFloat128Frac1( a );
4712 aSig0 = extractFloat128Frac0( a );
4713 aExp = extractFloat128Exp( a );
4714 aSign = extractFloat128Sign( a );
4715 aSig0 |= ( aSig1 != 0 );
4716 if ( 0x401E < aExp ) {
4717 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4718 goto invalid;
4720 else if ( aExp < 0x3FFF ) {
4721 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4722 return 0;
4724 aSig0 |= LIT64( 0x0001000000000000 );
4725 shiftCount = 0x402F - aExp;
4726 savedASig = aSig0;
4727 aSig0 >>= shiftCount;
4728 z = aSig0;
4729 if ( aSign ) z = - z;
4730 if ( ( z < 0 ) ^ aSign ) {
4731 invalid:
4732 float_raise( float_flag_invalid STATUS_VAR);
4733 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4735 if ( ( aSig0<<shiftCount ) != savedASig ) {
4736 STATUS(float_exception_flags) |= float_flag_inexact;
4738 return z;
4742 /*----------------------------------------------------------------------------
4743 | Returns the result of converting the quadruple-precision floating-point
4744 | value `a' to the 64-bit two's complement integer format. The conversion
4745 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4746 | Arithmetic---which means in particular that the conversion is rounded
4747 | according to the current rounding mode. If `a' is a NaN, the largest
4748 | positive integer is returned. Otherwise, if the conversion overflows, the
4749 | largest integer with the same sign as `a' is returned.
4750 *----------------------------------------------------------------------------*/
4752 int64 float128_to_int64( float128 a STATUS_PARAM )
4754 flag aSign;
4755 int32 aExp, shiftCount;
4756 bits64 aSig0, aSig1;
4758 aSig1 = extractFloat128Frac1( a );
4759 aSig0 = extractFloat128Frac0( a );
4760 aExp = extractFloat128Exp( a );
4761 aSign = extractFloat128Sign( a );
4762 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4763 shiftCount = 0x402F - aExp;
4764 if ( shiftCount <= 0 ) {
4765 if ( 0x403E < aExp ) {
4766 float_raise( float_flag_invalid STATUS_VAR);
4767 if ( ! aSign
4768 || ( ( aExp == 0x7FFF )
4769 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4772 return LIT64( 0x7FFFFFFFFFFFFFFF );
4774 return (sbits64) LIT64( 0x8000000000000000 );
4776 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4778 else {
4779 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4781 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4785 /*----------------------------------------------------------------------------
4786 | Returns the result of converting the quadruple-precision floating-point
4787 | value `a' to the 64-bit two's complement integer format. The conversion
4788 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4789 | Arithmetic, except that the conversion is always rounded toward zero.
4790 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4791 | the conversion overflows, the largest integer with the same sign as `a' is
4792 | returned.
4793 *----------------------------------------------------------------------------*/
4795 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4797 flag aSign;
4798 int32 aExp, shiftCount;
4799 bits64 aSig0, aSig1;
4800 int64 z;
4802 aSig1 = extractFloat128Frac1( a );
4803 aSig0 = extractFloat128Frac0( a );
4804 aExp = extractFloat128Exp( a );
4805 aSign = extractFloat128Sign( a );
4806 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4807 shiftCount = aExp - 0x402F;
4808 if ( 0 < shiftCount ) {
4809 if ( 0x403E <= aExp ) {
4810 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4811 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4812 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4813 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4815 else {
4816 float_raise( float_flag_invalid STATUS_VAR);
4817 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4818 return LIT64( 0x7FFFFFFFFFFFFFFF );
4821 return (sbits64) LIT64( 0x8000000000000000 );
4823 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4824 if ( (bits64) ( aSig1<<shiftCount ) ) {
4825 STATUS(float_exception_flags) |= float_flag_inexact;
4828 else {
4829 if ( aExp < 0x3FFF ) {
4830 if ( aExp | aSig0 | aSig1 ) {
4831 STATUS(float_exception_flags) |= float_flag_inexact;
4833 return 0;
4835 z = aSig0>>( - shiftCount );
4836 if ( aSig1
4837 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4838 STATUS(float_exception_flags) |= float_flag_inexact;
4841 if ( aSign ) z = - z;
4842 return z;
4846 /*----------------------------------------------------------------------------
4847 | Returns the result of converting the quadruple-precision floating-point
4848 | value `a' to the single-precision floating-point format. The conversion
4849 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4850 | Arithmetic.
4851 *----------------------------------------------------------------------------*/
4853 float32 float128_to_float32( float128 a STATUS_PARAM )
4855 flag aSign;
4856 int32 aExp;
4857 bits64 aSig0, aSig1;
4858 bits32 zSig;
4860 aSig1 = extractFloat128Frac1( a );
4861 aSig0 = extractFloat128Frac0( a );
4862 aExp = extractFloat128Exp( a );
4863 aSign = extractFloat128Sign( a );
4864 if ( aExp == 0x7FFF ) {
4865 if ( aSig0 | aSig1 ) {
4866 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4868 return packFloat32( aSign, 0xFF, 0 );
4870 aSig0 |= ( aSig1 != 0 );
4871 shift64RightJamming( aSig0, 18, &aSig0 );
4872 zSig = aSig0;
4873 if ( aExp || zSig ) {
4874 zSig |= 0x40000000;
4875 aExp -= 0x3F81;
4877 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4881 /*----------------------------------------------------------------------------
4882 | Returns the result of converting the quadruple-precision floating-point
4883 | value `a' to the double-precision floating-point format. The conversion
4884 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4885 | Arithmetic.
4886 *----------------------------------------------------------------------------*/
4888 float64 float128_to_float64( float128 a STATUS_PARAM )
4890 flag aSign;
4891 int32 aExp;
4892 bits64 aSig0, aSig1;
4894 aSig1 = extractFloat128Frac1( a );
4895 aSig0 = extractFloat128Frac0( a );
4896 aExp = extractFloat128Exp( a );
4897 aSign = extractFloat128Sign( a );
4898 if ( aExp == 0x7FFF ) {
4899 if ( aSig0 | aSig1 ) {
4900 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4902 return packFloat64( aSign, 0x7FF, 0 );
4904 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4905 aSig0 |= ( aSig1 != 0 );
4906 if ( aExp || aSig0 ) {
4907 aSig0 |= LIT64( 0x4000000000000000 );
4908 aExp -= 0x3C01;
4910 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4914 #ifdef FLOATX80
4916 /*----------------------------------------------------------------------------
4917 | Returns the result of converting the quadruple-precision floating-point
4918 | value `a' to the extended double-precision floating-point format. The
4919 | conversion is performed according to the IEC/IEEE Standard for Binary
4920 | Floating-Point Arithmetic.
4921 *----------------------------------------------------------------------------*/
4923 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4925 flag aSign;
4926 int32 aExp;
4927 bits64 aSig0, aSig1;
4929 aSig1 = extractFloat128Frac1( a );
4930 aSig0 = extractFloat128Frac0( a );
4931 aExp = extractFloat128Exp( a );
4932 aSign = extractFloat128Sign( a );
4933 if ( aExp == 0x7FFF ) {
4934 if ( aSig0 | aSig1 ) {
4935 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4937 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4939 if ( aExp == 0 ) {
4940 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4941 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4943 else {
4944 aSig0 |= LIT64( 0x0001000000000000 );
4946 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4947 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4951 #endif
4953 /*----------------------------------------------------------------------------
4954 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4955 | returns the result as a quadruple-precision floating-point value. The
4956 | operation is performed according to the IEC/IEEE Standard for Binary
4957 | Floating-Point Arithmetic.
4958 *----------------------------------------------------------------------------*/
4960 float128 float128_round_to_int( float128 a STATUS_PARAM )
4962 flag aSign;
4963 int32 aExp;
4964 bits64 lastBitMask, roundBitsMask;
4965 int8 roundingMode;
4966 float128 z;
4968 aExp = extractFloat128Exp( a );
4969 if ( 0x402F <= aExp ) {
4970 if ( 0x406F <= aExp ) {
4971 if ( ( aExp == 0x7FFF )
4972 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4974 return propagateFloat128NaN( a, a STATUS_VAR );
4976 return a;
4978 lastBitMask = 1;
4979 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4980 roundBitsMask = lastBitMask - 1;
4981 z = a;
4982 roundingMode = STATUS(float_rounding_mode);
4983 if ( roundingMode == float_round_nearest_even ) {
4984 if ( lastBitMask ) {
4985 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4986 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4988 else {
4989 if ( (sbits64) z.low < 0 ) {
4990 ++z.high;
4991 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4995 else if ( roundingMode != float_round_to_zero ) {
4996 if ( extractFloat128Sign( z )
4997 ^ ( roundingMode == float_round_up ) ) {
4998 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5001 z.low &= ~ roundBitsMask;
5003 else {
5004 if ( aExp < 0x3FFF ) {
5005 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5006 STATUS(float_exception_flags) |= float_flag_inexact;
5007 aSign = extractFloat128Sign( a );
5008 switch ( STATUS(float_rounding_mode) ) {
5009 case float_round_nearest_even:
5010 if ( ( aExp == 0x3FFE )
5011 && ( extractFloat128Frac0( a )
5012 | extractFloat128Frac1( a ) )
5014 return packFloat128( aSign, 0x3FFF, 0, 0 );
5016 break;
5017 case float_round_down:
5018 return
5019 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5020 : packFloat128( 0, 0, 0, 0 );
5021 case float_round_up:
5022 return
5023 aSign ? packFloat128( 1, 0, 0, 0 )
5024 : packFloat128( 0, 0x3FFF, 0, 0 );
5026 return packFloat128( aSign, 0, 0, 0 );
5028 lastBitMask = 1;
5029 lastBitMask <<= 0x402F - aExp;
5030 roundBitsMask = lastBitMask - 1;
5031 z.low = 0;
5032 z.high = a.high;
5033 roundingMode = STATUS(float_rounding_mode);
5034 if ( roundingMode == float_round_nearest_even ) {
5035 z.high += lastBitMask>>1;
5036 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5037 z.high &= ~ lastBitMask;
5040 else if ( roundingMode != float_round_to_zero ) {
5041 if ( extractFloat128Sign( z )
5042 ^ ( roundingMode == float_round_up ) ) {
5043 z.high |= ( a.low != 0 );
5044 z.high += roundBitsMask;
5047 z.high &= ~ roundBitsMask;
5049 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5050 STATUS(float_exception_flags) |= float_flag_inexact;
5052 return z;
5056 /*----------------------------------------------------------------------------
5057 | Returns the result of adding the absolute values of the quadruple-precision
5058 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5059 | before being returned. `zSign' is ignored if the result is a NaN.
5060 | The addition is performed according to the IEC/IEEE Standard for Binary
5061 | Floating-Point Arithmetic.
5062 *----------------------------------------------------------------------------*/
5064 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5066 int32 aExp, bExp, zExp;
5067 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5068 int32 expDiff;
5070 aSig1 = extractFloat128Frac1( a );
5071 aSig0 = extractFloat128Frac0( a );
5072 aExp = extractFloat128Exp( a );
5073 bSig1 = extractFloat128Frac1( b );
5074 bSig0 = extractFloat128Frac0( b );
5075 bExp = extractFloat128Exp( b );
5076 expDiff = aExp - bExp;
5077 if ( 0 < expDiff ) {
5078 if ( aExp == 0x7FFF ) {
5079 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5080 return a;
5082 if ( bExp == 0 ) {
5083 --expDiff;
5085 else {
5086 bSig0 |= LIT64( 0x0001000000000000 );
5088 shift128ExtraRightJamming(
5089 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5090 zExp = aExp;
5092 else if ( expDiff < 0 ) {
5093 if ( bExp == 0x7FFF ) {
5094 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5095 return packFloat128( zSign, 0x7FFF, 0, 0 );
5097 if ( aExp == 0 ) {
5098 ++expDiff;
5100 else {
5101 aSig0 |= LIT64( 0x0001000000000000 );
5103 shift128ExtraRightJamming(
5104 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5105 zExp = bExp;
5107 else {
5108 if ( aExp == 0x7FFF ) {
5109 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5110 return propagateFloat128NaN( a, b STATUS_VAR );
5112 return a;
5114 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5115 if ( aExp == 0 ) {
5116 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
5117 return packFloat128( zSign, 0, zSig0, zSig1 );
5119 zSig2 = 0;
5120 zSig0 |= LIT64( 0x0002000000000000 );
5121 zExp = aExp;
5122 goto shiftRight1;
5124 aSig0 |= LIT64( 0x0001000000000000 );
5125 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5126 --zExp;
5127 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5128 ++zExp;
5129 shiftRight1:
5130 shift128ExtraRightJamming(
5131 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5132 roundAndPack:
5133 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5137 /*----------------------------------------------------------------------------
5138 | Returns the result of subtracting the absolute values of the quadruple-
5139 | precision floating-point values `a' and `b'. If `zSign' is 1, the
5140 | difference is negated before being returned. `zSign' is ignored if the
5141 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5142 | Standard for Binary Floating-Point Arithmetic.
5143 *----------------------------------------------------------------------------*/
5145 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5147 int32 aExp, bExp, zExp;
5148 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5149 int32 expDiff;
5150 float128 z;
5152 aSig1 = extractFloat128Frac1( a );
5153 aSig0 = extractFloat128Frac0( a );
5154 aExp = extractFloat128Exp( a );
5155 bSig1 = extractFloat128Frac1( b );
5156 bSig0 = extractFloat128Frac0( b );
5157 bExp = extractFloat128Exp( b );
5158 expDiff = aExp - bExp;
5159 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5160 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5161 if ( 0 < expDiff ) goto aExpBigger;
5162 if ( expDiff < 0 ) goto bExpBigger;
5163 if ( aExp == 0x7FFF ) {
5164 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5165 return propagateFloat128NaN( a, b STATUS_VAR );
5167 float_raise( float_flag_invalid STATUS_VAR);
5168 z.low = float128_default_nan_low;
5169 z.high = float128_default_nan_high;
5170 return z;
5172 if ( aExp == 0 ) {
5173 aExp = 1;
5174 bExp = 1;
5176 if ( bSig0 < aSig0 ) goto aBigger;
5177 if ( aSig0 < bSig0 ) goto bBigger;
5178 if ( bSig1 < aSig1 ) goto aBigger;
5179 if ( aSig1 < bSig1 ) goto bBigger;
5180 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5181 bExpBigger:
5182 if ( bExp == 0x7FFF ) {
5183 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5184 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5186 if ( aExp == 0 ) {
5187 ++expDiff;
5189 else {
5190 aSig0 |= LIT64( 0x4000000000000000 );
5192 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5193 bSig0 |= LIT64( 0x4000000000000000 );
5194 bBigger:
5195 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5196 zExp = bExp;
5197 zSign ^= 1;
5198 goto normalizeRoundAndPack;
5199 aExpBigger:
5200 if ( aExp == 0x7FFF ) {
5201 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5202 return a;
5204 if ( bExp == 0 ) {
5205 --expDiff;
5207 else {
5208 bSig0 |= LIT64( 0x4000000000000000 );
5210 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5211 aSig0 |= LIT64( 0x4000000000000000 );
5212 aBigger:
5213 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5214 zExp = aExp;
5215 normalizeRoundAndPack:
5216 --zExp;
5217 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5221 /*----------------------------------------------------------------------------
5222 | Returns the result of adding the quadruple-precision floating-point values
5223 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5224 | for Binary Floating-Point Arithmetic.
5225 *----------------------------------------------------------------------------*/
5227 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5229 flag aSign, bSign;
5231 aSign = extractFloat128Sign( a );
5232 bSign = extractFloat128Sign( b );
5233 if ( aSign == bSign ) {
5234 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5236 else {
5237 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5242 /*----------------------------------------------------------------------------
5243 | Returns the result of subtracting the quadruple-precision floating-point
5244 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5245 | Standard for Binary Floating-Point Arithmetic.
5246 *----------------------------------------------------------------------------*/
5248 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5250 flag aSign, bSign;
5252 aSign = extractFloat128Sign( a );
5253 bSign = extractFloat128Sign( b );
5254 if ( aSign == bSign ) {
5255 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5257 else {
5258 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5263 /*----------------------------------------------------------------------------
5264 | Returns the result of multiplying the quadruple-precision floating-point
5265 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5266 | Standard for Binary Floating-Point Arithmetic.
5267 *----------------------------------------------------------------------------*/
5269 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5271 flag aSign, bSign, zSign;
5272 int32 aExp, bExp, zExp;
5273 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5274 float128 z;
5276 aSig1 = extractFloat128Frac1( a );
5277 aSig0 = extractFloat128Frac0( a );
5278 aExp = extractFloat128Exp( a );
5279 aSign = extractFloat128Sign( a );
5280 bSig1 = extractFloat128Frac1( b );
5281 bSig0 = extractFloat128Frac0( b );
5282 bExp = extractFloat128Exp( b );
5283 bSign = extractFloat128Sign( b );
5284 zSign = aSign ^ bSign;
5285 if ( aExp == 0x7FFF ) {
5286 if ( ( aSig0 | aSig1 )
5287 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5288 return propagateFloat128NaN( a, b STATUS_VAR );
5290 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5291 return packFloat128( zSign, 0x7FFF, 0, 0 );
5293 if ( bExp == 0x7FFF ) {
5294 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5295 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5296 invalid:
5297 float_raise( float_flag_invalid STATUS_VAR);
5298 z.low = float128_default_nan_low;
5299 z.high = float128_default_nan_high;
5300 return z;
5302 return packFloat128( zSign, 0x7FFF, 0, 0 );
5304 if ( aExp == 0 ) {
5305 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5306 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5308 if ( bExp == 0 ) {
5309 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5310 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5312 zExp = aExp + bExp - 0x4000;
5313 aSig0 |= LIT64( 0x0001000000000000 );
5314 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5315 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5316 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5317 zSig2 |= ( zSig3 != 0 );
5318 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5319 shift128ExtraRightJamming(
5320 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5321 ++zExp;
5323 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5327 /*----------------------------------------------------------------------------
5328 | Returns the result of dividing the quadruple-precision floating-point value
5329 | `a' by the corresponding value `b'. The operation is performed according to
5330 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5331 *----------------------------------------------------------------------------*/
5333 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5335 flag aSign, bSign, zSign;
5336 int32 aExp, bExp, zExp;
5337 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5338 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5339 float128 z;
5341 aSig1 = extractFloat128Frac1( a );
5342 aSig0 = extractFloat128Frac0( a );
5343 aExp = extractFloat128Exp( a );
5344 aSign = extractFloat128Sign( a );
5345 bSig1 = extractFloat128Frac1( b );
5346 bSig0 = extractFloat128Frac0( b );
5347 bExp = extractFloat128Exp( b );
5348 bSign = extractFloat128Sign( b );
5349 zSign = aSign ^ bSign;
5350 if ( aExp == 0x7FFF ) {
5351 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5352 if ( bExp == 0x7FFF ) {
5353 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5354 goto invalid;
5356 return packFloat128( zSign, 0x7FFF, 0, 0 );
5358 if ( bExp == 0x7FFF ) {
5359 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5360 return packFloat128( zSign, 0, 0, 0 );
5362 if ( bExp == 0 ) {
5363 if ( ( bSig0 | bSig1 ) == 0 ) {
5364 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5365 invalid:
5366 float_raise( float_flag_invalid STATUS_VAR);
5367 z.low = float128_default_nan_low;
5368 z.high = float128_default_nan_high;
5369 return z;
5371 float_raise( float_flag_divbyzero STATUS_VAR);
5372 return packFloat128( zSign, 0x7FFF, 0, 0 );
5374 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5376 if ( aExp == 0 ) {
5377 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5378 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5380 zExp = aExp - bExp + 0x3FFD;
5381 shortShift128Left(
5382 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5383 shortShift128Left(
5384 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5385 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5386 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5387 ++zExp;
5389 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5390 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5391 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5392 while ( (sbits64) rem0 < 0 ) {
5393 --zSig0;
5394 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5396 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5397 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5398 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5399 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5400 while ( (sbits64) rem1 < 0 ) {
5401 --zSig1;
5402 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5404 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5406 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5407 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5411 /*----------------------------------------------------------------------------
5412 | Returns the remainder of the quadruple-precision floating-point value `a'
5413 | with respect to the corresponding value `b'. The operation is performed
5414 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5415 *----------------------------------------------------------------------------*/
5417 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5419 flag aSign, zSign;
5420 int32 aExp, bExp, expDiff;
5421 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5422 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5423 sbits64 sigMean0;
5424 float128 z;
5426 aSig1 = extractFloat128Frac1( a );
5427 aSig0 = extractFloat128Frac0( a );
5428 aExp = extractFloat128Exp( a );
5429 aSign = extractFloat128Sign( a );
5430 bSig1 = extractFloat128Frac1( b );
5431 bSig0 = extractFloat128Frac0( b );
5432 bExp = extractFloat128Exp( b );
5433 if ( aExp == 0x7FFF ) {
5434 if ( ( aSig0 | aSig1 )
5435 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5436 return propagateFloat128NaN( a, b STATUS_VAR );
5438 goto invalid;
5440 if ( bExp == 0x7FFF ) {
5441 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5442 return a;
5444 if ( bExp == 0 ) {
5445 if ( ( bSig0 | bSig1 ) == 0 ) {
5446 invalid:
5447 float_raise( float_flag_invalid STATUS_VAR);
5448 z.low = float128_default_nan_low;
5449 z.high = float128_default_nan_high;
5450 return z;
5452 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5454 if ( aExp == 0 ) {
5455 if ( ( aSig0 | aSig1 ) == 0 ) return a;
5456 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5458 expDiff = aExp - bExp;
5459 if ( expDiff < -1 ) return a;
5460 shortShift128Left(
5461 aSig0 | LIT64( 0x0001000000000000 ),
5462 aSig1,
5463 15 - ( expDiff < 0 ),
5464 &aSig0,
5465 &aSig1
5467 shortShift128Left(
5468 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5469 q = le128( bSig0, bSig1, aSig0, aSig1 );
5470 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5471 expDiff -= 64;
5472 while ( 0 < expDiff ) {
5473 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5474 q = ( 4 < q ) ? q - 4 : 0;
5475 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5476 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5477 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5478 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5479 expDiff -= 61;
5481 if ( -64 < expDiff ) {
5482 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5483 q = ( 4 < q ) ? q - 4 : 0;
5484 q >>= - expDiff;
5485 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5486 expDiff += 52;
5487 if ( expDiff < 0 ) {
5488 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5490 else {
5491 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5493 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5494 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5496 else {
5497 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5498 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5500 do {
5501 alternateASig0 = aSig0;
5502 alternateASig1 = aSig1;
5503 ++q;
5504 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5505 } while ( 0 <= (sbits64) aSig0 );
5506 add128(
5507 aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5508 if ( ( sigMean0 < 0 )
5509 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5510 aSig0 = alternateASig0;
5511 aSig1 = alternateASig1;
5513 zSign = ( (sbits64) aSig0 < 0 );
5514 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5515 return
5516 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5520 /*----------------------------------------------------------------------------
5521 | Returns the square root of the quadruple-precision floating-point value `a'.
5522 | The operation is performed according to the IEC/IEEE Standard for Binary
5523 | Floating-Point Arithmetic.
5524 *----------------------------------------------------------------------------*/
5526 float128 float128_sqrt( float128 a STATUS_PARAM )
5528 flag aSign;
5529 int32 aExp, zExp;
5530 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5531 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5532 float128 z;
5534 aSig1 = extractFloat128Frac1( a );
5535 aSig0 = extractFloat128Frac0( a );
5536 aExp = extractFloat128Exp( a );
5537 aSign = extractFloat128Sign( a );
5538 if ( aExp == 0x7FFF ) {
5539 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5540 if ( ! aSign ) return a;
5541 goto invalid;
5543 if ( aSign ) {
5544 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5545 invalid:
5546 float_raise( float_flag_invalid STATUS_VAR);
5547 z.low = float128_default_nan_low;
5548 z.high = float128_default_nan_high;
5549 return z;
5551 if ( aExp == 0 ) {
5552 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5553 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5555 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5556 aSig0 |= LIT64( 0x0001000000000000 );
5557 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5558 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5559 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5560 doubleZSig0 = zSig0<<1;
5561 mul64To128( zSig0, zSig0, &term0, &term1 );
5562 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5563 while ( (sbits64) rem0 < 0 ) {
5564 --zSig0;
5565 doubleZSig0 -= 2;
5566 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5568 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5569 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5570 if ( zSig1 == 0 ) zSig1 = 1;
5571 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5572 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5573 mul64To128( zSig1, zSig1, &term2, &term3 );
5574 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5575 while ( (sbits64) rem1 < 0 ) {
5576 --zSig1;
5577 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5578 term3 |= 1;
5579 term2 |= doubleZSig0;
5580 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5582 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5584 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5585 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5589 /*----------------------------------------------------------------------------
5590 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5591 | the corresponding value `b', and 0 otherwise. The comparison is performed
5592 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5593 *----------------------------------------------------------------------------*/
5595 int float128_eq( float128 a, float128 b STATUS_PARAM )
5598 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5599 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5600 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5601 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5603 if ( float128_is_signaling_nan( a )
5604 || float128_is_signaling_nan( b ) ) {
5605 float_raise( float_flag_invalid STATUS_VAR);
5607 return 0;
5609 return
5610 ( a.low == b.low )
5611 && ( ( a.high == b.high )
5612 || ( ( a.low == 0 )
5613 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5618 /*----------------------------------------------------------------------------
5619 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5620 | or equal to the corresponding value `b', and 0 otherwise. The comparison
5621 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5622 | Arithmetic.
5623 *----------------------------------------------------------------------------*/
5625 int float128_le( float128 a, float128 b STATUS_PARAM )
5627 flag aSign, bSign;
5629 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5630 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5631 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5632 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5634 float_raise( float_flag_invalid STATUS_VAR);
5635 return 0;
5637 aSign = extractFloat128Sign( a );
5638 bSign = extractFloat128Sign( b );
5639 if ( aSign != bSign ) {
5640 return
5641 aSign
5642 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5643 == 0 );
5645 return
5646 aSign ? le128( b.high, b.low, a.high, a.low )
5647 : le128( a.high, a.low, b.high, b.low );
5651 /*----------------------------------------------------------------------------
5652 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5653 | the corresponding value `b', and 0 otherwise. The comparison is performed
5654 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5655 *----------------------------------------------------------------------------*/
5657 int float128_lt( float128 a, float128 b STATUS_PARAM )
5659 flag aSign, bSign;
5661 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5662 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5663 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5664 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5666 float_raise( float_flag_invalid STATUS_VAR);
5667 return 0;
5669 aSign = extractFloat128Sign( a );
5670 bSign = extractFloat128Sign( b );
5671 if ( aSign != bSign ) {
5672 return
5673 aSign
5674 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5675 != 0 );
5677 return
5678 aSign ? lt128( b.high, b.low, a.high, a.low )
5679 : lt128( a.high, a.low, b.high, b.low );
5683 /*----------------------------------------------------------------------------
5684 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5685 | the corresponding value `b', and 0 otherwise. The invalid exception is
5686 | raised if either operand is a NaN. Otherwise, the comparison is performed
5687 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5688 *----------------------------------------------------------------------------*/
5690 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5693 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5694 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5695 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5696 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5698 float_raise( float_flag_invalid STATUS_VAR);
5699 return 0;
5701 return
5702 ( a.low == b.low )
5703 && ( ( a.high == b.high )
5704 || ( ( a.low == 0 )
5705 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5710 /*----------------------------------------------------------------------------
5711 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5712 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5713 | cause an exception. Otherwise, the comparison is performed according to the
5714 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5715 *----------------------------------------------------------------------------*/
5717 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5719 flag aSign, bSign;
5721 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5722 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5723 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5724 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5726 if ( float128_is_signaling_nan( a )
5727 || float128_is_signaling_nan( b ) ) {
5728 float_raise( float_flag_invalid STATUS_VAR);
5730 return 0;
5732 aSign = extractFloat128Sign( a );
5733 bSign = extractFloat128Sign( b );
5734 if ( aSign != bSign ) {
5735 return
5736 aSign
5737 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5738 == 0 );
5740 return
5741 aSign ? le128( b.high, b.low, a.high, a.low )
5742 : le128( a.high, a.low, b.high, b.low );
5746 /*----------------------------------------------------------------------------
5747 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5748 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5749 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5750 | Standard for Binary Floating-Point Arithmetic.
5751 *----------------------------------------------------------------------------*/
5753 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5755 flag aSign, bSign;
5757 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5758 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5759 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5760 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5762 if ( float128_is_signaling_nan( a )
5763 || float128_is_signaling_nan( b ) ) {
5764 float_raise( float_flag_invalid STATUS_VAR);
5766 return 0;
5768 aSign = extractFloat128Sign( a );
5769 bSign = extractFloat128Sign( b );
5770 if ( aSign != bSign ) {
5771 return
5772 aSign
5773 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5774 != 0 );
5776 return
5777 aSign ? lt128( b.high, b.low, a.high, a.low )
5778 : lt128( a.high, a.low, b.high, b.low );
5782 #endif
5784 /* misc functions */
5785 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5787 return int64_to_float32(a STATUS_VAR);
5790 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5792 return int64_to_float64(a STATUS_VAR);
5795 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5797 int64_t v;
5798 unsigned int res;
5800 v = float32_to_int64(a STATUS_VAR);
5801 if (v < 0) {
5802 res = 0;
5803 float_raise( float_flag_invalid STATUS_VAR);
5804 } else if (v > 0xffffffff) {
5805 res = 0xffffffff;
5806 float_raise( float_flag_invalid STATUS_VAR);
5807 } else {
5808 res = v;
5810 return res;
5813 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5815 int64_t v;
5816 unsigned int res;
5818 v = float32_to_int64_round_to_zero(a STATUS_VAR);
5819 if (v < 0) {
5820 res = 0;
5821 float_raise( float_flag_invalid STATUS_VAR);
5822 } else if (v > 0xffffffff) {
5823 res = 0xffffffff;
5824 float_raise( float_flag_invalid STATUS_VAR);
5825 } else {
5826 res = v;
5828 return res;
5831 unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
5833 int64_t v;
5834 unsigned int res;
5836 v = float32_to_int64_round_to_zero(a STATUS_VAR);
5837 if (v < 0) {
5838 res = 0;
5839 float_raise( float_flag_invalid STATUS_VAR);
5840 } else if (v > 0xffff) {
5841 res = 0xffff;
5842 float_raise( float_flag_invalid STATUS_VAR);
5843 } else {
5844 res = v;
5846 return res;
5849 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5851 int64_t v;
5852 unsigned int res;
5854 v = float64_to_int64(a STATUS_VAR);
5855 if (v < 0) {
5856 res = 0;
5857 float_raise( float_flag_invalid STATUS_VAR);
5858 } else if (v > 0xffffffff) {
5859 res = 0xffffffff;
5860 float_raise( float_flag_invalid STATUS_VAR);
5861 } else {
5862 res = v;
5864 return res;
5867 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5869 int64_t v;
5870 unsigned int res;
5872 v = float64_to_int64_round_to_zero(a STATUS_VAR);
5873 if (v < 0) {
5874 res = 0;
5875 float_raise( float_flag_invalid STATUS_VAR);
5876 } else if (v > 0xffffffff) {
5877 res = 0xffffffff;
5878 float_raise( float_flag_invalid STATUS_VAR);
5879 } else {
5880 res = v;
5882 return res;
5885 unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
5887 int64_t v;
5888 unsigned int res;
5890 v = float64_to_int64_round_to_zero(a STATUS_VAR);
5891 if (v < 0) {
5892 res = 0;
5893 float_raise( float_flag_invalid STATUS_VAR);
5894 } else if (v > 0xffff) {
5895 res = 0xffff;
5896 float_raise( float_flag_invalid STATUS_VAR);
5897 } else {
5898 res = v;
5900 return res;
5903 /* FIXME: This looks broken. */
5904 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5906 int64_t v;
5908 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5909 v += float64_val(a);
5910 v = float64_to_int64(make_float64(v) STATUS_VAR);
5912 return v - INT64_MIN;
5915 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5917 int64_t v;
5919 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5920 v += float64_val(a);
5921 v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5923 return v - INT64_MIN;
5926 #define COMPARE(s, nan_exp) \
5927 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
5928 int is_quiet STATUS_PARAM ) \
5930 flag aSign, bSign; \
5931 bits ## s av, bv; \
5932 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
5933 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
5935 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
5936 extractFloat ## s ## Frac( a ) ) || \
5937 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
5938 extractFloat ## s ## Frac( b ) )) { \
5939 if (!is_quiet || \
5940 float ## s ## _is_signaling_nan( a ) || \
5941 float ## s ## _is_signaling_nan( b ) ) { \
5942 float_raise( float_flag_invalid STATUS_VAR); \
5944 return float_relation_unordered; \
5946 aSign = extractFloat ## s ## Sign( a ); \
5947 bSign = extractFloat ## s ## Sign( b ); \
5948 av = float ## s ## _val(a); \
5949 bv = float ## s ## _val(b); \
5950 if ( aSign != bSign ) { \
5951 if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) { \
5952 /* zero case */ \
5953 return float_relation_equal; \
5954 } else { \
5955 return 1 - (2 * aSign); \
5957 } else { \
5958 if (av == bv) { \
5959 return float_relation_equal; \
5960 } else { \
5961 return 1 - 2 * (aSign ^ ( av < bv )); \
5966 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
5968 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
5971 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
5973 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
5976 COMPARE(32, 0xff)
5977 COMPARE(64, 0x7ff)
5979 INLINE int float128_compare_internal( float128 a, float128 b,
5980 int is_quiet STATUS_PARAM )
5982 flag aSign, bSign;
5984 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
5985 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
5986 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
5987 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
5988 if (!is_quiet ||
5989 float128_is_signaling_nan( a ) ||
5990 float128_is_signaling_nan( b ) ) {
5991 float_raise( float_flag_invalid STATUS_VAR);
5993 return float_relation_unordered;
5995 aSign = extractFloat128Sign( a );
5996 bSign = extractFloat128Sign( b );
5997 if ( aSign != bSign ) {
5998 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
5999 /* zero case */
6000 return float_relation_equal;
6001 } else {
6002 return 1 - (2 * aSign);
6004 } else {
6005 if (a.low == b.low && a.high == b.high) {
6006 return float_relation_equal;
6007 } else {
6008 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6013 int float128_compare( float128 a, float128 b STATUS_PARAM )
6015 return float128_compare_internal(a, b, 0 STATUS_VAR);
6018 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6020 return float128_compare_internal(a, b, 1 STATUS_VAR);
6023 /* Multiply A by 2 raised to the power N. */
6024 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6026 flag aSign;
6027 int16 aExp;
6028 bits32 aSig;
6030 a = float32_squash_input_denormal(a STATUS_VAR);
6031 aSig = extractFloat32Frac( a );
6032 aExp = extractFloat32Exp( a );
6033 aSign = extractFloat32Sign( a );
6035 if ( aExp == 0xFF ) {
6036 return a;
6038 if ( aExp != 0 )
6039 aSig |= 0x00800000;
6040 else if ( aSig == 0 )
6041 return a;
6043 aExp += n - 1;
6044 aSig <<= 7;
6045 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6048 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6050 flag aSign;
6051 int16 aExp;
6052 bits64 aSig;
6054 a = float64_squash_input_denormal(a STATUS_VAR);
6055 aSig = extractFloat64Frac( a );
6056 aExp = extractFloat64Exp( a );
6057 aSign = extractFloat64Sign( a );
6059 if ( aExp == 0x7FF ) {
6060 return a;
6062 if ( aExp != 0 )
6063 aSig |= LIT64( 0x0010000000000000 );
6064 else if ( aSig == 0 )
6065 return a;
6067 aExp += n - 1;
6068 aSig <<= 10;
6069 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6072 #ifdef FLOATX80
6073 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6075 flag aSign;
6076 int16 aExp;
6077 bits64 aSig;
6079 aSig = extractFloatx80Frac( a );
6080 aExp = extractFloatx80Exp( a );
6081 aSign = extractFloatx80Sign( a );
6083 if ( aExp == 0x7FF ) {
6084 return a;
6086 if (aExp == 0 && aSig == 0)
6087 return a;
6089 aExp += n;
6090 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6091 aSign, aExp, aSig, 0 STATUS_VAR );
6093 #endif
6095 #ifdef FLOAT128
6096 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6098 flag aSign;
6099 int32 aExp;
6100 bits64 aSig0, aSig1;
6102 aSig1 = extractFloat128Frac1( a );
6103 aSig0 = extractFloat128Frac0( a );
6104 aExp = extractFloat128Exp( a );
6105 aSign = extractFloat128Sign( a );
6106 if ( aExp == 0x7FFF ) {
6107 return a;
6109 if ( aExp != 0 )
6110 aSig0 |= LIT64( 0x0001000000000000 );
6111 else if ( aSig0 == 0 && aSig1 == 0 )
6112 return a;
6114 aExp += n - 1;
6115 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6116 STATUS_VAR );
6119 #endif