softfloat: Fix single-to-half precision float conversions
[qemu.git] / fpu / softfloat.c
blob80d8cc489454bab39bfeb6e57793e678946c477e
2 /*============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5 Package, Release 2b.
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15 arithmetic/SoftFloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
26 Derivative works are acceptable, even for commercial purposes, so long as
27 (1) the source code for the derivative work includes prominent notice that
28 the work is derivative, and (2) the source code includes prominent notice with
29 these four paragraphs for those parts of this code that are retained.
31 =============================================================================*/
33 #include "softfloat.h"
35 /*----------------------------------------------------------------------------
36 | Primitive arithmetic functions, including multi-word arithmetic, and
37 | division and square root approximations. (Can be specialized to target if
38 | desired.)
39 *----------------------------------------------------------------------------*/
40 #include "softfloat-macros.h"
42 /*----------------------------------------------------------------------------
43 | Functions and definitions to determine: (1) whether tininess for underflow
44 | is detected before or after rounding by default, (2) what (if anything)
45 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
46 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
47 | are propagated from function inputs to output. These details are target-
48 | specific.
49 *----------------------------------------------------------------------------*/
50 #include "softfloat-specialize.h"
52 void set_float_rounding_mode(int val STATUS_PARAM)
54 STATUS(float_rounding_mode) = val;
57 void set_float_exception_flags(int val STATUS_PARAM)
59 STATUS(float_exception_flags) = val;
62 #ifdef FLOATX80
63 void set_floatx80_rounding_precision(int val STATUS_PARAM)
65 STATUS(floatx80_rounding_precision) = val;
67 #endif
69 /*----------------------------------------------------------------------------
70 | Returns the fraction bits of the half-precision floating-point value `a'.
71 *----------------------------------------------------------------------------*/
73 INLINE uint32_t extractFloat16Frac(float16 a)
75 return float16_val(a) & 0x3ff;
78 /*----------------------------------------------------------------------------
79 | Returns the exponent bits of the half-precision floating-point value `a'.
80 *----------------------------------------------------------------------------*/
82 INLINE int16 extractFloat16Exp(float16 a)
84 return (float16_val(a) >> 10) & 0x1f;
87 /*----------------------------------------------------------------------------
88 | Returns the sign bit of the single-precision floating-point value `a'.
89 *----------------------------------------------------------------------------*/
91 INLINE flag extractFloat16Sign(float16 a)
93 return float16_val(a)>>15;
96 /*----------------------------------------------------------------------------
97 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
98 | and 7, and returns the properly rounded 32-bit integer corresponding to the
99 | input. If `zSign' is 1, the input is negated before being converted to an
100 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
101 | is simply rounded to an integer, with the inexact exception raised if the
102 | input cannot be represented exactly as an integer. However, if the fixed-
103 | point input is too large, the invalid exception is raised and the largest
104 | positive or negative integer is returned.
105 *----------------------------------------------------------------------------*/
107 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
109 int8 roundingMode;
110 flag roundNearestEven;
111 int8 roundIncrement, roundBits;
112 int32 z;
114 roundingMode = STATUS(float_rounding_mode);
115 roundNearestEven = ( roundingMode == float_round_nearest_even );
116 roundIncrement = 0x40;
117 if ( ! roundNearestEven ) {
118 if ( roundingMode == float_round_to_zero ) {
119 roundIncrement = 0;
121 else {
122 roundIncrement = 0x7F;
123 if ( zSign ) {
124 if ( roundingMode == float_round_up ) roundIncrement = 0;
126 else {
127 if ( roundingMode == float_round_down ) roundIncrement = 0;
131 roundBits = absZ & 0x7F;
132 absZ = ( absZ + roundIncrement )>>7;
133 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
134 z = absZ;
135 if ( zSign ) z = - z;
136 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
137 float_raise( float_flag_invalid STATUS_VAR);
138 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
140 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
141 return z;
145 /*----------------------------------------------------------------------------
146 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
147 | `absZ1', with binary point between bits 63 and 64 (between the input words),
148 | and returns the properly rounded 64-bit integer corresponding to the input.
149 | If `zSign' is 1, the input is negated before being converted to an integer.
150 | Ordinarily, the fixed-point input is simply rounded to an integer, with
151 | the inexact exception raised if the input cannot be represented exactly as
152 | an integer. However, if the fixed-point input is too large, the invalid
153 | exception is raised and the largest positive or negative integer is
154 | returned.
155 *----------------------------------------------------------------------------*/
157 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
159 int8 roundingMode;
160 flag roundNearestEven, increment;
161 int64 z;
163 roundingMode = STATUS(float_rounding_mode);
164 roundNearestEven = ( roundingMode == float_round_nearest_even );
165 increment = ( (sbits64) absZ1 < 0 );
166 if ( ! roundNearestEven ) {
167 if ( roundingMode == float_round_to_zero ) {
168 increment = 0;
170 else {
171 if ( zSign ) {
172 increment = ( roundingMode == float_round_down ) && absZ1;
174 else {
175 increment = ( roundingMode == float_round_up ) && absZ1;
179 if ( increment ) {
180 ++absZ0;
181 if ( absZ0 == 0 ) goto overflow;
182 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
184 z = absZ0;
185 if ( zSign ) z = - z;
186 if ( z && ( ( z < 0 ) ^ zSign ) ) {
187 overflow:
188 float_raise( float_flag_invalid STATUS_VAR);
189 return
190 zSign ? (sbits64) LIT64( 0x8000000000000000 )
191 : LIT64( 0x7FFFFFFFFFFFFFFF );
193 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
194 return z;
198 /*----------------------------------------------------------------------------
199 | Returns the fraction bits of the single-precision floating-point value `a'.
200 *----------------------------------------------------------------------------*/
202 INLINE bits32 extractFloat32Frac( float32 a )
205 return float32_val(a) & 0x007FFFFF;
209 /*----------------------------------------------------------------------------
210 | Returns the exponent bits of the single-precision floating-point value `a'.
211 *----------------------------------------------------------------------------*/
213 INLINE int16 extractFloat32Exp( float32 a )
216 return ( float32_val(a)>>23 ) & 0xFF;
220 /*----------------------------------------------------------------------------
221 | Returns the sign bit of the single-precision floating-point value `a'.
222 *----------------------------------------------------------------------------*/
224 INLINE flag extractFloat32Sign( float32 a )
227 return float32_val(a)>>31;
231 /*----------------------------------------------------------------------------
232 | If `a' is denormal and we are in flush-to-zero mode then set the
233 | input-denormal exception and return zero. Otherwise just return the value.
234 *----------------------------------------------------------------------------*/
235 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
237 if (STATUS(flush_inputs_to_zero)) {
238 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
239 float_raise(float_flag_input_denormal STATUS_VAR);
240 return make_float32(float32_val(a) & 0x80000000);
243 return a;
246 /*----------------------------------------------------------------------------
247 | Normalizes the subnormal single-precision floating-point value represented
248 | by the denormalized significand `aSig'. The normalized exponent and
249 | significand are stored at the locations pointed to by `zExpPtr' and
250 | `zSigPtr', respectively.
251 *----------------------------------------------------------------------------*/
253 static void
254 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
256 int8 shiftCount;
258 shiftCount = countLeadingZeros32( aSig ) - 8;
259 *zSigPtr = aSig<<shiftCount;
260 *zExpPtr = 1 - shiftCount;
264 /*----------------------------------------------------------------------------
265 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
266 | single-precision floating-point value, returning the result. After being
267 | shifted into the proper positions, the three fields are simply added
268 | together to form the result. This means that any integer portion of `zSig'
269 | will be added into the exponent. Since a properly normalized significand
270 | will have an integer portion equal to 1, the `zExp' input should be 1 less
271 | than the desired result exponent whenever `zSig' is a complete, normalized
272 | significand.
273 *----------------------------------------------------------------------------*/
275 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
278 return make_float32(
279 ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
283 /*----------------------------------------------------------------------------
284 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
285 | and significand `zSig', and returns the proper single-precision floating-
286 | point value corresponding to the abstract input. Ordinarily, the abstract
287 | value is simply rounded and packed into the single-precision format, with
288 | the inexact exception raised if the abstract input cannot be represented
289 | exactly. However, if the abstract value is too large, the overflow and
290 | inexact exceptions are raised and an infinity or maximal finite value is
291 | returned. If the abstract value is too small, the input value is rounded to
292 | a subnormal number, and the underflow and inexact exceptions are raised if
293 | the abstract input cannot be represented exactly as a subnormal single-
294 | precision floating-point number.
295 | The input significand `zSig' has its binary point between bits 30
296 | and 29, which is 7 bits to the left of the usual location. This shifted
297 | significand must be normalized or smaller. If `zSig' is not normalized,
298 | `zExp' must be 0; in that case, the result returned is a subnormal number,
299 | and it must not require rounding. In the usual case that `zSig' is
300 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
301 | The handling of underflow and overflow follows the IEC/IEEE Standard for
302 | Binary Floating-Point Arithmetic.
303 *----------------------------------------------------------------------------*/
305 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
307 int8 roundingMode;
308 flag roundNearestEven;
309 int8 roundIncrement, roundBits;
310 flag isTiny;
312 roundingMode = STATUS(float_rounding_mode);
313 roundNearestEven = ( roundingMode == float_round_nearest_even );
314 roundIncrement = 0x40;
315 if ( ! roundNearestEven ) {
316 if ( roundingMode == float_round_to_zero ) {
317 roundIncrement = 0;
319 else {
320 roundIncrement = 0x7F;
321 if ( zSign ) {
322 if ( roundingMode == float_round_up ) roundIncrement = 0;
324 else {
325 if ( roundingMode == float_round_down ) roundIncrement = 0;
329 roundBits = zSig & 0x7F;
330 if ( 0xFD <= (bits16) zExp ) {
331 if ( ( 0xFD < zExp )
332 || ( ( zExp == 0xFD )
333 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
335 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
336 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
338 if ( zExp < 0 ) {
339 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
340 isTiny =
341 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
342 || ( zExp < -1 )
343 || ( zSig + roundIncrement < 0x80000000 );
344 shift32RightJamming( zSig, - zExp, &zSig );
345 zExp = 0;
346 roundBits = zSig & 0x7F;
347 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
350 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
351 zSig = ( zSig + roundIncrement )>>7;
352 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
353 if ( zSig == 0 ) zExp = 0;
354 return packFloat32( zSign, zExp, zSig );
358 /*----------------------------------------------------------------------------
359 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
360 | and significand `zSig', and returns the proper single-precision floating-
361 | point value corresponding to the abstract input. This routine is just like
362 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
363 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
364 | floating-point exponent.
365 *----------------------------------------------------------------------------*/
367 static float32
368 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
370 int8 shiftCount;
372 shiftCount = countLeadingZeros32( zSig ) - 1;
373 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
377 /*----------------------------------------------------------------------------
378 | Returns the fraction bits of the double-precision floating-point value `a'.
379 *----------------------------------------------------------------------------*/
381 INLINE bits64 extractFloat64Frac( float64 a )
384 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
388 /*----------------------------------------------------------------------------
389 | Returns the exponent bits of the double-precision floating-point value `a'.
390 *----------------------------------------------------------------------------*/
392 INLINE int16 extractFloat64Exp( float64 a )
395 return ( float64_val(a)>>52 ) & 0x7FF;
399 /*----------------------------------------------------------------------------
400 | Returns the sign bit of the double-precision floating-point value `a'.
401 *----------------------------------------------------------------------------*/
403 INLINE flag extractFloat64Sign( float64 a )
406 return float64_val(a)>>63;
410 /*----------------------------------------------------------------------------
411 | If `a' is denormal and we are in flush-to-zero mode then set the
412 | input-denormal exception and return zero. Otherwise just return the value.
413 *----------------------------------------------------------------------------*/
414 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
416 if (STATUS(flush_inputs_to_zero)) {
417 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
418 float_raise(float_flag_input_denormal STATUS_VAR);
419 return make_float64(float64_val(a) & (1ULL << 63));
422 return a;
425 /*----------------------------------------------------------------------------
426 | Normalizes the subnormal double-precision floating-point value represented
427 | by the denormalized significand `aSig'. The normalized exponent and
428 | significand are stored at the locations pointed to by `zExpPtr' and
429 | `zSigPtr', respectively.
430 *----------------------------------------------------------------------------*/
432 static void
433 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
435 int8 shiftCount;
437 shiftCount = countLeadingZeros64( aSig ) - 11;
438 *zSigPtr = aSig<<shiftCount;
439 *zExpPtr = 1 - shiftCount;
443 /*----------------------------------------------------------------------------
444 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
445 | double-precision floating-point value, returning the result. After being
446 | shifted into the proper positions, the three fields are simply added
447 | together to form the result. This means that any integer portion of `zSig'
448 | will be added into the exponent. Since a properly normalized significand
449 | will have an integer portion equal to 1, the `zExp' input should be 1 less
450 | than the desired result exponent whenever `zSig' is a complete, normalized
451 | significand.
452 *----------------------------------------------------------------------------*/
454 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
457 return make_float64(
458 ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
462 /*----------------------------------------------------------------------------
463 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
464 | and significand `zSig', and returns the proper double-precision floating-
465 | point value corresponding to the abstract input. Ordinarily, the abstract
466 | value is simply rounded and packed into the double-precision format, with
467 | the inexact exception raised if the abstract input cannot be represented
468 | exactly. However, if the abstract value is too large, the overflow and
469 | inexact exceptions are raised and an infinity or maximal finite value is
470 | returned. If the abstract value is too small, the input value is rounded
471 | to a subnormal number, and the underflow and inexact exceptions are raised
472 | if the abstract input cannot be represented exactly as a subnormal double-
473 | precision floating-point number.
474 | The input significand `zSig' has its binary point between bits 62
475 | and 61, which is 10 bits to the left of the usual location. This shifted
476 | significand must be normalized or smaller. If `zSig' is not normalized,
477 | `zExp' must be 0; in that case, the result returned is a subnormal number,
478 | and it must not require rounding. In the usual case that `zSig' is
479 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
480 | The handling of underflow and overflow follows the IEC/IEEE Standard for
481 | Binary Floating-Point Arithmetic.
482 *----------------------------------------------------------------------------*/
484 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
486 int8 roundingMode;
487 flag roundNearestEven;
488 int16 roundIncrement, roundBits;
489 flag isTiny;
491 roundingMode = STATUS(float_rounding_mode);
492 roundNearestEven = ( roundingMode == float_round_nearest_even );
493 roundIncrement = 0x200;
494 if ( ! roundNearestEven ) {
495 if ( roundingMode == float_round_to_zero ) {
496 roundIncrement = 0;
498 else {
499 roundIncrement = 0x3FF;
500 if ( zSign ) {
501 if ( roundingMode == float_round_up ) roundIncrement = 0;
503 else {
504 if ( roundingMode == float_round_down ) roundIncrement = 0;
508 roundBits = zSig & 0x3FF;
509 if ( 0x7FD <= (bits16) zExp ) {
510 if ( ( 0x7FD < zExp )
511 || ( ( zExp == 0x7FD )
512 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
514 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
515 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
517 if ( zExp < 0 ) {
518 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
519 isTiny =
520 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
521 || ( zExp < -1 )
522 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
523 shift64RightJamming( zSig, - zExp, &zSig );
524 zExp = 0;
525 roundBits = zSig & 0x3FF;
526 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
529 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
530 zSig = ( zSig + roundIncrement )>>10;
531 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
532 if ( zSig == 0 ) zExp = 0;
533 return packFloat64( zSign, zExp, zSig );
537 /*----------------------------------------------------------------------------
538 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
539 | and significand `zSig', and returns the proper double-precision floating-
540 | point value corresponding to the abstract input. This routine is just like
541 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
542 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
543 | floating-point exponent.
544 *----------------------------------------------------------------------------*/
546 static float64
547 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
549 int8 shiftCount;
551 shiftCount = countLeadingZeros64( zSig ) - 1;
552 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
556 #ifdef FLOATX80
558 /*----------------------------------------------------------------------------
559 | Returns the fraction bits of the extended double-precision floating-point
560 | value `a'.
561 *----------------------------------------------------------------------------*/
563 INLINE bits64 extractFloatx80Frac( floatx80 a )
566 return a.low;
570 /*----------------------------------------------------------------------------
571 | Returns the exponent bits of the extended double-precision floating-point
572 | value `a'.
573 *----------------------------------------------------------------------------*/
575 INLINE int32 extractFloatx80Exp( floatx80 a )
578 return a.high & 0x7FFF;
582 /*----------------------------------------------------------------------------
583 | Returns the sign bit of the extended double-precision floating-point value
584 | `a'.
585 *----------------------------------------------------------------------------*/
587 INLINE flag extractFloatx80Sign( floatx80 a )
590 return a.high>>15;
594 /*----------------------------------------------------------------------------
595 | Normalizes the subnormal extended double-precision floating-point value
596 | represented by the denormalized significand `aSig'. The normalized exponent
597 | and significand are stored at the locations pointed to by `zExpPtr' and
598 | `zSigPtr', respectively.
599 *----------------------------------------------------------------------------*/
601 static void
602 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
604 int8 shiftCount;
606 shiftCount = countLeadingZeros64( aSig );
607 *zSigPtr = aSig<<shiftCount;
608 *zExpPtr = 1 - shiftCount;
612 /*----------------------------------------------------------------------------
613 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
614 | extended double-precision floating-point value, returning the result.
615 *----------------------------------------------------------------------------*/
617 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
619 floatx80 z;
621 z.low = zSig;
622 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
623 return z;
627 /*----------------------------------------------------------------------------
628 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
629 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
630 | and returns the proper extended double-precision floating-point value
631 | corresponding to the abstract input. Ordinarily, the abstract value is
632 | rounded and packed into the extended double-precision format, with the
633 | inexact exception raised if the abstract input cannot be represented
634 | exactly. However, if the abstract value is too large, the overflow and
635 | inexact exceptions are raised and an infinity or maximal finite value is
636 | returned. If the abstract value is too small, the input value is rounded to
637 | a subnormal number, and the underflow and inexact exceptions are raised if
638 | the abstract input cannot be represented exactly as a subnormal extended
639 | double-precision floating-point number.
640 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
641 | number of bits as single or double precision, respectively. Otherwise, the
642 | result is rounded to the full precision of the extended double-precision
643 | format.
644 | The input significand must be normalized or smaller. If the input
645 | significand is not normalized, `zExp' must be 0; in that case, the result
646 | returned is a subnormal number, and it must not require rounding. The
647 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
648 | Floating-Point Arithmetic.
649 *----------------------------------------------------------------------------*/
651 static floatx80
652 roundAndPackFloatx80(
653 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
654 STATUS_PARAM)
656 int8 roundingMode;
657 flag roundNearestEven, increment, isTiny;
658 int64 roundIncrement, roundMask, roundBits;
660 roundingMode = STATUS(float_rounding_mode);
661 roundNearestEven = ( roundingMode == float_round_nearest_even );
662 if ( roundingPrecision == 80 ) goto precision80;
663 if ( roundingPrecision == 64 ) {
664 roundIncrement = LIT64( 0x0000000000000400 );
665 roundMask = LIT64( 0x00000000000007FF );
667 else if ( roundingPrecision == 32 ) {
668 roundIncrement = LIT64( 0x0000008000000000 );
669 roundMask = LIT64( 0x000000FFFFFFFFFF );
671 else {
672 goto precision80;
674 zSig0 |= ( zSig1 != 0 );
675 if ( ! roundNearestEven ) {
676 if ( roundingMode == float_round_to_zero ) {
677 roundIncrement = 0;
679 else {
680 roundIncrement = roundMask;
681 if ( zSign ) {
682 if ( roundingMode == float_round_up ) roundIncrement = 0;
684 else {
685 if ( roundingMode == float_round_down ) roundIncrement = 0;
689 roundBits = zSig0 & roundMask;
690 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
691 if ( ( 0x7FFE < zExp )
692 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
694 goto overflow;
696 if ( zExp <= 0 ) {
697 if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
698 isTiny =
699 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
700 || ( zExp < 0 )
701 || ( zSig0 <= zSig0 + roundIncrement );
702 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
703 zExp = 0;
704 roundBits = zSig0 & roundMask;
705 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
706 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
707 zSig0 += roundIncrement;
708 if ( (sbits64) zSig0 < 0 ) zExp = 1;
709 roundIncrement = roundMask + 1;
710 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
711 roundMask |= roundIncrement;
713 zSig0 &= ~ roundMask;
714 return packFloatx80( zSign, zExp, zSig0 );
717 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
718 zSig0 += roundIncrement;
719 if ( zSig0 < roundIncrement ) {
720 ++zExp;
721 zSig0 = LIT64( 0x8000000000000000 );
723 roundIncrement = roundMask + 1;
724 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
725 roundMask |= roundIncrement;
727 zSig0 &= ~ roundMask;
728 if ( zSig0 == 0 ) zExp = 0;
729 return packFloatx80( zSign, zExp, zSig0 );
730 precision80:
731 increment = ( (sbits64) zSig1 < 0 );
732 if ( ! roundNearestEven ) {
733 if ( roundingMode == float_round_to_zero ) {
734 increment = 0;
736 else {
737 if ( zSign ) {
738 increment = ( roundingMode == float_round_down ) && zSig1;
740 else {
741 increment = ( roundingMode == float_round_up ) && zSig1;
745 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
746 if ( ( 0x7FFE < zExp )
747 || ( ( zExp == 0x7FFE )
748 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
749 && increment
752 roundMask = 0;
753 overflow:
754 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
755 if ( ( roundingMode == float_round_to_zero )
756 || ( zSign && ( roundingMode == float_round_up ) )
757 || ( ! zSign && ( roundingMode == float_round_down ) )
759 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
761 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
763 if ( zExp <= 0 ) {
764 isTiny =
765 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
766 || ( zExp < 0 )
767 || ! increment
768 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
769 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
770 zExp = 0;
771 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
772 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
773 if ( roundNearestEven ) {
774 increment = ( (sbits64) zSig1 < 0 );
776 else {
777 if ( zSign ) {
778 increment = ( roundingMode == float_round_down ) && zSig1;
780 else {
781 increment = ( roundingMode == float_round_up ) && zSig1;
784 if ( increment ) {
785 ++zSig0;
786 zSig0 &=
787 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
788 if ( (sbits64) zSig0 < 0 ) zExp = 1;
790 return packFloatx80( zSign, zExp, zSig0 );
793 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
794 if ( increment ) {
795 ++zSig0;
796 if ( zSig0 == 0 ) {
797 ++zExp;
798 zSig0 = LIT64( 0x8000000000000000 );
800 else {
801 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
804 else {
805 if ( zSig0 == 0 ) zExp = 0;
807 return packFloatx80( zSign, zExp, zSig0 );
811 /*----------------------------------------------------------------------------
812 | Takes an abstract floating-point value having sign `zSign', exponent
813 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
814 | and returns the proper extended double-precision floating-point value
815 | corresponding to the abstract input. This routine is just like
816 | `roundAndPackFloatx80' except that the input significand does not have to be
817 | normalized.
818 *----------------------------------------------------------------------------*/
820 static floatx80
821 normalizeRoundAndPackFloatx80(
822 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
823 STATUS_PARAM)
825 int8 shiftCount;
827 if ( zSig0 == 0 ) {
828 zSig0 = zSig1;
829 zSig1 = 0;
830 zExp -= 64;
832 shiftCount = countLeadingZeros64( zSig0 );
833 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
834 zExp -= shiftCount;
835 return
836 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
840 #endif
842 #ifdef FLOAT128
844 /*----------------------------------------------------------------------------
845 | Returns the least-significant 64 fraction bits of the quadruple-precision
846 | floating-point value `a'.
847 *----------------------------------------------------------------------------*/
849 INLINE bits64 extractFloat128Frac1( float128 a )
852 return a.low;
856 /*----------------------------------------------------------------------------
857 | Returns the most-significant 48 fraction bits of the quadruple-precision
858 | floating-point value `a'.
859 *----------------------------------------------------------------------------*/
861 INLINE bits64 extractFloat128Frac0( float128 a )
864 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
868 /*----------------------------------------------------------------------------
869 | Returns the exponent bits of the quadruple-precision floating-point value
870 | `a'.
871 *----------------------------------------------------------------------------*/
873 INLINE int32 extractFloat128Exp( float128 a )
876 return ( a.high>>48 ) & 0x7FFF;
880 /*----------------------------------------------------------------------------
881 | Returns the sign bit of the quadruple-precision floating-point value `a'.
882 *----------------------------------------------------------------------------*/
884 INLINE flag extractFloat128Sign( float128 a )
887 return a.high>>63;
891 /*----------------------------------------------------------------------------
892 | Normalizes the subnormal quadruple-precision floating-point value
893 | represented by the denormalized significand formed by the concatenation of
894 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
895 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
896 | significand are stored at the location pointed to by `zSig0Ptr', and the
897 | least significant 64 bits of the normalized significand are stored at the
898 | location pointed to by `zSig1Ptr'.
899 *----------------------------------------------------------------------------*/
901 static void
902 normalizeFloat128Subnormal(
903 bits64 aSig0,
904 bits64 aSig1,
905 int32 *zExpPtr,
906 bits64 *zSig0Ptr,
907 bits64 *zSig1Ptr
910 int8 shiftCount;
912 if ( aSig0 == 0 ) {
913 shiftCount = countLeadingZeros64( aSig1 ) - 15;
914 if ( shiftCount < 0 ) {
915 *zSig0Ptr = aSig1>>( - shiftCount );
916 *zSig1Ptr = aSig1<<( shiftCount & 63 );
918 else {
919 *zSig0Ptr = aSig1<<shiftCount;
920 *zSig1Ptr = 0;
922 *zExpPtr = - shiftCount - 63;
924 else {
925 shiftCount = countLeadingZeros64( aSig0 ) - 15;
926 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
927 *zExpPtr = 1 - shiftCount;
932 /*----------------------------------------------------------------------------
933 | Packs the sign `zSign', the exponent `zExp', and the significand formed
934 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
935 | floating-point value, returning the result. After being shifted into the
936 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
937 | added together to form the most significant 32 bits of the result. This
938 | means that any integer portion of `zSig0' will be added into the exponent.
939 | Since a properly normalized significand will have an integer portion equal
940 | to 1, the `zExp' input should be 1 less than the desired result exponent
941 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
942 | significand.
943 *----------------------------------------------------------------------------*/
945 INLINE float128
946 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
948 float128 z;
950 z.low = zSig1;
951 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
952 return z;
956 /*----------------------------------------------------------------------------
957 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
958 | and extended significand formed by the concatenation of `zSig0', `zSig1',
959 | and `zSig2', and returns the proper quadruple-precision floating-point value
960 | corresponding to the abstract input. Ordinarily, the abstract value is
961 | simply rounded and packed into the quadruple-precision format, with the
962 | inexact exception raised if the abstract input cannot be represented
963 | exactly. However, if the abstract value is too large, the overflow and
964 | inexact exceptions are raised and an infinity or maximal finite value is
965 | returned. If the abstract value is too small, the input value is rounded to
966 | a subnormal number, and the underflow and inexact exceptions are raised if
967 | the abstract input cannot be represented exactly as a subnormal quadruple-
968 | precision floating-point number.
969 | The input significand must be normalized or smaller. If the input
970 | significand is not normalized, `zExp' must be 0; in that case, the result
971 | returned is a subnormal number, and it must not require rounding. In the
972 | usual case that the input significand is normalized, `zExp' must be 1 less
973 | than the ``true'' floating-point exponent. The handling of underflow and
974 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
975 *----------------------------------------------------------------------------*/
977 static float128
978 roundAndPackFloat128(
979 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
981 int8 roundingMode;
982 flag roundNearestEven, increment, isTiny;
984 roundingMode = STATUS(float_rounding_mode);
985 roundNearestEven = ( roundingMode == float_round_nearest_even );
986 increment = ( (sbits64) zSig2 < 0 );
987 if ( ! roundNearestEven ) {
988 if ( roundingMode == float_round_to_zero ) {
989 increment = 0;
991 else {
992 if ( zSign ) {
993 increment = ( roundingMode == float_round_down ) && zSig2;
995 else {
996 increment = ( roundingMode == float_round_up ) && zSig2;
1000 if ( 0x7FFD <= (bits32) zExp ) {
1001 if ( ( 0x7FFD < zExp )
1002 || ( ( zExp == 0x7FFD )
1003 && eq128(
1004 LIT64( 0x0001FFFFFFFFFFFF ),
1005 LIT64( 0xFFFFFFFFFFFFFFFF ),
1006 zSig0,
1007 zSig1
1009 && increment
1012 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1013 if ( ( roundingMode == float_round_to_zero )
1014 || ( zSign && ( roundingMode == float_round_up ) )
1015 || ( ! zSign && ( roundingMode == float_round_down ) )
1017 return
1018 packFloat128(
1019 zSign,
1020 0x7FFE,
1021 LIT64( 0x0000FFFFFFFFFFFF ),
1022 LIT64( 0xFFFFFFFFFFFFFFFF )
1025 return packFloat128( zSign, 0x7FFF, 0, 0 );
1027 if ( zExp < 0 ) {
1028 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
1029 isTiny =
1030 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1031 || ( zExp < -1 )
1032 || ! increment
1033 || lt128(
1034 zSig0,
1035 zSig1,
1036 LIT64( 0x0001FFFFFFFFFFFF ),
1037 LIT64( 0xFFFFFFFFFFFFFFFF )
1039 shift128ExtraRightJamming(
1040 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1041 zExp = 0;
1042 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1043 if ( roundNearestEven ) {
1044 increment = ( (sbits64) zSig2 < 0 );
1046 else {
1047 if ( zSign ) {
1048 increment = ( roundingMode == float_round_down ) && zSig2;
1050 else {
1051 increment = ( roundingMode == float_round_up ) && zSig2;
1056 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1057 if ( increment ) {
1058 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1059 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1061 else {
1062 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1064 return packFloat128( zSign, zExp, zSig0, zSig1 );
1068 /*----------------------------------------------------------------------------
1069 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1070 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1071 | returns the proper quadruple-precision floating-point value corresponding
1072 | to the abstract input. This routine is just like `roundAndPackFloat128'
1073 | except that the input significand has fewer bits and does not have to be
1074 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1075 | point exponent.
1076 *----------------------------------------------------------------------------*/
1078 static float128
1079 normalizeRoundAndPackFloat128(
1080 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1082 int8 shiftCount;
1083 bits64 zSig2;
1085 if ( zSig0 == 0 ) {
1086 zSig0 = zSig1;
1087 zSig1 = 0;
1088 zExp -= 64;
1090 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1091 if ( 0 <= shiftCount ) {
1092 zSig2 = 0;
1093 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1095 else {
1096 shift128ExtraRightJamming(
1097 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1099 zExp -= shiftCount;
1100 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1104 #endif
1106 /*----------------------------------------------------------------------------
1107 | Returns the result of converting the 32-bit two's complement integer `a'
1108 | to the single-precision floating-point format. The conversion is performed
1109 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1110 *----------------------------------------------------------------------------*/
1112 float32 int32_to_float32( int32 a STATUS_PARAM )
1114 flag zSign;
1116 if ( a == 0 ) return float32_zero;
1117 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1118 zSign = ( a < 0 );
1119 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1123 /*----------------------------------------------------------------------------
1124 | Returns the result of converting the 32-bit two's complement integer `a'
1125 | to the double-precision floating-point format. The conversion is performed
1126 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1127 *----------------------------------------------------------------------------*/
1129 float64 int32_to_float64( int32 a STATUS_PARAM )
1131 flag zSign;
1132 uint32 absA;
1133 int8 shiftCount;
1134 bits64 zSig;
1136 if ( a == 0 ) return float64_zero;
1137 zSign = ( a < 0 );
1138 absA = zSign ? - a : a;
1139 shiftCount = countLeadingZeros32( absA ) + 21;
1140 zSig = absA;
1141 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1145 #ifdef FLOATX80
1147 /*----------------------------------------------------------------------------
1148 | Returns the result of converting the 32-bit two's complement integer `a'
1149 | to the extended double-precision floating-point format. The conversion
1150 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1151 | Arithmetic.
1152 *----------------------------------------------------------------------------*/
1154 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1156 flag zSign;
1157 uint32 absA;
1158 int8 shiftCount;
1159 bits64 zSig;
1161 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1162 zSign = ( a < 0 );
1163 absA = zSign ? - a : a;
1164 shiftCount = countLeadingZeros32( absA ) + 32;
1165 zSig = absA;
1166 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1170 #endif
1172 #ifdef FLOAT128
1174 /*----------------------------------------------------------------------------
1175 | Returns the result of converting the 32-bit two's complement integer `a' to
1176 | the quadruple-precision floating-point format. The conversion is performed
1177 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1178 *----------------------------------------------------------------------------*/
1180 float128 int32_to_float128( int32 a STATUS_PARAM )
1182 flag zSign;
1183 uint32 absA;
1184 int8 shiftCount;
1185 bits64 zSig0;
1187 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1188 zSign = ( a < 0 );
1189 absA = zSign ? - a : a;
1190 shiftCount = countLeadingZeros32( absA ) + 17;
1191 zSig0 = absA;
1192 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1196 #endif
1198 /*----------------------------------------------------------------------------
1199 | Returns the result of converting the 64-bit two's complement integer `a'
1200 | to the single-precision floating-point format. The conversion is performed
1201 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1202 *----------------------------------------------------------------------------*/
1204 float32 int64_to_float32( int64 a STATUS_PARAM )
1206 flag zSign;
1207 uint64 absA;
1208 int8 shiftCount;
1210 if ( a == 0 ) return float32_zero;
1211 zSign = ( a < 0 );
1212 absA = zSign ? - a : a;
1213 shiftCount = countLeadingZeros64( absA ) - 40;
1214 if ( 0 <= shiftCount ) {
1215 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1217 else {
1218 shiftCount += 7;
1219 if ( shiftCount < 0 ) {
1220 shift64RightJamming( absA, - shiftCount, &absA );
1222 else {
1223 absA <<= shiftCount;
1225 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1230 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1232 int8 shiftCount;
1234 if ( a == 0 ) return float32_zero;
1235 shiftCount = countLeadingZeros64( a ) - 40;
1236 if ( 0 <= shiftCount ) {
1237 return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1239 else {
1240 shiftCount += 7;
1241 if ( shiftCount < 0 ) {
1242 shift64RightJamming( a, - shiftCount, &a );
1244 else {
1245 a <<= shiftCount;
1247 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1251 /*----------------------------------------------------------------------------
1252 | Returns the result of converting the 64-bit two's complement integer `a'
1253 | to the double-precision floating-point format. The conversion is performed
1254 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1255 *----------------------------------------------------------------------------*/
1257 float64 int64_to_float64( int64 a STATUS_PARAM )
1259 flag zSign;
1261 if ( a == 0 ) return float64_zero;
1262 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1263 return packFloat64( 1, 0x43E, 0 );
1265 zSign = ( a < 0 );
1266 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1270 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1272 if ( a == 0 ) return float64_zero;
1273 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1277 #ifdef FLOATX80
1279 /*----------------------------------------------------------------------------
1280 | Returns the result of converting the 64-bit two's complement integer `a'
1281 | to the extended double-precision floating-point format. The conversion
1282 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1283 | Arithmetic.
1284 *----------------------------------------------------------------------------*/
1286 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1288 flag zSign;
1289 uint64 absA;
1290 int8 shiftCount;
1292 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1293 zSign = ( a < 0 );
1294 absA = zSign ? - a : a;
1295 shiftCount = countLeadingZeros64( absA );
1296 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1300 #endif
1302 #ifdef FLOAT128
1304 /*----------------------------------------------------------------------------
1305 | Returns the result of converting the 64-bit two's complement integer `a' to
1306 | the quadruple-precision floating-point format. The conversion is performed
1307 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1308 *----------------------------------------------------------------------------*/
1310 float128 int64_to_float128( int64 a STATUS_PARAM )
1312 flag zSign;
1313 uint64 absA;
1314 int8 shiftCount;
1315 int32 zExp;
1316 bits64 zSig0, zSig1;
1318 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1319 zSign = ( a < 0 );
1320 absA = zSign ? - a : a;
1321 shiftCount = countLeadingZeros64( absA ) + 49;
1322 zExp = 0x406E - shiftCount;
1323 if ( 64 <= shiftCount ) {
1324 zSig1 = 0;
1325 zSig0 = absA;
1326 shiftCount -= 64;
1328 else {
1329 zSig1 = absA;
1330 zSig0 = 0;
1332 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1333 return packFloat128( zSign, zExp, zSig0, zSig1 );
1337 #endif
1339 /*----------------------------------------------------------------------------
1340 | Returns the result of converting the single-precision floating-point value
1341 | `a' to the 32-bit two's complement integer format. The conversion is
1342 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1343 | Arithmetic---which means in particular that the conversion is rounded
1344 | according to the current rounding mode. If `a' is a NaN, the largest
1345 | positive integer is returned. Otherwise, if the conversion overflows, the
1346 | largest integer with the same sign as `a' is returned.
1347 *----------------------------------------------------------------------------*/
1349 int32 float32_to_int32( float32 a STATUS_PARAM )
1351 flag aSign;
1352 int16 aExp, shiftCount;
1353 bits32 aSig;
1354 bits64 aSig64;
1356 a = float32_squash_input_denormal(a STATUS_VAR);
1357 aSig = extractFloat32Frac( a );
1358 aExp = extractFloat32Exp( a );
1359 aSign = extractFloat32Sign( a );
1360 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1361 if ( aExp ) aSig |= 0x00800000;
1362 shiftCount = 0xAF - aExp;
1363 aSig64 = aSig;
1364 aSig64 <<= 32;
1365 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1366 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1370 /*----------------------------------------------------------------------------
1371 | Returns the result of converting the single-precision floating-point value
1372 | `a' to the 32-bit two's complement integer format. The conversion is
1373 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1374 | Arithmetic, except that the conversion is always rounded toward zero.
1375 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1376 | the conversion overflows, the largest integer with the same sign as `a' is
1377 | returned.
1378 *----------------------------------------------------------------------------*/
1380 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1382 flag aSign;
1383 int16 aExp, shiftCount;
1384 bits32 aSig;
1385 int32 z;
1386 a = float32_squash_input_denormal(a STATUS_VAR);
1388 aSig = extractFloat32Frac( a );
1389 aExp = extractFloat32Exp( a );
1390 aSign = extractFloat32Sign( a );
1391 shiftCount = aExp - 0x9E;
1392 if ( 0 <= shiftCount ) {
1393 if ( float32_val(a) != 0xCF000000 ) {
1394 float_raise( float_flag_invalid STATUS_VAR);
1395 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1397 return (sbits32) 0x80000000;
1399 else if ( aExp <= 0x7E ) {
1400 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1401 return 0;
1403 aSig = ( aSig | 0x00800000 )<<8;
1404 z = aSig>>( - shiftCount );
1405 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1406 STATUS(float_exception_flags) |= float_flag_inexact;
1408 if ( aSign ) z = - z;
1409 return z;
1413 /*----------------------------------------------------------------------------
1414 | Returns the result of converting the single-precision floating-point value
1415 | `a' to the 16-bit two's complement integer format. The conversion is
1416 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1417 | Arithmetic, except that the conversion is always rounded toward zero.
1418 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1419 | the conversion overflows, the largest integer with the same sign as `a' is
1420 | returned.
1421 *----------------------------------------------------------------------------*/
1423 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1425 flag aSign;
1426 int16 aExp, shiftCount;
1427 bits32 aSig;
1428 int32 z;
1430 aSig = extractFloat32Frac( a );
1431 aExp = extractFloat32Exp( a );
1432 aSign = extractFloat32Sign( a );
1433 shiftCount = aExp - 0x8E;
1434 if ( 0 <= shiftCount ) {
1435 if ( float32_val(a) != 0xC7000000 ) {
1436 float_raise( float_flag_invalid STATUS_VAR);
1437 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1438 return 0x7FFF;
1441 return (sbits32) 0xffff8000;
1443 else if ( aExp <= 0x7E ) {
1444 if ( aExp | aSig ) {
1445 STATUS(float_exception_flags) |= float_flag_inexact;
1447 return 0;
1449 shiftCount -= 0x10;
1450 aSig = ( aSig | 0x00800000 )<<8;
1451 z = aSig>>( - shiftCount );
1452 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1453 STATUS(float_exception_flags) |= float_flag_inexact;
1455 if ( aSign ) {
1456 z = - z;
1458 return z;
1462 /*----------------------------------------------------------------------------
1463 | Returns the result of converting the single-precision floating-point value
1464 | `a' to the 64-bit two's complement integer format. The conversion is
1465 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1466 | Arithmetic---which means in particular that the conversion is rounded
1467 | according to the current rounding mode. If `a' is a NaN, the largest
1468 | positive integer is returned. Otherwise, if the conversion overflows, the
1469 | largest integer with the same sign as `a' is returned.
1470 *----------------------------------------------------------------------------*/
1472 int64 float32_to_int64( float32 a STATUS_PARAM )
1474 flag aSign;
1475 int16 aExp, shiftCount;
1476 bits32 aSig;
1477 bits64 aSig64, aSigExtra;
1478 a = float32_squash_input_denormal(a STATUS_VAR);
1480 aSig = extractFloat32Frac( a );
1481 aExp = extractFloat32Exp( a );
1482 aSign = extractFloat32Sign( a );
1483 shiftCount = 0xBE - aExp;
1484 if ( shiftCount < 0 ) {
1485 float_raise( float_flag_invalid STATUS_VAR);
1486 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1487 return LIT64( 0x7FFFFFFFFFFFFFFF );
1489 return (sbits64) LIT64( 0x8000000000000000 );
1491 if ( aExp ) aSig |= 0x00800000;
1492 aSig64 = aSig;
1493 aSig64 <<= 40;
1494 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1495 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1499 /*----------------------------------------------------------------------------
1500 | Returns the result of converting the single-precision floating-point value
1501 | `a' to the 64-bit two's complement integer format. The conversion is
1502 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1503 | Arithmetic, except that the conversion is always rounded toward zero. If
1504 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1505 | conversion overflows, the largest integer with the same sign as `a' is
1506 | returned.
1507 *----------------------------------------------------------------------------*/
1509 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1511 flag aSign;
1512 int16 aExp, shiftCount;
1513 bits32 aSig;
1514 bits64 aSig64;
1515 int64 z;
1516 a = float32_squash_input_denormal(a STATUS_VAR);
1518 aSig = extractFloat32Frac( a );
1519 aExp = extractFloat32Exp( a );
1520 aSign = extractFloat32Sign( a );
1521 shiftCount = aExp - 0xBE;
1522 if ( 0 <= shiftCount ) {
1523 if ( float32_val(a) != 0xDF000000 ) {
1524 float_raise( float_flag_invalid STATUS_VAR);
1525 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1526 return LIT64( 0x7FFFFFFFFFFFFFFF );
1529 return (sbits64) LIT64( 0x8000000000000000 );
1531 else if ( aExp <= 0x7E ) {
1532 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1533 return 0;
1535 aSig64 = aSig | 0x00800000;
1536 aSig64 <<= 40;
1537 z = aSig64>>( - shiftCount );
1538 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1539 STATUS(float_exception_flags) |= float_flag_inexact;
1541 if ( aSign ) z = - z;
1542 return z;
1546 /*----------------------------------------------------------------------------
1547 | Returns the result of converting the single-precision floating-point value
1548 | `a' to the double-precision floating-point format. The conversion is
1549 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1550 | Arithmetic.
1551 *----------------------------------------------------------------------------*/
1553 float64 float32_to_float64( float32 a STATUS_PARAM )
1555 flag aSign;
1556 int16 aExp;
1557 bits32 aSig;
1558 a = float32_squash_input_denormal(a STATUS_VAR);
1560 aSig = extractFloat32Frac( a );
1561 aExp = extractFloat32Exp( a );
1562 aSign = extractFloat32Sign( a );
1563 if ( aExp == 0xFF ) {
1564 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1565 return packFloat64( aSign, 0x7FF, 0 );
1567 if ( aExp == 0 ) {
1568 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1569 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1570 --aExp;
1572 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1576 #ifdef FLOATX80
1578 /*----------------------------------------------------------------------------
1579 | Returns the result of converting the single-precision floating-point value
1580 | `a' to the extended double-precision floating-point format. The conversion
1581 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1582 | Arithmetic.
1583 *----------------------------------------------------------------------------*/
1585 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1587 flag aSign;
1588 int16 aExp;
1589 bits32 aSig;
1591 a = float32_squash_input_denormal(a STATUS_VAR);
1592 aSig = extractFloat32Frac( a );
1593 aExp = extractFloat32Exp( a );
1594 aSign = extractFloat32Sign( a );
1595 if ( aExp == 0xFF ) {
1596 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1597 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1599 if ( aExp == 0 ) {
1600 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1601 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1603 aSig |= 0x00800000;
1604 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1608 #endif
1610 #ifdef FLOAT128
1612 /*----------------------------------------------------------------------------
1613 | Returns the result of converting the single-precision floating-point value
1614 | `a' to the double-precision floating-point format. The conversion is
1615 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1616 | Arithmetic.
1617 *----------------------------------------------------------------------------*/
1619 float128 float32_to_float128( float32 a STATUS_PARAM )
1621 flag aSign;
1622 int16 aExp;
1623 bits32 aSig;
1625 a = float32_squash_input_denormal(a STATUS_VAR);
1626 aSig = extractFloat32Frac( a );
1627 aExp = extractFloat32Exp( a );
1628 aSign = extractFloat32Sign( a );
1629 if ( aExp == 0xFF ) {
1630 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1631 return packFloat128( aSign, 0x7FFF, 0, 0 );
1633 if ( aExp == 0 ) {
1634 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1635 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1636 --aExp;
1638 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1642 #endif
1644 /*----------------------------------------------------------------------------
1645 | Rounds the single-precision floating-point value `a' to an integer, and
1646 | returns the result as a single-precision floating-point value. The
1647 | operation is performed according to the IEC/IEEE Standard for Binary
1648 | Floating-Point Arithmetic.
1649 *----------------------------------------------------------------------------*/
1651 float32 float32_round_to_int( float32 a STATUS_PARAM)
1653 flag aSign;
1654 int16 aExp;
1655 bits32 lastBitMask, roundBitsMask;
1656 int8 roundingMode;
1657 bits32 z;
1658 a = float32_squash_input_denormal(a STATUS_VAR);
1660 aExp = extractFloat32Exp( a );
1661 if ( 0x96 <= aExp ) {
1662 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1663 return propagateFloat32NaN( a, a STATUS_VAR );
1665 return a;
1667 if ( aExp <= 0x7E ) {
1668 if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1669 STATUS(float_exception_flags) |= float_flag_inexact;
1670 aSign = extractFloat32Sign( a );
1671 switch ( STATUS(float_rounding_mode) ) {
1672 case float_round_nearest_even:
1673 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1674 return packFloat32( aSign, 0x7F, 0 );
1676 break;
1677 case float_round_down:
1678 return make_float32(aSign ? 0xBF800000 : 0);
1679 case float_round_up:
1680 return make_float32(aSign ? 0x80000000 : 0x3F800000);
1682 return packFloat32( aSign, 0, 0 );
1684 lastBitMask = 1;
1685 lastBitMask <<= 0x96 - aExp;
1686 roundBitsMask = lastBitMask - 1;
1687 z = float32_val(a);
1688 roundingMode = STATUS(float_rounding_mode);
1689 if ( roundingMode == float_round_nearest_even ) {
1690 z += lastBitMask>>1;
1691 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1693 else if ( roundingMode != float_round_to_zero ) {
1694 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1695 z += roundBitsMask;
1698 z &= ~ roundBitsMask;
1699 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1700 return make_float32(z);
1704 /*----------------------------------------------------------------------------
1705 | Returns the result of adding the absolute values of the single-precision
1706 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1707 | before being returned. `zSign' is ignored if the result is a NaN.
1708 | The addition is performed according to the IEC/IEEE Standard for Binary
1709 | Floating-Point Arithmetic.
1710 *----------------------------------------------------------------------------*/
1712 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1714 int16 aExp, bExp, zExp;
1715 bits32 aSig, bSig, zSig;
1716 int16 expDiff;
1718 aSig = extractFloat32Frac( a );
1719 aExp = extractFloat32Exp( a );
1720 bSig = extractFloat32Frac( b );
1721 bExp = extractFloat32Exp( b );
1722 expDiff = aExp - bExp;
1723 aSig <<= 6;
1724 bSig <<= 6;
1725 if ( 0 < expDiff ) {
1726 if ( aExp == 0xFF ) {
1727 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1728 return a;
1730 if ( bExp == 0 ) {
1731 --expDiff;
1733 else {
1734 bSig |= 0x20000000;
1736 shift32RightJamming( bSig, expDiff, &bSig );
1737 zExp = aExp;
1739 else if ( expDiff < 0 ) {
1740 if ( bExp == 0xFF ) {
1741 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1742 return packFloat32( zSign, 0xFF, 0 );
1744 if ( aExp == 0 ) {
1745 ++expDiff;
1747 else {
1748 aSig |= 0x20000000;
1750 shift32RightJamming( aSig, - expDiff, &aSig );
1751 zExp = bExp;
1753 else {
1754 if ( aExp == 0xFF ) {
1755 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1756 return a;
1758 if ( aExp == 0 ) {
1759 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1760 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1762 zSig = 0x40000000 + aSig + bSig;
1763 zExp = aExp;
1764 goto roundAndPack;
1766 aSig |= 0x20000000;
1767 zSig = ( aSig + bSig )<<1;
1768 --zExp;
1769 if ( (sbits32) zSig < 0 ) {
1770 zSig = aSig + bSig;
1771 ++zExp;
1773 roundAndPack:
1774 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1778 /*----------------------------------------------------------------------------
1779 | Returns the result of subtracting the absolute values of the single-
1780 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1781 | difference is negated before being returned. `zSign' is ignored if the
1782 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1783 | Standard for Binary Floating-Point Arithmetic.
1784 *----------------------------------------------------------------------------*/
1786 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1788 int16 aExp, bExp, zExp;
1789 bits32 aSig, bSig, zSig;
1790 int16 expDiff;
1792 aSig = extractFloat32Frac( a );
1793 aExp = extractFloat32Exp( a );
1794 bSig = extractFloat32Frac( b );
1795 bExp = extractFloat32Exp( b );
1796 expDiff = aExp - bExp;
1797 aSig <<= 7;
1798 bSig <<= 7;
1799 if ( 0 < expDiff ) goto aExpBigger;
1800 if ( expDiff < 0 ) goto bExpBigger;
1801 if ( aExp == 0xFF ) {
1802 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1803 float_raise( float_flag_invalid STATUS_VAR);
1804 return float32_default_nan;
1806 if ( aExp == 0 ) {
1807 aExp = 1;
1808 bExp = 1;
1810 if ( bSig < aSig ) goto aBigger;
1811 if ( aSig < bSig ) goto bBigger;
1812 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1813 bExpBigger:
1814 if ( bExp == 0xFF ) {
1815 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1816 return packFloat32( zSign ^ 1, 0xFF, 0 );
1818 if ( aExp == 0 ) {
1819 ++expDiff;
1821 else {
1822 aSig |= 0x40000000;
1824 shift32RightJamming( aSig, - expDiff, &aSig );
1825 bSig |= 0x40000000;
1826 bBigger:
1827 zSig = bSig - aSig;
1828 zExp = bExp;
1829 zSign ^= 1;
1830 goto normalizeRoundAndPack;
1831 aExpBigger:
1832 if ( aExp == 0xFF ) {
1833 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1834 return a;
1836 if ( bExp == 0 ) {
1837 --expDiff;
1839 else {
1840 bSig |= 0x40000000;
1842 shift32RightJamming( bSig, expDiff, &bSig );
1843 aSig |= 0x40000000;
1844 aBigger:
1845 zSig = aSig - bSig;
1846 zExp = aExp;
1847 normalizeRoundAndPack:
1848 --zExp;
1849 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1853 /*----------------------------------------------------------------------------
1854 | Returns the result of adding the single-precision floating-point values `a'
1855 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1856 | Binary Floating-Point Arithmetic.
1857 *----------------------------------------------------------------------------*/
1859 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1861 flag aSign, bSign;
1862 a = float32_squash_input_denormal(a STATUS_VAR);
1863 b = float32_squash_input_denormal(b STATUS_VAR);
1865 aSign = extractFloat32Sign( a );
1866 bSign = extractFloat32Sign( b );
1867 if ( aSign == bSign ) {
1868 return addFloat32Sigs( a, b, aSign STATUS_VAR);
1870 else {
1871 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1876 /*----------------------------------------------------------------------------
1877 | Returns the result of subtracting the single-precision floating-point values
1878 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1879 | for Binary Floating-Point Arithmetic.
1880 *----------------------------------------------------------------------------*/
1882 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1884 flag aSign, bSign;
1885 a = float32_squash_input_denormal(a STATUS_VAR);
1886 b = float32_squash_input_denormal(b STATUS_VAR);
1888 aSign = extractFloat32Sign( a );
1889 bSign = extractFloat32Sign( b );
1890 if ( aSign == bSign ) {
1891 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1893 else {
1894 return addFloat32Sigs( a, b, aSign STATUS_VAR );
1899 /*----------------------------------------------------------------------------
1900 | Returns the result of multiplying the single-precision floating-point values
1901 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1902 | for Binary Floating-Point Arithmetic.
1903 *----------------------------------------------------------------------------*/
1905 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1907 flag aSign, bSign, zSign;
1908 int16 aExp, bExp, zExp;
1909 bits32 aSig, bSig;
1910 bits64 zSig64;
1911 bits32 zSig;
1913 a = float32_squash_input_denormal(a STATUS_VAR);
1914 b = float32_squash_input_denormal(b STATUS_VAR);
1916 aSig = extractFloat32Frac( a );
1917 aExp = extractFloat32Exp( a );
1918 aSign = extractFloat32Sign( a );
1919 bSig = extractFloat32Frac( b );
1920 bExp = extractFloat32Exp( b );
1921 bSign = extractFloat32Sign( b );
1922 zSign = aSign ^ bSign;
1923 if ( aExp == 0xFF ) {
1924 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1925 return propagateFloat32NaN( a, b STATUS_VAR );
1927 if ( ( bExp | bSig ) == 0 ) {
1928 float_raise( float_flag_invalid STATUS_VAR);
1929 return float32_default_nan;
1931 return packFloat32( zSign, 0xFF, 0 );
1933 if ( bExp == 0xFF ) {
1934 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1935 if ( ( aExp | aSig ) == 0 ) {
1936 float_raise( float_flag_invalid STATUS_VAR);
1937 return float32_default_nan;
1939 return packFloat32( zSign, 0xFF, 0 );
1941 if ( aExp == 0 ) {
1942 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1943 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1945 if ( bExp == 0 ) {
1946 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1947 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1949 zExp = aExp + bExp - 0x7F;
1950 aSig = ( aSig | 0x00800000 )<<7;
1951 bSig = ( bSig | 0x00800000 )<<8;
1952 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1953 zSig = zSig64;
1954 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1955 zSig <<= 1;
1956 --zExp;
1958 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1962 /*----------------------------------------------------------------------------
1963 | Returns the result of dividing the single-precision floating-point value `a'
1964 | by the corresponding value `b'. The operation is performed according to the
1965 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1966 *----------------------------------------------------------------------------*/
1968 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1970 flag aSign, bSign, zSign;
1971 int16 aExp, bExp, zExp;
1972 bits32 aSig, bSig, zSig;
1973 a = float32_squash_input_denormal(a STATUS_VAR);
1974 b = float32_squash_input_denormal(b STATUS_VAR);
1976 aSig = extractFloat32Frac( a );
1977 aExp = extractFloat32Exp( a );
1978 aSign = extractFloat32Sign( a );
1979 bSig = extractFloat32Frac( b );
1980 bExp = extractFloat32Exp( b );
1981 bSign = extractFloat32Sign( b );
1982 zSign = aSign ^ bSign;
1983 if ( aExp == 0xFF ) {
1984 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1985 if ( bExp == 0xFF ) {
1986 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1987 float_raise( float_flag_invalid STATUS_VAR);
1988 return float32_default_nan;
1990 return packFloat32( zSign, 0xFF, 0 );
1992 if ( bExp == 0xFF ) {
1993 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1994 return packFloat32( zSign, 0, 0 );
1996 if ( bExp == 0 ) {
1997 if ( bSig == 0 ) {
1998 if ( ( aExp | aSig ) == 0 ) {
1999 float_raise( float_flag_invalid STATUS_VAR);
2000 return float32_default_nan;
2002 float_raise( float_flag_divbyzero STATUS_VAR);
2003 return packFloat32( zSign, 0xFF, 0 );
2005 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2007 if ( aExp == 0 ) {
2008 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2009 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2011 zExp = aExp - bExp + 0x7D;
2012 aSig = ( aSig | 0x00800000 )<<7;
2013 bSig = ( bSig | 0x00800000 )<<8;
2014 if ( bSig <= ( aSig + aSig ) ) {
2015 aSig >>= 1;
2016 ++zExp;
2018 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
2019 if ( ( zSig & 0x3F ) == 0 ) {
2020 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2022 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2026 /*----------------------------------------------------------------------------
2027 | Returns the remainder of the single-precision floating-point value `a'
2028 | with respect to the corresponding value `b'. The operation is performed
2029 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2030 *----------------------------------------------------------------------------*/
2032 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2034 flag aSign, zSign;
2035 int16 aExp, bExp, expDiff;
2036 bits32 aSig, bSig;
2037 bits32 q;
2038 bits64 aSig64, bSig64, q64;
2039 bits32 alternateASig;
2040 sbits32 sigMean;
2041 a = float32_squash_input_denormal(a STATUS_VAR);
2042 b = float32_squash_input_denormal(b STATUS_VAR);
2044 aSig = extractFloat32Frac( a );
2045 aExp = extractFloat32Exp( a );
2046 aSign = extractFloat32Sign( a );
2047 bSig = extractFloat32Frac( b );
2048 bExp = extractFloat32Exp( b );
2049 if ( aExp == 0xFF ) {
2050 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2051 return propagateFloat32NaN( a, b STATUS_VAR );
2053 float_raise( float_flag_invalid STATUS_VAR);
2054 return float32_default_nan;
2056 if ( bExp == 0xFF ) {
2057 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2058 return a;
2060 if ( bExp == 0 ) {
2061 if ( bSig == 0 ) {
2062 float_raise( float_flag_invalid STATUS_VAR);
2063 return float32_default_nan;
2065 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2067 if ( aExp == 0 ) {
2068 if ( aSig == 0 ) return a;
2069 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2071 expDiff = aExp - bExp;
2072 aSig |= 0x00800000;
2073 bSig |= 0x00800000;
2074 if ( expDiff < 32 ) {
2075 aSig <<= 8;
2076 bSig <<= 8;
2077 if ( expDiff < 0 ) {
2078 if ( expDiff < -1 ) return a;
2079 aSig >>= 1;
2081 q = ( bSig <= aSig );
2082 if ( q ) aSig -= bSig;
2083 if ( 0 < expDiff ) {
2084 q = ( ( (bits64) aSig )<<32 ) / bSig;
2085 q >>= 32 - expDiff;
2086 bSig >>= 2;
2087 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2089 else {
2090 aSig >>= 2;
2091 bSig >>= 2;
2094 else {
2095 if ( bSig <= aSig ) aSig -= bSig;
2096 aSig64 = ( (bits64) aSig )<<40;
2097 bSig64 = ( (bits64) bSig )<<40;
2098 expDiff -= 64;
2099 while ( 0 < expDiff ) {
2100 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2101 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2102 aSig64 = - ( ( bSig * q64 )<<38 );
2103 expDiff -= 62;
2105 expDiff += 64;
2106 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2107 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2108 q = q64>>( 64 - expDiff );
2109 bSig <<= 6;
2110 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2112 do {
2113 alternateASig = aSig;
2114 ++q;
2115 aSig -= bSig;
2116 } while ( 0 <= (sbits32) aSig );
2117 sigMean = aSig + alternateASig;
2118 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2119 aSig = alternateASig;
2121 zSign = ( (sbits32) aSig < 0 );
2122 if ( zSign ) aSig = - aSig;
2123 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2127 /*----------------------------------------------------------------------------
2128 | Returns the square root of the single-precision floating-point value `a'.
2129 | The operation is performed according to the IEC/IEEE Standard for Binary
2130 | Floating-Point Arithmetic.
2131 *----------------------------------------------------------------------------*/
2133 float32 float32_sqrt( float32 a STATUS_PARAM )
2135 flag aSign;
2136 int16 aExp, zExp;
2137 bits32 aSig, zSig;
2138 bits64 rem, term;
2139 a = float32_squash_input_denormal(a STATUS_VAR);
2141 aSig = extractFloat32Frac( a );
2142 aExp = extractFloat32Exp( a );
2143 aSign = extractFloat32Sign( a );
2144 if ( aExp == 0xFF ) {
2145 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2146 if ( ! aSign ) return a;
2147 float_raise( float_flag_invalid STATUS_VAR);
2148 return float32_default_nan;
2150 if ( aSign ) {
2151 if ( ( aExp | aSig ) == 0 ) return a;
2152 float_raise( float_flag_invalid STATUS_VAR);
2153 return float32_default_nan;
2155 if ( aExp == 0 ) {
2156 if ( aSig == 0 ) return float32_zero;
2157 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2159 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2160 aSig = ( aSig | 0x00800000 )<<8;
2161 zSig = estimateSqrt32( aExp, aSig ) + 2;
2162 if ( ( zSig & 0x7F ) <= 5 ) {
2163 if ( zSig < 2 ) {
2164 zSig = 0x7FFFFFFF;
2165 goto roundAndPack;
2167 aSig >>= aExp & 1;
2168 term = ( (bits64) zSig ) * zSig;
2169 rem = ( ( (bits64) aSig )<<32 ) - term;
2170 while ( (sbits64) rem < 0 ) {
2171 --zSig;
2172 rem += ( ( (bits64) zSig )<<1 ) | 1;
2174 zSig |= ( rem != 0 );
2176 shift32RightJamming( zSig, 1, &zSig );
2177 roundAndPack:
2178 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2182 /*----------------------------------------------------------------------------
2183 | Returns the binary exponential of the single-precision floating-point value
2184 | `a'. The operation is performed according to the IEC/IEEE Standard for
2185 | Binary Floating-Point Arithmetic.
2187 | Uses the following identities:
2189 | 1. -------------------------------------------------------------------------
2190 | x x*ln(2)
2191 | 2 = e
2193 | 2. -------------------------------------------------------------------------
2194 | 2 3 4 5 n
2195 | x x x x x x x
2196 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2197 | 1! 2! 3! 4! 5! n!
2198 *----------------------------------------------------------------------------*/
2200 static const float64 float32_exp2_coefficients[15] =
2202 make_float64( 0x3ff0000000000000ll ), /* 1 */
2203 make_float64( 0x3fe0000000000000ll ), /* 2 */
2204 make_float64( 0x3fc5555555555555ll ), /* 3 */
2205 make_float64( 0x3fa5555555555555ll ), /* 4 */
2206 make_float64( 0x3f81111111111111ll ), /* 5 */
2207 make_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2208 make_float64( 0x3f2a01a01a01a01all ), /* 7 */
2209 make_float64( 0x3efa01a01a01a01all ), /* 8 */
2210 make_float64( 0x3ec71de3a556c734ll ), /* 9 */
2211 make_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2212 make_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2213 make_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2214 make_float64( 0x3de6124613a86d09ll ), /* 13 */
2215 make_float64( 0x3da93974a8c07c9dll ), /* 14 */
2216 make_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2219 float32 float32_exp2( float32 a STATUS_PARAM )
2221 flag aSign;
2222 int16 aExp;
2223 bits32 aSig;
2224 float64 r, x, xn;
2225 int i;
2226 a = float32_squash_input_denormal(a STATUS_VAR);
2228 aSig = extractFloat32Frac( a );
2229 aExp = extractFloat32Exp( a );
2230 aSign = extractFloat32Sign( a );
2232 if ( aExp == 0xFF) {
2233 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2234 return (aSign) ? float32_zero : a;
2236 if (aExp == 0) {
2237 if (aSig == 0) return float32_one;
2240 float_raise( float_flag_inexact STATUS_VAR);
2242 /* ******************************* */
2243 /* using float64 for approximation */
2244 /* ******************************* */
2245 x = float32_to_float64(a STATUS_VAR);
2246 x = float64_mul(x, float64_ln2 STATUS_VAR);
2248 xn = x;
2249 r = float64_one;
2250 for (i = 0 ; i < 15 ; i++) {
2251 float64 f;
2253 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2254 r = float64_add(r, f STATUS_VAR);
2256 xn = float64_mul(xn, x STATUS_VAR);
2259 return float64_to_float32(r, status);
2262 /*----------------------------------------------------------------------------
2263 | Returns the binary log of the single-precision floating-point value `a'.
2264 | The operation is performed according to the IEC/IEEE Standard for Binary
2265 | Floating-Point Arithmetic.
2266 *----------------------------------------------------------------------------*/
2267 float32 float32_log2( float32 a STATUS_PARAM )
2269 flag aSign, zSign;
2270 int16 aExp;
2271 bits32 aSig, zSig, i;
2273 a = float32_squash_input_denormal(a STATUS_VAR);
2274 aSig = extractFloat32Frac( a );
2275 aExp = extractFloat32Exp( a );
2276 aSign = extractFloat32Sign( a );
2278 if ( aExp == 0 ) {
2279 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2280 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2282 if ( aSign ) {
2283 float_raise( float_flag_invalid STATUS_VAR);
2284 return float32_default_nan;
2286 if ( aExp == 0xFF ) {
2287 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2288 return a;
2291 aExp -= 0x7F;
2292 aSig |= 0x00800000;
2293 zSign = aExp < 0;
2294 zSig = aExp << 23;
2296 for (i = 1 << 22; i > 0; i >>= 1) {
2297 aSig = ( (bits64)aSig * aSig ) >> 23;
2298 if ( aSig & 0x01000000 ) {
2299 aSig >>= 1;
2300 zSig |= i;
2304 if ( zSign )
2305 zSig = -zSig;
2307 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2310 /*----------------------------------------------------------------------------
2311 | Returns 1 if the single-precision floating-point value `a' is equal to
2312 | the corresponding value `b', and 0 otherwise. The comparison is performed
2313 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2314 *----------------------------------------------------------------------------*/
2316 int float32_eq( float32 a, float32 b STATUS_PARAM )
2318 a = float32_squash_input_denormal(a STATUS_VAR);
2319 b = float32_squash_input_denormal(b STATUS_VAR);
2321 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2322 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2324 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2325 float_raise( float_flag_invalid STATUS_VAR);
2327 return 0;
2329 return ( float32_val(a) == float32_val(b) ) ||
2330 ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2334 /*----------------------------------------------------------------------------
2335 | Returns 1 if the single-precision floating-point value `a' is less than
2336 | or equal to the corresponding value `b', and 0 otherwise. The comparison
2337 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2338 | Arithmetic.
2339 *----------------------------------------------------------------------------*/
2341 int float32_le( float32 a, float32 b STATUS_PARAM )
2343 flag aSign, bSign;
2344 bits32 av, bv;
2345 a = float32_squash_input_denormal(a STATUS_VAR);
2346 b = float32_squash_input_denormal(b STATUS_VAR);
2348 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2349 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2351 float_raise( float_flag_invalid STATUS_VAR);
2352 return 0;
2354 aSign = extractFloat32Sign( a );
2355 bSign = extractFloat32Sign( b );
2356 av = float32_val(a);
2357 bv = float32_val(b);
2358 if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2359 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2363 /*----------------------------------------------------------------------------
2364 | Returns 1 if the single-precision floating-point value `a' is less than
2365 | the corresponding value `b', and 0 otherwise. The comparison is performed
2366 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2367 *----------------------------------------------------------------------------*/
2369 int float32_lt( float32 a, float32 b STATUS_PARAM )
2371 flag aSign, bSign;
2372 bits32 av, bv;
2373 a = float32_squash_input_denormal(a STATUS_VAR);
2374 b = float32_squash_input_denormal(b STATUS_VAR);
2376 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2377 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2379 float_raise( float_flag_invalid STATUS_VAR);
2380 return 0;
2382 aSign = extractFloat32Sign( a );
2383 bSign = extractFloat32Sign( b );
2384 av = float32_val(a);
2385 bv = float32_val(b);
2386 if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2387 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2391 /*----------------------------------------------------------------------------
2392 | Returns 1 if the single-precision floating-point value `a' is equal to
2393 | the corresponding value `b', and 0 otherwise. The invalid exception is
2394 | raised if either operand is a NaN. Otherwise, the comparison is performed
2395 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2396 *----------------------------------------------------------------------------*/
2398 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2400 bits32 av, bv;
2401 a = float32_squash_input_denormal(a STATUS_VAR);
2402 b = float32_squash_input_denormal(b STATUS_VAR);
2404 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2405 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2407 float_raise( float_flag_invalid STATUS_VAR);
2408 return 0;
2410 av = float32_val(a);
2411 bv = float32_val(b);
2412 return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2416 /*----------------------------------------------------------------------------
2417 | Returns 1 if the single-precision floating-point value `a' is less than or
2418 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2419 | cause an exception. Otherwise, the comparison is performed according to the
2420 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2421 *----------------------------------------------------------------------------*/
2423 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2425 flag aSign, bSign;
2426 bits32 av, bv;
2427 a = float32_squash_input_denormal(a STATUS_VAR);
2428 b = float32_squash_input_denormal(b STATUS_VAR);
2430 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2431 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2433 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2434 float_raise( float_flag_invalid STATUS_VAR);
2436 return 0;
2438 aSign = extractFloat32Sign( a );
2439 bSign = extractFloat32Sign( b );
2440 av = float32_val(a);
2441 bv = float32_val(b);
2442 if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2443 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2447 /*----------------------------------------------------------------------------
2448 | Returns 1 if the single-precision floating-point value `a' is less than
2449 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2450 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2451 | Standard for Binary Floating-Point Arithmetic.
2452 *----------------------------------------------------------------------------*/
2454 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2456 flag aSign, bSign;
2457 bits32 av, bv;
2458 a = float32_squash_input_denormal(a STATUS_VAR);
2459 b = float32_squash_input_denormal(b STATUS_VAR);
2461 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2462 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2464 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2465 float_raise( float_flag_invalid STATUS_VAR);
2467 return 0;
2469 aSign = extractFloat32Sign( a );
2470 bSign = extractFloat32Sign( b );
2471 av = float32_val(a);
2472 bv = float32_val(b);
2473 if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2474 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2478 /*----------------------------------------------------------------------------
2479 | Returns the result of converting the double-precision floating-point value
2480 | `a' to the 32-bit two's complement integer format. The conversion is
2481 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2482 | Arithmetic---which means in particular that the conversion is rounded
2483 | according to the current rounding mode. If `a' is a NaN, the largest
2484 | positive integer is returned. Otherwise, if the conversion overflows, the
2485 | largest integer with the same sign as `a' is returned.
2486 *----------------------------------------------------------------------------*/
2488 int32 float64_to_int32( float64 a STATUS_PARAM )
2490 flag aSign;
2491 int16 aExp, shiftCount;
2492 bits64 aSig;
2493 a = float64_squash_input_denormal(a STATUS_VAR);
2495 aSig = extractFloat64Frac( a );
2496 aExp = extractFloat64Exp( a );
2497 aSign = extractFloat64Sign( a );
2498 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2499 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2500 shiftCount = 0x42C - aExp;
2501 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2502 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2506 /*----------------------------------------------------------------------------
2507 | Returns the result of converting the double-precision floating-point value
2508 | `a' to the 32-bit two's complement integer format. The conversion is
2509 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2510 | Arithmetic, except that the conversion is always rounded toward zero.
2511 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2512 | the conversion overflows, the largest integer with the same sign as `a' is
2513 | returned.
2514 *----------------------------------------------------------------------------*/
2516 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2518 flag aSign;
2519 int16 aExp, shiftCount;
2520 bits64 aSig, savedASig;
2521 int32 z;
2522 a = float64_squash_input_denormal(a STATUS_VAR);
2524 aSig = extractFloat64Frac( a );
2525 aExp = extractFloat64Exp( a );
2526 aSign = extractFloat64Sign( a );
2527 if ( 0x41E < aExp ) {
2528 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2529 goto invalid;
2531 else if ( aExp < 0x3FF ) {
2532 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2533 return 0;
2535 aSig |= LIT64( 0x0010000000000000 );
2536 shiftCount = 0x433 - aExp;
2537 savedASig = aSig;
2538 aSig >>= shiftCount;
2539 z = aSig;
2540 if ( aSign ) z = - z;
2541 if ( ( z < 0 ) ^ aSign ) {
2542 invalid:
2543 float_raise( float_flag_invalid STATUS_VAR);
2544 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2546 if ( ( aSig<<shiftCount ) != savedASig ) {
2547 STATUS(float_exception_flags) |= float_flag_inexact;
2549 return z;
2553 /*----------------------------------------------------------------------------
2554 | Returns the result of converting the double-precision floating-point value
2555 | `a' to the 16-bit two's complement integer format. The conversion is
2556 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2557 | Arithmetic, except that the conversion is always rounded toward zero.
2558 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2559 | the conversion overflows, the largest integer with the same sign as `a' is
2560 | returned.
2561 *----------------------------------------------------------------------------*/
2563 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2565 flag aSign;
2566 int16 aExp, shiftCount;
2567 bits64 aSig, savedASig;
2568 int32 z;
2570 aSig = extractFloat64Frac( a );
2571 aExp = extractFloat64Exp( a );
2572 aSign = extractFloat64Sign( a );
2573 if ( 0x40E < aExp ) {
2574 if ( ( aExp == 0x7FF ) && aSig ) {
2575 aSign = 0;
2577 goto invalid;
2579 else if ( aExp < 0x3FF ) {
2580 if ( aExp || aSig ) {
2581 STATUS(float_exception_flags) |= float_flag_inexact;
2583 return 0;
2585 aSig |= LIT64( 0x0010000000000000 );
2586 shiftCount = 0x433 - aExp;
2587 savedASig = aSig;
2588 aSig >>= shiftCount;
2589 z = aSig;
2590 if ( aSign ) {
2591 z = - z;
2593 if ( ( (int16_t)z < 0 ) ^ aSign ) {
2594 invalid:
2595 float_raise( float_flag_invalid STATUS_VAR);
2596 return aSign ? (sbits32) 0xffff8000 : 0x7FFF;
2598 if ( ( aSig<<shiftCount ) != savedASig ) {
2599 STATUS(float_exception_flags) |= float_flag_inexact;
2601 return z;
2604 /*----------------------------------------------------------------------------
2605 | Returns the result of converting the double-precision floating-point value
2606 | `a' to the 64-bit two's complement integer format. The conversion is
2607 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2608 | Arithmetic---which means in particular that the conversion is rounded
2609 | according to the current rounding mode. If `a' is a NaN, the largest
2610 | positive integer is returned. Otherwise, if the conversion overflows, the
2611 | largest integer with the same sign as `a' is returned.
2612 *----------------------------------------------------------------------------*/
2614 int64 float64_to_int64( float64 a STATUS_PARAM )
2616 flag aSign;
2617 int16 aExp, shiftCount;
2618 bits64 aSig, aSigExtra;
2619 a = float64_squash_input_denormal(a STATUS_VAR);
2621 aSig = extractFloat64Frac( a );
2622 aExp = extractFloat64Exp( a );
2623 aSign = extractFloat64Sign( a );
2624 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2625 shiftCount = 0x433 - aExp;
2626 if ( shiftCount <= 0 ) {
2627 if ( 0x43E < aExp ) {
2628 float_raise( float_flag_invalid STATUS_VAR);
2629 if ( ! aSign
2630 || ( ( aExp == 0x7FF )
2631 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2633 return LIT64( 0x7FFFFFFFFFFFFFFF );
2635 return (sbits64) LIT64( 0x8000000000000000 );
2637 aSigExtra = 0;
2638 aSig <<= - shiftCount;
2640 else {
2641 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2643 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2647 /*----------------------------------------------------------------------------
2648 | Returns the result of converting the double-precision floating-point value
2649 | `a' to the 64-bit two's complement integer format. The conversion is
2650 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2651 | Arithmetic, except that the conversion is always rounded toward zero.
2652 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2653 | the conversion overflows, the largest integer with the same sign as `a' is
2654 | returned.
2655 *----------------------------------------------------------------------------*/
2657 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2659 flag aSign;
2660 int16 aExp, shiftCount;
2661 bits64 aSig;
2662 int64 z;
2663 a = float64_squash_input_denormal(a STATUS_VAR);
2665 aSig = extractFloat64Frac( a );
2666 aExp = extractFloat64Exp( a );
2667 aSign = extractFloat64Sign( a );
2668 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2669 shiftCount = aExp - 0x433;
2670 if ( 0 <= shiftCount ) {
2671 if ( 0x43E <= aExp ) {
2672 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2673 float_raise( float_flag_invalid STATUS_VAR);
2674 if ( ! aSign
2675 || ( ( aExp == 0x7FF )
2676 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2678 return LIT64( 0x7FFFFFFFFFFFFFFF );
2681 return (sbits64) LIT64( 0x8000000000000000 );
2683 z = aSig<<shiftCount;
2685 else {
2686 if ( aExp < 0x3FE ) {
2687 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2688 return 0;
2690 z = aSig>>( - shiftCount );
2691 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2692 STATUS(float_exception_flags) |= float_flag_inexact;
2695 if ( aSign ) z = - z;
2696 return z;
2700 /*----------------------------------------------------------------------------
2701 | Returns the result of converting the double-precision floating-point value
2702 | `a' to the single-precision floating-point format. The conversion is
2703 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2704 | Arithmetic.
2705 *----------------------------------------------------------------------------*/
2707 float32 float64_to_float32( float64 a STATUS_PARAM )
2709 flag aSign;
2710 int16 aExp;
2711 bits64 aSig;
2712 bits32 zSig;
2713 a = float64_squash_input_denormal(a STATUS_VAR);
2715 aSig = extractFloat64Frac( a );
2716 aExp = extractFloat64Exp( a );
2717 aSign = extractFloat64Sign( a );
2718 if ( aExp == 0x7FF ) {
2719 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2720 return packFloat32( aSign, 0xFF, 0 );
2722 shift64RightJamming( aSig, 22, &aSig );
2723 zSig = aSig;
2724 if ( aExp || zSig ) {
2725 zSig |= 0x40000000;
2726 aExp -= 0x381;
2728 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2733 /*----------------------------------------------------------------------------
2734 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2735 | half-precision floating-point value, returning the result. After being
2736 | shifted into the proper positions, the three fields are simply added
2737 | together to form the result. This means that any integer portion of `zSig'
2738 | will be added into the exponent. Since a properly normalized significand
2739 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2740 | than the desired result exponent whenever `zSig' is a complete, normalized
2741 | significand.
2742 *----------------------------------------------------------------------------*/
2743 static float16 packFloat16(flag zSign, int16 zExp, bits16 zSig)
2745 return make_float16(
2746 (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig);
2749 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2750 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
2752 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2754 flag aSign;
2755 int16 aExp;
2756 bits32 aSig;
2758 aSign = extractFloat16Sign(a);
2759 aExp = extractFloat16Exp(a);
2760 aSig = extractFloat16Frac(a);
2762 if (aExp == 0x1f && ieee) {
2763 if (aSig) {
2764 /* Make sure correct exceptions are raised. */
2765 float32ToCommonNaN(a STATUS_VAR);
2766 aSig |= 0x200;
2768 return packFloat32(aSign, 0xff, aSig << 13);
2770 if (aExp == 0) {
2771 int8 shiftCount;
2773 if (aSig == 0) {
2774 return packFloat32(aSign, 0, 0);
2777 shiftCount = countLeadingZeros32( aSig ) - 21;
2778 aSig = aSig << shiftCount;
2779 aExp = -shiftCount;
2781 return packFloat32( aSign, aExp + 0x70, aSig << 13);
2784 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
2786 flag aSign;
2787 int16 aExp;
2788 bits32 aSig;
2789 bits32 mask;
2790 bits32 increment;
2791 int8 roundingMode;
2792 a = float32_squash_input_denormal(a STATUS_VAR);
2794 aSig = extractFloat32Frac( a );
2795 aExp = extractFloat32Exp( a );
2796 aSign = extractFloat32Sign( a );
2797 if ( aExp == 0xFF ) {
2798 if (aSig) {
2799 /* Input is a NaN */
2800 float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2801 if (!ieee) {
2802 return packFloat16(aSign, 0, 0);
2804 return r;
2806 /* Infinity */
2807 if (!ieee) {
2808 float_raise(float_flag_invalid STATUS_VAR);
2809 return packFloat16(aSign, 0x1f, 0x3ff);
2811 return packFloat16(aSign, 0x1f, 0);
2813 if (aExp == 0 && aSig == 0) {
2814 return packFloat16(aSign, 0, 0);
2816 /* Decimal point between bits 22 and 23. */
2817 aSig |= 0x00800000;
2818 aExp -= 0x7f;
2819 if (aExp < -14) {
2820 mask = 0x00ffffff;
2821 if (aExp >= -24) {
2822 mask >>= 25 + aExp;
2824 } else {
2825 mask = 0x00001fff;
2827 if (aSig & mask) {
2828 float_raise( float_flag_underflow STATUS_VAR );
2829 roundingMode = STATUS(float_rounding_mode);
2830 switch (roundingMode) {
2831 case float_round_nearest_even:
2832 increment = (mask + 1) >> 1;
2833 if ((aSig & mask) == increment) {
2834 increment = aSig & (increment << 1);
2836 break;
2837 case float_round_up:
2838 increment = aSign ? 0 : mask;
2839 break;
2840 case float_round_down:
2841 increment = aSign ? mask : 0;
2842 break;
2843 default: /* round_to_zero */
2844 increment = 0;
2845 break;
2847 aSig += increment;
2848 if (aSig >= 0x01000000) {
2849 aSig >>= 1;
2850 aExp++;
2852 } else if (aExp < -14
2853 && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2854 float_raise( float_flag_underflow STATUS_VAR);
2857 if (ieee) {
2858 if (aExp > 15) {
2859 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2860 return packFloat16(aSign, 0x1f, 0);
2862 } else {
2863 if (aExp > 16) {
2864 float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
2865 return packFloat16(aSign, 0x1f, 0x3ff);
2868 if (aExp < -24) {
2869 return packFloat16(aSign, 0, 0);
2871 if (aExp < -14) {
2872 aSig >>= -14 - aExp;
2873 aExp = -14;
2875 return packFloat16(aSign, aExp + 14, aSig >> 13);
2878 #ifdef FLOATX80
2880 /*----------------------------------------------------------------------------
2881 | Returns the result of converting the double-precision floating-point value
2882 | `a' to the extended double-precision floating-point format. The conversion
2883 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2884 | Arithmetic.
2885 *----------------------------------------------------------------------------*/
2887 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2889 flag aSign;
2890 int16 aExp;
2891 bits64 aSig;
2893 a = float64_squash_input_denormal(a STATUS_VAR);
2894 aSig = extractFloat64Frac( a );
2895 aExp = extractFloat64Exp( a );
2896 aSign = extractFloat64Sign( a );
2897 if ( aExp == 0x7FF ) {
2898 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2899 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2901 if ( aExp == 0 ) {
2902 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2903 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2905 return
2906 packFloatx80(
2907 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2911 #endif
2913 #ifdef FLOAT128
2915 /*----------------------------------------------------------------------------
2916 | Returns the result of converting the double-precision floating-point value
2917 | `a' to the quadruple-precision floating-point format. The conversion is
2918 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2919 | Arithmetic.
2920 *----------------------------------------------------------------------------*/
2922 float128 float64_to_float128( float64 a STATUS_PARAM )
2924 flag aSign;
2925 int16 aExp;
2926 bits64 aSig, zSig0, zSig1;
2928 a = float64_squash_input_denormal(a STATUS_VAR);
2929 aSig = extractFloat64Frac( a );
2930 aExp = extractFloat64Exp( a );
2931 aSign = extractFloat64Sign( a );
2932 if ( aExp == 0x7FF ) {
2933 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2934 return packFloat128( aSign, 0x7FFF, 0, 0 );
2936 if ( aExp == 0 ) {
2937 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2938 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2939 --aExp;
2941 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2942 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2946 #endif
2948 /*----------------------------------------------------------------------------
2949 | Rounds the double-precision floating-point value `a' to an integer, and
2950 | returns the result as a double-precision floating-point value. The
2951 | operation is performed according to the IEC/IEEE Standard for Binary
2952 | Floating-Point Arithmetic.
2953 *----------------------------------------------------------------------------*/
2955 float64 float64_round_to_int( float64 a STATUS_PARAM )
2957 flag aSign;
2958 int16 aExp;
2959 bits64 lastBitMask, roundBitsMask;
2960 int8 roundingMode;
2961 bits64 z;
2962 a = float64_squash_input_denormal(a STATUS_VAR);
2964 aExp = extractFloat64Exp( a );
2965 if ( 0x433 <= aExp ) {
2966 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2967 return propagateFloat64NaN( a, a STATUS_VAR );
2969 return a;
2971 if ( aExp < 0x3FF ) {
2972 if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2973 STATUS(float_exception_flags) |= float_flag_inexact;
2974 aSign = extractFloat64Sign( a );
2975 switch ( STATUS(float_rounding_mode) ) {
2976 case float_round_nearest_even:
2977 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2978 return packFloat64( aSign, 0x3FF, 0 );
2980 break;
2981 case float_round_down:
2982 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2983 case float_round_up:
2984 return make_float64(
2985 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2987 return packFloat64( aSign, 0, 0 );
2989 lastBitMask = 1;
2990 lastBitMask <<= 0x433 - aExp;
2991 roundBitsMask = lastBitMask - 1;
2992 z = float64_val(a);
2993 roundingMode = STATUS(float_rounding_mode);
2994 if ( roundingMode == float_round_nearest_even ) {
2995 z += lastBitMask>>1;
2996 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2998 else if ( roundingMode != float_round_to_zero ) {
2999 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3000 z += roundBitsMask;
3003 z &= ~ roundBitsMask;
3004 if ( z != float64_val(a) )
3005 STATUS(float_exception_flags) |= float_flag_inexact;
3006 return make_float64(z);
3010 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3012 int oldmode;
3013 float64 res;
3014 oldmode = STATUS(float_rounding_mode);
3015 STATUS(float_rounding_mode) = float_round_to_zero;
3016 res = float64_round_to_int(a STATUS_VAR);
3017 STATUS(float_rounding_mode) = oldmode;
3018 return res;
3021 /*----------------------------------------------------------------------------
3022 | Returns the result of adding the absolute values of the double-precision
3023 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3024 | before being returned. `zSign' is ignored if the result is a NaN.
3025 | The addition is performed according to the IEC/IEEE Standard for Binary
3026 | Floating-Point Arithmetic.
3027 *----------------------------------------------------------------------------*/
3029 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3031 int16 aExp, bExp, zExp;
3032 bits64 aSig, bSig, zSig;
3033 int16 expDiff;
3035 aSig = extractFloat64Frac( a );
3036 aExp = extractFloat64Exp( a );
3037 bSig = extractFloat64Frac( b );
3038 bExp = extractFloat64Exp( b );
3039 expDiff = aExp - bExp;
3040 aSig <<= 9;
3041 bSig <<= 9;
3042 if ( 0 < expDiff ) {
3043 if ( aExp == 0x7FF ) {
3044 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3045 return a;
3047 if ( bExp == 0 ) {
3048 --expDiff;
3050 else {
3051 bSig |= LIT64( 0x2000000000000000 );
3053 shift64RightJamming( bSig, expDiff, &bSig );
3054 zExp = aExp;
3056 else if ( expDiff < 0 ) {
3057 if ( bExp == 0x7FF ) {
3058 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3059 return packFloat64( zSign, 0x7FF, 0 );
3061 if ( aExp == 0 ) {
3062 ++expDiff;
3064 else {
3065 aSig |= LIT64( 0x2000000000000000 );
3067 shift64RightJamming( aSig, - expDiff, &aSig );
3068 zExp = bExp;
3070 else {
3071 if ( aExp == 0x7FF ) {
3072 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3073 return a;
3075 if ( aExp == 0 ) {
3076 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
3077 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3079 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3080 zExp = aExp;
3081 goto roundAndPack;
3083 aSig |= LIT64( 0x2000000000000000 );
3084 zSig = ( aSig + bSig )<<1;
3085 --zExp;
3086 if ( (sbits64) zSig < 0 ) {
3087 zSig = aSig + bSig;
3088 ++zExp;
3090 roundAndPack:
3091 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3095 /*----------------------------------------------------------------------------
3096 | Returns the result of subtracting the absolute values of the double-
3097 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3098 | difference is negated before being returned. `zSign' is ignored if the
3099 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3100 | Standard for Binary Floating-Point Arithmetic.
3101 *----------------------------------------------------------------------------*/
3103 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3105 int16 aExp, bExp, zExp;
3106 bits64 aSig, bSig, zSig;
3107 int16 expDiff;
3109 aSig = extractFloat64Frac( a );
3110 aExp = extractFloat64Exp( a );
3111 bSig = extractFloat64Frac( b );
3112 bExp = extractFloat64Exp( b );
3113 expDiff = aExp - bExp;
3114 aSig <<= 10;
3115 bSig <<= 10;
3116 if ( 0 < expDiff ) goto aExpBigger;
3117 if ( expDiff < 0 ) goto bExpBigger;
3118 if ( aExp == 0x7FF ) {
3119 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3120 float_raise( float_flag_invalid STATUS_VAR);
3121 return float64_default_nan;
3123 if ( aExp == 0 ) {
3124 aExp = 1;
3125 bExp = 1;
3127 if ( bSig < aSig ) goto aBigger;
3128 if ( aSig < bSig ) goto bBigger;
3129 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3130 bExpBigger:
3131 if ( bExp == 0x7FF ) {
3132 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3133 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3135 if ( aExp == 0 ) {
3136 ++expDiff;
3138 else {
3139 aSig |= LIT64( 0x4000000000000000 );
3141 shift64RightJamming( aSig, - expDiff, &aSig );
3142 bSig |= LIT64( 0x4000000000000000 );
3143 bBigger:
3144 zSig = bSig - aSig;
3145 zExp = bExp;
3146 zSign ^= 1;
3147 goto normalizeRoundAndPack;
3148 aExpBigger:
3149 if ( aExp == 0x7FF ) {
3150 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3151 return a;
3153 if ( bExp == 0 ) {
3154 --expDiff;
3156 else {
3157 bSig |= LIT64( 0x4000000000000000 );
3159 shift64RightJamming( bSig, expDiff, &bSig );
3160 aSig |= LIT64( 0x4000000000000000 );
3161 aBigger:
3162 zSig = aSig - bSig;
3163 zExp = aExp;
3164 normalizeRoundAndPack:
3165 --zExp;
3166 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3170 /*----------------------------------------------------------------------------
3171 | Returns the result of adding the double-precision floating-point values `a'
3172 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3173 | Binary Floating-Point Arithmetic.
3174 *----------------------------------------------------------------------------*/
3176 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3178 flag aSign, bSign;
3179 a = float64_squash_input_denormal(a STATUS_VAR);
3180 b = float64_squash_input_denormal(b STATUS_VAR);
3182 aSign = extractFloat64Sign( a );
3183 bSign = extractFloat64Sign( b );
3184 if ( aSign == bSign ) {
3185 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3187 else {
3188 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3193 /*----------------------------------------------------------------------------
3194 | Returns the result of subtracting the double-precision floating-point values
3195 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3196 | for Binary Floating-Point Arithmetic.
3197 *----------------------------------------------------------------------------*/
3199 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3201 flag aSign, bSign;
3202 a = float64_squash_input_denormal(a STATUS_VAR);
3203 b = float64_squash_input_denormal(b STATUS_VAR);
3205 aSign = extractFloat64Sign( a );
3206 bSign = extractFloat64Sign( b );
3207 if ( aSign == bSign ) {
3208 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3210 else {
3211 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3216 /*----------------------------------------------------------------------------
3217 | Returns the result of multiplying the double-precision floating-point values
3218 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3219 | for Binary Floating-Point Arithmetic.
3220 *----------------------------------------------------------------------------*/
3222 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3224 flag aSign, bSign, zSign;
3225 int16 aExp, bExp, zExp;
3226 bits64 aSig, bSig, zSig0, zSig1;
3228 a = float64_squash_input_denormal(a STATUS_VAR);
3229 b = float64_squash_input_denormal(b STATUS_VAR);
3231 aSig = extractFloat64Frac( a );
3232 aExp = extractFloat64Exp( a );
3233 aSign = extractFloat64Sign( a );
3234 bSig = extractFloat64Frac( b );
3235 bExp = extractFloat64Exp( b );
3236 bSign = extractFloat64Sign( b );
3237 zSign = aSign ^ bSign;
3238 if ( aExp == 0x7FF ) {
3239 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3240 return propagateFloat64NaN( a, b STATUS_VAR );
3242 if ( ( bExp | bSig ) == 0 ) {
3243 float_raise( float_flag_invalid STATUS_VAR);
3244 return float64_default_nan;
3246 return packFloat64( zSign, 0x7FF, 0 );
3248 if ( bExp == 0x7FF ) {
3249 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3250 if ( ( aExp | aSig ) == 0 ) {
3251 float_raise( float_flag_invalid STATUS_VAR);
3252 return float64_default_nan;
3254 return packFloat64( zSign, 0x7FF, 0 );
3256 if ( aExp == 0 ) {
3257 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3258 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3260 if ( bExp == 0 ) {
3261 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3262 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3264 zExp = aExp + bExp - 0x3FF;
3265 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3266 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3267 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3268 zSig0 |= ( zSig1 != 0 );
3269 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
3270 zSig0 <<= 1;
3271 --zExp;
3273 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3277 /*----------------------------------------------------------------------------
3278 | Returns the result of dividing the double-precision floating-point value `a'
3279 | by the corresponding value `b'. The operation is performed according to
3280 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3281 *----------------------------------------------------------------------------*/
3283 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3285 flag aSign, bSign, zSign;
3286 int16 aExp, bExp, zExp;
3287 bits64 aSig, bSig, zSig;
3288 bits64 rem0, rem1;
3289 bits64 term0, term1;
3290 a = float64_squash_input_denormal(a STATUS_VAR);
3291 b = float64_squash_input_denormal(b STATUS_VAR);
3293 aSig = extractFloat64Frac( a );
3294 aExp = extractFloat64Exp( a );
3295 aSign = extractFloat64Sign( a );
3296 bSig = extractFloat64Frac( b );
3297 bExp = extractFloat64Exp( b );
3298 bSign = extractFloat64Sign( b );
3299 zSign = aSign ^ bSign;
3300 if ( aExp == 0x7FF ) {
3301 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3302 if ( bExp == 0x7FF ) {
3303 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3304 float_raise( float_flag_invalid STATUS_VAR);
3305 return float64_default_nan;
3307 return packFloat64( zSign, 0x7FF, 0 );
3309 if ( bExp == 0x7FF ) {
3310 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3311 return packFloat64( zSign, 0, 0 );
3313 if ( bExp == 0 ) {
3314 if ( bSig == 0 ) {
3315 if ( ( aExp | aSig ) == 0 ) {
3316 float_raise( float_flag_invalid STATUS_VAR);
3317 return float64_default_nan;
3319 float_raise( float_flag_divbyzero STATUS_VAR);
3320 return packFloat64( zSign, 0x7FF, 0 );
3322 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3324 if ( aExp == 0 ) {
3325 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3326 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3328 zExp = aExp - bExp + 0x3FD;
3329 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3330 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3331 if ( bSig <= ( aSig + aSig ) ) {
3332 aSig >>= 1;
3333 ++zExp;
3335 zSig = estimateDiv128To64( aSig, 0, bSig );
3336 if ( ( zSig & 0x1FF ) <= 2 ) {
3337 mul64To128( bSig, zSig, &term0, &term1 );
3338 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3339 while ( (sbits64) rem0 < 0 ) {
3340 --zSig;
3341 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3343 zSig |= ( rem1 != 0 );
3345 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3349 /*----------------------------------------------------------------------------
3350 | Returns the remainder of the double-precision floating-point value `a'
3351 | with respect to the corresponding value `b'. The operation is performed
3352 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3353 *----------------------------------------------------------------------------*/
3355 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3357 flag aSign, zSign;
3358 int16 aExp, bExp, expDiff;
3359 bits64 aSig, bSig;
3360 bits64 q, alternateASig;
3361 sbits64 sigMean;
3363 a = float64_squash_input_denormal(a STATUS_VAR);
3364 b = float64_squash_input_denormal(b STATUS_VAR);
3365 aSig = extractFloat64Frac( a );
3366 aExp = extractFloat64Exp( a );
3367 aSign = extractFloat64Sign( a );
3368 bSig = extractFloat64Frac( b );
3369 bExp = extractFloat64Exp( b );
3370 if ( aExp == 0x7FF ) {
3371 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3372 return propagateFloat64NaN( a, b STATUS_VAR );
3374 float_raise( float_flag_invalid STATUS_VAR);
3375 return float64_default_nan;
3377 if ( bExp == 0x7FF ) {
3378 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3379 return a;
3381 if ( bExp == 0 ) {
3382 if ( bSig == 0 ) {
3383 float_raise( float_flag_invalid STATUS_VAR);
3384 return float64_default_nan;
3386 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3388 if ( aExp == 0 ) {
3389 if ( aSig == 0 ) return a;
3390 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3392 expDiff = aExp - bExp;
3393 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3394 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3395 if ( expDiff < 0 ) {
3396 if ( expDiff < -1 ) return a;
3397 aSig >>= 1;
3399 q = ( bSig <= aSig );
3400 if ( q ) aSig -= bSig;
3401 expDiff -= 64;
3402 while ( 0 < expDiff ) {
3403 q = estimateDiv128To64( aSig, 0, bSig );
3404 q = ( 2 < q ) ? q - 2 : 0;
3405 aSig = - ( ( bSig>>2 ) * q );
3406 expDiff -= 62;
3408 expDiff += 64;
3409 if ( 0 < expDiff ) {
3410 q = estimateDiv128To64( aSig, 0, bSig );
3411 q = ( 2 < q ) ? q - 2 : 0;
3412 q >>= 64 - expDiff;
3413 bSig >>= 2;
3414 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3416 else {
3417 aSig >>= 2;
3418 bSig >>= 2;
3420 do {
3421 alternateASig = aSig;
3422 ++q;
3423 aSig -= bSig;
3424 } while ( 0 <= (sbits64) aSig );
3425 sigMean = aSig + alternateASig;
3426 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3427 aSig = alternateASig;
3429 zSign = ( (sbits64) aSig < 0 );
3430 if ( zSign ) aSig = - aSig;
3431 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3435 /*----------------------------------------------------------------------------
3436 | Returns the square root of the double-precision floating-point value `a'.
3437 | The operation is performed according to the IEC/IEEE Standard for Binary
3438 | Floating-Point Arithmetic.
3439 *----------------------------------------------------------------------------*/
3441 float64 float64_sqrt( float64 a STATUS_PARAM )
3443 flag aSign;
3444 int16 aExp, zExp;
3445 bits64 aSig, zSig, doubleZSig;
3446 bits64 rem0, rem1, term0, term1;
3447 a = float64_squash_input_denormal(a STATUS_VAR);
3449 aSig = extractFloat64Frac( a );
3450 aExp = extractFloat64Exp( a );
3451 aSign = extractFloat64Sign( a );
3452 if ( aExp == 0x7FF ) {
3453 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3454 if ( ! aSign ) return a;
3455 float_raise( float_flag_invalid STATUS_VAR);
3456 return float64_default_nan;
3458 if ( aSign ) {
3459 if ( ( aExp | aSig ) == 0 ) return a;
3460 float_raise( float_flag_invalid STATUS_VAR);
3461 return float64_default_nan;
3463 if ( aExp == 0 ) {
3464 if ( aSig == 0 ) return float64_zero;
3465 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3467 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3468 aSig |= LIT64( 0x0010000000000000 );
3469 zSig = estimateSqrt32( aExp, aSig>>21 );
3470 aSig <<= 9 - ( aExp & 1 );
3471 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3472 if ( ( zSig & 0x1FF ) <= 5 ) {
3473 doubleZSig = zSig<<1;
3474 mul64To128( zSig, zSig, &term0, &term1 );
3475 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3476 while ( (sbits64) rem0 < 0 ) {
3477 --zSig;
3478 doubleZSig -= 2;
3479 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3481 zSig |= ( ( rem0 | rem1 ) != 0 );
3483 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3487 /*----------------------------------------------------------------------------
3488 | Returns the binary log of the double-precision floating-point value `a'.
3489 | The operation is performed according to the IEC/IEEE Standard for Binary
3490 | Floating-Point Arithmetic.
3491 *----------------------------------------------------------------------------*/
3492 float64 float64_log2( float64 a STATUS_PARAM )
3494 flag aSign, zSign;
3495 int16 aExp;
3496 bits64 aSig, aSig0, aSig1, zSig, i;
3497 a = float64_squash_input_denormal(a STATUS_VAR);
3499 aSig = extractFloat64Frac( a );
3500 aExp = extractFloat64Exp( a );
3501 aSign = extractFloat64Sign( a );
3503 if ( aExp == 0 ) {
3504 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3505 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3507 if ( aSign ) {
3508 float_raise( float_flag_invalid STATUS_VAR);
3509 return float64_default_nan;
3511 if ( aExp == 0x7FF ) {
3512 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3513 return a;
3516 aExp -= 0x3FF;
3517 aSig |= LIT64( 0x0010000000000000 );
3518 zSign = aExp < 0;
3519 zSig = (bits64)aExp << 52;
3520 for (i = 1LL << 51; i > 0; i >>= 1) {
3521 mul64To128( aSig, aSig, &aSig0, &aSig1 );
3522 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3523 if ( aSig & LIT64( 0x0020000000000000 ) ) {
3524 aSig >>= 1;
3525 zSig |= i;
3529 if ( zSign )
3530 zSig = -zSig;
3531 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3534 /*----------------------------------------------------------------------------
3535 | Returns 1 if the double-precision floating-point value `a' is equal to the
3536 | corresponding value `b', and 0 otherwise. The comparison is performed
3537 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3538 *----------------------------------------------------------------------------*/
3540 int float64_eq( float64 a, float64 b STATUS_PARAM )
3542 bits64 av, bv;
3543 a = float64_squash_input_denormal(a STATUS_VAR);
3544 b = float64_squash_input_denormal(b STATUS_VAR);
3546 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3547 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3549 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3550 float_raise( float_flag_invalid STATUS_VAR);
3552 return 0;
3554 av = float64_val(a);
3555 bv = float64_val(b);
3556 return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3560 /*----------------------------------------------------------------------------
3561 | Returns 1 if the double-precision floating-point value `a' is less than or
3562 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3563 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3564 | Arithmetic.
3565 *----------------------------------------------------------------------------*/
3567 int float64_le( float64 a, float64 b STATUS_PARAM )
3569 flag aSign, bSign;
3570 bits64 av, bv;
3571 a = float64_squash_input_denormal(a STATUS_VAR);
3572 b = float64_squash_input_denormal(b STATUS_VAR);
3574 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3575 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3577 float_raise( float_flag_invalid STATUS_VAR);
3578 return 0;
3580 aSign = extractFloat64Sign( a );
3581 bSign = extractFloat64Sign( b );
3582 av = float64_val(a);
3583 bv = float64_val(b);
3584 if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3585 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3589 /*----------------------------------------------------------------------------
3590 | Returns 1 if the double-precision floating-point value `a' is less than
3591 | the corresponding value `b', and 0 otherwise. The comparison is performed
3592 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3593 *----------------------------------------------------------------------------*/
3595 int float64_lt( float64 a, float64 b STATUS_PARAM )
3597 flag aSign, bSign;
3598 bits64 av, bv;
3600 a = float64_squash_input_denormal(a STATUS_VAR);
3601 b = float64_squash_input_denormal(b STATUS_VAR);
3602 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3603 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3605 float_raise( float_flag_invalid STATUS_VAR);
3606 return 0;
3608 aSign = extractFloat64Sign( a );
3609 bSign = extractFloat64Sign( b );
3610 av = float64_val(a);
3611 bv = float64_val(b);
3612 if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3613 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3617 /*----------------------------------------------------------------------------
3618 | Returns 1 if the double-precision floating-point value `a' is equal to the
3619 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3620 | if either operand is a NaN. Otherwise, the comparison is performed
3621 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3622 *----------------------------------------------------------------------------*/
3624 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3626 bits64 av, bv;
3627 a = float64_squash_input_denormal(a STATUS_VAR);
3628 b = float64_squash_input_denormal(b STATUS_VAR);
3630 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3631 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3633 float_raise( float_flag_invalid STATUS_VAR);
3634 return 0;
3636 av = float64_val(a);
3637 bv = float64_val(b);
3638 return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3642 /*----------------------------------------------------------------------------
3643 | Returns 1 if the double-precision floating-point value `a' is less than or
3644 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3645 | cause an exception. Otherwise, the comparison is performed according to the
3646 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3647 *----------------------------------------------------------------------------*/
3649 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3651 flag aSign, bSign;
3652 bits64 av, bv;
3653 a = float64_squash_input_denormal(a STATUS_VAR);
3654 b = float64_squash_input_denormal(b STATUS_VAR);
3656 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3657 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3659 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3660 float_raise( float_flag_invalid STATUS_VAR);
3662 return 0;
3664 aSign = extractFloat64Sign( a );
3665 bSign = extractFloat64Sign( b );
3666 av = float64_val(a);
3667 bv = float64_val(b);
3668 if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3669 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3673 /*----------------------------------------------------------------------------
3674 | Returns 1 if the double-precision floating-point value `a' is less than
3675 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3676 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3677 | Standard for Binary Floating-Point Arithmetic.
3678 *----------------------------------------------------------------------------*/
3680 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3682 flag aSign, bSign;
3683 bits64 av, bv;
3684 a = float64_squash_input_denormal(a STATUS_VAR);
3685 b = float64_squash_input_denormal(b STATUS_VAR);
3687 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3688 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3690 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3691 float_raise( float_flag_invalid STATUS_VAR);
3693 return 0;
3695 aSign = extractFloat64Sign( a );
3696 bSign = extractFloat64Sign( b );
3697 av = float64_val(a);
3698 bv = float64_val(b);
3699 if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3700 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3704 #ifdef FLOATX80
3706 /*----------------------------------------------------------------------------
3707 | Returns the result of converting the extended double-precision floating-
3708 | point value `a' to the 32-bit two's complement integer format. The
3709 | conversion is performed according to the IEC/IEEE Standard for Binary
3710 | Floating-Point Arithmetic---which means in particular that the conversion
3711 | is rounded according to the current rounding mode. If `a' is a NaN, the
3712 | largest positive integer is returned. Otherwise, if the conversion
3713 | overflows, the largest integer with the same sign as `a' is returned.
3714 *----------------------------------------------------------------------------*/
3716 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3718 flag aSign;
3719 int32 aExp, shiftCount;
3720 bits64 aSig;
3722 aSig = extractFloatx80Frac( a );
3723 aExp = extractFloatx80Exp( a );
3724 aSign = extractFloatx80Sign( a );
3725 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3726 shiftCount = 0x4037 - aExp;
3727 if ( shiftCount <= 0 ) shiftCount = 1;
3728 shift64RightJamming( aSig, shiftCount, &aSig );
3729 return roundAndPackInt32( aSign, aSig STATUS_VAR );
3733 /*----------------------------------------------------------------------------
3734 | Returns the result of converting the extended double-precision floating-
3735 | point value `a' to the 32-bit two's complement integer format. The
3736 | conversion is performed according to the IEC/IEEE Standard for Binary
3737 | Floating-Point Arithmetic, except that the conversion is always rounded
3738 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3739 | Otherwise, if the conversion overflows, the largest integer with the same
3740 | sign as `a' is returned.
3741 *----------------------------------------------------------------------------*/
3743 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3745 flag aSign;
3746 int32 aExp, shiftCount;
3747 bits64 aSig, savedASig;
3748 int32 z;
3750 aSig = extractFloatx80Frac( a );
3751 aExp = extractFloatx80Exp( a );
3752 aSign = extractFloatx80Sign( a );
3753 if ( 0x401E < aExp ) {
3754 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3755 goto invalid;
3757 else if ( aExp < 0x3FFF ) {
3758 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3759 return 0;
3761 shiftCount = 0x403E - aExp;
3762 savedASig = aSig;
3763 aSig >>= shiftCount;
3764 z = aSig;
3765 if ( aSign ) z = - z;
3766 if ( ( z < 0 ) ^ aSign ) {
3767 invalid:
3768 float_raise( float_flag_invalid STATUS_VAR);
3769 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3771 if ( ( aSig<<shiftCount ) != savedASig ) {
3772 STATUS(float_exception_flags) |= float_flag_inexact;
3774 return z;
3778 /*----------------------------------------------------------------------------
3779 | Returns the result of converting the extended double-precision floating-
3780 | point value `a' to the 64-bit two's complement integer format. The
3781 | conversion is performed according to the IEC/IEEE Standard for Binary
3782 | Floating-Point Arithmetic---which means in particular that the conversion
3783 | is rounded according to the current rounding mode. If `a' is a NaN,
3784 | the largest positive integer is returned. Otherwise, if the conversion
3785 | overflows, the largest integer with the same sign as `a' is returned.
3786 *----------------------------------------------------------------------------*/
3788 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3790 flag aSign;
3791 int32 aExp, shiftCount;
3792 bits64 aSig, aSigExtra;
3794 aSig = extractFloatx80Frac( a );
3795 aExp = extractFloatx80Exp( a );
3796 aSign = extractFloatx80Sign( a );
3797 shiftCount = 0x403E - aExp;
3798 if ( shiftCount <= 0 ) {
3799 if ( shiftCount ) {
3800 float_raise( float_flag_invalid STATUS_VAR);
3801 if ( ! aSign
3802 || ( ( aExp == 0x7FFF )
3803 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3805 return LIT64( 0x7FFFFFFFFFFFFFFF );
3807 return (sbits64) LIT64( 0x8000000000000000 );
3809 aSigExtra = 0;
3811 else {
3812 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3814 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3818 /*----------------------------------------------------------------------------
3819 | Returns the result of converting the extended double-precision floating-
3820 | point value `a' to the 64-bit two's complement integer format. The
3821 | conversion is performed according to the IEC/IEEE Standard for Binary
3822 | Floating-Point Arithmetic, except that the conversion is always rounded
3823 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3824 | Otherwise, if the conversion overflows, the largest integer with the same
3825 | sign as `a' is returned.
3826 *----------------------------------------------------------------------------*/
3828 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3830 flag aSign;
3831 int32 aExp, shiftCount;
3832 bits64 aSig;
3833 int64 z;
3835 aSig = extractFloatx80Frac( a );
3836 aExp = extractFloatx80Exp( a );
3837 aSign = extractFloatx80Sign( a );
3838 shiftCount = aExp - 0x403E;
3839 if ( 0 <= shiftCount ) {
3840 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3841 if ( ( a.high != 0xC03E ) || aSig ) {
3842 float_raise( float_flag_invalid STATUS_VAR);
3843 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3844 return LIT64( 0x7FFFFFFFFFFFFFFF );
3847 return (sbits64) LIT64( 0x8000000000000000 );
3849 else if ( aExp < 0x3FFF ) {
3850 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3851 return 0;
3853 z = aSig>>( - shiftCount );
3854 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3855 STATUS(float_exception_flags) |= float_flag_inexact;
3857 if ( aSign ) z = - z;
3858 return z;
3862 /*----------------------------------------------------------------------------
3863 | Returns the result of converting the extended double-precision floating-
3864 | point value `a' to the single-precision floating-point format. The
3865 | conversion is performed according to the IEC/IEEE Standard for Binary
3866 | Floating-Point Arithmetic.
3867 *----------------------------------------------------------------------------*/
3869 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3871 flag aSign;
3872 int32 aExp;
3873 bits64 aSig;
3875 aSig = extractFloatx80Frac( a );
3876 aExp = extractFloatx80Exp( a );
3877 aSign = extractFloatx80Sign( a );
3878 if ( aExp == 0x7FFF ) {
3879 if ( (bits64) ( aSig<<1 ) ) {
3880 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3882 return packFloat32( aSign, 0xFF, 0 );
3884 shift64RightJamming( aSig, 33, &aSig );
3885 if ( aExp || aSig ) aExp -= 0x3F81;
3886 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3890 /*----------------------------------------------------------------------------
3891 | Returns the result of converting the extended double-precision floating-
3892 | point value `a' to the double-precision floating-point format. The
3893 | conversion is performed according to the IEC/IEEE Standard for Binary
3894 | Floating-Point Arithmetic.
3895 *----------------------------------------------------------------------------*/
3897 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3899 flag aSign;
3900 int32 aExp;
3901 bits64 aSig, zSig;
3903 aSig = extractFloatx80Frac( a );
3904 aExp = extractFloatx80Exp( a );
3905 aSign = extractFloatx80Sign( a );
3906 if ( aExp == 0x7FFF ) {
3907 if ( (bits64) ( aSig<<1 ) ) {
3908 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3910 return packFloat64( aSign, 0x7FF, 0 );
3912 shift64RightJamming( aSig, 1, &zSig );
3913 if ( aExp || aSig ) aExp -= 0x3C01;
3914 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3918 #ifdef FLOAT128
3920 /*----------------------------------------------------------------------------
3921 | Returns the result of converting the extended double-precision floating-
3922 | point value `a' to the quadruple-precision floating-point format. The
3923 | conversion is performed according to the IEC/IEEE Standard for Binary
3924 | Floating-Point Arithmetic.
3925 *----------------------------------------------------------------------------*/
3927 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3929 flag aSign;
3930 int16 aExp;
3931 bits64 aSig, zSig0, zSig1;
3933 aSig = extractFloatx80Frac( a );
3934 aExp = extractFloatx80Exp( a );
3935 aSign = extractFloatx80Sign( a );
3936 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3937 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3939 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3940 return packFloat128( aSign, aExp, zSig0, zSig1 );
3944 #endif
3946 /*----------------------------------------------------------------------------
3947 | Rounds the extended double-precision floating-point value `a' to an integer,
3948 | and returns the result as an extended quadruple-precision floating-point
3949 | value. The operation is performed according to the IEC/IEEE Standard for
3950 | Binary Floating-Point Arithmetic.
3951 *----------------------------------------------------------------------------*/
3953 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3955 flag aSign;
3956 int32 aExp;
3957 bits64 lastBitMask, roundBitsMask;
3958 int8 roundingMode;
3959 floatx80 z;
3961 aExp = extractFloatx80Exp( a );
3962 if ( 0x403E <= aExp ) {
3963 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3964 return propagateFloatx80NaN( a, a STATUS_VAR );
3966 return a;
3968 if ( aExp < 0x3FFF ) {
3969 if ( ( aExp == 0 )
3970 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3971 return a;
3973 STATUS(float_exception_flags) |= float_flag_inexact;
3974 aSign = extractFloatx80Sign( a );
3975 switch ( STATUS(float_rounding_mode) ) {
3976 case float_round_nearest_even:
3977 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3979 return
3980 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3982 break;
3983 case float_round_down:
3984 return
3985 aSign ?
3986 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3987 : packFloatx80( 0, 0, 0 );
3988 case float_round_up:
3989 return
3990 aSign ? packFloatx80( 1, 0, 0 )
3991 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3993 return packFloatx80( aSign, 0, 0 );
3995 lastBitMask = 1;
3996 lastBitMask <<= 0x403E - aExp;
3997 roundBitsMask = lastBitMask - 1;
3998 z = a;
3999 roundingMode = STATUS(float_rounding_mode);
4000 if ( roundingMode == float_round_nearest_even ) {
4001 z.low += lastBitMask>>1;
4002 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4004 else if ( roundingMode != float_round_to_zero ) {
4005 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4006 z.low += roundBitsMask;
4009 z.low &= ~ roundBitsMask;
4010 if ( z.low == 0 ) {
4011 ++z.high;
4012 z.low = LIT64( 0x8000000000000000 );
4014 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4015 return z;
4019 /*----------------------------------------------------------------------------
4020 | Returns the result of adding the absolute values of the extended double-
4021 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4022 | negated before being returned. `zSign' is ignored if the result is a NaN.
4023 | The addition is performed according to the IEC/IEEE Standard for Binary
4024 | Floating-Point Arithmetic.
4025 *----------------------------------------------------------------------------*/
4027 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4029 int32 aExp, bExp, zExp;
4030 bits64 aSig, bSig, zSig0, zSig1;
4031 int32 expDiff;
4033 aSig = extractFloatx80Frac( a );
4034 aExp = extractFloatx80Exp( a );
4035 bSig = extractFloatx80Frac( b );
4036 bExp = extractFloatx80Exp( b );
4037 expDiff = aExp - bExp;
4038 if ( 0 < expDiff ) {
4039 if ( aExp == 0x7FFF ) {
4040 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4041 return a;
4043 if ( bExp == 0 ) --expDiff;
4044 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4045 zExp = aExp;
4047 else if ( expDiff < 0 ) {
4048 if ( bExp == 0x7FFF ) {
4049 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4050 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4052 if ( aExp == 0 ) ++expDiff;
4053 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4054 zExp = bExp;
4056 else {
4057 if ( aExp == 0x7FFF ) {
4058 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
4059 return propagateFloatx80NaN( a, b STATUS_VAR );
4061 return a;
4063 zSig1 = 0;
4064 zSig0 = aSig + bSig;
4065 if ( aExp == 0 ) {
4066 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4067 goto roundAndPack;
4069 zExp = aExp;
4070 goto shiftRight1;
4072 zSig0 = aSig + bSig;
4073 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
4074 shiftRight1:
4075 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4076 zSig0 |= LIT64( 0x8000000000000000 );
4077 ++zExp;
4078 roundAndPack:
4079 return
4080 roundAndPackFloatx80(
4081 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4085 /*----------------------------------------------------------------------------
4086 | Returns the result of subtracting the absolute values of the extended
4087 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4088 | difference is negated before being returned. `zSign' is ignored if the
4089 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4090 | Standard for Binary Floating-Point Arithmetic.
4091 *----------------------------------------------------------------------------*/
4093 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4095 int32 aExp, bExp, zExp;
4096 bits64 aSig, bSig, zSig0, zSig1;
4097 int32 expDiff;
4098 floatx80 z;
4100 aSig = extractFloatx80Frac( a );
4101 aExp = extractFloatx80Exp( a );
4102 bSig = extractFloatx80Frac( b );
4103 bExp = extractFloatx80Exp( b );
4104 expDiff = aExp - bExp;
4105 if ( 0 < expDiff ) goto aExpBigger;
4106 if ( expDiff < 0 ) goto bExpBigger;
4107 if ( aExp == 0x7FFF ) {
4108 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
4109 return propagateFloatx80NaN( a, b STATUS_VAR );
4111 float_raise( float_flag_invalid STATUS_VAR);
4112 z.low = floatx80_default_nan_low;
4113 z.high = floatx80_default_nan_high;
4114 return z;
4116 if ( aExp == 0 ) {
4117 aExp = 1;
4118 bExp = 1;
4120 zSig1 = 0;
4121 if ( bSig < aSig ) goto aBigger;
4122 if ( aSig < bSig ) goto bBigger;
4123 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4124 bExpBigger:
4125 if ( bExp == 0x7FFF ) {
4126 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4127 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4129 if ( aExp == 0 ) ++expDiff;
4130 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4131 bBigger:
4132 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4133 zExp = bExp;
4134 zSign ^= 1;
4135 goto normalizeRoundAndPack;
4136 aExpBigger:
4137 if ( aExp == 0x7FFF ) {
4138 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4139 return a;
4141 if ( bExp == 0 ) --expDiff;
4142 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4143 aBigger:
4144 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4145 zExp = aExp;
4146 normalizeRoundAndPack:
4147 return
4148 normalizeRoundAndPackFloatx80(
4149 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4153 /*----------------------------------------------------------------------------
4154 | Returns the result of adding the extended double-precision floating-point
4155 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4156 | Standard for Binary Floating-Point Arithmetic.
4157 *----------------------------------------------------------------------------*/
4159 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4161 flag aSign, bSign;
4163 aSign = extractFloatx80Sign( a );
4164 bSign = extractFloatx80Sign( b );
4165 if ( aSign == bSign ) {
4166 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4168 else {
4169 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4174 /*----------------------------------------------------------------------------
4175 | Returns the result of subtracting the extended double-precision floating-
4176 | point values `a' and `b'. The operation is performed according to the
4177 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4178 *----------------------------------------------------------------------------*/
4180 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4182 flag aSign, bSign;
4184 aSign = extractFloatx80Sign( a );
4185 bSign = extractFloatx80Sign( b );
4186 if ( aSign == bSign ) {
4187 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4189 else {
4190 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4195 /*----------------------------------------------------------------------------
4196 | Returns the result of multiplying the extended double-precision floating-
4197 | point values `a' and `b'. The operation is performed according to the
4198 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4199 *----------------------------------------------------------------------------*/
4201 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4203 flag aSign, bSign, zSign;
4204 int32 aExp, bExp, zExp;
4205 bits64 aSig, bSig, zSig0, zSig1;
4206 floatx80 z;
4208 aSig = extractFloatx80Frac( a );
4209 aExp = extractFloatx80Exp( a );
4210 aSign = extractFloatx80Sign( a );
4211 bSig = extractFloatx80Frac( b );
4212 bExp = extractFloatx80Exp( b );
4213 bSign = extractFloatx80Sign( b );
4214 zSign = aSign ^ bSign;
4215 if ( aExp == 0x7FFF ) {
4216 if ( (bits64) ( aSig<<1 )
4217 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4218 return propagateFloatx80NaN( a, b STATUS_VAR );
4220 if ( ( bExp | bSig ) == 0 ) goto invalid;
4221 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4223 if ( bExp == 0x7FFF ) {
4224 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4225 if ( ( aExp | aSig ) == 0 ) {
4226 invalid:
4227 float_raise( float_flag_invalid STATUS_VAR);
4228 z.low = floatx80_default_nan_low;
4229 z.high = floatx80_default_nan_high;
4230 return z;
4232 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4234 if ( aExp == 0 ) {
4235 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4236 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4238 if ( bExp == 0 ) {
4239 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4240 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4242 zExp = aExp + bExp - 0x3FFE;
4243 mul64To128( aSig, bSig, &zSig0, &zSig1 );
4244 if ( 0 < (sbits64) zSig0 ) {
4245 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4246 --zExp;
4248 return
4249 roundAndPackFloatx80(
4250 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4254 /*----------------------------------------------------------------------------
4255 | Returns the result of dividing the extended double-precision floating-point
4256 | value `a' by the corresponding value `b'. The operation is performed
4257 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4258 *----------------------------------------------------------------------------*/
4260 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4262 flag aSign, bSign, zSign;
4263 int32 aExp, bExp, zExp;
4264 bits64 aSig, bSig, zSig0, zSig1;
4265 bits64 rem0, rem1, rem2, term0, term1, term2;
4266 floatx80 z;
4268 aSig = extractFloatx80Frac( a );
4269 aExp = extractFloatx80Exp( a );
4270 aSign = extractFloatx80Sign( a );
4271 bSig = extractFloatx80Frac( b );
4272 bExp = extractFloatx80Exp( b );
4273 bSign = extractFloatx80Sign( b );
4274 zSign = aSign ^ bSign;
4275 if ( aExp == 0x7FFF ) {
4276 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4277 if ( bExp == 0x7FFF ) {
4278 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4279 goto invalid;
4281 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4283 if ( bExp == 0x7FFF ) {
4284 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4285 return packFloatx80( zSign, 0, 0 );
4287 if ( bExp == 0 ) {
4288 if ( bSig == 0 ) {
4289 if ( ( aExp | aSig ) == 0 ) {
4290 invalid:
4291 float_raise( float_flag_invalid STATUS_VAR);
4292 z.low = floatx80_default_nan_low;
4293 z.high = floatx80_default_nan_high;
4294 return z;
4296 float_raise( float_flag_divbyzero STATUS_VAR);
4297 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4299 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4301 if ( aExp == 0 ) {
4302 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4303 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4305 zExp = aExp - bExp + 0x3FFE;
4306 rem1 = 0;
4307 if ( bSig <= aSig ) {
4308 shift128Right( aSig, 0, 1, &aSig, &rem1 );
4309 ++zExp;
4311 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4312 mul64To128( bSig, zSig0, &term0, &term1 );
4313 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4314 while ( (sbits64) rem0 < 0 ) {
4315 --zSig0;
4316 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4318 zSig1 = estimateDiv128To64( rem1, 0, bSig );
4319 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
4320 mul64To128( bSig, zSig1, &term1, &term2 );
4321 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4322 while ( (sbits64) rem1 < 0 ) {
4323 --zSig1;
4324 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4326 zSig1 |= ( ( rem1 | rem2 ) != 0 );
4328 return
4329 roundAndPackFloatx80(
4330 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4334 /*----------------------------------------------------------------------------
4335 | Returns the remainder of the extended double-precision floating-point value
4336 | `a' with respect to the corresponding value `b'. The operation is performed
4337 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4338 *----------------------------------------------------------------------------*/
4340 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4342 flag aSign, zSign;
4343 int32 aExp, bExp, expDiff;
4344 bits64 aSig0, aSig1, bSig;
4345 bits64 q, term0, term1, alternateASig0, alternateASig1;
4346 floatx80 z;
4348 aSig0 = extractFloatx80Frac( a );
4349 aExp = extractFloatx80Exp( a );
4350 aSign = extractFloatx80Sign( a );
4351 bSig = extractFloatx80Frac( b );
4352 bExp = extractFloatx80Exp( b );
4353 if ( aExp == 0x7FFF ) {
4354 if ( (bits64) ( aSig0<<1 )
4355 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4356 return propagateFloatx80NaN( a, b STATUS_VAR );
4358 goto invalid;
4360 if ( bExp == 0x7FFF ) {
4361 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4362 return a;
4364 if ( bExp == 0 ) {
4365 if ( bSig == 0 ) {
4366 invalid:
4367 float_raise( float_flag_invalid STATUS_VAR);
4368 z.low = floatx80_default_nan_low;
4369 z.high = floatx80_default_nan_high;
4370 return z;
4372 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4374 if ( aExp == 0 ) {
4375 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
4376 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4378 bSig |= LIT64( 0x8000000000000000 );
4379 zSign = aSign;
4380 expDiff = aExp - bExp;
4381 aSig1 = 0;
4382 if ( expDiff < 0 ) {
4383 if ( expDiff < -1 ) return a;
4384 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4385 expDiff = 0;
4387 q = ( bSig <= aSig0 );
4388 if ( q ) aSig0 -= bSig;
4389 expDiff -= 64;
4390 while ( 0 < expDiff ) {
4391 q = estimateDiv128To64( aSig0, aSig1, bSig );
4392 q = ( 2 < q ) ? q - 2 : 0;
4393 mul64To128( bSig, q, &term0, &term1 );
4394 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4395 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4396 expDiff -= 62;
4398 expDiff += 64;
4399 if ( 0 < expDiff ) {
4400 q = estimateDiv128To64( aSig0, aSig1, bSig );
4401 q = ( 2 < q ) ? q - 2 : 0;
4402 q >>= 64 - expDiff;
4403 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4404 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4405 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4406 while ( le128( term0, term1, aSig0, aSig1 ) ) {
4407 ++q;
4408 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4411 else {
4412 term1 = 0;
4413 term0 = bSig;
4415 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4416 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4417 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4418 && ( q & 1 ) )
4420 aSig0 = alternateASig0;
4421 aSig1 = alternateASig1;
4422 zSign = ! zSign;
4424 return
4425 normalizeRoundAndPackFloatx80(
4426 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4430 /*----------------------------------------------------------------------------
4431 | Returns the square root of the extended double-precision floating-point
4432 | value `a'. The operation is performed according to the IEC/IEEE Standard
4433 | for Binary Floating-Point Arithmetic.
4434 *----------------------------------------------------------------------------*/
4436 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4438 flag aSign;
4439 int32 aExp, zExp;
4440 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4441 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4442 floatx80 z;
4444 aSig0 = extractFloatx80Frac( a );
4445 aExp = extractFloatx80Exp( a );
4446 aSign = extractFloatx80Sign( a );
4447 if ( aExp == 0x7FFF ) {
4448 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4449 if ( ! aSign ) return a;
4450 goto invalid;
4452 if ( aSign ) {
4453 if ( ( aExp | aSig0 ) == 0 ) return a;
4454 invalid:
4455 float_raise( float_flag_invalid STATUS_VAR);
4456 z.low = floatx80_default_nan_low;
4457 z.high = floatx80_default_nan_high;
4458 return z;
4460 if ( aExp == 0 ) {
4461 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4462 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4464 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4465 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4466 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4467 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4468 doubleZSig0 = zSig0<<1;
4469 mul64To128( zSig0, zSig0, &term0, &term1 );
4470 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4471 while ( (sbits64) rem0 < 0 ) {
4472 --zSig0;
4473 doubleZSig0 -= 2;
4474 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4476 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4477 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4478 if ( zSig1 == 0 ) zSig1 = 1;
4479 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4480 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4481 mul64To128( zSig1, zSig1, &term2, &term3 );
4482 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4483 while ( (sbits64) rem1 < 0 ) {
4484 --zSig1;
4485 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4486 term3 |= 1;
4487 term2 |= doubleZSig0;
4488 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4490 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4492 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4493 zSig0 |= doubleZSig0;
4494 return
4495 roundAndPackFloatx80(
4496 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4500 /*----------------------------------------------------------------------------
4501 | Returns 1 if the extended double-precision floating-point value `a' is
4502 | equal to the corresponding value `b', and 0 otherwise. The comparison is
4503 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4504 | Arithmetic.
4505 *----------------------------------------------------------------------------*/
4507 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4510 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4511 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4512 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4513 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4515 if ( floatx80_is_signaling_nan( a )
4516 || floatx80_is_signaling_nan( b ) ) {
4517 float_raise( float_flag_invalid STATUS_VAR);
4519 return 0;
4521 return
4522 ( a.low == b.low )
4523 && ( ( a.high == b.high )
4524 || ( ( a.low == 0 )
4525 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4530 /*----------------------------------------------------------------------------
4531 | Returns 1 if the extended double-precision floating-point value `a' is
4532 | less than or equal to the corresponding value `b', and 0 otherwise. The
4533 | comparison is performed according to the IEC/IEEE Standard for Binary
4534 | Floating-Point Arithmetic.
4535 *----------------------------------------------------------------------------*/
4537 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4539 flag aSign, bSign;
4541 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4542 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4543 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4544 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4546 float_raise( float_flag_invalid STATUS_VAR);
4547 return 0;
4549 aSign = extractFloatx80Sign( a );
4550 bSign = extractFloatx80Sign( b );
4551 if ( aSign != bSign ) {
4552 return
4553 aSign
4554 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4555 == 0 );
4557 return
4558 aSign ? le128( b.high, b.low, a.high, a.low )
4559 : le128( a.high, a.low, b.high, b.low );
4563 /*----------------------------------------------------------------------------
4564 | Returns 1 if the extended double-precision floating-point value `a' is
4565 | less than the corresponding value `b', and 0 otherwise. The comparison
4566 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4567 | Arithmetic.
4568 *----------------------------------------------------------------------------*/
4570 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4572 flag aSign, bSign;
4574 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4575 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4576 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4577 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4579 float_raise( float_flag_invalid STATUS_VAR);
4580 return 0;
4582 aSign = extractFloatx80Sign( a );
4583 bSign = extractFloatx80Sign( b );
4584 if ( aSign != bSign ) {
4585 return
4586 aSign
4587 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4588 != 0 );
4590 return
4591 aSign ? lt128( b.high, b.low, a.high, a.low )
4592 : lt128( a.high, a.low, b.high, b.low );
4596 /*----------------------------------------------------------------------------
4597 | Returns 1 if the extended double-precision floating-point value `a' is equal
4598 | to the corresponding value `b', and 0 otherwise. The invalid exception is
4599 | raised if either operand is a NaN. Otherwise, the comparison is performed
4600 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4601 *----------------------------------------------------------------------------*/
4603 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4606 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4607 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4608 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4609 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4611 float_raise( float_flag_invalid STATUS_VAR);
4612 return 0;
4614 return
4615 ( a.low == b.low )
4616 && ( ( a.high == b.high )
4617 || ( ( a.low == 0 )
4618 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4623 /*----------------------------------------------------------------------------
4624 | Returns 1 if the extended double-precision floating-point value `a' is less
4625 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4626 | do not cause an exception. Otherwise, the comparison is performed according
4627 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4628 *----------------------------------------------------------------------------*/
4630 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4632 flag aSign, bSign;
4634 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4635 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4636 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4637 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4639 if ( floatx80_is_signaling_nan( a )
4640 || floatx80_is_signaling_nan( b ) ) {
4641 float_raise( float_flag_invalid STATUS_VAR);
4643 return 0;
4645 aSign = extractFloatx80Sign( a );
4646 bSign = extractFloatx80Sign( b );
4647 if ( aSign != bSign ) {
4648 return
4649 aSign
4650 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4651 == 0 );
4653 return
4654 aSign ? le128( b.high, b.low, a.high, a.low )
4655 : le128( a.high, a.low, b.high, b.low );
4659 /*----------------------------------------------------------------------------
4660 | Returns 1 if the extended double-precision floating-point value `a' is less
4661 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4662 | an exception. Otherwise, the comparison is performed according to the
4663 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4664 *----------------------------------------------------------------------------*/
4666 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4668 flag aSign, bSign;
4670 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4671 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4672 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4673 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4675 if ( floatx80_is_signaling_nan( a )
4676 || floatx80_is_signaling_nan( b ) ) {
4677 float_raise( float_flag_invalid STATUS_VAR);
4679 return 0;
4681 aSign = extractFloatx80Sign( a );
4682 bSign = extractFloatx80Sign( b );
4683 if ( aSign != bSign ) {
4684 return
4685 aSign
4686 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4687 != 0 );
4689 return
4690 aSign ? lt128( b.high, b.low, a.high, a.low )
4691 : lt128( a.high, a.low, b.high, b.low );
4695 #endif
4697 #ifdef FLOAT128
4699 /*----------------------------------------------------------------------------
4700 | Returns the result of converting the quadruple-precision floating-point
4701 | value `a' to the 32-bit two's complement integer format. The conversion
4702 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4703 | Arithmetic---which means in particular that the conversion is rounded
4704 | according to the current rounding mode. If `a' is a NaN, the largest
4705 | positive integer is returned. Otherwise, if the conversion overflows, the
4706 | largest integer with the same sign as `a' is returned.
4707 *----------------------------------------------------------------------------*/
4709 int32 float128_to_int32( float128 a STATUS_PARAM )
4711 flag aSign;
4712 int32 aExp, shiftCount;
4713 bits64 aSig0, aSig1;
4715 aSig1 = extractFloat128Frac1( a );
4716 aSig0 = extractFloat128Frac0( a );
4717 aExp = extractFloat128Exp( a );
4718 aSign = extractFloat128Sign( a );
4719 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4720 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4721 aSig0 |= ( aSig1 != 0 );
4722 shiftCount = 0x4028 - aExp;
4723 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4724 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4728 /*----------------------------------------------------------------------------
4729 | Returns the result of converting the quadruple-precision floating-point
4730 | value `a' to the 32-bit two's complement integer format. The conversion
4731 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4732 | Arithmetic, except that the conversion is always rounded toward zero. If
4733 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4734 | conversion overflows, the largest integer with the same sign as `a' is
4735 | returned.
4736 *----------------------------------------------------------------------------*/
4738 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4740 flag aSign;
4741 int32 aExp, shiftCount;
4742 bits64 aSig0, aSig1, savedASig;
4743 int32 z;
4745 aSig1 = extractFloat128Frac1( a );
4746 aSig0 = extractFloat128Frac0( a );
4747 aExp = extractFloat128Exp( a );
4748 aSign = extractFloat128Sign( a );
4749 aSig0 |= ( aSig1 != 0 );
4750 if ( 0x401E < aExp ) {
4751 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4752 goto invalid;
4754 else if ( aExp < 0x3FFF ) {
4755 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4756 return 0;
4758 aSig0 |= LIT64( 0x0001000000000000 );
4759 shiftCount = 0x402F - aExp;
4760 savedASig = aSig0;
4761 aSig0 >>= shiftCount;
4762 z = aSig0;
4763 if ( aSign ) z = - z;
4764 if ( ( z < 0 ) ^ aSign ) {
4765 invalid:
4766 float_raise( float_flag_invalid STATUS_VAR);
4767 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4769 if ( ( aSig0<<shiftCount ) != savedASig ) {
4770 STATUS(float_exception_flags) |= float_flag_inexact;
4772 return z;
4776 /*----------------------------------------------------------------------------
4777 | Returns the result of converting the quadruple-precision floating-point
4778 | value `a' to the 64-bit two's complement integer format. The conversion
4779 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4780 | Arithmetic---which means in particular that the conversion is rounded
4781 | according to the current rounding mode. If `a' is a NaN, the largest
4782 | positive integer is returned. Otherwise, if the conversion overflows, the
4783 | largest integer with the same sign as `a' is returned.
4784 *----------------------------------------------------------------------------*/
4786 int64 float128_to_int64( float128 a STATUS_PARAM )
4788 flag aSign;
4789 int32 aExp, shiftCount;
4790 bits64 aSig0, aSig1;
4792 aSig1 = extractFloat128Frac1( a );
4793 aSig0 = extractFloat128Frac0( a );
4794 aExp = extractFloat128Exp( a );
4795 aSign = extractFloat128Sign( a );
4796 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4797 shiftCount = 0x402F - aExp;
4798 if ( shiftCount <= 0 ) {
4799 if ( 0x403E < aExp ) {
4800 float_raise( float_flag_invalid STATUS_VAR);
4801 if ( ! aSign
4802 || ( ( aExp == 0x7FFF )
4803 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4806 return LIT64( 0x7FFFFFFFFFFFFFFF );
4808 return (sbits64) LIT64( 0x8000000000000000 );
4810 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4812 else {
4813 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4815 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4819 /*----------------------------------------------------------------------------
4820 | Returns the result of converting the quadruple-precision floating-point
4821 | value `a' to the 64-bit two's complement integer format. The conversion
4822 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4823 | Arithmetic, except that the conversion is always rounded toward zero.
4824 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4825 | the conversion overflows, the largest integer with the same sign as `a' is
4826 | returned.
4827 *----------------------------------------------------------------------------*/
4829 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4831 flag aSign;
4832 int32 aExp, shiftCount;
4833 bits64 aSig0, aSig1;
4834 int64 z;
4836 aSig1 = extractFloat128Frac1( a );
4837 aSig0 = extractFloat128Frac0( a );
4838 aExp = extractFloat128Exp( a );
4839 aSign = extractFloat128Sign( a );
4840 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4841 shiftCount = aExp - 0x402F;
4842 if ( 0 < shiftCount ) {
4843 if ( 0x403E <= aExp ) {
4844 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4845 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4846 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4847 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4849 else {
4850 float_raise( float_flag_invalid STATUS_VAR);
4851 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4852 return LIT64( 0x7FFFFFFFFFFFFFFF );
4855 return (sbits64) LIT64( 0x8000000000000000 );
4857 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4858 if ( (bits64) ( aSig1<<shiftCount ) ) {
4859 STATUS(float_exception_flags) |= float_flag_inexact;
4862 else {
4863 if ( aExp < 0x3FFF ) {
4864 if ( aExp | aSig0 | aSig1 ) {
4865 STATUS(float_exception_flags) |= float_flag_inexact;
4867 return 0;
4869 z = aSig0>>( - shiftCount );
4870 if ( aSig1
4871 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4872 STATUS(float_exception_flags) |= float_flag_inexact;
4875 if ( aSign ) z = - z;
4876 return z;
4880 /*----------------------------------------------------------------------------
4881 | Returns the result of converting the quadruple-precision floating-point
4882 | value `a' to the single-precision floating-point format. The conversion
4883 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4884 | Arithmetic.
4885 *----------------------------------------------------------------------------*/
4887 float32 float128_to_float32( float128 a STATUS_PARAM )
4889 flag aSign;
4890 int32 aExp;
4891 bits64 aSig0, aSig1;
4892 bits32 zSig;
4894 aSig1 = extractFloat128Frac1( a );
4895 aSig0 = extractFloat128Frac0( a );
4896 aExp = extractFloat128Exp( a );
4897 aSign = extractFloat128Sign( a );
4898 if ( aExp == 0x7FFF ) {
4899 if ( aSig0 | aSig1 ) {
4900 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4902 return packFloat32( aSign, 0xFF, 0 );
4904 aSig0 |= ( aSig1 != 0 );
4905 shift64RightJamming( aSig0, 18, &aSig0 );
4906 zSig = aSig0;
4907 if ( aExp || zSig ) {
4908 zSig |= 0x40000000;
4909 aExp -= 0x3F81;
4911 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4915 /*----------------------------------------------------------------------------
4916 | Returns the result of converting the quadruple-precision floating-point
4917 | value `a' to the double-precision floating-point format. The conversion
4918 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4919 | Arithmetic.
4920 *----------------------------------------------------------------------------*/
4922 float64 float128_to_float64( float128 a STATUS_PARAM )
4924 flag aSign;
4925 int32 aExp;
4926 bits64 aSig0, aSig1;
4928 aSig1 = extractFloat128Frac1( a );
4929 aSig0 = extractFloat128Frac0( a );
4930 aExp = extractFloat128Exp( a );
4931 aSign = extractFloat128Sign( a );
4932 if ( aExp == 0x7FFF ) {
4933 if ( aSig0 | aSig1 ) {
4934 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4936 return packFloat64( aSign, 0x7FF, 0 );
4938 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4939 aSig0 |= ( aSig1 != 0 );
4940 if ( aExp || aSig0 ) {
4941 aSig0 |= LIT64( 0x4000000000000000 );
4942 aExp -= 0x3C01;
4944 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4948 #ifdef FLOATX80
4950 /*----------------------------------------------------------------------------
4951 | Returns the result of converting the quadruple-precision floating-point
4952 | value `a' to the extended double-precision floating-point format. The
4953 | conversion is performed according to the IEC/IEEE Standard for Binary
4954 | Floating-Point Arithmetic.
4955 *----------------------------------------------------------------------------*/
4957 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4959 flag aSign;
4960 int32 aExp;
4961 bits64 aSig0, aSig1;
4963 aSig1 = extractFloat128Frac1( a );
4964 aSig0 = extractFloat128Frac0( a );
4965 aExp = extractFloat128Exp( a );
4966 aSign = extractFloat128Sign( a );
4967 if ( aExp == 0x7FFF ) {
4968 if ( aSig0 | aSig1 ) {
4969 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4971 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4973 if ( aExp == 0 ) {
4974 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4975 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4977 else {
4978 aSig0 |= LIT64( 0x0001000000000000 );
4980 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4981 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4985 #endif
4987 /*----------------------------------------------------------------------------
4988 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4989 | returns the result as a quadruple-precision floating-point value. The
4990 | operation is performed according to the IEC/IEEE Standard for Binary
4991 | Floating-Point Arithmetic.
4992 *----------------------------------------------------------------------------*/
4994 float128 float128_round_to_int( float128 a STATUS_PARAM )
4996 flag aSign;
4997 int32 aExp;
4998 bits64 lastBitMask, roundBitsMask;
4999 int8 roundingMode;
5000 float128 z;
5002 aExp = extractFloat128Exp( a );
5003 if ( 0x402F <= aExp ) {
5004 if ( 0x406F <= aExp ) {
5005 if ( ( aExp == 0x7FFF )
5006 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5008 return propagateFloat128NaN( a, a STATUS_VAR );
5010 return a;
5012 lastBitMask = 1;
5013 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5014 roundBitsMask = lastBitMask - 1;
5015 z = a;
5016 roundingMode = STATUS(float_rounding_mode);
5017 if ( roundingMode == float_round_nearest_even ) {
5018 if ( lastBitMask ) {
5019 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5020 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5022 else {
5023 if ( (sbits64) z.low < 0 ) {
5024 ++z.high;
5025 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
5029 else if ( roundingMode != float_round_to_zero ) {
5030 if ( extractFloat128Sign( z )
5031 ^ ( roundingMode == float_round_up ) ) {
5032 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5035 z.low &= ~ roundBitsMask;
5037 else {
5038 if ( aExp < 0x3FFF ) {
5039 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5040 STATUS(float_exception_flags) |= float_flag_inexact;
5041 aSign = extractFloat128Sign( a );
5042 switch ( STATUS(float_rounding_mode) ) {
5043 case float_round_nearest_even:
5044 if ( ( aExp == 0x3FFE )
5045 && ( extractFloat128Frac0( a )
5046 | extractFloat128Frac1( a ) )
5048 return packFloat128( aSign, 0x3FFF, 0, 0 );
5050 break;
5051 case float_round_down:
5052 return
5053 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5054 : packFloat128( 0, 0, 0, 0 );
5055 case float_round_up:
5056 return
5057 aSign ? packFloat128( 1, 0, 0, 0 )
5058 : packFloat128( 0, 0x3FFF, 0, 0 );
5060 return packFloat128( aSign, 0, 0, 0 );
5062 lastBitMask = 1;
5063 lastBitMask <<= 0x402F - aExp;
5064 roundBitsMask = lastBitMask - 1;
5065 z.low = 0;
5066 z.high = a.high;
5067 roundingMode = STATUS(float_rounding_mode);
5068 if ( roundingMode == float_round_nearest_even ) {
5069 z.high += lastBitMask>>1;
5070 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5071 z.high &= ~ lastBitMask;
5074 else if ( roundingMode != float_round_to_zero ) {
5075 if ( extractFloat128Sign( z )
5076 ^ ( roundingMode == float_round_up ) ) {
5077 z.high |= ( a.low != 0 );
5078 z.high += roundBitsMask;
5081 z.high &= ~ roundBitsMask;
5083 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5084 STATUS(float_exception_flags) |= float_flag_inexact;
5086 return z;
5090 /*----------------------------------------------------------------------------
5091 | Returns the result of adding the absolute values of the quadruple-precision
5092 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5093 | before being returned. `zSign' is ignored if the result is a NaN.
5094 | The addition is performed according to the IEC/IEEE Standard for Binary
5095 | Floating-Point Arithmetic.
5096 *----------------------------------------------------------------------------*/
5098 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5100 int32 aExp, bExp, zExp;
5101 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5102 int32 expDiff;
5104 aSig1 = extractFloat128Frac1( a );
5105 aSig0 = extractFloat128Frac0( a );
5106 aExp = extractFloat128Exp( a );
5107 bSig1 = extractFloat128Frac1( b );
5108 bSig0 = extractFloat128Frac0( b );
5109 bExp = extractFloat128Exp( b );
5110 expDiff = aExp - bExp;
5111 if ( 0 < expDiff ) {
5112 if ( aExp == 0x7FFF ) {
5113 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5114 return a;
5116 if ( bExp == 0 ) {
5117 --expDiff;
5119 else {
5120 bSig0 |= LIT64( 0x0001000000000000 );
5122 shift128ExtraRightJamming(
5123 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5124 zExp = aExp;
5126 else if ( expDiff < 0 ) {
5127 if ( bExp == 0x7FFF ) {
5128 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5129 return packFloat128( zSign, 0x7FFF, 0, 0 );
5131 if ( aExp == 0 ) {
5132 ++expDiff;
5134 else {
5135 aSig0 |= LIT64( 0x0001000000000000 );
5137 shift128ExtraRightJamming(
5138 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5139 zExp = bExp;
5141 else {
5142 if ( aExp == 0x7FFF ) {
5143 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5144 return propagateFloat128NaN( a, b STATUS_VAR );
5146 return a;
5148 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5149 if ( aExp == 0 ) {
5150 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
5151 return packFloat128( zSign, 0, zSig0, zSig1 );
5153 zSig2 = 0;
5154 zSig0 |= LIT64( 0x0002000000000000 );
5155 zExp = aExp;
5156 goto shiftRight1;
5158 aSig0 |= LIT64( 0x0001000000000000 );
5159 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5160 --zExp;
5161 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5162 ++zExp;
5163 shiftRight1:
5164 shift128ExtraRightJamming(
5165 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5166 roundAndPack:
5167 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5171 /*----------------------------------------------------------------------------
5172 | Returns the result of subtracting the absolute values of the quadruple-
5173 | precision floating-point values `a' and `b'. If `zSign' is 1, the
5174 | difference is negated before being returned. `zSign' is ignored if the
5175 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5176 | Standard for Binary Floating-Point Arithmetic.
5177 *----------------------------------------------------------------------------*/
5179 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5181 int32 aExp, bExp, zExp;
5182 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5183 int32 expDiff;
5184 float128 z;
5186 aSig1 = extractFloat128Frac1( a );
5187 aSig0 = extractFloat128Frac0( a );
5188 aExp = extractFloat128Exp( a );
5189 bSig1 = extractFloat128Frac1( b );
5190 bSig0 = extractFloat128Frac0( b );
5191 bExp = extractFloat128Exp( b );
5192 expDiff = aExp - bExp;
5193 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5194 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5195 if ( 0 < expDiff ) goto aExpBigger;
5196 if ( expDiff < 0 ) goto bExpBigger;
5197 if ( aExp == 0x7FFF ) {
5198 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5199 return propagateFloat128NaN( a, b STATUS_VAR );
5201 float_raise( float_flag_invalid STATUS_VAR);
5202 z.low = float128_default_nan_low;
5203 z.high = float128_default_nan_high;
5204 return z;
5206 if ( aExp == 0 ) {
5207 aExp = 1;
5208 bExp = 1;
5210 if ( bSig0 < aSig0 ) goto aBigger;
5211 if ( aSig0 < bSig0 ) goto bBigger;
5212 if ( bSig1 < aSig1 ) goto aBigger;
5213 if ( aSig1 < bSig1 ) goto bBigger;
5214 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5215 bExpBigger:
5216 if ( bExp == 0x7FFF ) {
5217 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5218 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5220 if ( aExp == 0 ) {
5221 ++expDiff;
5223 else {
5224 aSig0 |= LIT64( 0x4000000000000000 );
5226 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5227 bSig0 |= LIT64( 0x4000000000000000 );
5228 bBigger:
5229 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5230 zExp = bExp;
5231 zSign ^= 1;
5232 goto normalizeRoundAndPack;
5233 aExpBigger:
5234 if ( aExp == 0x7FFF ) {
5235 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5236 return a;
5238 if ( bExp == 0 ) {
5239 --expDiff;
5241 else {
5242 bSig0 |= LIT64( 0x4000000000000000 );
5244 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5245 aSig0 |= LIT64( 0x4000000000000000 );
5246 aBigger:
5247 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5248 zExp = aExp;
5249 normalizeRoundAndPack:
5250 --zExp;
5251 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5255 /*----------------------------------------------------------------------------
5256 | Returns the result of adding the quadruple-precision floating-point values
5257 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5258 | for Binary Floating-Point Arithmetic.
5259 *----------------------------------------------------------------------------*/
5261 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5263 flag aSign, bSign;
5265 aSign = extractFloat128Sign( a );
5266 bSign = extractFloat128Sign( b );
5267 if ( aSign == bSign ) {
5268 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5270 else {
5271 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5276 /*----------------------------------------------------------------------------
5277 | Returns the result of subtracting the quadruple-precision floating-point
5278 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5279 | Standard for Binary Floating-Point Arithmetic.
5280 *----------------------------------------------------------------------------*/
5282 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5284 flag aSign, bSign;
5286 aSign = extractFloat128Sign( a );
5287 bSign = extractFloat128Sign( b );
5288 if ( aSign == bSign ) {
5289 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5291 else {
5292 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5297 /*----------------------------------------------------------------------------
5298 | Returns the result of multiplying the quadruple-precision floating-point
5299 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5300 | Standard for Binary Floating-Point Arithmetic.
5301 *----------------------------------------------------------------------------*/
5303 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5305 flag aSign, bSign, zSign;
5306 int32 aExp, bExp, zExp;
5307 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5308 float128 z;
5310 aSig1 = extractFloat128Frac1( a );
5311 aSig0 = extractFloat128Frac0( a );
5312 aExp = extractFloat128Exp( a );
5313 aSign = extractFloat128Sign( a );
5314 bSig1 = extractFloat128Frac1( b );
5315 bSig0 = extractFloat128Frac0( b );
5316 bExp = extractFloat128Exp( b );
5317 bSign = extractFloat128Sign( b );
5318 zSign = aSign ^ bSign;
5319 if ( aExp == 0x7FFF ) {
5320 if ( ( aSig0 | aSig1 )
5321 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5322 return propagateFloat128NaN( a, b STATUS_VAR );
5324 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5325 return packFloat128( zSign, 0x7FFF, 0, 0 );
5327 if ( bExp == 0x7FFF ) {
5328 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5329 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5330 invalid:
5331 float_raise( float_flag_invalid STATUS_VAR);
5332 z.low = float128_default_nan_low;
5333 z.high = float128_default_nan_high;
5334 return z;
5336 return packFloat128( zSign, 0x7FFF, 0, 0 );
5338 if ( aExp == 0 ) {
5339 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5340 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5342 if ( bExp == 0 ) {
5343 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5344 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5346 zExp = aExp + bExp - 0x4000;
5347 aSig0 |= LIT64( 0x0001000000000000 );
5348 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5349 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5350 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5351 zSig2 |= ( zSig3 != 0 );
5352 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5353 shift128ExtraRightJamming(
5354 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5355 ++zExp;
5357 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5361 /*----------------------------------------------------------------------------
5362 | Returns the result of dividing the quadruple-precision floating-point value
5363 | `a' by the corresponding value `b'. The operation is performed according to
5364 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5365 *----------------------------------------------------------------------------*/
5367 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5369 flag aSign, bSign, zSign;
5370 int32 aExp, bExp, zExp;
5371 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5372 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5373 float128 z;
5375 aSig1 = extractFloat128Frac1( a );
5376 aSig0 = extractFloat128Frac0( a );
5377 aExp = extractFloat128Exp( a );
5378 aSign = extractFloat128Sign( a );
5379 bSig1 = extractFloat128Frac1( b );
5380 bSig0 = extractFloat128Frac0( b );
5381 bExp = extractFloat128Exp( b );
5382 bSign = extractFloat128Sign( b );
5383 zSign = aSign ^ bSign;
5384 if ( aExp == 0x7FFF ) {
5385 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5386 if ( bExp == 0x7FFF ) {
5387 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5388 goto invalid;
5390 return packFloat128( zSign, 0x7FFF, 0, 0 );
5392 if ( bExp == 0x7FFF ) {
5393 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5394 return packFloat128( zSign, 0, 0, 0 );
5396 if ( bExp == 0 ) {
5397 if ( ( bSig0 | bSig1 ) == 0 ) {
5398 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5399 invalid:
5400 float_raise( float_flag_invalid STATUS_VAR);
5401 z.low = float128_default_nan_low;
5402 z.high = float128_default_nan_high;
5403 return z;
5405 float_raise( float_flag_divbyzero STATUS_VAR);
5406 return packFloat128( zSign, 0x7FFF, 0, 0 );
5408 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5410 if ( aExp == 0 ) {
5411 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5412 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5414 zExp = aExp - bExp + 0x3FFD;
5415 shortShift128Left(
5416 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5417 shortShift128Left(
5418 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5419 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5420 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5421 ++zExp;
5423 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5424 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5425 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5426 while ( (sbits64) rem0 < 0 ) {
5427 --zSig0;
5428 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5430 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5431 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5432 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5433 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5434 while ( (sbits64) rem1 < 0 ) {
5435 --zSig1;
5436 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5438 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5440 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5441 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5445 /*----------------------------------------------------------------------------
5446 | Returns the remainder of the quadruple-precision floating-point value `a'
5447 | with respect to the corresponding value `b'. The operation is performed
5448 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5449 *----------------------------------------------------------------------------*/
5451 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5453 flag aSign, zSign;
5454 int32 aExp, bExp, expDiff;
5455 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5456 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5457 sbits64 sigMean0;
5458 float128 z;
5460 aSig1 = extractFloat128Frac1( a );
5461 aSig0 = extractFloat128Frac0( a );
5462 aExp = extractFloat128Exp( a );
5463 aSign = extractFloat128Sign( a );
5464 bSig1 = extractFloat128Frac1( b );
5465 bSig0 = extractFloat128Frac0( b );
5466 bExp = extractFloat128Exp( b );
5467 if ( aExp == 0x7FFF ) {
5468 if ( ( aSig0 | aSig1 )
5469 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5470 return propagateFloat128NaN( a, b STATUS_VAR );
5472 goto invalid;
5474 if ( bExp == 0x7FFF ) {
5475 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5476 return a;
5478 if ( bExp == 0 ) {
5479 if ( ( bSig0 | bSig1 ) == 0 ) {
5480 invalid:
5481 float_raise( float_flag_invalid STATUS_VAR);
5482 z.low = float128_default_nan_low;
5483 z.high = float128_default_nan_high;
5484 return z;
5486 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5488 if ( aExp == 0 ) {
5489 if ( ( aSig0 | aSig1 ) == 0 ) return a;
5490 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5492 expDiff = aExp - bExp;
5493 if ( expDiff < -1 ) return a;
5494 shortShift128Left(
5495 aSig0 | LIT64( 0x0001000000000000 ),
5496 aSig1,
5497 15 - ( expDiff < 0 ),
5498 &aSig0,
5499 &aSig1
5501 shortShift128Left(
5502 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5503 q = le128( bSig0, bSig1, aSig0, aSig1 );
5504 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5505 expDiff -= 64;
5506 while ( 0 < expDiff ) {
5507 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5508 q = ( 4 < q ) ? q - 4 : 0;
5509 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5510 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5511 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5512 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5513 expDiff -= 61;
5515 if ( -64 < expDiff ) {
5516 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5517 q = ( 4 < q ) ? q - 4 : 0;
5518 q >>= - expDiff;
5519 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5520 expDiff += 52;
5521 if ( expDiff < 0 ) {
5522 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5524 else {
5525 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5527 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5528 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5530 else {
5531 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5532 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5534 do {
5535 alternateASig0 = aSig0;
5536 alternateASig1 = aSig1;
5537 ++q;
5538 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5539 } while ( 0 <= (sbits64) aSig0 );
5540 add128(
5541 aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5542 if ( ( sigMean0 < 0 )
5543 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5544 aSig0 = alternateASig0;
5545 aSig1 = alternateASig1;
5547 zSign = ( (sbits64) aSig0 < 0 );
5548 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5549 return
5550 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5554 /*----------------------------------------------------------------------------
5555 | Returns the square root of the quadruple-precision floating-point value `a'.
5556 | The operation is performed according to the IEC/IEEE Standard for Binary
5557 | Floating-Point Arithmetic.
5558 *----------------------------------------------------------------------------*/
5560 float128 float128_sqrt( float128 a STATUS_PARAM )
5562 flag aSign;
5563 int32 aExp, zExp;
5564 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5565 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5566 float128 z;
5568 aSig1 = extractFloat128Frac1( a );
5569 aSig0 = extractFloat128Frac0( a );
5570 aExp = extractFloat128Exp( a );
5571 aSign = extractFloat128Sign( a );
5572 if ( aExp == 0x7FFF ) {
5573 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5574 if ( ! aSign ) return a;
5575 goto invalid;
5577 if ( aSign ) {
5578 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5579 invalid:
5580 float_raise( float_flag_invalid STATUS_VAR);
5581 z.low = float128_default_nan_low;
5582 z.high = float128_default_nan_high;
5583 return z;
5585 if ( aExp == 0 ) {
5586 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5587 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5589 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5590 aSig0 |= LIT64( 0x0001000000000000 );
5591 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5592 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5593 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5594 doubleZSig0 = zSig0<<1;
5595 mul64To128( zSig0, zSig0, &term0, &term1 );
5596 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5597 while ( (sbits64) rem0 < 0 ) {
5598 --zSig0;
5599 doubleZSig0 -= 2;
5600 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5602 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5603 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5604 if ( zSig1 == 0 ) zSig1 = 1;
5605 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5606 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5607 mul64To128( zSig1, zSig1, &term2, &term3 );
5608 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5609 while ( (sbits64) rem1 < 0 ) {
5610 --zSig1;
5611 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5612 term3 |= 1;
5613 term2 |= doubleZSig0;
5614 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5616 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5618 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5619 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5623 /*----------------------------------------------------------------------------
5624 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5625 | the corresponding value `b', and 0 otherwise. The comparison is performed
5626 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5627 *----------------------------------------------------------------------------*/
5629 int float128_eq( float128 a, float128 b STATUS_PARAM )
5632 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5633 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5634 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5635 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5637 if ( float128_is_signaling_nan( a )
5638 || float128_is_signaling_nan( b ) ) {
5639 float_raise( float_flag_invalid STATUS_VAR);
5641 return 0;
5643 return
5644 ( a.low == b.low )
5645 && ( ( a.high == b.high )
5646 || ( ( a.low == 0 )
5647 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5652 /*----------------------------------------------------------------------------
5653 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5654 | or equal to the corresponding value `b', and 0 otherwise. The comparison
5655 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5656 | Arithmetic.
5657 *----------------------------------------------------------------------------*/
5659 int float128_le( float128 a, float128 b STATUS_PARAM )
5661 flag aSign, bSign;
5663 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5664 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5665 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5666 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5668 float_raise( float_flag_invalid STATUS_VAR);
5669 return 0;
5671 aSign = extractFloat128Sign( a );
5672 bSign = extractFloat128Sign( b );
5673 if ( aSign != bSign ) {
5674 return
5675 aSign
5676 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5677 == 0 );
5679 return
5680 aSign ? le128( b.high, b.low, a.high, a.low )
5681 : le128( a.high, a.low, b.high, b.low );
5685 /*----------------------------------------------------------------------------
5686 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5687 | the corresponding value `b', and 0 otherwise. The comparison is performed
5688 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5689 *----------------------------------------------------------------------------*/
5691 int float128_lt( float128 a, float128 b STATUS_PARAM )
5693 flag aSign, bSign;
5695 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5696 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5697 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5698 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5700 float_raise( float_flag_invalid STATUS_VAR);
5701 return 0;
5703 aSign = extractFloat128Sign( a );
5704 bSign = extractFloat128Sign( b );
5705 if ( aSign != bSign ) {
5706 return
5707 aSign
5708 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5709 != 0 );
5711 return
5712 aSign ? lt128( b.high, b.low, a.high, a.low )
5713 : lt128( a.high, a.low, b.high, b.low );
5717 /*----------------------------------------------------------------------------
5718 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5719 | the corresponding value `b', and 0 otherwise. The invalid exception is
5720 | raised if either operand is a NaN. Otherwise, the comparison is performed
5721 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5722 *----------------------------------------------------------------------------*/
5724 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5727 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5728 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5729 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5730 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5732 float_raise( float_flag_invalid STATUS_VAR);
5733 return 0;
5735 return
5736 ( a.low == b.low )
5737 && ( ( a.high == b.high )
5738 || ( ( a.low == 0 )
5739 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5744 /*----------------------------------------------------------------------------
5745 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5746 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5747 | cause an exception. Otherwise, the comparison is performed according to the
5748 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5749 *----------------------------------------------------------------------------*/
5751 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5753 flag aSign, bSign;
5755 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5756 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5757 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5758 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5760 if ( float128_is_signaling_nan( a )
5761 || float128_is_signaling_nan( b ) ) {
5762 float_raise( float_flag_invalid STATUS_VAR);
5764 return 0;
5766 aSign = extractFloat128Sign( a );
5767 bSign = extractFloat128Sign( b );
5768 if ( aSign != bSign ) {
5769 return
5770 aSign
5771 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5772 == 0 );
5774 return
5775 aSign ? le128( b.high, b.low, a.high, a.low )
5776 : le128( a.high, a.low, b.high, b.low );
5780 /*----------------------------------------------------------------------------
5781 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5782 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5783 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5784 | Standard for Binary Floating-Point Arithmetic.
5785 *----------------------------------------------------------------------------*/
5787 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5789 flag aSign, bSign;
5791 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5792 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5793 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5794 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5796 if ( float128_is_signaling_nan( a )
5797 || float128_is_signaling_nan( b ) ) {
5798 float_raise( float_flag_invalid STATUS_VAR);
5800 return 0;
5802 aSign = extractFloat128Sign( a );
5803 bSign = extractFloat128Sign( b );
5804 if ( aSign != bSign ) {
5805 return
5806 aSign
5807 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5808 != 0 );
5810 return
5811 aSign ? lt128( b.high, b.low, a.high, a.low )
5812 : lt128( a.high, a.low, b.high, b.low );
5816 #endif
5818 /* misc functions */
5819 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5821 return int64_to_float32(a STATUS_VAR);
5824 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5826 return int64_to_float64(a STATUS_VAR);
5829 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5831 int64_t v;
5832 unsigned int res;
5834 v = float32_to_int64(a STATUS_VAR);
5835 if (v < 0) {
5836 res = 0;
5837 float_raise( float_flag_invalid STATUS_VAR);
5838 } else if (v > 0xffffffff) {
5839 res = 0xffffffff;
5840 float_raise( float_flag_invalid STATUS_VAR);
5841 } else {
5842 res = v;
5844 return res;
5847 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5849 int64_t v;
5850 unsigned int res;
5852 v = float32_to_int64_round_to_zero(a STATUS_VAR);
5853 if (v < 0) {
5854 res = 0;
5855 float_raise( float_flag_invalid STATUS_VAR);
5856 } else if (v > 0xffffffff) {
5857 res = 0xffffffff;
5858 float_raise( float_flag_invalid STATUS_VAR);
5859 } else {
5860 res = v;
5862 return res;
5865 unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
5867 int64_t v;
5868 unsigned int res;
5870 v = float32_to_int64_round_to_zero(a STATUS_VAR);
5871 if (v < 0) {
5872 res = 0;
5873 float_raise( float_flag_invalid STATUS_VAR);
5874 } else if (v > 0xffff) {
5875 res = 0xffff;
5876 float_raise( float_flag_invalid STATUS_VAR);
5877 } else {
5878 res = v;
5880 return res;
5883 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5885 int64_t v;
5886 unsigned int res;
5888 v = float64_to_int64(a STATUS_VAR);
5889 if (v < 0) {
5890 res = 0;
5891 float_raise( float_flag_invalid STATUS_VAR);
5892 } else if (v > 0xffffffff) {
5893 res = 0xffffffff;
5894 float_raise( float_flag_invalid STATUS_VAR);
5895 } else {
5896 res = v;
5898 return res;
5901 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5903 int64_t v;
5904 unsigned int res;
5906 v = float64_to_int64_round_to_zero(a STATUS_VAR);
5907 if (v < 0) {
5908 res = 0;
5909 float_raise( float_flag_invalid STATUS_VAR);
5910 } else if (v > 0xffffffff) {
5911 res = 0xffffffff;
5912 float_raise( float_flag_invalid STATUS_VAR);
5913 } else {
5914 res = v;
5916 return res;
5919 unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
5921 int64_t v;
5922 unsigned int res;
5924 v = float64_to_int64_round_to_zero(a STATUS_VAR);
5925 if (v < 0) {
5926 res = 0;
5927 float_raise( float_flag_invalid STATUS_VAR);
5928 } else if (v > 0xffff) {
5929 res = 0xffff;
5930 float_raise( float_flag_invalid STATUS_VAR);
5931 } else {
5932 res = v;
5934 return res;
5937 /* FIXME: This looks broken. */
5938 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5940 int64_t v;
5942 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5943 v += float64_val(a);
5944 v = float64_to_int64(make_float64(v) STATUS_VAR);
5946 return v - INT64_MIN;
5949 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5951 int64_t v;
5953 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5954 v += float64_val(a);
5955 v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5957 return v - INT64_MIN;
5960 #define COMPARE(s, nan_exp) \
5961 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
5962 int is_quiet STATUS_PARAM ) \
5964 flag aSign, bSign; \
5965 bits ## s av, bv; \
5966 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
5967 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
5969 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
5970 extractFloat ## s ## Frac( a ) ) || \
5971 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
5972 extractFloat ## s ## Frac( b ) )) { \
5973 if (!is_quiet || \
5974 float ## s ## _is_signaling_nan( a ) || \
5975 float ## s ## _is_signaling_nan( b ) ) { \
5976 float_raise( float_flag_invalid STATUS_VAR); \
5978 return float_relation_unordered; \
5980 aSign = extractFloat ## s ## Sign( a ); \
5981 bSign = extractFloat ## s ## Sign( b ); \
5982 av = float ## s ## _val(a); \
5983 bv = float ## s ## _val(b); \
5984 if ( aSign != bSign ) { \
5985 if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) { \
5986 /* zero case */ \
5987 return float_relation_equal; \
5988 } else { \
5989 return 1 - (2 * aSign); \
5991 } else { \
5992 if (av == bv) { \
5993 return float_relation_equal; \
5994 } else { \
5995 return 1 - 2 * (aSign ^ ( av < bv )); \
6000 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
6002 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
6005 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
6007 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
6010 COMPARE(32, 0xff)
6011 COMPARE(64, 0x7ff)
6013 INLINE int float128_compare_internal( float128 a, float128 b,
6014 int is_quiet STATUS_PARAM )
6016 flag aSign, bSign;
6018 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6019 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6020 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6021 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6022 if (!is_quiet ||
6023 float128_is_signaling_nan( a ) ||
6024 float128_is_signaling_nan( b ) ) {
6025 float_raise( float_flag_invalid STATUS_VAR);
6027 return float_relation_unordered;
6029 aSign = extractFloat128Sign( a );
6030 bSign = extractFloat128Sign( b );
6031 if ( aSign != bSign ) {
6032 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6033 /* zero case */
6034 return float_relation_equal;
6035 } else {
6036 return 1 - (2 * aSign);
6038 } else {
6039 if (a.low == b.low && a.high == b.high) {
6040 return float_relation_equal;
6041 } else {
6042 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6047 int float128_compare( float128 a, float128 b STATUS_PARAM )
6049 return float128_compare_internal(a, b, 0 STATUS_VAR);
6052 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6054 return float128_compare_internal(a, b, 1 STATUS_VAR);
6057 /* Multiply A by 2 raised to the power N. */
6058 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6060 flag aSign;
6061 int16 aExp;
6062 bits32 aSig;
6064 a = float32_squash_input_denormal(a STATUS_VAR);
6065 aSig = extractFloat32Frac( a );
6066 aExp = extractFloat32Exp( a );
6067 aSign = extractFloat32Sign( a );
6069 if ( aExp == 0xFF ) {
6070 return a;
6072 if ( aExp != 0 )
6073 aSig |= 0x00800000;
6074 else if ( aSig == 0 )
6075 return a;
6077 aExp += n - 1;
6078 aSig <<= 7;
6079 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6082 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6084 flag aSign;
6085 int16 aExp;
6086 bits64 aSig;
6088 a = float64_squash_input_denormal(a STATUS_VAR);
6089 aSig = extractFloat64Frac( a );
6090 aExp = extractFloat64Exp( a );
6091 aSign = extractFloat64Sign( a );
6093 if ( aExp == 0x7FF ) {
6094 return a;
6096 if ( aExp != 0 )
6097 aSig |= LIT64( 0x0010000000000000 );
6098 else if ( aSig == 0 )
6099 return a;
6101 aExp += n - 1;
6102 aSig <<= 10;
6103 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6106 #ifdef FLOATX80
6107 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6109 flag aSign;
6110 int16 aExp;
6111 bits64 aSig;
6113 aSig = extractFloatx80Frac( a );
6114 aExp = extractFloatx80Exp( a );
6115 aSign = extractFloatx80Sign( a );
6117 if ( aExp == 0x7FF ) {
6118 return a;
6120 if (aExp == 0 && aSig == 0)
6121 return a;
6123 aExp += n;
6124 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6125 aSign, aExp, aSig, 0 STATUS_VAR );
6127 #endif
6129 #ifdef FLOAT128
6130 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6132 flag aSign;
6133 int32 aExp;
6134 bits64 aSig0, aSig1;
6136 aSig1 = extractFloat128Frac1( a );
6137 aSig0 = extractFloat128Frac0( a );
6138 aExp = extractFloat128Exp( a );
6139 aSign = extractFloat128Sign( a );
6140 if ( aExp == 0x7FFF ) {
6141 return a;
6143 if ( aExp != 0 )
6144 aSig0 |= LIT64( 0x0001000000000000 );
6145 else if ( aSig0 == 0 && aSig1 == 0 )
6146 return a;
6148 aExp += n - 1;
6149 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6150 STATUS_VAR );
6153 #endif