Fixing tap adapter for win32
[qemu/kevin.git] / fpu / softfloat.c
blob30b07e9b44b2d28910670729904aaabc37b1cf53
2 /*============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5 Package, Release 2b.
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15 arithmetic/SoftFloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
26 Derivative works are acceptable, even for commercial purposes, so long as
27 (1) the source code for the derivative work includes prominent notice that
28 the work is derivative, and (2) the source code includes prominent notice with
29 these four paragraphs for those parts of this code that are retained.
31 =============================================================================*/
33 #include "softfloat.h"
35 /*----------------------------------------------------------------------------
36 | Primitive arithmetic functions, including multi-word arithmetic, and
37 | division and square root approximations. (Can be specialized to target if
38 | desired.)
39 *----------------------------------------------------------------------------*/
40 #include "softfloat-macros.h"
42 /*----------------------------------------------------------------------------
43 | Functions and definitions to determine: (1) whether tininess for underflow
44 | is detected before or after rounding by default, (2) what (if anything)
45 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
46 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
47 | are propagated from function inputs to output. These details are target-
48 | specific.
49 *----------------------------------------------------------------------------*/
50 #include "softfloat-specialize.h"
52 void set_float_rounding_mode(int val STATUS_PARAM)
54 STATUS(float_rounding_mode) = val;
57 void set_float_exception_flags(int val STATUS_PARAM)
59 STATUS(float_exception_flags) = val;
62 #ifdef FLOATX80
63 void set_floatx80_rounding_precision(int val STATUS_PARAM)
65 STATUS(floatx80_rounding_precision) = val;
67 #endif
69 /*----------------------------------------------------------------------------
70 | Returns the fraction bits of the half-precision floating-point value `a'.
71 *----------------------------------------------------------------------------*/
73 INLINE uint32_t extractFloat16Frac(float16 a)
75 return float16_val(a) & 0x3ff;
78 /*----------------------------------------------------------------------------
79 | Returns the exponent bits of the half-precision floating-point value `a'.
80 *----------------------------------------------------------------------------*/
82 INLINE int16 extractFloat16Exp(float16 a)
84 return (float16_val(a) >> 10) & 0x1f;
87 /*----------------------------------------------------------------------------
88 | Returns the sign bit of the single-precision floating-point value `a'.
89 *----------------------------------------------------------------------------*/
91 INLINE flag extractFloat16Sign(float16 a)
93 return float16_val(a)>>15;
96 /*----------------------------------------------------------------------------
97 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
98 | and 7, and returns the properly rounded 32-bit integer corresponding to the
99 | input. If `zSign' is 1, the input is negated before being converted to an
100 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
101 | is simply rounded to an integer, with the inexact exception raised if the
102 | input cannot be represented exactly as an integer. However, if the fixed-
103 | point input is too large, the invalid exception is raised and the largest
104 | positive or negative integer is returned.
105 *----------------------------------------------------------------------------*/
107 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
109 int8 roundingMode;
110 flag roundNearestEven;
111 int8 roundIncrement, roundBits;
112 int32 z;
114 roundingMode = STATUS(float_rounding_mode);
115 roundNearestEven = ( roundingMode == float_round_nearest_even );
116 roundIncrement = 0x40;
117 if ( ! roundNearestEven ) {
118 if ( roundingMode == float_round_to_zero ) {
119 roundIncrement = 0;
121 else {
122 roundIncrement = 0x7F;
123 if ( zSign ) {
124 if ( roundingMode == float_round_up ) roundIncrement = 0;
126 else {
127 if ( roundingMode == float_round_down ) roundIncrement = 0;
131 roundBits = absZ & 0x7F;
132 absZ = ( absZ + roundIncrement )>>7;
133 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
134 z = absZ;
135 if ( zSign ) z = - z;
136 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
137 float_raise( float_flag_invalid STATUS_VAR);
138 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
140 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
141 return z;
145 /*----------------------------------------------------------------------------
146 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
147 | `absZ1', with binary point between bits 63 and 64 (between the input words),
148 | and returns the properly rounded 64-bit integer corresponding to the input.
149 | If `zSign' is 1, the input is negated before being converted to an integer.
150 | Ordinarily, the fixed-point input is simply rounded to an integer, with
151 | the inexact exception raised if the input cannot be represented exactly as
152 | an integer. However, if the fixed-point input is too large, the invalid
153 | exception is raised and the largest positive or negative integer is
154 | returned.
155 *----------------------------------------------------------------------------*/
157 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
159 int8 roundingMode;
160 flag roundNearestEven, increment;
161 int64 z;
163 roundingMode = STATUS(float_rounding_mode);
164 roundNearestEven = ( roundingMode == float_round_nearest_even );
165 increment = ( (sbits64) absZ1 < 0 );
166 if ( ! roundNearestEven ) {
167 if ( roundingMode == float_round_to_zero ) {
168 increment = 0;
170 else {
171 if ( zSign ) {
172 increment = ( roundingMode == float_round_down ) && absZ1;
174 else {
175 increment = ( roundingMode == float_round_up ) && absZ1;
179 if ( increment ) {
180 ++absZ0;
181 if ( absZ0 == 0 ) goto overflow;
182 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
184 z = absZ0;
185 if ( zSign ) z = - z;
186 if ( z && ( ( z < 0 ) ^ zSign ) ) {
187 overflow:
188 float_raise( float_flag_invalid STATUS_VAR);
189 return
190 zSign ? (sbits64) LIT64( 0x8000000000000000 )
191 : LIT64( 0x7FFFFFFFFFFFFFFF );
193 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
194 return z;
198 /*----------------------------------------------------------------------------
199 | Returns the fraction bits of the single-precision floating-point value `a'.
200 *----------------------------------------------------------------------------*/
202 INLINE bits32 extractFloat32Frac( float32 a )
205 return float32_val(a) & 0x007FFFFF;
209 /*----------------------------------------------------------------------------
210 | Returns the exponent bits of the single-precision floating-point value `a'.
211 *----------------------------------------------------------------------------*/
213 INLINE int16 extractFloat32Exp( float32 a )
216 return ( float32_val(a)>>23 ) & 0xFF;
220 /*----------------------------------------------------------------------------
221 | Returns the sign bit of the single-precision floating-point value `a'.
222 *----------------------------------------------------------------------------*/
224 INLINE flag extractFloat32Sign( float32 a )
227 return float32_val(a)>>31;
231 /*----------------------------------------------------------------------------
232 | If `a' is denormal and we are in flush-to-zero mode then set the
233 | input-denormal exception and return zero. Otherwise just return the value.
234 *----------------------------------------------------------------------------*/
235 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
237 if (STATUS(flush_inputs_to_zero)) {
238 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
239 float_raise(float_flag_input_denormal STATUS_VAR);
240 return make_float32(float32_val(a) & 0x80000000);
243 return a;
246 /*----------------------------------------------------------------------------
247 | Normalizes the subnormal single-precision floating-point value represented
248 | by the denormalized significand `aSig'. The normalized exponent and
249 | significand are stored at the locations pointed to by `zExpPtr' and
250 | `zSigPtr', respectively.
251 *----------------------------------------------------------------------------*/
253 static void
254 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
256 int8 shiftCount;
258 shiftCount = countLeadingZeros32( aSig ) - 8;
259 *zSigPtr = aSig<<shiftCount;
260 *zExpPtr = 1 - shiftCount;
264 /*----------------------------------------------------------------------------
265 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
266 | single-precision floating-point value, returning the result. After being
267 | shifted into the proper positions, the three fields are simply added
268 | together to form the result. This means that any integer portion of `zSig'
269 | will be added into the exponent. Since a properly normalized significand
270 | will have an integer portion equal to 1, the `zExp' input should be 1 less
271 | than the desired result exponent whenever `zSig' is a complete, normalized
272 | significand.
273 *----------------------------------------------------------------------------*/
275 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
278 return make_float32(
279 ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
283 /*----------------------------------------------------------------------------
284 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
285 | and significand `zSig', and returns the proper single-precision floating-
286 | point value corresponding to the abstract input. Ordinarily, the abstract
287 | value is simply rounded and packed into the single-precision format, with
288 | the inexact exception raised if the abstract input cannot be represented
289 | exactly. However, if the abstract value is too large, the overflow and
290 | inexact exceptions are raised and an infinity or maximal finite value is
291 | returned. If the abstract value is too small, the input value is rounded to
292 | a subnormal number, and the underflow and inexact exceptions are raised if
293 | the abstract input cannot be represented exactly as a subnormal single-
294 | precision floating-point number.
295 | The input significand `zSig' has its binary point between bits 30
296 | and 29, which is 7 bits to the left of the usual location. This shifted
297 | significand must be normalized or smaller. If `zSig' is not normalized,
298 | `zExp' must be 0; in that case, the result returned is a subnormal number,
299 | and it must not require rounding. In the usual case that `zSig' is
300 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
301 | The handling of underflow and overflow follows the IEC/IEEE Standard for
302 | Binary Floating-Point Arithmetic.
303 *----------------------------------------------------------------------------*/
305 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
307 int8 roundingMode;
308 flag roundNearestEven;
309 int8 roundIncrement, roundBits;
310 flag isTiny;
312 roundingMode = STATUS(float_rounding_mode);
313 roundNearestEven = ( roundingMode == float_round_nearest_even );
314 roundIncrement = 0x40;
315 if ( ! roundNearestEven ) {
316 if ( roundingMode == float_round_to_zero ) {
317 roundIncrement = 0;
319 else {
320 roundIncrement = 0x7F;
321 if ( zSign ) {
322 if ( roundingMode == float_round_up ) roundIncrement = 0;
324 else {
325 if ( roundingMode == float_round_down ) roundIncrement = 0;
329 roundBits = zSig & 0x7F;
330 if ( 0xFD <= (bits16) zExp ) {
331 if ( ( 0xFD < zExp )
332 || ( ( zExp == 0xFD )
333 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
335 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
336 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
338 if ( zExp < 0 ) {
339 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
340 isTiny =
341 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
342 || ( zExp < -1 )
343 || ( zSig + roundIncrement < 0x80000000 );
344 shift32RightJamming( zSig, - zExp, &zSig );
345 zExp = 0;
346 roundBits = zSig & 0x7F;
347 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
350 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
351 zSig = ( zSig + roundIncrement )>>7;
352 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
353 if ( zSig == 0 ) zExp = 0;
354 return packFloat32( zSign, zExp, zSig );
358 /*----------------------------------------------------------------------------
359 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
360 | and significand `zSig', and returns the proper single-precision floating-
361 | point value corresponding to the abstract input. This routine is just like
362 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
363 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
364 | floating-point exponent.
365 *----------------------------------------------------------------------------*/
367 static float32
368 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
370 int8 shiftCount;
372 shiftCount = countLeadingZeros32( zSig ) - 1;
373 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
377 /*----------------------------------------------------------------------------
378 | Returns the fraction bits of the double-precision floating-point value `a'.
379 *----------------------------------------------------------------------------*/
381 INLINE bits64 extractFloat64Frac( float64 a )
384 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
388 /*----------------------------------------------------------------------------
389 | Returns the exponent bits of the double-precision floating-point value `a'.
390 *----------------------------------------------------------------------------*/
392 INLINE int16 extractFloat64Exp( float64 a )
395 return ( float64_val(a)>>52 ) & 0x7FF;
399 /*----------------------------------------------------------------------------
400 | Returns the sign bit of the double-precision floating-point value `a'.
401 *----------------------------------------------------------------------------*/
403 INLINE flag extractFloat64Sign( float64 a )
406 return float64_val(a)>>63;
410 /*----------------------------------------------------------------------------
411 | If `a' is denormal and we are in flush-to-zero mode then set the
412 | input-denormal exception and return zero. Otherwise just return the value.
413 *----------------------------------------------------------------------------*/
414 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
416 if (STATUS(flush_inputs_to_zero)) {
417 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
418 float_raise(float_flag_input_denormal STATUS_VAR);
419 return make_float64(float64_val(a) & (1ULL << 63));
422 return a;
425 /*----------------------------------------------------------------------------
426 | Normalizes the subnormal double-precision floating-point value represented
427 | by the denormalized significand `aSig'. The normalized exponent and
428 | significand are stored at the locations pointed to by `zExpPtr' and
429 | `zSigPtr', respectively.
430 *----------------------------------------------------------------------------*/
432 static void
433 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
435 int8 shiftCount;
437 shiftCount = countLeadingZeros64( aSig ) - 11;
438 *zSigPtr = aSig<<shiftCount;
439 *zExpPtr = 1 - shiftCount;
443 /*----------------------------------------------------------------------------
444 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
445 | double-precision floating-point value, returning the result. After being
446 | shifted into the proper positions, the three fields are simply added
447 | together to form the result. This means that any integer portion of `zSig'
448 | will be added into the exponent. Since a properly normalized significand
449 | will have an integer portion equal to 1, the `zExp' input should be 1 less
450 | than the desired result exponent whenever `zSig' is a complete, normalized
451 | significand.
452 *----------------------------------------------------------------------------*/
454 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
457 return make_float64(
458 ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
462 /*----------------------------------------------------------------------------
463 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
464 | and significand `zSig', and returns the proper double-precision floating-
465 | point value corresponding to the abstract input. Ordinarily, the abstract
466 | value is simply rounded and packed into the double-precision format, with
467 | the inexact exception raised if the abstract input cannot be represented
468 | exactly. However, if the abstract value is too large, the overflow and
469 | inexact exceptions are raised and an infinity or maximal finite value is
470 | returned. If the abstract value is too small, the input value is rounded
471 | to a subnormal number, and the underflow and inexact exceptions are raised
472 | if the abstract input cannot be represented exactly as a subnormal double-
473 | precision floating-point number.
474 | The input significand `zSig' has its binary point between bits 62
475 | and 61, which is 10 bits to the left of the usual location. This shifted
476 | significand must be normalized or smaller. If `zSig' is not normalized,
477 | `zExp' must be 0; in that case, the result returned is a subnormal number,
478 | and it must not require rounding. In the usual case that `zSig' is
479 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
480 | The handling of underflow and overflow follows the IEC/IEEE Standard for
481 | Binary Floating-Point Arithmetic.
482 *----------------------------------------------------------------------------*/
484 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
486 int8 roundingMode;
487 flag roundNearestEven;
488 int16 roundIncrement, roundBits;
489 flag isTiny;
491 roundingMode = STATUS(float_rounding_mode);
492 roundNearestEven = ( roundingMode == float_round_nearest_even );
493 roundIncrement = 0x200;
494 if ( ! roundNearestEven ) {
495 if ( roundingMode == float_round_to_zero ) {
496 roundIncrement = 0;
498 else {
499 roundIncrement = 0x3FF;
500 if ( zSign ) {
501 if ( roundingMode == float_round_up ) roundIncrement = 0;
503 else {
504 if ( roundingMode == float_round_down ) roundIncrement = 0;
508 roundBits = zSig & 0x3FF;
509 if ( 0x7FD <= (bits16) zExp ) {
510 if ( ( 0x7FD < zExp )
511 || ( ( zExp == 0x7FD )
512 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
514 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
515 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
517 if ( zExp < 0 ) {
518 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
519 isTiny =
520 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
521 || ( zExp < -1 )
522 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
523 shift64RightJamming( zSig, - zExp, &zSig );
524 zExp = 0;
525 roundBits = zSig & 0x3FF;
526 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
529 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
530 zSig = ( zSig + roundIncrement )>>10;
531 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
532 if ( zSig == 0 ) zExp = 0;
533 return packFloat64( zSign, zExp, zSig );
537 /*----------------------------------------------------------------------------
538 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
539 | and significand `zSig', and returns the proper double-precision floating-
540 | point value corresponding to the abstract input. This routine is just like
541 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
542 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
543 | floating-point exponent.
544 *----------------------------------------------------------------------------*/
546 static float64
547 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
549 int8 shiftCount;
551 shiftCount = countLeadingZeros64( zSig ) - 1;
552 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
556 #ifdef FLOATX80
558 /*----------------------------------------------------------------------------
559 | Returns the fraction bits of the extended double-precision floating-point
560 | value `a'.
561 *----------------------------------------------------------------------------*/
563 INLINE bits64 extractFloatx80Frac( floatx80 a )
566 return a.low;
570 /*----------------------------------------------------------------------------
571 | Returns the exponent bits of the extended double-precision floating-point
572 | value `a'.
573 *----------------------------------------------------------------------------*/
575 INLINE int32 extractFloatx80Exp( floatx80 a )
578 return a.high & 0x7FFF;
582 /*----------------------------------------------------------------------------
583 | Returns the sign bit of the extended double-precision floating-point value
584 | `a'.
585 *----------------------------------------------------------------------------*/
587 INLINE flag extractFloatx80Sign( floatx80 a )
590 return a.high>>15;
594 /*----------------------------------------------------------------------------
595 | Normalizes the subnormal extended double-precision floating-point value
596 | represented by the denormalized significand `aSig'. The normalized exponent
597 | and significand are stored at the locations pointed to by `zExpPtr' and
598 | `zSigPtr', respectively.
599 *----------------------------------------------------------------------------*/
601 static void
602 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
604 int8 shiftCount;
606 shiftCount = countLeadingZeros64( aSig );
607 *zSigPtr = aSig<<shiftCount;
608 *zExpPtr = 1 - shiftCount;
612 /*----------------------------------------------------------------------------
613 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
614 | extended double-precision floating-point value, returning the result.
615 *----------------------------------------------------------------------------*/
617 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
619 floatx80 z;
621 z.low = zSig;
622 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
623 return z;
627 /*----------------------------------------------------------------------------
628 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
629 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
630 | and returns the proper extended double-precision floating-point value
631 | corresponding to the abstract input. Ordinarily, the abstract value is
632 | rounded and packed into the extended double-precision format, with the
633 | inexact exception raised if the abstract input cannot be represented
634 | exactly. However, if the abstract value is too large, the overflow and
635 | inexact exceptions are raised and an infinity or maximal finite value is
636 | returned. If the abstract value is too small, the input value is rounded to
637 | a subnormal number, and the underflow and inexact exceptions are raised if
638 | the abstract input cannot be represented exactly as a subnormal extended
639 | double-precision floating-point number.
640 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
641 | number of bits as single or double precision, respectively. Otherwise, the
642 | result is rounded to the full precision of the extended double-precision
643 | format.
644 | The input significand must be normalized or smaller. If the input
645 | significand is not normalized, `zExp' must be 0; in that case, the result
646 | returned is a subnormal number, and it must not require rounding. The
647 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
648 | Floating-Point Arithmetic.
649 *----------------------------------------------------------------------------*/
651 static floatx80
652 roundAndPackFloatx80(
653 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
654 STATUS_PARAM)
656 int8 roundingMode;
657 flag roundNearestEven, increment, isTiny;
658 int64 roundIncrement, roundMask, roundBits;
660 roundingMode = STATUS(float_rounding_mode);
661 roundNearestEven = ( roundingMode == float_round_nearest_even );
662 if ( roundingPrecision == 80 ) goto precision80;
663 if ( roundingPrecision == 64 ) {
664 roundIncrement = LIT64( 0x0000000000000400 );
665 roundMask = LIT64( 0x00000000000007FF );
667 else if ( roundingPrecision == 32 ) {
668 roundIncrement = LIT64( 0x0000008000000000 );
669 roundMask = LIT64( 0x000000FFFFFFFFFF );
671 else {
672 goto precision80;
674 zSig0 |= ( zSig1 != 0 );
675 if ( ! roundNearestEven ) {
676 if ( roundingMode == float_round_to_zero ) {
677 roundIncrement = 0;
679 else {
680 roundIncrement = roundMask;
681 if ( zSign ) {
682 if ( roundingMode == float_round_up ) roundIncrement = 0;
684 else {
685 if ( roundingMode == float_round_down ) roundIncrement = 0;
689 roundBits = zSig0 & roundMask;
690 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
691 if ( ( 0x7FFE < zExp )
692 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
694 goto overflow;
696 if ( zExp <= 0 ) {
697 if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
698 isTiny =
699 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
700 || ( zExp < 0 )
701 || ( zSig0 <= zSig0 + roundIncrement );
702 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
703 zExp = 0;
704 roundBits = zSig0 & roundMask;
705 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
706 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
707 zSig0 += roundIncrement;
708 if ( (sbits64) zSig0 < 0 ) zExp = 1;
709 roundIncrement = roundMask + 1;
710 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
711 roundMask |= roundIncrement;
713 zSig0 &= ~ roundMask;
714 return packFloatx80( zSign, zExp, zSig0 );
717 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
718 zSig0 += roundIncrement;
719 if ( zSig0 < roundIncrement ) {
720 ++zExp;
721 zSig0 = LIT64( 0x8000000000000000 );
723 roundIncrement = roundMask + 1;
724 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
725 roundMask |= roundIncrement;
727 zSig0 &= ~ roundMask;
728 if ( zSig0 == 0 ) zExp = 0;
729 return packFloatx80( zSign, zExp, zSig0 );
730 precision80:
731 increment = ( (sbits64) zSig1 < 0 );
732 if ( ! roundNearestEven ) {
733 if ( roundingMode == float_round_to_zero ) {
734 increment = 0;
736 else {
737 if ( zSign ) {
738 increment = ( roundingMode == float_round_down ) && zSig1;
740 else {
741 increment = ( roundingMode == float_round_up ) && zSig1;
745 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
746 if ( ( 0x7FFE < zExp )
747 || ( ( zExp == 0x7FFE )
748 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
749 && increment
752 roundMask = 0;
753 overflow:
754 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
755 if ( ( roundingMode == float_round_to_zero )
756 || ( zSign && ( roundingMode == float_round_up ) )
757 || ( ! zSign && ( roundingMode == float_round_down ) )
759 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
761 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
763 if ( zExp <= 0 ) {
764 isTiny =
765 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
766 || ( zExp < 0 )
767 || ! increment
768 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
769 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
770 zExp = 0;
771 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
772 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
773 if ( roundNearestEven ) {
774 increment = ( (sbits64) zSig1 < 0 );
776 else {
777 if ( zSign ) {
778 increment = ( roundingMode == float_round_down ) && zSig1;
780 else {
781 increment = ( roundingMode == float_round_up ) && zSig1;
784 if ( increment ) {
785 ++zSig0;
786 zSig0 &=
787 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
788 if ( (sbits64) zSig0 < 0 ) zExp = 1;
790 return packFloatx80( zSign, zExp, zSig0 );
793 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
794 if ( increment ) {
795 ++zSig0;
796 if ( zSig0 == 0 ) {
797 ++zExp;
798 zSig0 = LIT64( 0x8000000000000000 );
800 else {
801 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
804 else {
805 if ( zSig0 == 0 ) zExp = 0;
807 return packFloatx80( zSign, zExp, zSig0 );
811 /*----------------------------------------------------------------------------
812 | Takes an abstract floating-point value having sign `zSign', exponent
813 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
814 | and returns the proper extended double-precision floating-point value
815 | corresponding to the abstract input. This routine is just like
816 | `roundAndPackFloatx80' except that the input significand does not have to be
817 | normalized.
818 *----------------------------------------------------------------------------*/
820 static floatx80
821 normalizeRoundAndPackFloatx80(
822 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
823 STATUS_PARAM)
825 int8 shiftCount;
827 if ( zSig0 == 0 ) {
828 zSig0 = zSig1;
829 zSig1 = 0;
830 zExp -= 64;
832 shiftCount = countLeadingZeros64( zSig0 );
833 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
834 zExp -= shiftCount;
835 return
836 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
840 #endif
842 #ifdef FLOAT128
844 /*----------------------------------------------------------------------------
845 | Returns the least-significant 64 fraction bits of the quadruple-precision
846 | floating-point value `a'.
847 *----------------------------------------------------------------------------*/
849 INLINE bits64 extractFloat128Frac1( float128 a )
852 return a.low;
856 /*----------------------------------------------------------------------------
857 | Returns the most-significant 48 fraction bits of the quadruple-precision
858 | floating-point value `a'.
859 *----------------------------------------------------------------------------*/
861 INLINE bits64 extractFloat128Frac0( float128 a )
864 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
868 /*----------------------------------------------------------------------------
869 | Returns the exponent bits of the quadruple-precision floating-point value
870 | `a'.
871 *----------------------------------------------------------------------------*/
873 INLINE int32 extractFloat128Exp( float128 a )
876 return ( a.high>>48 ) & 0x7FFF;
880 /*----------------------------------------------------------------------------
881 | Returns the sign bit of the quadruple-precision floating-point value `a'.
882 *----------------------------------------------------------------------------*/
884 INLINE flag extractFloat128Sign( float128 a )
887 return a.high>>63;
891 /*----------------------------------------------------------------------------
892 | Normalizes the subnormal quadruple-precision floating-point value
893 | represented by the denormalized significand formed by the concatenation of
894 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
895 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
896 | significand are stored at the location pointed to by `zSig0Ptr', and the
897 | least significant 64 bits of the normalized significand are stored at the
898 | location pointed to by `zSig1Ptr'.
899 *----------------------------------------------------------------------------*/
901 static void
902 normalizeFloat128Subnormal(
903 bits64 aSig0,
904 bits64 aSig1,
905 int32 *zExpPtr,
906 bits64 *zSig0Ptr,
907 bits64 *zSig1Ptr
910 int8 shiftCount;
912 if ( aSig0 == 0 ) {
913 shiftCount = countLeadingZeros64( aSig1 ) - 15;
914 if ( shiftCount < 0 ) {
915 *zSig0Ptr = aSig1>>( - shiftCount );
916 *zSig1Ptr = aSig1<<( shiftCount & 63 );
918 else {
919 *zSig0Ptr = aSig1<<shiftCount;
920 *zSig1Ptr = 0;
922 *zExpPtr = - shiftCount - 63;
924 else {
925 shiftCount = countLeadingZeros64( aSig0 ) - 15;
926 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
927 *zExpPtr = 1 - shiftCount;
932 /*----------------------------------------------------------------------------
933 | Packs the sign `zSign', the exponent `zExp', and the significand formed
934 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
935 | floating-point value, returning the result. After being shifted into the
936 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
937 | added together to form the most significant 32 bits of the result. This
938 | means that any integer portion of `zSig0' will be added into the exponent.
939 | Since a properly normalized significand will have an integer portion equal
940 | to 1, the `zExp' input should be 1 less than the desired result exponent
941 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
942 | significand.
943 *----------------------------------------------------------------------------*/
945 INLINE float128
946 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
948 float128 z;
950 z.low = zSig1;
951 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
952 return z;
956 /*----------------------------------------------------------------------------
957 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
958 | and extended significand formed by the concatenation of `zSig0', `zSig1',
959 | and `zSig2', and returns the proper quadruple-precision floating-point value
960 | corresponding to the abstract input. Ordinarily, the abstract value is
961 | simply rounded and packed into the quadruple-precision format, with the
962 | inexact exception raised if the abstract input cannot be represented
963 | exactly. However, if the abstract value is too large, the overflow and
964 | inexact exceptions are raised and an infinity or maximal finite value is
965 | returned. If the abstract value is too small, the input value is rounded to
966 | a subnormal number, and the underflow and inexact exceptions are raised if
967 | the abstract input cannot be represented exactly as a subnormal quadruple-
968 | precision floating-point number.
969 | The input significand must be normalized or smaller. If the input
970 | significand is not normalized, `zExp' must be 0; in that case, the result
971 | returned is a subnormal number, and it must not require rounding. In the
972 | usual case that the input significand is normalized, `zExp' must be 1 less
973 | than the ``true'' floating-point exponent. The handling of underflow and
974 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
975 *----------------------------------------------------------------------------*/
977 static float128
978 roundAndPackFloat128(
979 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
981 int8 roundingMode;
982 flag roundNearestEven, increment, isTiny;
984 roundingMode = STATUS(float_rounding_mode);
985 roundNearestEven = ( roundingMode == float_round_nearest_even );
986 increment = ( (sbits64) zSig2 < 0 );
987 if ( ! roundNearestEven ) {
988 if ( roundingMode == float_round_to_zero ) {
989 increment = 0;
991 else {
992 if ( zSign ) {
993 increment = ( roundingMode == float_round_down ) && zSig2;
995 else {
996 increment = ( roundingMode == float_round_up ) && zSig2;
1000 if ( 0x7FFD <= (bits32) zExp ) {
1001 if ( ( 0x7FFD < zExp )
1002 || ( ( zExp == 0x7FFD )
1003 && eq128(
1004 LIT64( 0x0001FFFFFFFFFFFF ),
1005 LIT64( 0xFFFFFFFFFFFFFFFF ),
1006 zSig0,
1007 zSig1
1009 && increment
1012 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1013 if ( ( roundingMode == float_round_to_zero )
1014 || ( zSign && ( roundingMode == float_round_up ) )
1015 || ( ! zSign && ( roundingMode == float_round_down ) )
1017 return
1018 packFloat128(
1019 zSign,
1020 0x7FFE,
1021 LIT64( 0x0000FFFFFFFFFFFF ),
1022 LIT64( 0xFFFFFFFFFFFFFFFF )
1025 return packFloat128( zSign, 0x7FFF, 0, 0 );
1027 if ( zExp < 0 ) {
1028 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
1029 isTiny =
1030 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1031 || ( zExp < -1 )
1032 || ! increment
1033 || lt128(
1034 zSig0,
1035 zSig1,
1036 LIT64( 0x0001FFFFFFFFFFFF ),
1037 LIT64( 0xFFFFFFFFFFFFFFFF )
1039 shift128ExtraRightJamming(
1040 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1041 zExp = 0;
1042 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1043 if ( roundNearestEven ) {
1044 increment = ( (sbits64) zSig2 < 0 );
1046 else {
1047 if ( zSign ) {
1048 increment = ( roundingMode == float_round_down ) && zSig2;
1050 else {
1051 increment = ( roundingMode == float_round_up ) && zSig2;
1056 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1057 if ( increment ) {
1058 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1059 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1061 else {
1062 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1064 return packFloat128( zSign, zExp, zSig0, zSig1 );
1068 /*----------------------------------------------------------------------------
1069 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1070 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1071 | returns the proper quadruple-precision floating-point value corresponding
1072 | to the abstract input. This routine is just like `roundAndPackFloat128'
1073 | except that the input significand has fewer bits and does not have to be
1074 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1075 | point exponent.
1076 *----------------------------------------------------------------------------*/
1078 static float128
1079 normalizeRoundAndPackFloat128(
1080 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1082 int8 shiftCount;
1083 bits64 zSig2;
1085 if ( zSig0 == 0 ) {
1086 zSig0 = zSig1;
1087 zSig1 = 0;
1088 zExp -= 64;
1090 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1091 if ( 0 <= shiftCount ) {
1092 zSig2 = 0;
1093 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1095 else {
1096 shift128ExtraRightJamming(
1097 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1099 zExp -= shiftCount;
1100 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1104 #endif
1106 /*----------------------------------------------------------------------------
1107 | Returns the result of converting the 32-bit two's complement integer `a'
1108 | to the single-precision floating-point format. The conversion is performed
1109 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1110 *----------------------------------------------------------------------------*/
1112 float32 int32_to_float32( int32 a STATUS_PARAM )
1114 flag zSign;
1116 if ( a == 0 ) return float32_zero;
1117 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1118 zSign = ( a < 0 );
1119 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1123 /*----------------------------------------------------------------------------
1124 | Returns the result of converting the 32-bit two's complement integer `a'
1125 | to the double-precision floating-point format. The conversion is performed
1126 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1127 *----------------------------------------------------------------------------*/
1129 float64 int32_to_float64( int32 a STATUS_PARAM )
1131 flag zSign;
1132 uint32 absA;
1133 int8 shiftCount;
1134 bits64 zSig;
1136 if ( a == 0 ) return float64_zero;
1137 zSign = ( a < 0 );
1138 absA = zSign ? - a : a;
1139 shiftCount = countLeadingZeros32( absA ) + 21;
1140 zSig = absA;
1141 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1145 #ifdef FLOATX80
1147 /*----------------------------------------------------------------------------
1148 | Returns the result of converting the 32-bit two's complement integer `a'
1149 | to the extended double-precision floating-point format. The conversion
1150 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1151 | Arithmetic.
1152 *----------------------------------------------------------------------------*/
1154 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1156 flag zSign;
1157 uint32 absA;
1158 int8 shiftCount;
1159 bits64 zSig;
1161 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1162 zSign = ( a < 0 );
1163 absA = zSign ? - a : a;
1164 shiftCount = countLeadingZeros32( absA ) + 32;
1165 zSig = absA;
1166 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1170 #endif
1172 #ifdef FLOAT128
1174 /*----------------------------------------------------------------------------
1175 | Returns the result of converting the 32-bit two's complement integer `a' to
1176 | the quadruple-precision floating-point format. The conversion is performed
1177 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1178 *----------------------------------------------------------------------------*/
1180 float128 int32_to_float128( int32 a STATUS_PARAM )
1182 flag zSign;
1183 uint32 absA;
1184 int8 shiftCount;
1185 bits64 zSig0;
1187 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1188 zSign = ( a < 0 );
1189 absA = zSign ? - a : a;
1190 shiftCount = countLeadingZeros32( absA ) + 17;
1191 zSig0 = absA;
1192 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1196 #endif
1198 /*----------------------------------------------------------------------------
1199 | Returns the result of converting the 64-bit two's complement integer `a'
1200 | to the single-precision floating-point format. The conversion is performed
1201 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1202 *----------------------------------------------------------------------------*/
1204 float32 int64_to_float32( int64 a STATUS_PARAM )
1206 flag zSign;
1207 uint64 absA;
1208 int8 shiftCount;
1210 if ( a == 0 ) return float32_zero;
1211 zSign = ( a < 0 );
1212 absA = zSign ? - a : a;
1213 shiftCount = countLeadingZeros64( absA ) - 40;
1214 if ( 0 <= shiftCount ) {
1215 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1217 else {
1218 shiftCount += 7;
1219 if ( shiftCount < 0 ) {
1220 shift64RightJamming( absA, - shiftCount, &absA );
1222 else {
1223 absA <<= shiftCount;
1225 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1230 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1232 int8 shiftCount;
1234 if ( a == 0 ) return float32_zero;
1235 shiftCount = countLeadingZeros64( a ) - 40;
1236 if ( 0 <= shiftCount ) {
1237 return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1239 else {
1240 shiftCount += 7;
1241 if ( shiftCount < 0 ) {
1242 shift64RightJamming( a, - shiftCount, &a );
1244 else {
1245 a <<= shiftCount;
1247 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1251 /*----------------------------------------------------------------------------
1252 | Returns the result of converting the 64-bit two's complement integer `a'
1253 | to the double-precision floating-point format. The conversion is performed
1254 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1255 *----------------------------------------------------------------------------*/
1257 float64 int64_to_float64( int64 a STATUS_PARAM )
1259 flag zSign;
1261 if ( a == 0 ) return float64_zero;
1262 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1263 return packFloat64( 1, 0x43E, 0 );
1265 zSign = ( a < 0 );
1266 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1270 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1272 if ( a == 0 ) return float64_zero;
1273 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1277 #ifdef FLOATX80
1279 /*----------------------------------------------------------------------------
1280 | Returns the result of converting the 64-bit two's complement integer `a'
1281 | to the extended double-precision floating-point format. The conversion
1282 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1283 | Arithmetic.
1284 *----------------------------------------------------------------------------*/
1286 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1288 flag zSign;
1289 uint64 absA;
1290 int8 shiftCount;
1292 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1293 zSign = ( a < 0 );
1294 absA = zSign ? - a : a;
1295 shiftCount = countLeadingZeros64( absA );
1296 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1300 #endif
1302 #ifdef FLOAT128
1304 /*----------------------------------------------------------------------------
1305 | Returns the result of converting the 64-bit two's complement integer `a' to
1306 | the quadruple-precision floating-point format. The conversion is performed
1307 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1308 *----------------------------------------------------------------------------*/
1310 float128 int64_to_float128( int64 a STATUS_PARAM )
1312 flag zSign;
1313 uint64 absA;
1314 int8 shiftCount;
1315 int32 zExp;
1316 bits64 zSig0, zSig1;
1318 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1319 zSign = ( a < 0 );
1320 absA = zSign ? - a : a;
1321 shiftCount = countLeadingZeros64( absA ) + 49;
1322 zExp = 0x406E - shiftCount;
1323 if ( 64 <= shiftCount ) {
1324 zSig1 = 0;
1325 zSig0 = absA;
1326 shiftCount -= 64;
1328 else {
1329 zSig1 = absA;
1330 zSig0 = 0;
1332 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1333 return packFloat128( zSign, zExp, zSig0, zSig1 );
1337 #endif
1339 /*----------------------------------------------------------------------------
1340 | Returns the result of converting the single-precision floating-point value
1341 | `a' to the 32-bit two's complement integer format. The conversion is
1342 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1343 | Arithmetic---which means in particular that the conversion is rounded
1344 | according to the current rounding mode. If `a' is a NaN, the largest
1345 | positive integer is returned. Otherwise, if the conversion overflows, the
1346 | largest integer with the same sign as `a' is returned.
1347 *----------------------------------------------------------------------------*/
1349 int32 float32_to_int32( float32 a STATUS_PARAM )
1351 flag aSign;
1352 int16 aExp, shiftCount;
1353 bits32 aSig;
1354 bits64 aSig64;
1356 a = float32_squash_input_denormal(a STATUS_VAR);
1357 aSig = extractFloat32Frac( a );
1358 aExp = extractFloat32Exp( a );
1359 aSign = extractFloat32Sign( a );
1360 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1361 if ( aExp ) aSig |= 0x00800000;
1362 shiftCount = 0xAF - aExp;
1363 aSig64 = aSig;
1364 aSig64 <<= 32;
1365 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1366 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1370 /*----------------------------------------------------------------------------
1371 | Returns the result of converting the single-precision floating-point value
1372 | `a' to the 32-bit two's complement integer format. The conversion is
1373 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1374 | Arithmetic, except that the conversion is always rounded toward zero.
1375 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1376 | the conversion overflows, the largest integer with the same sign as `a' is
1377 | returned.
1378 *----------------------------------------------------------------------------*/
1380 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1382 flag aSign;
1383 int16 aExp, shiftCount;
1384 bits32 aSig;
1385 int32 z;
1386 a = float32_squash_input_denormal(a STATUS_VAR);
1388 aSig = extractFloat32Frac( a );
1389 aExp = extractFloat32Exp( a );
1390 aSign = extractFloat32Sign( a );
1391 shiftCount = aExp - 0x9E;
1392 if ( 0 <= shiftCount ) {
1393 if ( float32_val(a) != 0xCF000000 ) {
1394 float_raise( float_flag_invalid STATUS_VAR);
1395 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1397 return (sbits32) 0x80000000;
1399 else if ( aExp <= 0x7E ) {
1400 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1401 return 0;
1403 aSig = ( aSig | 0x00800000 )<<8;
1404 z = aSig>>( - shiftCount );
1405 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1406 STATUS(float_exception_flags) |= float_flag_inexact;
1408 if ( aSign ) z = - z;
1409 return z;
1413 /*----------------------------------------------------------------------------
1414 | Returns the result of converting the single-precision floating-point value
1415 | `a' to the 16-bit two's complement integer format. The conversion is
1416 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1417 | Arithmetic, except that the conversion is always rounded toward zero.
1418 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1419 | the conversion overflows, the largest integer with the same sign as `a' is
1420 | returned.
1421 *----------------------------------------------------------------------------*/
1423 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1425 flag aSign;
1426 int16 aExp, shiftCount;
1427 bits32 aSig;
1428 int32 z;
1430 aSig = extractFloat32Frac( a );
1431 aExp = extractFloat32Exp( a );
1432 aSign = extractFloat32Sign( a );
1433 shiftCount = aExp - 0x8E;
1434 if ( 0 <= shiftCount ) {
1435 if ( float32_val(a) != 0xC7000000 ) {
1436 float_raise( float_flag_invalid STATUS_VAR);
1437 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1438 return 0x7FFF;
1441 return (sbits32) 0xffff8000;
1443 else if ( aExp <= 0x7E ) {
1444 if ( aExp | aSig ) {
1445 STATUS(float_exception_flags) |= float_flag_inexact;
1447 return 0;
1449 shiftCount -= 0x10;
1450 aSig = ( aSig | 0x00800000 )<<8;
1451 z = aSig>>( - shiftCount );
1452 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1453 STATUS(float_exception_flags) |= float_flag_inexact;
1455 if ( aSign ) {
1456 z = - z;
1458 return z;
1462 /*----------------------------------------------------------------------------
1463 | Returns the result of converting the single-precision floating-point value
1464 | `a' to the 64-bit two's complement integer format. The conversion is
1465 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1466 | Arithmetic---which means in particular that the conversion is rounded
1467 | according to the current rounding mode. If `a' is a NaN, the largest
1468 | positive integer is returned. Otherwise, if the conversion overflows, the
1469 | largest integer with the same sign as `a' is returned.
1470 *----------------------------------------------------------------------------*/
1472 int64 float32_to_int64( float32 a STATUS_PARAM )
1474 flag aSign;
1475 int16 aExp, shiftCount;
1476 bits32 aSig;
1477 bits64 aSig64, aSigExtra;
1478 a = float32_squash_input_denormal(a STATUS_VAR);
1480 aSig = extractFloat32Frac( a );
1481 aExp = extractFloat32Exp( a );
1482 aSign = extractFloat32Sign( a );
1483 shiftCount = 0xBE - aExp;
1484 if ( shiftCount < 0 ) {
1485 float_raise( float_flag_invalid STATUS_VAR);
1486 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1487 return LIT64( 0x7FFFFFFFFFFFFFFF );
1489 return (sbits64) LIT64( 0x8000000000000000 );
1491 if ( aExp ) aSig |= 0x00800000;
1492 aSig64 = aSig;
1493 aSig64 <<= 40;
1494 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1495 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1499 /*----------------------------------------------------------------------------
1500 | Returns the result of converting the single-precision floating-point value
1501 | `a' to the 64-bit two's complement integer format. The conversion is
1502 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1503 | Arithmetic, except that the conversion is always rounded toward zero. If
1504 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1505 | conversion overflows, the largest integer with the same sign as `a' is
1506 | returned.
1507 *----------------------------------------------------------------------------*/
1509 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1511 flag aSign;
1512 int16 aExp, shiftCount;
1513 bits32 aSig;
1514 bits64 aSig64;
1515 int64 z;
1516 a = float32_squash_input_denormal(a STATUS_VAR);
1518 aSig = extractFloat32Frac( a );
1519 aExp = extractFloat32Exp( a );
1520 aSign = extractFloat32Sign( a );
1521 shiftCount = aExp - 0xBE;
1522 if ( 0 <= shiftCount ) {
1523 if ( float32_val(a) != 0xDF000000 ) {
1524 float_raise( float_flag_invalid STATUS_VAR);
1525 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1526 return LIT64( 0x7FFFFFFFFFFFFFFF );
1529 return (sbits64) LIT64( 0x8000000000000000 );
1531 else if ( aExp <= 0x7E ) {
1532 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1533 return 0;
1535 aSig64 = aSig | 0x00800000;
1536 aSig64 <<= 40;
1537 z = aSig64>>( - shiftCount );
1538 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1539 STATUS(float_exception_flags) |= float_flag_inexact;
1541 if ( aSign ) z = - z;
1542 return z;
1546 /*----------------------------------------------------------------------------
1547 | Returns the result of converting the single-precision floating-point value
1548 | `a' to the double-precision floating-point format. The conversion is
1549 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1550 | Arithmetic.
1551 *----------------------------------------------------------------------------*/
1553 float64 float32_to_float64( float32 a STATUS_PARAM )
1555 flag aSign;
1556 int16 aExp;
1557 bits32 aSig;
1558 a = float32_squash_input_denormal(a STATUS_VAR);
1560 aSig = extractFloat32Frac( a );
1561 aExp = extractFloat32Exp( a );
1562 aSign = extractFloat32Sign( a );
1563 if ( aExp == 0xFF ) {
1564 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1565 return packFloat64( aSign, 0x7FF, 0 );
1567 if ( aExp == 0 ) {
1568 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1569 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1570 --aExp;
1572 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1576 #ifdef FLOATX80
1578 /*----------------------------------------------------------------------------
1579 | Returns the result of converting the single-precision floating-point value
1580 | `a' to the extended double-precision floating-point format. The conversion
1581 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1582 | Arithmetic.
1583 *----------------------------------------------------------------------------*/
1585 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1587 flag aSign;
1588 int16 aExp;
1589 bits32 aSig;
1591 a = float32_squash_input_denormal(a STATUS_VAR);
1592 aSig = extractFloat32Frac( a );
1593 aExp = extractFloat32Exp( a );
1594 aSign = extractFloat32Sign( a );
1595 if ( aExp == 0xFF ) {
1596 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1597 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1599 if ( aExp == 0 ) {
1600 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1601 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1603 aSig |= 0x00800000;
1604 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1608 #endif
1610 #ifdef FLOAT128
1612 /*----------------------------------------------------------------------------
1613 | Returns the result of converting the single-precision floating-point value
1614 | `a' to the double-precision floating-point format. The conversion is
1615 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1616 | Arithmetic.
1617 *----------------------------------------------------------------------------*/
1619 float128 float32_to_float128( float32 a STATUS_PARAM )
1621 flag aSign;
1622 int16 aExp;
1623 bits32 aSig;
1625 a = float32_squash_input_denormal(a STATUS_VAR);
1626 aSig = extractFloat32Frac( a );
1627 aExp = extractFloat32Exp( a );
1628 aSign = extractFloat32Sign( a );
1629 if ( aExp == 0xFF ) {
1630 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1631 return packFloat128( aSign, 0x7FFF, 0, 0 );
1633 if ( aExp == 0 ) {
1634 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1635 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1636 --aExp;
1638 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1642 #endif
1644 /*----------------------------------------------------------------------------
1645 | Rounds the single-precision floating-point value `a' to an integer, and
1646 | returns the result as a single-precision floating-point value. The
1647 | operation is performed according to the IEC/IEEE Standard for Binary
1648 | Floating-Point Arithmetic.
1649 *----------------------------------------------------------------------------*/
1651 float32 float32_round_to_int( float32 a STATUS_PARAM)
1653 flag aSign;
1654 int16 aExp;
1655 bits32 lastBitMask, roundBitsMask;
1656 int8 roundingMode;
1657 bits32 z;
1658 a = float32_squash_input_denormal(a STATUS_VAR);
1660 aExp = extractFloat32Exp( a );
1661 if ( 0x96 <= aExp ) {
1662 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1663 return propagateFloat32NaN( a, a STATUS_VAR );
1665 return a;
1667 if ( aExp <= 0x7E ) {
1668 if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1669 STATUS(float_exception_flags) |= float_flag_inexact;
1670 aSign = extractFloat32Sign( a );
1671 switch ( STATUS(float_rounding_mode) ) {
1672 case float_round_nearest_even:
1673 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1674 return packFloat32( aSign, 0x7F, 0 );
1676 break;
1677 case float_round_down:
1678 return make_float32(aSign ? 0xBF800000 : 0);
1679 case float_round_up:
1680 return make_float32(aSign ? 0x80000000 : 0x3F800000);
1682 return packFloat32( aSign, 0, 0 );
1684 lastBitMask = 1;
1685 lastBitMask <<= 0x96 - aExp;
1686 roundBitsMask = lastBitMask - 1;
1687 z = float32_val(a);
1688 roundingMode = STATUS(float_rounding_mode);
1689 if ( roundingMode == float_round_nearest_even ) {
1690 z += lastBitMask>>1;
1691 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1693 else if ( roundingMode != float_round_to_zero ) {
1694 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1695 z += roundBitsMask;
1698 z &= ~ roundBitsMask;
1699 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1700 return make_float32(z);
1704 /*----------------------------------------------------------------------------
1705 | Returns the result of adding the absolute values of the single-precision
1706 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1707 | before being returned. `zSign' is ignored if the result is a NaN.
1708 | The addition is performed according to the IEC/IEEE Standard for Binary
1709 | Floating-Point Arithmetic.
1710 *----------------------------------------------------------------------------*/
1712 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1714 int16 aExp, bExp, zExp;
1715 bits32 aSig, bSig, zSig;
1716 int16 expDiff;
1718 aSig = extractFloat32Frac( a );
1719 aExp = extractFloat32Exp( a );
1720 bSig = extractFloat32Frac( b );
1721 bExp = extractFloat32Exp( b );
1722 expDiff = aExp - bExp;
1723 aSig <<= 6;
1724 bSig <<= 6;
1725 if ( 0 < expDiff ) {
1726 if ( aExp == 0xFF ) {
1727 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1728 return a;
1730 if ( bExp == 0 ) {
1731 --expDiff;
1733 else {
1734 bSig |= 0x20000000;
1736 shift32RightJamming( bSig, expDiff, &bSig );
1737 zExp = aExp;
1739 else if ( expDiff < 0 ) {
1740 if ( bExp == 0xFF ) {
1741 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1742 return packFloat32( zSign, 0xFF, 0 );
1744 if ( aExp == 0 ) {
1745 ++expDiff;
1747 else {
1748 aSig |= 0x20000000;
1750 shift32RightJamming( aSig, - expDiff, &aSig );
1751 zExp = bExp;
1753 else {
1754 if ( aExp == 0xFF ) {
1755 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1756 return a;
1758 if ( aExp == 0 ) {
1759 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1760 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1762 zSig = 0x40000000 + aSig + bSig;
1763 zExp = aExp;
1764 goto roundAndPack;
1766 aSig |= 0x20000000;
1767 zSig = ( aSig + bSig )<<1;
1768 --zExp;
1769 if ( (sbits32) zSig < 0 ) {
1770 zSig = aSig + bSig;
1771 ++zExp;
1773 roundAndPack:
1774 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1778 /*----------------------------------------------------------------------------
1779 | Returns the result of subtracting the absolute values of the single-
1780 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1781 | difference is negated before being returned. `zSign' is ignored if the
1782 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1783 | Standard for Binary Floating-Point Arithmetic.
1784 *----------------------------------------------------------------------------*/
1786 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1788 int16 aExp, bExp, zExp;
1789 bits32 aSig, bSig, zSig;
1790 int16 expDiff;
1792 aSig = extractFloat32Frac( a );
1793 aExp = extractFloat32Exp( a );
1794 bSig = extractFloat32Frac( b );
1795 bExp = extractFloat32Exp( b );
1796 expDiff = aExp - bExp;
1797 aSig <<= 7;
1798 bSig <<= 7;
1799 if ( 0 < expDiff ) goto aExpBigger;
1800 if ( expDiff < 0 ) goto bExpBigger;
1801 if ( aExp == 0xFF ) {
1802 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1803 float_raise( float_flag_invalid STATUS_VAR);
1804 return float32_default_nan;
1806 if ( aExp == 0 ) {
1807 aExp = 1;
1808 bExp = 1;
1810 if ( bSig < aSig ) goto aBigger;
1811 if ( aSig < bSig ) goto bBigger;
1812 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1813 bExpBigger:
1814 if ( bExp == 0xFF ) {
1815 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1816 return packFloat32( zSign ^ 1, 0xFF, 0 );
1818 if ( aExp == 0 ) {
1819 ++expDiff;
1821 else {
1822 aSig |= 0x40000000;
1824 shift32RightJamming( aSig, - expDiff, &aSig );
1825 bSig |= 0x40000000;
1826 bBigger:
1827 zSig = bSig - aSig;
1828 zExp = bExp;
1829 zSign ^= 1;
1830 goto normalizeRoundAndPack;
1831 aExpBigger:
1832 if ( aExp == 0xFF ) {
1833 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1834 return a;
1836 if ( bExp == 0 ) {
1837 --expDiff;
1839 else {
1840 bSig |= 0x40000000;
1842 shift32RightJamming( bSig, expDiff, &bSig );
1843 aSig |= 0x40000000;
1844 aBigger:
1845 zSig = aSig - bSig;
1846 zExp = aExp;
1847 normalizeRoundAndPack:
1848 --zExp;
1849 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1853 /*----------------------------------------------------------------------------
1854 | Returns the result of adding the single-precision floating-point values `a'
1855 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1856 | Binary Floating-Point Arithmetic.
1857 *----------------------------------------------------------------------------*/
1859 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1861 flag aSign, bSign;
1862 a = float32_squash_input_denormal(a STATUS_VAR);
1863 b = float32_squash_input_denormal(b STATUS_VAR);
1865 aSign = extractFloat32Sign( a );
1866 bSign = extractFloat32Sign( b );
1867 if ( aSign == bSign ) {
1868 return addFloat32Sigs( a, b, aSign STATUS_VAR);
1870 else {
1871 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1876 /*----------------------------------------------------------------------------
1877 | Returns the result of subtracting the single-precision floating-point values
1878 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1879 | for Binary Floating-Point Arithmetic.
1880 *----------------------------------------------------------------------------*/
1882 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1884 flag aSign, bSign;
1885 a = float32_squash_input_denormal(a STATUS_VAR);
1886 b = float32_squash_input_denormal(b STATUS_VAR);
1888 aSign = extractFloat32Sign( a );
1889 bSign = extractFloat32Sign( b );
1890 if ( aSign == bSign ) {
1891 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1893 else {
1894 return addFloat32Sigs( a, b, aSign STATUS_VAR );
1899 /*----------------------------------------------------------------------------
1900 | Returns the result of multiplying the single-precision floating-point values
1901 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1902 | for Binary Floating-Point Arithmetic.
1903 *----------------------------------------------------------------------------*/
1905 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1907 flag aSign, bSign, zSign;
1908 int16 aExp, bExp, zExp;
1909 bits32 aSig, bSig;
1910 bits64 zSig64;
1911 bits32 zSig;
1913 a = float32_squash_input_denormal(a STATUS_VAR);
1914 b = float32_squash_input_denormal(b STATUS_VAR);
1916 aSig = extractFloat32Frac( a );
1917 aExp = extractFloat32Exp( a );
1918 aSign = extractFloat32Sign( a );
1919 bSig = extractFloat32Frac( b );
1920 bExp = extractFloat32Exp( b );
1921 bSign = extractFloat32Sign( b );
1922 zSign = aSign ^ bSign;
1923 if ( aExp == 0xFF ) {
1924 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1925 return propagateFloat32NaN( a, b STATUS_VAR );
1927 if ( ( bExp | bSig ) == 0 ) {
1928 float_raise( float_flag_invalid STATUS_VAR);
1929 return float32_default_nan;
1931 return packFloat32( zSign, 0xFF, 0 );
1933 if ( bExp == 0xFF ) {
1934 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1935 if ( ( aExp | aSig ) == 0 ) {
1936 float_raise( float_flag_invalid STATUS_VAR);
1937 return float32_default_nan;
1939 return packFloat32( zSign, 0xFF, 0 );
1941 if ( aExp == 0 ) {
1942 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1943 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1945 if ( bExp == 0 ) {
1946 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1947 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1949 zExp = aExp + bExp - 0x7F;
1950 aSig = ( aSig | 0x00800000 )<<7;
1951 bSig = ( bSig | 0x00800000 )<<8;
1952 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1953 zSig = zSig64;
1954 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1955 zSig <<= 1;
1956 --zExp;
1958 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1962 /*----------------------------------------------------------------------------
1963 | Returns the result of dividing the single-precision floating-point value `a'
1964 | by the corresponding value `b'. The operation is performed according to the
1965 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1966 *----------------------------------------------------------------------------*/
1968 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1970 flag aSign, bSign, zSign;
1971 int16 aExp, bExp, zExp;
1972 bits32 aSig, bSig, zSig;
1973 a = float32_squash_input_denormal(a STATUS_VAR);
1974 b = float32_squash_input_denormal(b STATUS_VAR);
1976 aSig = extractFloat32Frac( a );
1977 aExp = extractFloat32Exp( a );
1978 aSign = extractFloat32Sign( a );
1979 bSig = extractFloat32Frac( b );
1980 bExp = extractFloat32Exp( b );
1981 bSign = extractFloat32Sign( b );
1982 zSign = aSign ^ bSign;
1983 if ( aExp == 0xFF ) {
1984 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1985 if ( bExp == 0xFF ) {
1986 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1987 float_raise( float_flag_invalid STATUS_VAR);
1988 return float32_default_nan;
1990 return packFloat32( zSign, 0xFF, 0 );
1992 if ( bExp == 0xFF ) {
1993 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1994 return packFloat32( zSign, 0, 0 );
1996 if ( bExp == 0 ) {
1997 if ( bSig == 0 ) {
1998 if ( ( aExp | aSig ) == 0 ) {
1999 float_raise( float_flag_invalid STATUS_VAR);
2000 return float32_default_nan;
2002 float_raise( float_flag_divbyzero STATUS_VAR);
2003 return packFloat32( zSign, 0xFF, 0 );
2005 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2007 if ( aExp == 0 ) {
2008 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2009 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2011 zExp = aExp - bExp + 0x7D;
2012 aSig = ( aSig | 0x00800000 )<<7;
2013 bSig = ( bSig | 0x00800000 )<<8;
2014 if ( bSig <= ( aSig + aSig ) ) {
2015 aSig >>= 1;
2016 ++zExp;
2018 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
2019 if ( ( zSig & 0x3F ) == 0 ) {
2020 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2022 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2026 /*----------------------------------------------------------------------------
2027 | Returns the remainder of the single-precision floating-point value `a'
2028 | with respect to the corresponding value `b'. The operation is performed
2029 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2030 *----------------------------------------------------------------------------*/
2032 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2034 flag aSign, zSign;
2035 int16 aExp, bExp, expDiff;
2036 bits32 aSig, bSig;
2037 bits32 q;
2038 bits64 aSig64, bSig64, q64;
2039 bits32 alternateASig;
2040 sbits32 sigMean;
2041 a = float32_squash_input_denormal(a STATUS_VAR);
2042 b = float32_squash_input_denormal(b STATUS_VAR);
2044 aSig = extractFloat32Frac( a );
2045 aExp = extractFloat32Exp( a );
2046 aSign = extractFloat32Sign( a );
2047 bSig = extractFloat32Frac( b );
2048 bExp = extractFloat32Exp( b );
2049 if ( aExp == 0xFF ) {
2050 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2051 return propagateFloat32NaN( a, b STATUS_VAR );
2053 float_raise( float_flag_invalid STATUS_VAR);
2054 return float32_default_nan;
2056 if ( bExp == 0xFF ) {
2057 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2058 return a;
2060 if ( bExp == 0 ) {
2061 if ( bSig == 0 ) {
2062 float_raise( float_flag_invalid STATUS_VAR);
2063 return float32_default_nan;
2065 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2067 if ( aExp == 0 ) {
2068 if ( aSig == 0 ) return a;
2069 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2071 expDiff = aExp - bExp;
2072 aSig |= 0x00800000;
2073 bSig |= 0x00800000;
2074 if ( expDiff < 32 ) {
2075 aSig <<= 8;
2076 bSig <<= 8;
2077 if ( expDiff < 0 ) {
2078 if ( expDiff < -1 ) return a;
2079 aSig >>= 1;
2081 q = ( bSig <= aSig );
2082 if ( q ) aSig -= bSig;
2083 if ( 0 < expDiff ) {
2084 q = ( ( (bits64) aSig )<<32 ) / bSig;
2085 q >>= 32 - expDiff;
2086 bSig >>= 2;
2087 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2089 else {
2090 aSig >>= 2;
2091 bSig >>= 2;
2094 else {
2095 if ( bSig <= aSig ) aSig -= bSig;
2096 aSig64 = ( (bits64) aSig )<<40;
2097 bSig64 = ( (bits64) bSig )<<40;
2098 expDiff -= 64;
2099 while ( 0 < expDiff ) {
2100 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2101 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2102 aSig64 = - ( ( bSig * q64 )<<38 );
2103 expDiff -= 62;
2105 expDiff += 64;
2106 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2107 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2108 q = q64>>( 64 - expDiff );
2109 bSig <<= 6;
2110 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2112 do {
2113 alternateASig = aSig;
2114 ++q;
2115 aSig -= bSig;
2116 } while ( 0 <= (sbits32) aSig );
2117 sigMean = aSig + alternateASig;
2118 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2119 aSig = alternateASig;
2121 zSign = ( (sbits32) aSig < 0 );
2122 if ( zSign ) aSig = - aSig;
2123 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2127 /*----------------------------------------------------------------------------
2128 | Returns the square root of the single-precision floating-point value `a'.
2129 | The operation is performed according to the IEC/IEEE Standard for Binary
2130 | Floating-Point Arithmetic.
2131 *----------------------------------------------------------------------------*/
2133 float32 float32_sqrt( float32 a STATUS_PARAM )
2135 flag aSign;
2136 int16 aExp, zExp;
2137 bits32 aSig, zSig;
2138 bits64 rem, term;
2139 a = float32_squash_input_denormal(a STATUS_VAR);
2141 aSig = extractFloat32Frac( a );
2142 aExp = extractFloat32Exp( a );
2143 aSign = extractFloat32Sign( a );
2144 if ( aExp == 0xFF ) {
2145 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2146 if ( ! aSign ) return a;
2147 float_raise( float_flag_invalid STATUS_VAR);
2148 return float32_default_nan;
2150 if ( aSign ) {
2151 if ( ( aExp | aSig ) == 0 ) return a;
2152 float_raise( float_flag_invalid STATUS_VAR);
2153 return float32_default_nan;
2155 if ( aExp == 0 ) {
2156 if ( aSig == 0 ) return float32_zero;
2157 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2159 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2160 aSig = ( aSig | 0x00800000 )<<8;
2161 zSig = estimateSqrt32( aExp, aSig ) + 2;
2162 if ( ( zSig & 0x7F ) <= 5 ) {
2163 if ( zSig < 2 ) {
2164 zSig = 0x7FFFFFFF;
2165 goto roundAndPack;
2167 aSig >>= aExp & 1;
2168 term = ( (bits64) zSig ) * zSig;
2169 rem = ( ( (bits64) aSig )<<32 ) - term;
2170 while ( (sbits64) rem < 0 ) {
2171 --zSig;
2172 rem += ( ( (bits64) zSig )<<1 ) | 1;
2174 zSig |= ( rem != 0 );
2176 shift32RightJamming( zSig, 1, &zSig );
2177 roundAndPack:
2178 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2182 /*----------------------------------------------------------------------------
2183 | Returns the binary exponential of the single-precision floating-point value
2184 | `a'. The operation is performed according to the IEC/IEEE Standard for
2185 | Binary Floating-Point Arithmetic.
2187 | Uses the following identities:
2189 | 1. -------------------------------------------------------------------------
2190 | x x*ln(2)
2191 | 2 = e
2193 | 2. -------------------------------------------------------------------------
2194 | 2 3 4 5 n
2195 | x x x x x x x
2196 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2197 | 1! 2! 3! 4! 5! n!
2198 *----------------------------------------------------------------------------*/
2200 static const float64 float32_exp2_coefficients[15] =
2202 const_float64( 0x3ff0000000000000ll ), /* 1 */
2203 const_float64( 0x3fe0000000000000ll ), /* 2 */
2204 const_float64( 0x3fc5555555555555ll ), /* 3 */
2205 const_float64( 0x3fa5555555555555ll ), /* 4 */
2206 const_float64( 0x3f81111111111111ll ), /* 5 */
2207 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2208 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2209 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2210 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2211 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2212 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2213 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2214 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2215 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2216 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2219 float32 float32_exp2( float32 a STATUS_PARAM )
2221 flag aSign;
2222 int16 aExp;
2223 bits32 aSig;
2224 float64 r, x, xn;
2225 int i;
2226 a = float32_squash_input_denormal(a STATUS_VAR);
2228 aSig = extractFloat32Frac( a );
2229 aExp = extractFloat32Exp( a );
2230 aSign = extractFloat32Sign( a );
2232 if ( aExp == 0xFF) {
2233 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2234 return (aSign) ? float32_zero : a;
2236 if (aExp == 0) {
2237 if (aSig == 0) return float32_one;
2240 float_raise( float_flag_inexact STATUS_VAR);
2242 /* ******************************* */
2243 /* using float64 for approximation */
2244 /* ******************************* */
2245 x = float32_to_float64(a STATUS_VAR);
2246 x = float64_mul(x, float64_ln2 STATUS_VAR);
2248 xn = x;
2249 r = float64_one;
2250 for (i = 0 ; i < 15 ; i++) {
2251 float64 f;
2253 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2254 r = float64_add(r, f STATUS_VAR);
2256 xn = float64_mul(xn, x STATUS_VAR);
2259 return float64_to_float32(r, status);
2262 /*----------------------------------------------------------------------------
2263 | Returns the binary log of the single-precision floating-point value `a'.
2264 | The operation is performed according to the IEC/IEEE Standard for Binary
2265 | Floating-Point Arithmetic.
2266 *----------------------------------------------------------------------------*/
2267 float32 float32_log2( float32 a STATUS_PARAM )
2269 flag aSign, zSign;
2270 int16 aExp;
2271 bits32 aSig, zSig, i;
2273 a = float32_squash_input_denormal(a STATUS_VAR);
2274 aSig = extractFloat32Frac( a );
2275 aExp = extractFloat32Exp( a );
2276 aSign = extractFloat32Sign( a );
2278 if ( aExp == 0 ) {
2279 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2280 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2282 if ( aSign ) {
2283 float_raise( float_flag_invalid STATUS_VAR);
2284 return float32_default_nan;
2286 if ( aExp == 0xFF ) {
2287 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2288 return a;
2291 aExp -= 0x7F;
2292 aSig |= 0x00800000;
2293 zSign = aExp < 0;
2294 zSig = aExp << 23;
2296 for (i = 1 << 22; i > 0; i >>= 1) {
2297 aSig = ( (bits64)aSig * aSig ) >> 23;
2298 if ( aSig & 0x01000000 ) {
2299 aSig >>= 1;
2300 zSig |= i;
2304 if ( zSign )
2305 zSig = -zSig;
2307 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2310 /*----------------------------------------------------------------------------
2311 | Returns 1 if the single-precision floating-point value `a' is equal to
2312 | the corresponding value `b', and 0 otherwise. The comparison is performed
2313 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2314 *----------------------------------------------------------------------------*/
2316 int float32_eq( float32 a, float32 b STATUS_PARAM )
2318 a = float32_squash_input_denormal(a STATUS_VAR);
2319 b = float32_squash_input_denormal(b STATUS_VAR);
2321 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2322 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2324 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2325 float_raise( float_flag_invalid STATUS_VAR);
2327 return 0;
2329 return ( float32_val(a) == float32_val(b) ) ||
2330 ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2334 /*----------------------------------------------------------------------------
2335 | Returns 1 if the single-precision floating-point value `a' is less than
2336 | or equal to the corresponding value `b', and 0 otherwise. The comparison
2337 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2338 | Arithmetic.
2339 *----------------------------------------------------------------------------*/
2341 int float32_le( float32 a, float32 b STATUS_PARAM )
2343 flag aSign, bSign;
2344 bits32 av, bv;
2345 a = float32_squash_input_denormal(a STATUS_VAR);
2346 b = float32_squash_input_denormal(b STATUS_VAR);
2348 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2349 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2351 float_raise( float_flag_invalid STATUS_VAR);
2352 return 0;
2354 aSign = extractFloat32Sign( a );
2355 bSign = extractFloat32Sign( b );
2356 av = float32_val(a);
2357 bv = float32_val(b);
2358 if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2359 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2363 /*----------------------------------------------------------------------------
2364 | Returns 1 if the single-precision floating-point value `a' is less than
2365 | the corresponding value `b', and 0 otherwise. The comparison is performed
2366 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2367 *----------------------------------------------------------------------------*/
2369 int float32_lt( float32 a, float32 b STATUS_PARAM )
2371 flag aSign, bSign;
2372 bits32 av, bv;
2373 a = float32_squash_input_denormal(a STATUS_VAR);
2374 b = float32_squash_input_denormal(b STATUS_VAR);
2376 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2377 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2379 float_raise( float_flag_invalid STATUS_VAR);
2380 return 0;
2382 aSign = extractFloat32Sign( a );
2383 bSign = extractFloat32Sign( b );
2384 av = float32_val(a);
2385 bv = float32_val(b);
2386 if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2387 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2391 /*----------------------------------------------------------------------------
2392 | Returns 1 if the single-precision floating-point value `a' is equal to
2393 | the corresponding value `b', and 0 otherwise. The invalid exception is
2394 | raised if either operand is a NaN. Otherwise, the comparison is performed
2395 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2396 *----------------------------------------------------------------------------*/
2398 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2400 bits32 av, bv;
2401 a = float32_squash_input_denormal(a STATUS_VAR);
2402 b = float32_squash_input_denormal(b STATUS_VAR);
2404 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2405 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2407 float_raise( float_flag_invalid STATUS_VAR);
2408 return 0;
2410 av = float32_val(a);
2411 bv = float32_val(b);
2412 return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2416 /*----------------------------------------------------------------------------
2417 | Returns 1 if the single-precision floating-point value `a' is less than or
2418 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2419 | cause an exception. Otherwise, the comparison is performed according to the
2420 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2421 *----------------------------------------------------------------------------*/
2423 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2425 flag aSign, bSign;
2426 bits32 av, bv;
2427 a = float32_squash_input_denormal(a STATUS_VAR);
2428 b = float32_squash_input_denormal(b STATUS_VAR);
2430 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2431 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2433 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2434 float_raise( float_flag_invalid STATUS_VAR);
2436 return 0;
2438 aSign = extractFloat32Sign( a );
2439 bSign = extractFloat32Sign( b );
2440 av = float32_val(a);
2441 bv = float32_val(b);
2442 if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2443 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2447 /*----------------------------------------------------------------------------
2448 | Returns 1 if the single-precision floating-point value `a' is less than
2449 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2450 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2451 | Standard for Binary Floating-Point Arithmetic.
2452 *----------------------------------------------------------------------------*/
2454 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2456 flag aSign, bSign;
2457 bits32 av, bv;
2458 a = float32_squash_input_denormal(a STATUS_VAR);
2459 b = float32_squash_input_denormal(b STATUS_VAR);
2461 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2462 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2464 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2465 float_raise( float_flag_invalid STATUS_VAR);
2467 return 0;
2469 aSign = extractFloat32Sign( a );
2470 bSign = extractFloat32Sign( b );
2471 av = float32_val(a);
2472 bv = float32_val(b);
2473 if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2474 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2478 /*----------------------------------------------------------------------------
2479 | Returns the result of converting the double-precision floating-point value
2480 | `a' to the 32-bit two's complement integer format. The conversion is
2481 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2482 | Arithmetic---which means in particular that the conversion is rounded
2483 | according to the current rounding mode. If `a' is a NaN, the largest
2484 | positive integer is returned. Otherwise, if the conversion overflows, the
2485 | largest integer with the same sign as `a' is returned.
2486 *----------------------------------------------------------------------------*/
2488 int32 float64_to_int32( float64 a STATUS_PARAM )
2490 flag aSign;
2491 int16 aExp, shiftCount;
2492 bits64 aSig;
2493 a = float64_squash_input_denormal(a STATUS_VAR);
2495 aSig = extractFloat64Frac( a );
2496 aExp = extractFloat64Exp( a );
2497 aSign = extractFloat64Sign( a );
2498 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2499 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2500 shiftCount = 0x42C - aExp;
2501 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2502 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2506 /*----------------------------------------------------------------------------
2507 | Returns the result of converting the double-precision floating-point value
2508 | `a' to the 32-bit two's complement integer format. The conversion is
2509 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2510 | Arithmetic, except that the conversion is always rounded toward zero.
2511 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2512 | the conversion overflows, the largest integer with the same sign as `a' is
2513 | returned.
2514 *----------------------------------------------------------------------------*/
2516 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2518 flag aSign;
2519 int16 aExp, shiftCount;
2520 bits64 aSig, savedASig;
2521 int32 z;
2522 a = float64_squash_input_denormal(a STATUS_VAR);
2524 aSig = extractFloat64Frac( a );
2525 aExp = extractFloat64Exp( a );
2526 aSign = extractFloat64Sign( a );
2527 if ( 0x41E < aExp ) {
2528 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2529 goto invalid;
2531 else if ( aExp < 0x3FF ) {
2532 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2533 return 0;
2535 aSig |= LIT64( 0x0010000000000000 );
2536 shiftCount = 0x433 - aExp;
2537 savedASig = aSig;
2538 aSig >>= shiftCount;
2539 z = aSig;
2540 if ( aSign ) z = - z;
2541 if ( ( z < 0 ) ^ aSign ) {
2542 invalid:
2543 float_raise( float_flag_invalid STATUS_VAR);
2544 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2546 if ( ( aSig<<shiftCount ) != savedASig ) {
2547 STATUS(float_exception_flags) |= float_flag_inexact;
2549 return z;
2553 /*----------------------------------------------------------------------------
2554 | Returns the result of converting the double-precision floating-point value
2555 | `a' to the 16-bit two's complement integer format. The conversion is
2556 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2557 | Arithmetic, except that the conversion is always rounded toward zero.
2558 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2559 | the conversion overflows, the largest integer with the same sign as `a' is
2560 | returned.
2561 *----------------------------------------------------------------------------*/
2563 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2565 flag aSign;
2566 int16 aExp, shiftCount;
2567 bits64 aSig, savedASig;
2568 int32 z;
2570 aSig = extractFloat64Frac( a );
2571 aExp = extractFloat64Exp( a );
2572 aSign = extractFloat64Sign( a );
2573 if ( 0x40E < aExp ) {
2574 if ( ( aExp == 0x7FF ) && aSig ) {
2575 aSign = 0;
2577 goto invalid;
2579 else if ( aExp < 0x3FF ) {
2580 if ( aExp || aSig ) {
2581 STATUS(float_exception_flags) |= float_flag_inexact;
2583 return 0;
2585 aSig |= LIT64( 0x0010000000000000 );
2586 shiftCount = 0x433 - aExp;
2587 savedASig = aSig;
2588 aSig >>= shiftCount;
2589 z = aSig;
2590 if ( aSign ) {
2591 z = - z;
2593 if ( ( (int16_t)z < 0 ) ^ aSign ) {
2594 invalid:
2595 float_raise( float_flag_invalid STATUS_VAR);
2596 return aSign ? (sbits32) 0xffff8000 : 0x7FFF;
2598 if ( ( aSig<<shiftCount ) != savedASig ) {
2599 STATUS(float_exception_flags) |= float_flag_inexact;
2601 return z;
2604 /*----------------------------------------------------------------------------
2605 | Returns the result of converting the double-precision floating-point value
2606 | `a' to the 64-bit two's complement integer format. The conversion is
2607 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2608 | Arithmetic---which means in particular that the conversion is rounded
2609 | according to the current rounding mode. If `a' is a NaN, the largest
2610 | positive integer is returned. Otherwise, if the conversion overflows, the
2611 | largest integer with the same sign as `a' is returned.
2612 *----------------------------------------------------------------------------*/
2614 int64 float64_to_int64( float64 a STATUS_PARAM )
2616 flag aSign;
2617 int16 aExp, shiftCount;
2618 bits64 aSig, aSigExtra;
2619 a = float64_squash_input_denormal(a STATUS_VAR);
2621 aSig = extractFloat64Frac( a );
2622 aExp = extractFloat64Exp( a );
2623 aSign = extractFloat64Sign( a );
2624 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2625 shiftCount = 0x433 - aExp;
2626 if ( shiftCount <= 0 ) {
2627 if ( 0x43E < aExp ) {
2628 float_raise( float_flag_invalid STATUS_VAR);
2629 if ( ! aSign
2630 || ( ( aExp == 0x7FF )
2631 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2633 return LIT64( 0x7FFFFFFFFFFFFFFF );
2635 return (sbits64) LIT64( 0x8000000000000000 );
2637 aSigExtra = 0;
2638 aSig <<= - shiftCount;
2640 else {
2641 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2643 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2647 /*----------------------------------------------------------------------------
2648 | Returns the result of converting the double-precision floating-point value
2649 | `a' to the 64-bit two's complement integer format. The conversion is
2650 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2651 | Arithmetic, except that the conversion is always rounded toward zero.
2652 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2653 | the conversion overflows, the largest integer with the same sign as `a' is
2654 | returned.
2655 *----------------------------------------------------------------------------*/
2657 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2659 flag aSign;
2660 int16 aExp, shiftCount;
2661 bits64 aSig;
2662 int64 z;
2663 a = float64_squash_input_denormal(a STATUS_VAR);
2665 aSig = extractFloat64Frac( a );
2666 aExp = extractFloat64Exp( a );
2667 aSign = extractFloat64Sign( a );
2668 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2669 shiftCount = aExp - 0x433;
2670 if ( 0 <= shiftCount ) {
2671 if ( 0x43E <= aExp ) {
2672 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2673 float_raise( float_flag_invalid STATUS_VAR);
2674 if ( ! aSign
2675 || ( ( aExp == 0x7FF )
2676 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2678 return LIT64( 0x7FFFFFFFFFFFFFFF );
2681 return (sbits64) LIT64( 0x8000000000000000 );
2683 z = aSig<<shiftCount;
2685 else {
2686 if ( aExp < 0x3FE ) {
2687 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2688 return 0;
2690 z = aSig>>( - shiftCount );
2691 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2692 STATUS(float_exception_flags) |= float_flag_inexact;
2695 if ( aSign ) z = - z;
2696 return z;
2700 /*----------------------------------------------------------------------------
2701 | Returns the result of converting the double-precision floating-point value
2702 | `a' to the single-precision floating-point format. The conversion is
2703 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2704 | Arithmetic.
2705 *----------------------------------------------------------------------------*/
2707 float32 float64_to_float32( float64 a STATUS_PARAM )
2709 flag aSign;
2710 int16 aExp;
2711 bits64 aSig;
2712 bits32 zSig;
2713 a = float64_squash_input_denormal(a STATUS_VAR);
2715 aSig = extractFloat64Frac( a );
2716 aExp = extractFloat64Exp( a );
2717 aSign = extractFloat64Sign( a );
2718 if ( aExp == 0x7FF ) {
2719 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2720 return packFloat32( aSign, 0xFF, 0 );
2722 shift64RightJamming( aSig, 22, &aSig );
2723 zSig = aSig;
2724 if ( aExp || zSig ) {
2725 zSig |= 0x40000000;
2726 aExp -= 0x381;
2728 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2733 /*----------------------------------------------------------------------------
2734 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2735 | half-precision floating-point value, returning the result. After being
2736 | shifted into the proper positions, the three fields are simply added
2737 | together to form the result. This means that any integer portion of `zSig'
2738 | will be added into the exponent. Since a properly normalized significand
2739 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2740 | than the desired result exponent whenever `zSig' is a complete, normalized
2741 | significand.
2742 *----------------------------------------------------------------------------*/
2743 static float16 packFloat16(flag zSign, int16 zExp, bits16 zSig)
2745 return make_float16(
2746 (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig);
2749 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2750 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
2752 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2754 flag aSign;
2755 int16 aExp;
2756 bits32 aSig;
2758 aSign = extractFloat16Sign(a);
2759 aExp = extractFloat16Exp(a);
2760 aSig = extractFloat16Frac(a);
2762 if (aExp == 0x1f && ieee) {
2763 if (aSig) {
2764 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
2766 return packFloat32(aSign, 0xff, aSig << 13);
2768 if (aExp == 0) {
2769 int8 shiftCount;
2771 if (aSig == 0) {
2772 return packFloat32(aSign, 0, 0);
2775 shiftCount = countLeadingZeros32( aSig ) - 21;
2776 aSig = aSig << shiftCount;
2777 aExp = -shiftCount;
2779 return packFloat32( aSign, aExp + 0x70, aSig << 13);
2782 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
2784 flag aSign;
2785 int16 aExp;
2786 bits32 aSig;
2787 bits32 mask;
2788 bits32 increment;
2789 int8 roundingMode;
2790 a = float32_squash_input_denormal(a STATUS_VAR);
2792 aSig = extractFloat32Frac( a );
2793 aExp = extractFloat32Exp( a );
2794 aSign = extractFloat32Sign( a );
2795 if ( aExp == 0xFF ) {
2796 if (aSig) {
2797 /* Input is a NaN */
2798 float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2799 if (!ieee) {
2800 return packFloat16(aSign, 0, 0);
2802 return r;
2804 /* Infinity */
2805 if (!ieee) {
2806 float_raise(float_flag_invalid STATUS_VAR);
2807 return packFloat16(aSign, 0x1f, 0x3ff);
2809 return packFloat16(aSign, 0x1f, 0);
2811 if (aExp == 0 && aSig == 0) {
2812 return packFloat16(aSign, 0, 0);
2814 /* Decimal point between bits 22 and 23. */
2815 aSig |= 0x00800000;
2816 aExp -= 0x7f;
2817 if (aExp < -14) {
2818 mask = 0x00ffffff;
2819 if (aExp >= -24) {
2820 mask >>= 25 + aExp;
2822 } else {
2823 mask = 0x00001fff;
2825 if (aSig & mask) {
2826 float_raise( float_flag_underflow STATUS_VAR );
2827 roundingMode = STATUS(float_rounding_mode);
2828 switch (roundingMode) {
2829 case float_round_nearest_even:
2830 increment = (mask + 1) >> 1;
2831 if ((aSig & mask) == increment) {
2832 increment = aSig & (increment << 1);
2834 break;
2835 case float_round_up:
2836 increment = aSign ? 0 : mask;
2837 break;
2838 case float_round_down:
2839 increment = aSign ? mask : 0;
2840 break;
2841 default: /* round_to_zero */
2842 increment = 0;
2843 break;
2845 aSig += increment;
2846 if (aSig >= 0x01000000) {
2847 aSig >>= 1;
2848 aExp++;
2850 } else if (aExp < -14
2851 && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2852 float_raise( float_flag_underflow STATUS_VAR);
2855 if (ieee) {
2856 if (aExp > 15) {
2857 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2858 return packFloat16(aSign, 0x1f, 0);
2860 } else {
2861 if (aExp > 16) {
2862 float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
2863 return packFloat16(aSign, 0x1f, 0x3ff);
2866 if (aExp < -24) {
2867 return packFloat16(aSign, 0, 0);
2869 if (aExp < -14) {
2870 aSig >>= -14 - aExp;
2871 aExp = -14;
2873 return packFloat16(aSign, aExp + 14, aSig >> 13);
2876 #ifdef FLOATX80
2878 /*----------------------------------------------------------------------------
2879 | Returns the result of converting the double-precision floating-point value
2880 | `a' to the extended double-precision floating-point format. The conversion
2881 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2882 | Arithmetic.
2883 *----------------------------------------------------------------------------*/
2885 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2887 flag aSign;
2888 int16 aExp;
2889 bits64 aSig;
2891 a = float64_squash_input_denormal(a STATUS_VAR);
2892 aSig = extractFloat64Frac( a );
2893 aExp = extractFloat64Exp( a );
2894 aSign = extractFloat64Sign( a );
2895 if ( aExp == 0x7FF ) {
2896 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2897 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2899 if ( aExp == 0 ) {
2900 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2901 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2903 return
2904 packFloatx80(
2905 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2909 #endif
2911 #ifdef FLOAT128
2913 /*----------------------------------------------------------------------------
2914 | Returns the result of converting the double-precision floating-point value
2915 | `a' to the quadruple-precision floating-point format. The conversion is
2916 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2917 | Arithmetic.
2918 *----------------------------------------------------------------------------*/
2920 float128 float64_to_float128( float64 a STATUS_PARAM )
2922 flag aSign;
2923 int16 aExp;
2924 bits64 aSig, zSig0, zSig1;
2926 a = float64_squash_input_denormal(a STATUS_VAR);
2927 aSig = extractFloat64Frac( a );
2928 aExp = extractFloat64Exp( a );
2929 aSign = extractFloat64Sign( a );
2930 if ( aExp == 0x7FF ) {
2931 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2932 return packFloat128( aSign, 0x7FFF, 0, 0 );
2934 if ( aExp == 0 ) {
2935 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2936 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2937 --aExp;
2939 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2940 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2944 #endif
2946 /*----------------------------------------------------------------------------
2947 | Rounds the double-precision floating-point value `a' to an integer, and
2948 | returns the result as a double-precision floating-point value. The
2949 | operation is performed according to the IEC/IEEE Standard for Binary
2950 | Floating-Point Arithmetic.
2951 *----------------------------------------------------------------------------*/
2953 float64 float64_round_to_int( float64 a STATUS_PARAM )
2955 flag aSign;
2956 int16 aExp;
2957 bits64 lastBitMask, roundBitsMask;
2958 int8 roundingMode;
2959 bits64 z;
2960 a = float64_squash_input_denormal(a STATUS_VAR);
2962 aExp = extractFloat64Exp( a );
2963 if ( 0x433 <= aExp ) {
2964 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2965 return propagateFloat64NaN( a, a STATUS_VAR );
2967 return a;
2969 if ( aExp < 0x3FF ) {
2970 if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2971 STATUS(float_exception_flags) |= float_flag_inexact;
2972 aSign = extractFloat64Sign( a );
2973 switch ( STATUS(float_rounding_mode) ) {
2974 case float_round_nearest_even:
2975 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2976 return packFloat64( aSign, 0x3FF, 0 );
2978 break;
2979 case float_round_down:
2980 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2981 case float_round_up:
2982 return make_float64(
2983 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2985 return packFloat64( aSign, 0, 0 );
2987 lastBitMask = 1;
2988 lastBitMask <<= 0x433 - aExp;
2989 roundBitsMask = lastBitMask - 1;
2990 z = float64_val(a);
2991 roundingMode = STATUS(float_rounding_mode);
2992 if ( roundingMode == float_round_nearest_even ) {
2993 z += lastBitMask>>1;
2994 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2996 else if ( roundingMode != float_round_to_zero ) {
2997 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
2998 z += roundBitsMask;
3001 z &= ~ roundBitsMask;
3002 if ( z != float64_val(a) )
3003 STATUS(float_exception_flags) |= float_flag_inexact;
3004 return make_float64(z);
3008 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3010 int oldmode;
3011 float64 res;
3012 oldmode = STATUS(float_rounding_mode);
3013 STATUS(float_rounding_mode) = float_round_to_zero;
3014 res = float64_round_to_int(a STATUS_VAR);
3015 STATUS(float_rounding_mode) = oldmode;
3016 return res;
3019 /*----------------------------------------------------------------------------
3020 | Returns the result of adding the absolute values of the double-precision
3021 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3022 | before being returned. `zSign' is ignored if the result is a NaN.
3023 | The addition is performed according to the IEC/IEEE Standard for Binary
3024 | Floating-Point Arithmetic.
3025 *----------------------------------------------------------------------------*/
3027 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3029 int16 aExp, bExp, zExp;
3030 bits64 aSig, bSig, zSig;
3031 int16 expDiff;
3033 aSig = extractFloat64Frac( a );
3034 aExp = extractFloat64Exp( a );
3035 bSig = extractFloat64Frac( b );
3036 bExp = extractFloat64Exp( b );
3037 expDiff = aExp - bExp;
3038 aSig <<= 9;
3039 bSig <<= 9;
3040 if ( 0 < expDiff ) {
3041 if ( aExp == 0x7FF ) {
3042 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3043 return a;
3045 if ( bExp == 0 ) {
3046 --expDiff;
3048 else {
3049 bSig |= LIT64( 0x2000000000000000 );
3051 shift64RightJamming( bSig, expDiff, &bSig );
3052 zExp = aExp;
3054 else if ( expDiff < 0 ) {
3055 if ( bExp == 0x7FF ) {
3056 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3057 return packFloat64( zSign, 0x7FF, 0 );
3059 if ( aExp == 0 ) {
3060 ++expDiff;
3062 else {
3063 aSig |= LIT64( 0x2000000000000000 );
3065 shift64RightJamming( aSig, - expDiff, &aSig );
3066 zExp = bExp;
3068 else {
3069 if ( aExp == 0x7FF ) {
3070 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3071 return a;
3073 if ( aExp == 0 ) {
3074 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
3075 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3077 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3078 zExp = aExp;
3079 goto roundAndPack;
3081 aSig |= LIT64( 0x2000000000000000 );
3082 zSig = ( aSig + bSig )<<1;
3083 --zExp;
3084 if ( (sbits64) zSig < 0 ) {
3085 zSig = aSig + bSig;
3086 ++zExp;
3088 roundAndPack:
3089 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3093 /*----------------------------------------------------------------------------
3094 | Returns the result of subtracting the absolute values of the double-
3095 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3096 | difference is negated before being returned. `zSign' is ignored if the
3097 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3098 | Standard for Binary Floating-Point Arithmetic.
3099 *----------------------------------------------------------------------------*/
3101 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3103 int16 aExp, bExp, zExp;
3104 bits64 aSig, bSig, zSig;
3105 int16 expDiff;
3107 aSig = extractFloat64Frac( a );
3108 aExp = extractFloat64Exp( a );
3109 bSig = extractFloat64Frac( b );
3110 bExp = extractFloat64Exp( b );
3111 expDiff = aExp - bExp;
3112 aSig <<= 10;
3113 bSig <<= 10;
3114 if ( 0 < expDiff ) goto aExpBigger;
3115 if ( expDiff < 0 ) goto bExpBigger;
3116 if ( aExp == 0x7FF ) {
3117 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3118 float_raise( float_flag_invalid STATUS_VAR);
3119 return float64_default_nan;
3121 if ( aExp == 0 ) {
3122 aExp = 1;
3123 bExp = 1;
3125 if ( bSig < aSig ) goto aBigger;
3126 if ( aSig < bSig ) goto bBigger;
3127 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3128 bExpBigger:
3129 if ( bExp == 0x7FF ) {
3130 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3131 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3133 if ( aExp == 0 ) {
3134 ++expDiff;
3136 else {
3137 aSig |= LIT64( 0x4000000000000000 );
3139 shift64RightJamming( aSig, - expDiff, &aSig );
3140 bSig |= LIT64( 0x4000000000000000 );
3141 bBigger:
3142 zSig = bSig - aSig;
3143 zExp = bExp;
3144 zSign ^= 1;
3145 goto normalizeRoundAndPack;
3146 aExpBigger:
3147 if ( aExp == 0x7FF ) {
3148 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3149 return a;
3151 if ( bExp == 0 ) {
3152 --expDiff;
3154 else {
3155 bSig |= LIT64( 0x4000000000000000 );
3157 shift64RightJamming( bSig, expDiff, &bSig );
3158 aSig |= LIT64( 0x4000000000000000 );
3159 aBigger:
3160 zSig = aSig - bSig;
3161 zExp = aExp;
3162 normalizeRoundAndPack:
3163 --zExp;
3164 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3168 /*----------------------------------------------------------------------------
3169 | Returns the result of adding the double-precision floating-point values `a'
3170 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3171 | Binary Floating-Point Arithmetic.
3172 *----------------------------------------------------------------------------*/
3174 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3176 flag aSign, bSign;
3177 a = float64_squash_input_denormal(a STATUS_VAR);
3178 b = float64_squash_input_denormal(b STATUS_VAR);
3180 aSign = extractFloat64Sign( a );
3181 bSign = extractFloat64Sign( b );
3182 if ( aSign == bSign ) {
3183 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3185 else {
3186 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3191 /*----------------------------------------------------------------------------
3192 | Returns the result of subtracting the double-precision floating-point values
3193 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3194 | for Binary Floating-Point Arithmetic.
3195 *----------------------------------------------------------------------------*/
3197 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3199 flag aSign, bSign;
3200 a = float64_squash_input_denormal(a STATUS_VAR);
3201 b = float64_squash_input_denormal(b STATUS_VAR);
3203 aSign = extractFloat64Sign( a );
3204 bSign = extractFloat64Sign( b );
3205 if ( aSign == bSign ) {
3206 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3208 else {
3209 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3214 /*----------------------------------------------------------------------------
3215 | Returns the result of multiplying the double-precision floating-point values
3216 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3217 | for Binary Floating-Point Arithmetic.
3218 *----------------------------------------------------------------------------*/
3220 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3222 flag aSign, bSign, zSign;
3223 int16 aExp, bExp, zExp;
3224 bits64 aSig, bSig, zSig0, zSig1;
3226 a = float64_squash_input_denormal(a STATUS_VAR);
3227 b = float64_squash_input_denormal(b STATUS_VAR);
3229 aSig = extractFloat64Frac( a );
3230 aExp = extractFloat64Exp( a );
3231 aSign = extractFloat64Sign( a );
3232 bSig = extractFloat64Frac( b );
3233 bExp = extractFloat64Exp( b );
3234 bSign = extractFloat64Sign( b );
3235 zSign = aSign ^ bSign;
3236 if ( aExp == 0x7FF ) {
3237 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3238 return propagateFloat64NaN( a, b STATUS_VAR );
3240 if ( ( bExp | bSig ) == 0 ) {
3241 float_raise( float_flag_invalid STATUS_VAR);
3242 return float64_default_nan;
3244 return packFloat64( zSign, 0x7FF, 0 );
3246 if ( bExp == 0x7FF ) {
3247 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3248 if ( ( aExp | aSig ) == 0 ) {
3249 float_raise( float_flag_invalid STATUS_VAR);
3250 return float64_default_nan;
3252 return packFloat64( zSign, 0x7FF, 0 );
3254 if ( aExp == 0 ) {
3255 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3256 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3258 if ( bExp == 0 ) {
3259 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3260 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3262 zExp = aExp + bExp - 0x3FF;
3263 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3264 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3265 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3266 zSig0 |= ( zSig1 != 0 );
3267 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
3268 zSig0 <<= 1;
3269 --zExp;
3271 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3275 /*----------------------------------------------------------------------------
3276 | Returns the result of dividing the double-precision floating-point value `a'
3277 | by the corresponding value `b'. The operation is performed according to
3278 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3279 *----------------------------------------------------------------------------*/
3281 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3283 flag aSign, bSign, zSign;
3284 int16 aExp, bExp, zExp;
3285 bits64 aSig, bSig, zSig;
3286 bits64 rem0, rem1;
3287 bits64 term0, term1;
3288 a = float64_squash_input_denormal(a STATUS_VAR);
3289 b = float64_squash_input_denormal(b STATUS_VAR);
3291 aSig = extractFloat64Frac( a );
3292 aExp = extractFloat64Exp( a );
3293 aSign = extractFloat64Sign( a );
3294 bSig = extractFloat64Frac( b );
3295 bExp = extractFloat64Exp( b );
3296 bSign = extractFloat64Sign( b );
3297 zSign = aSign ^ bSign;
3298 if ( aExp == 0x7FF ) {
3299 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3300 if ( bExp == 0x7FF ) {
3301 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3302 float_raise( float_flag_invalid STATUS_VAR);
3303 return float64_default_nan;
3305 return packFloat64( zSign, 0x7FF, 0 );
3307 if ( bExp == 0x7FF ) {
3308 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3309 return packFloat64( zSign, 0, 0 );
3311 if ( bExp == 0 ) {
3312 if ( bSig == 0 ) {
3313 if ( ( aExp | aSig ) == 0 ) {
3314 float_raise( float_flag_invalid STATUS_VAR);
3315 return float64_default_nan;
3317 float_raise( float_flag_divbyzero STATUS_VAR);
3318 return packFloat64( zSign, 0x7FF, 0 );
3320 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3322 if ( aExp == 0 ) {
3323 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3324 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3326 zExp = aExp - bExp + 0x3FD;
3327 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3328 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3329 if ( bSig <= ( aSig + aSig ) ) {
3330 aSig >>= 1;
3331 ++zExp;
3333 zSig = estimateDiv128To64( aSig, 0, bSig );
3334 if ( ( zSig & 0x1FF ) <= 2 ) {
3335 mul64To128( bSig, zSig, &term0, &term1 );
3336 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3337 while ( (sbits64) rem0 < 0 ) {
3338 --zSig;
3339 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3341 zSig |= ( rem1 != 0 );
3343 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3347 /*----------------------------------------------------------------------------
3348 | Returns the remainder of the double-precision floating-point value `a'
3349 | with respect to the corresponding value `b'. The operation is performed
3350 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3351 *----------------------------------------------------------------------------*/
3353 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3355 flag aSign, zSign;
3356 int16 aExp, bExp, expDiff;
3357 bits64 aSig, bSig;
3358 bits64 q, alternateASig;
3359 sbits64 sigMean;
3361 a = float64_squash_input_denormal(a STATUS_VAR);
3362 b = float64_squash_input_denormal(b STATUS_VAR);
3363 aSig = extractFloat64Frac( a );
3364 aExp = extractFloat64Exp( a );
3365 aSign = extractFloat64Sign( a );
3366 bSig = extractFloat64Frac( b );
3367 bExp = extractFloat64Exp( b );
3368 if ( aExp == 0x7FF ) {
3369 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3370 return propagateFloat64NaN( a, b STATUS_VAR );
3372 float_raise( float_flag_invalid STATUS_VAR);
3373 return float64_default_nan;
3375 if ( bExp == 0x7FF ) {
3376 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3377 return a;
3379 if ( bExp == 0 ) {
3380 if ( bSig == 0 ) {
3381 float_raise( float_flag_invalid STATUS_VAR);
3382 return float64_default_nan;
3384 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3386 if ( aExp == 0 ) {
3387 if ( aSig == 0 ) return a;
3388 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3390 expDiff = aExp - bExp;
3391 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3392 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3393 if ( expDiff < 0 ) {
3394 if ( expDiff < -1 ) return a;
3395 aSig >>= 1;
3397 q = ( bSig <= aSig );
3398 if ( q ) aSig -= bSig;
3399 expDiff -= 64;
3400 while ( 0 < expDiff ) {
3401 q = estimateDiv128To64( aSig, 0, bSig );
3402 q = ( 2 < q ) ? q - 2 : 0;
3403 aSig = - ( ( bSig>>2 ) * q );
3404 expDiff -= 62;
3406 expDiff += 64;
3407 if ( 0 < expDiff ) {
3408 q = estimateDiv128To64( aSig, 0, bSig );
3409 q = ( 2 < q ) ? q - 2 : 0;
3410 q >>= 64 - expDiff;
3411 bSig >>= 2;
3412 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3414 else {
3415 aSig >>= 2;
3416 bSig >>= 2;
3418 do {
3419 alternateASig = aSig;
3420 ++q;
3421 aSig -= bSig;
3422 } while ( 0 <= (sbits64) aSig );
3423 sigMean = aSig + alternateASig;
3424 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3425 aSig = alternateASig;
3427 zSign = ( (sbits64) aSig < 0 );
3428 if ( zSign ) aSig = - aSig;
3429 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3433 /*----------------------------------------------------------------------------
3434 | Returns the square root of the double-precision floating-point value `a'.
3435 | The operation is performed according to the IEC/IEEE Standard for Binary
3436 | Floating-Point Arithmetic.
3437 *----------------------------------------------------------------------------*/
3439 float64 float64_sqrt( float64 a STATUS_PARAM )
3441 flag aSign;
3442 int16 aExp, zExp;
3443 bits64 aSig, zSig, doubleZSig;
3444 bits64 rem0, rem1, term0, term1;
3445 a = float64_squash_input_denormal(a STATUS_VAR);
3447 aSig = extractFloat64Frac( a );
3448 aExp = extractFloat64Exp( a );
3449 aSign = extractFloat64Sign( a );
3450 if ( aExp == 0x7FF ) {
3451 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3452 if ( ! aSign ) return a;
3453 float_raise( float_flag_invalid STATUS_VAR);
3454 return float64_default_nan;
3456 if ( aSign ) {
3457 if ( ( aExp | aSig ) == 0 ) return a;
3458 float_raise( float_flag_invalid STATUS_VAR);
3459 return float64_default_nan;
3461 if ( aExp == 0 ) {
3462 if ( aSig == 0 ) return float64_zero;
3463 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3465 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3466 aSig |= LIT64( 0x0010000000000000 );
3467 zSig = estimateSqrt32( aExp, aSig>>21 );
3468 aSig <<= 9 - ( aExp & 1 );
3469 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3470 if ( ( zSig & 0x1FF ) <= 5 ) {
3471 doubleZSig = zSig<<1;
3472 mul64To128( zSig, zSig, &term0, &term1 );
3473 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3474 while ( (sbits64) rem0 < 0 ) {
3475 --zSig;
3476 doubleZSig -= 2;
3477 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3479 zSig |= ( ( rem0 | rem1 ) != 0 );
3481 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3485 /*----------------------------------------------------------------------------
3486 | Returns the binary log of the double-precision floating-point value `a'.
3487 | The operation is performed according to the IEC/IEEE Standard for Binary
3488 | Floating-Point Arithmetic.
3489 *----------------------------------------------------------------------------*/
3490 float64 float64_log2( float64 a STATUS_PARAM )
3492 flag aSign, zSign;
3493 int16 aExp;
3494 bits64 aSig, aSig0, aSig1, zSig, i;
3495 a = float64_squash_input_denormal(a STATUS_VAR);
3497 aSig = extractFloat64Frac( a );
3498 aExp = extractFloat64Exp( a );
3499 aSign = extractFloat64Sign( a );
3501 if ( aExp == 0 ) {
3502 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3503 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3505 if ( aSign ) {
3506 float_raise( float_flag_invalid STATUS_VAR);
3507 return float64_default_nan;
3509 if ( aExp == 0x7FF ) {
3510 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3511 return a;
3514 aExp -= 0x3FF;
3515 aSig |= LIT64( 0x0010000000000000 );
3516 zSign = aExp < 0;
3517 zSig = (bits64)aExp << 52;
3518 for (i = 1LL << 51; i > 0; i >>= 1) {
3519 mul64To128( aSig, aSig, &aSig0, &aSig1 );
3520 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3521 if ( aSig & LIT64( 0x0020000000000000 ) ) {
3522 aSig >>= 1;
3523 zSig |= i;
3527 if ( zSign )
3528 zSig = -zSig;
3529 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3532 /*----------------------------------------------------------------------------
3533 | Returns 1 if the double-precision floating-point value `a' is equal to the
3534 | corresponding value `b', and 0 otherwise. The comparison is performed
3535 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3536 *----------------------------------------------------------------------------*/
3538 int float64_eq( float64 a, float64 b STATUS_PARAM )
3540 bits64 av, bv;
3541 a = float64_squash_input_denormal(a STATUS_VAR);
3542 b = float64_squash_input_denormal(b STATUS_VAR);
3544 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3545 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3547 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3548 float_raise( float_flag_invalid STATUS_VAR);
3550 return 0;
3552 av = float64_val(a);
3553 bv = float64_val(b);
3554 return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3558 /*----------------------------------------------------------------------------
3559 | Returns 1 if the double-precision floating-point value `a' is less than or
3560 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3561 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3562 | Arithmetic.
3563 *----------------------------------------------------------------------------*/
3565 int float64_le( float64 a, float64 b STATUS_PARAM )
3567 flag aSign, bSign;
3568 bits64 av, bv;
3569 a = float64_squash_input_denormal(a STATUS_VAR);
3570 b = float64_squash_input_denormal(b STATUS_VAR);
3572 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3573 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3575 float_raise( float_flag_invalid STATUS_VAR);
3576 return 0;
3578 aSign = extractFloat64Sign( a );
3579 bSign = extractFloat64Sign( b );
3580 av = float64_val(a);
3581 bv = float64_val(b);
3582 if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3583 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3587 /*----------------------------------------------------------------------------
3588 | Returns 1 if the double-precision floating-point value `a' is less than
3589 | the corresponding value `b', and 0 otherwise. The comparison is performed
3590 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3591 *----------------------------------------------------------------------------*/
3593 int float64_lt( float64 a, float64 b STATUS_PARAM )
3595 flag aSign, bSign;
3596 bits64 av, bv;
3598 a = float64_squash_input_denormal(a STATUS_VAR);
3599 b = float64_squash_input_denormal(b STATUS_VAR);
3600 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3601 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3603 float_raise( float_flag_invalid STATUS_VAR);
3604 return 0;
3606 aSign = extractFloat64Sign( a );
3607 bSign = extractFloat64Sign( b );
3608 av = float64_val(a);
3609 bv = float64_val(b);
3610 if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3611 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3615 /*----------------------------------------------------------------------------
3616 | Returns 1 if the double-precision floating-point value `a' is equal to the
3617 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3618 | if either operand is a NaN. Otherwise, the comparison is performed
3619 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3620 *----------------------------------------------------------------------------*/
3622 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3624 bits64 av, bv;
3625 a = float64_squash_input_denormal(a STATUS_VAR);
3626 b = float64_squash_input_denormal(b STATUS_VAR);
3628 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3629 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3631 float_raise( float_flag_invalid STATUS_VAR);
3632 return 0;
3634 av = float64_val(a);
3635 bv = float64_val(b);
3636 return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3640 /*----------------------------------------------------------------------------
3641 | Returns 1 if the double-precision floating-point value `a' is less than or
3642 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3643 | cause an exception. Otherwise, the comparison is performed according to the
3644 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3645 *----------------------------------------------------------------------------*/
3647 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3649 flag aSign, bSign;
3650 bits64 av, bv;
3651 a = float64_squash_input_denormal(a STATUS_VAR);
3652 b = float64_squash_input_denormal(b STATUS_VAR);
3654 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3655 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3657 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3658 float_raise( float_flag_invalid STATUS_VAR);
3660 return 0;
3662 aSign = extractFloat64Sign( a );
3663 bSign = extractFloat64Sign( b );
3664 av = float64_val(a);
3665 bv = float64_val(b);
3666 if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3667 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3671 /*----------------------------------------------------------------------------
3672 | Returns 1 if the double-precision floating-point value `a' is less than
3673 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3674 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3675 | Standard for Binary Floating-Point Arithmetic.
3676 *----------------------------------------------------------------------------*/
3678 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3680 flag aSign, bSign;
3681 bits64 av, bv;
3682 a = float64_squash_input_denormal(a STATUS_VAR);
3683 b = float64_squash_input_denormal(b STATUS_VAR);
3685 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3686 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3688 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3689 float_raise( float_flag_invalid STATUS_VAR);
3691 return 0;
3693 aSign = extractFloat64Sign( a );
3694 bSign = extractFloat64Sign( b );
3695 av = float64_val(a);
3696 bv = float64_val(b);
3697 if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3698 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3702 #ifdef FLOATX80
3704 /*----------------------------------------------------------------------------
3705 | Returns the result of converting the extended double-precision floating-
3706 | point value `a' to the 32-bit two's complement integer format. The
3707 | conversion is performed according to the IEC/IEEE Standard for Binary
3708 | Floating-Point Arithmetic---which means in particular that the conversion
3709 | is rounded according to the current rounding mode. If `a' is a NaN, the
3710 | largest positive integer is returned. Otherwise, if the conversion
3711 | overflows, the largest integer with the same sign as `a' is returned.
3712 *----------------------------------------------------------------------------*/
3714 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3716 flag aSign;
3717 int32 aExp, shiftCount;
3718 bits64 aSig;
3720 aSig = extractFloatx80Frac( a );
3721 aExp = extractFloatx80Exp( a );
3722 aSign = extractFloatx80Sign( a );
3723 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3724 shiftCount = 0x4037 - aExp;
3725 if ( shiftCount <= 0 ) shiftCount = 1;
3726 shift64RightJamming( aSig, shiftCount, &aSig );
3727 return roundAndPackInt32( aSign, aSig STATUS_VAR );
3731 /*----------------------------------------------------------------------------
3732 | Returns the result of converting the extended double-precision floating-
3733 | point value `a' to the 32-bit two's complement integer format. The
3734 | conversion is performed according to the IEC/IEEE Standard for Binary
3735 | Floating-Point Arithmetic, except that the conversion is always rounded
3736 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3737 | Otherwise, if the conversion overflows, the largest integer with the same
3738 | sign as `a' is returned.
3739 *----------------------------------------------------------------------------*/
3741 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3743 flag aSign;
3744 int32 aExp, shiftCount;
3745 bits64 aSig, savedASig;
3746 int32 z;
3748 aSig = extractFloatx80Frac( a );
3749 aExp = extractFloatx80Exp( a );
3750 aSign = extractFloatx80Sign( a );
3751 if ( 0x401E < aExp ) {
3752 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3753 goto invalid;
3755 else if ( aExp < 0x3FFF ) {
3756 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3757 return 0;
3759 shiftCount = 0x403E - aExp;
3760 savedASig = aSig;
3761 aSig >>= shiftCount;
3762 z = aSig;
3763 if ( aSign ) z = - z;
3764 if ( ( z < 0 ) ^ aSign ) {
3765 invalid:
3766 float_raise( float_flag_invalid STATUS_VAR);
3767 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3769 if ( ( aSig<<shiftCount ) != savedASig ) {
3770 STATUS(float_exception_flags) |= float_flag_inexact;
3772 return z;
3776 /*----------------------------------------------------------------------------
3777 | Returns the result of converting the extended double-precision floating-
3778 | point value `a' to the 64-bit two's complement integer format. The
3779 | conversion is performed according to the IEC/IEEE Standard for Binary
3780 | Floating-Point Arithmetic---which means in particular that the conversion
3781 | is rounded according to the current rounding mode. If `a' is a NaN,
3782 | the largest positive integer is returned. Otherwise, if the conversion
3783 | overflows, the largest integer with the same sign as `a' is returned.
3784 *----------------------------------------------------------------------------*/
3786 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3788 flag aSign;
3789 int32 aExp, shiftCount;
3790 bits64 aSig, aSigExtra;
3792 aSig = extractFloatx80Frac( a );
3793 aExp = extractFloatx80Exp( a );
3794 aSign = extractFloatx80Sign( a );
3795 shiftCount = 0x403E - aExp;
3796 if ( shiftCount <= 0 ) {
3797 if ( shiftCount ) {
3798 float_raise( float_flag_invalid STATUS_VAR);
3799 if ( ! aSign
3800 || ( ( aExp == 0x7FFF )
3801 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3803 return LIT64( 0x7FFFFFFFFFFFFFFF );
3805 return (sbits64) LIT64( 0x8000000000000000 );
3807 aSigExtra = 0;
3809 else {
3810 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3812 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3816 /*----------------------------------------------------------------------------
3817 | Returns the result of converting the extended double-precision floating-
3818 | point value `a' to the 64-bit two's complement integer format. The
3819 | conversion is performed according to the IEC/IEEE Standard for Binary
3820 | Floating-Point Arithmetic, except that the conversion is always rounded
3821 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3822 | Otherwise, if the conversion overflows, the largest integer with the same
3823 | sign as `a' is returned.
3824 *----------------------------------------------------------------------------*/
3826 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3828 flag aSign;
3829 int32 aExp, shiftCount;
3830 bits64 aSig;
3831 int64 z;
3833 aSig = extractFloatx80Frac( a );
3834 aExp = extractFloatx80Exp( a );
3835 aSign = extractFloatx80Sign( a );
3836 shiftCount = aExp - 0x403E;
3837 if ( 0 <= shiftCount ) {
3838 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3839 if ( ( a.high != 0xC03E ) || aSig ) {
3840 float_raise( float_flag_invalid STATUS_VAR);
3841 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3842 return LIT64( 0x7FFFFFFFFFFFFFFF );
3845 return (sbits64) LIT64( 0x8000000000000000 );
3847 else if ( aExp < 0x3FFF ) {
3848 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3849 return 0;
3851 z = aSig>>( - shiftCount );
3852 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3853 STATUS(float_exception_flags) |= float_flag_inexact;
3855 if ( aSign ) z = - z;
3856 return z;
3860 /*----------------------------------------------------------------------------
3861 | Returns the result of converting the extended double-precision floating-
3862 | point value `a' to the single-precision floating-point format. The
3863 | conversion is performed according to the IEC/IEEE Standard for Binary
3864 | Floating-Point Arithmetic.
3865 *----------------------------------------------------------------------------*/
3867 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3869 flag aSign;
3870 int32 aExp;
3871 bits64 aSig;
3873 aSig = extractFloatx80Frac( a );
3874 aExp = extractFloatx80Exp( a );
3875 aSign = extractFloatx80Sign( a );
3876 if ( aExp == 0x7FFF ) {
3877 if ( (bits64) ( aSig<<1 ) ) {
3878 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3880 return packFloat32( aSign, 0xFF, 0 );
3882 shift64RightJamming( aSig, 33, &aSig );
3883 if ( aExp || aSig ) aExp -= 0x3F81;
3884 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3888 /*----------------------------------------------------------------------------
3889 | Returns the result of converting the extended double-precision floating-
3890 | point value `a' to the double-precision floating-point format. The
3891 | conversion is performed according to the IEC/IEEE Standard for Binary
3892 | Floating-Point Arithmetic.
3893 *----------------------------------------------------------------------------*/
3895 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3897 flag aSign;
3898 int32 aExp;
3899 bits64 aSig, zSig;
3901 aSig = extractFloatx80Frac( a );
3902 aExp = extractFloatx80Exp( a );
3903 aSign = extractFloatx80Sign( a );
3904 if ( aExp == 0x7FFF ) {
3905 if ( (bits64) ( aSig<<1 ) ) {
3906 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3908 return packFloat64( aSign, 0x7FF, 0 );
3910 shift64RightJamming( aSig, 1, &zSig );
3911 if ( aExp || aSig ) aExp -= 0x3C01;
3912 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3916 #ifdef FLOAT128
3918 /*----------------------------------------------------------------------------
3919 | Returns the result of converting the extended double-precision floating-
3920 | point value `a' to the quadruple-precision floating-point format. The
3921 | conversion is performed according to the IEC/IEEE Standard for Binary
3922 | Floating-Point Arithmetic.
3923 *----------------------------------------------------------------------------*/
3925 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3927 flag aSign;
3928 int16 aExp;
3929 bits64 aSig, zSig0, zSig1;
3931 aSig = extractFloatx80Frac( a );
3932 aExp = extractFloatx80Exp( a );
3933 aSign = extractFloatx80Sign( a );
3934 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3935 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3937 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3938 return packFloat128( aSign, aExp, zSig0, zSig1 );
3942 #endif
3944 /*----------------------------------------------------------------------------
3945 | Rounds the extended double-precision floating-point value `a' to an integer,
3946 | and returns the result as an extended quadruple-precision floating-point
3947 | value. The operation is performed according to the IEC/IEEE Standard for
3948 | Binary Floating-Point Arithmetic.
3949 *----------------------------------------------------------------------------*/
3951 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3953 flag aSign;
3954 int32 aExp;
3955 bits64 lastBitMask, roundBitsMask;
3956 int8 roundingMode;
3957 floatx80 z;
3959 aExp = extractFloatx80Exp( a );
3960 if ( 0x403E <= aExp ) {
3961 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3962 return propagateFloatx80NaN( a, a STATUS_VAR );
3964 return a;
3966 if ( aExp < 0x3FFF ) {
3967 if ( ( aExp == 0 )
3968 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3969 return a;
3971 STATUS(float_exception_flags) |= float_flag_inexact;
3972 aSign = extractFloatx80Sign( a );
3973 switch ( STATUS(float_rounding_mode) ) {
3974 case float_round_nearest_even:
3975 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3977 return
3978 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3980 break;
3981 case float_round_down:
3982 return
3983 aSign ?
3984 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3985 : packFloatx80( 0, 0, 0 );
3986 case float_round_up:
3987 return
3988 aSign ? packFloatx80( 1, 0, 0 )
3989 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3991 return packFloatx80( aSign, 0, 0 );
3993 lastBitMask = 1;
3994 lastBitMask <<= 0x403E - aExp;
3995 roundBitsMask = lastBitMask - 1;
3996 z = a;
3997 roundingMode = STATUS(float_rounding_mode);
3998 if ( roundingMode == float_round_nearest_even ) {
3999 z.low += lastBitMask>>1;
4000 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4002 else if ( roundingMode != float_round_to_zero ) {
4003 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4004 z.low += roundBitsMask;
4007 z.low &= ~ roundBitsMask;
4008 if ( z.low == 0 ) {
4009 ++z.high;
4010 z.low = LIT64( 0x8000000000000000 );
4012 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4013 return z;
4017 /*----------------------------------------------------------------------------
4018 | Returns the result of adding the absolute values of the extended double-
4019 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4020 | negated before being returned. `zSign' is ignored if the result is a NaN.
4021 | The addition is performed according to the IEC/IEEE Standard for Binary
4022 | Floating-Point Arithmetic.
4023 *----------------------------------------------------------------------------*/
4025 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4027 int32 aExp, bExp, zExp;
4028 bits64 aSig, bSig, zSig0, zSig1;
4029 int32 expDiff;
4031 aSig = extractFloatx80Frac( a );
4032 aExp = extractFloatx80Exp( a );
4033 bSig = extractFloatx80Frac( b );
4034 bExp = extractFloatx80Exp( b );
4035 expDiff = aExp - bExp;
4036 if ( 0 < expDiff ) {
4037 if ( aExp == 0x7FFF ) {
4038 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4039 return a;
4041 if ( bExp == 0 ) --expDiff;
4042 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4043 zExp = aExp;
4045 else if ( expDiff < 0 ) {
4046 if ( bExp == 0x7FFF ) {
4047 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4048 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4050 if ( aExp == 0 ) ++expDiff;
4051 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4052 zExp = bExp;
4054 else {
4055 if ( aExp == 0x7FFF ) {
4056 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
4057 return propagateFloatx80NaN( a, b STATUS_VAR );
4059 return a;
4061 zSig1 = 0;
4062 zSig0 = aSig + bSig;
4063 if ( aExp == 0 ) {
4064 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4065 goto roundAndPack;
4067 zExp = aExp;
4068 goto shiftRight1;
4070 zSig0 = aSig + bSig;
4071 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
4072 shiftRight1:
4073 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4074 zSig0 |= LIT64( 0x8000000000000000 );
4075 ++zExp;
4076 roundAndPack:
4077 return
4078 roundAndPackFloatx80(
4079 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4083 /*----------------------------------------------------------------------------
4084 | Returns the result of subtracting the absolute values of the extended
4085 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4086 | difference is negated before being returned. `zSign' is ignored if the
4087 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4088 | Standard for Binary Floating-Point Arithmetic.
4089 *----------------------------------------------------------------------------*/
4091 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4093 int32 aExp, bExp, zExp;
4094 bits64 aSig, bSig, zSig0, zSig1;
4095 int32 expDiff;
4096 floatx80 z;
4098 aSig = extractFloatx80Frac( a );
4099 aExp = extractFloatx80Exp( a );
4100 bSig = extractFloatx80Frac( b );
4101 bExp = extractFloatx80Exp( b );
4102 expDiff = aExp - bExp;
4103 if ( 0 < expDiff ) goto aExpBigger;
4104 if ( expDiff < 0 ) goto bExpBigger;
4105 if ( aExp == 0x7FFF ) {
4106 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
4107 return propagateFloatx80NaN( a, b STATUS_VAR );
4109 float_raise( float_flag_invalid STATUS_VAR);
4110 z.low = floatx80_default_nan_low;
4111 z.high = floatx80_default_nan_high;
4112 return z;
4114 if ( aExp == 0 ) {
4115 aExp = 1;
4116 bExp = 1;
4118 zSig1 = 0;
4119 if ( bSig < aSig ) goto aBigger;
4120 if ( aSig < bSig ) goto bBigger;
4121 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4122 bExpBigger:
4123 if ( bExp == 0x7FFF ) {
4124 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4125 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4127 if ( aExp == 0 ) ++expDiff;
4128 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4129 bBigger:
4130 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4131 zExp = bExp;
4132 zSign ^= 1;
4133 goto normalizeRoundAndPack;
4134 aExpBigger:
4135 if ( aExp == 0x7FFF ) {
4136 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4137 return a;
4139 if ( bExp == 0 ) --expDiff;
4140 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4141 aBigger:
4142 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4143 zExp = aExp;
4144 normalizeRoundAndPack:
4145 return
4146 normalizeRoundAndPackFloatx80(
4147 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4151 /*----------------------------------------------------------------------------
4152 | Returns the result of adding the extended double-precision floating-point
4153 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4154 | Standard for Binary Floating-Point Arithmetic.
4155 *----------------------------------------------------------------------------*/
4157 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4159 flag aSign, bSign;
4161 aSign = extractFloatx80Sign( a );
4162 bSign = extractFloatx80Sign( b );
4163 if ( aSign == bSign ) {
4164 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4166 else {
4167 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4172 /*----------------------------------------------------------------------------
4173 | Returns the result of subtracting the extended double-precision floating-
4174 | point values `a' and `b'. The operation is performed according to the
4175 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4176 *----------------------------------------------------------------------------*/
4178 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4180 flag aSign, bSign;
4182 aSign = extractFloatx80Sign( a );
4183 bSign = extractFloatx80Sign( b );
4184 if ( aSign == bSign ) {
4185 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4187 else {
4188 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4193 /*----------------------------------------------------------------------------
4194 | Returns the result of multiplying the extended double-precision floating-
4195 | point values `a' and `b'. The operation is performed according to the
4196 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4197 *----------------------------------------------------------------------------*/
4199 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4201 flag aSign, bSign, zSign;
4202 int32 aExp, bExp, zExp;
4203 bits64 aSig, bSig, zSig0, zSig1;
4204 floatx80 z;
4206 aSig = extractFloatx80Frac( a );
4207 aExp = extractFloatx80Exp( a );
4208 aSign = extractFloatx80Sign( a );
4209 bSig = extractFloatx80Frac( b );
4210 bExp = extractFloatx80Exp( b );
4211 bSign = extractFloatx80Sign( b );
4212 zSign = aSign ^ bSign;
4213 if ( aExp == 0x7FFF ) {
4214 if ( (bits64) ( aSig<<1 )
4215 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4216 return propagateFloatx80NaN( a, b STATUS_VAR );
4218 if ( ( bExp | bSig ) == 0 ) goto invalid;
4219 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4221 if ( bExp == 0x7FFF ) {
4222 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4223 if ( ( aExp | aSig ) == 0 ) {
4224 invalid:
4225 float_raise( float_flag_invalid STATUS_VAR);
4226 z.low = floatx80_default_nan_low;
4227 z.high = floatx80_default_nan_high;
4228 return z;
4230 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4232 if ( aExp == 0 ) {
4233 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4234 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4236 if ( bExp == 0 ) {
4237 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4238 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4240 zExp = aExp + bExp - 0x3FFE;
4241 mul64To128( aSig, bSig, &zSig0, &zSig1 );
4242 if ( 0 < (sbits64) zSig0 ) {
4243 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4244 --zExp;
4246 return
4247 roundAndPackFloatx80(
4248 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4252 /*----------------------------------------------------------------------------
4253 | Returns the result of dividing the extended double-precision floating-point
4254 | value `a' by the corresponding value `b'. The operation is performed
4255 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4256 *----------------------------------------------------------------------------*/
4258 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4260 flag aSign, bSign, zSign;
4261 int32 aExp, bExp, zExp;
4262 bits64 aSig, bSig, zSig0, zSig1;
4263 bits64 rem0, rem1, rem2, term0, term1, term2;
4264 floatx80 z;
4266 aSig = extractFloatx80Frac( a );
4267 aExp = extractFloatx80Exp( a );
4268 aSign = extractFloatx80Sign( a );
4269 bSig = extractFloatx80Frac( b );
4270 bExp = extractFloatx80Exp( b );
4271 bSign = extractFloatx80Sign( b );
4272 zSign = aSign ^ bSign;
4273 if ( aExp == 0x7FFF ) {
4274 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4275 if ( bExp == 0x7FFF ) {
4276 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4277 goto invalid;
4279 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4281 if ( bExp == 0x7FFF ) {
4282 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4283 return packFloatx80( zSign, 0, 0 );
4285 if ( bExp == 0 ) {
4286 if ( bSig == 0 ) {
4287 if ( ( aExp | aSig ) == 0 ) {
4288 invalid:
4289 float_raise( float_flag_invalid STATUS_VAR);
4290 z.low = floatx80_default_nan_low;
4291 z.high = floatx80_default_nan_high;
4292 return z;
4294 float_raise( float_flag_divbyzero STATUS_VAR);
4295 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4297 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4299 if ( aExp == 0 ) {
4300 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4301 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4303 zExp = aExp - bExp + 0x3FFE;
4304 rem1 = 0;
4305 if ( bSig <= aSig ) {
4306 shift128Right( aSig, 0, 1, &aSig, &rem1 );
4307 ++zExp;
4309 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4310 mul64To128( bSig, zSig0, &term0, &term1 );
4311 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4312 while ( (sbits64) rem0 < 0 ) {
4313 --zSig0;
4314 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4316 zSig1 = estimateDiv128To64( rem1, 0, bSig );
4317 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
4318 mul64To128( bSig, zSig1, &term1, &term2 );
4319 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4320 while ( (sbits64) rem1 < 0 ) {
4321 --zSig1;
4322 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4324 zSig1 |= ( ( rem1 | rem2 ) != 0 );
4326 return
4327 roundAndPackFloatx80(
4328 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4332 /*----------------------------------------------------------------------------
4333 | Returns the remainder of the extended double-precision floating-point value
4334 | `a' with respect to the corresponding value `b'. The operation is performed
4335 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4336 *----------------------------------------------------------------------------*/
4338 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4340 flag aSign, zSign;
4341 int32 aExp, bExp, expDiff;
4342 bits64 aSig0, aSig1, bSig;
4343 bits64 q, term0, term1, alternateASig0, alternateASig1;
4344 floatx80 z;
4346 aSig0 = extractFloatx80Frac( a );
4347 aExp = extractFloatx80Exp( a );
4348 aSign = extractFloatx80Sign( a );
4349 bSig = extractFloatx80Frac( b );
4350 bExp = extractFloatx80Exp( b );
4351 if ( aExp == 0x7FFF ) {
4352 if ( (bits64) ( aSig0<<1 )
4353 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4354 return propagateFloatx80NaN( a, b STATUS_VAR );
4356 goto invalid;
4358 if ( bExp == 0x7FFF ) {
4359 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4360 return a;
4362 if ( bExp == 0 ) {
4363 if ( bSig == 0 ) {
4364 invalid:
4365 float_raise( float_flag_invalid STATUS_VAR);
4366 z.low = floatx80_default_nan_low;
4367 z.high = floatx80_default_nan_high;
4368 return z;
4370 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4372 if ( aExp == 0 ) {
4373 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
4374 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4376 bSig |= LIT64( 0x8000000000000000 );
4377 zSign = aSign;
4378 expDiff = aExp - bExp;
4379 aSig1 = 0;
4380 if ( expDiff < 0 ) {
4381 if ( expDiff < -1 ) return a;
4382 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4383 expDiff = 0;
4385 q = ( bSig <= aSig0 );
4386 if ( q ) aSig0 -= bSig;
4387 expDiff -= 64;
4388 while ( 0 < expDiff ) {
4389 q = estimateDiv128To64( aSig0, aSig1, bSig );
4390 q = ( 2 < q ) ? q - 2 : 0;
4391 mul64To128( bSig, q, &term0, &term1 );
4392 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4393 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4394 expDiff -= 62;
4396 expDiff += 64;
4397 if ( 0 < expDiff ) {
4398 q = estimateDiv128To64( aSig0, aSig1, bSig );
4399 q = ( 2 < q ) ? q - 2 : 0;
4400 q >>= 64 - expDiff;
4401 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4402 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4403 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4404 while ( le128( term0, term1, aSig0, aSig1 ) ) {
4405 ++q;
4406 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4409 else {
4410 term1 = 0;
4411 term0 = bSig;
4413 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4414 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4415 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4416 && ( q & 1 ) )
4418 aSig0 = alternateASig0;
4419 aSig1 = alternateASig1;
4420 zSign = ! zSign;
4422 return
4423 normalizeRoundAndPackFloatx80(
4424 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4428 /*----------------------------------------------------------------------------
4429 | Returns the square root of the extended double-precision floating-point
4430 | value `a'. The operation is performed according to the IEC/IEEE Standard
4431 | for Binary Floating-Point Arithmetic.
4432 *----------------------------------------------------------------------------*/
4434 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4436 flag aSign;
4437 int32 aExp, zExp;
4438 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4439 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4440 floatx80 z;
4442 aSig0 = extractFloatx80Frac( a );
4443 aExp = extractFloatx80Exp( a );
4444 aSign = extractFloatx80Sign( a );
4445 if ( aExp == 0x7FFF ) {
4446 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4447 if ( ! aSign ) return a;
4448 goto invalid;
4450 if ( aSign ) {
4451 if ( ( aExp | aSig0 ) == 0 ) return a;
4452 invalid:
4453 float_raise( float_flag_invalid STATUS_VAR);
4454 z.low = floatx80_default_nan_low;
4455 z.high = floatx80_default_nan_high;
4456 return z;
4458 if ( aExp == 0 ) {
4459 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4460 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4462 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4463 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4464 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4465 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4466 doubleZSig0 = zSig0<<1;
4467 mul64To128( zSig0, zSig0, &term0, &term1 );
4468 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4469 while ( (sbits64) rem0 < 0 ) {
4470 --zSig0;
4471 doubleZSig0 -= 2;
4472 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4474 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4475 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4476 if ( zSig1 == 0 ) zSig1 = 1;
4477 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4478 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4479 mul64To128( zSig1, zSig1, &term2, &term3 );
4480 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4481 while ( (sbits64) rem1 < 0 ) {
4482 --zSig1;
4483 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4484 term3 |= 1;
4485 term2 |= doubleZSig0;
4486 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4488 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4490 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4491 zSig0 |= doubleZSig0;
4492 return
4493 roundAndPackFloatx80(
4494 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4498 /*----------------------------------------------------------------------------
4499 | Returns 1 if the extended double-precision floating-point value `a' is
4500 | equal to the corresponding value `b', and 0 otherwise. The comparison is
4501 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4502 | Arithmetic.
4503 *----------------------------------------------------------------------------*/
4505 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4508 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4509 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4510 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4511 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4513 if ( floatx80_is_signaling_nan( a )
4514 || floatx80_is_signaling_nan( b ) ) {
4515 float_raise( float_flag_invalid STATUS_VAR);
4517 return 0;
4519 return
4520 ( a.low == b.low )
4521 && ( ( a.high == b.high )
4522 || ( ( a.low == 0 )
4523 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4528 /*----------------------------------------------------------------------------
4529 | Returns 1 if the extended double-precision floating-point value `a' is
4530 | less than or equal to the corresponding value `b', and 0 otherwise. The
4531 | comparison is performed according to the IEC/IEEE Standard for Binary
4532 | Floating-Point Arithmetic.
4533 *----------------------------------------------------------------------------*/
4535 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4537 flag aSign, bSign;
4539 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4540 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4541 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4542 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4544 float_raise( float_flag_invalid STATUS_VAR);
4545 return 0;
4547 aSign = extractFloatx80Sign( a );
4548 bSign = extractFloatx80Sign( b );
4549 if ( aSign != bSign ) {
4550 return
4551 aSign
4552 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4553 == 0 );
4555 return
4556 aSign ? le128( b.high, b.low, a.high, a.low )
4557 : le128( a.high, a.low, b.high, b.low );
4561 /*----------------------------------------------------------------------------
4562 | Returns 1 if the extended double-precision floating-point value `a' is
4563 | less than the corresponding value `b', and 0 otherwise. The comparison
4564 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4565 | Arithmetic.
4566 *----------------------------------------------------------------------------*/
4568 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4570 flag aSign, bSign;
4572 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4573 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4574 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4575 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4577 float_raise( float_flag_invalid STATUS_VAR);
4578 return 0;
4580 aSign = extractFloatx80Sign( a );
4581 bSign = extractFloatx80Sign( b );
4582 if ( aSign != bSign ) {
4583 return
4584 aSign
4585 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4586 != 0 );
4588 return
4589 aSign ? lt128( b.high, b.low, a.high, a.low )
4590 : lt128( a.high, a.low, b.high, b.low );
4594 /*----------------------------------------------------------------------------
4595 | Returns 1 if the extended double-precision floating-point value `a' is equal
4596 | to the corresponding value `b', and 0 otherwise. The invalid exception is
4597 | raised if either operand is a NaN. Otherwise, the comparison is performed
4598 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4599 *----------------------------------------------------------------------------*/
4601 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4604 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4605 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4606 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4607 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4609 float_raise( float_flag_invalid STATUS_VAR);
4610 return 0;
4612 return
4613 ( a.low == b.low )
4614 && ( ( a.high == b.high )
4615 || ( ( a.low == 0 )
4616 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4621 /*----------------------------------------------------------------------------
4622 | Returns 1 if the extended double-precision floating-point value `a' is less
4623 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4624 | do not cause an exception. Otherwise, the comparison is performed according
4625 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4626 *----------------------------------------------------------------------------*/
4628 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4630 flag aSign, bSign;
4632 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4633 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4634 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4635 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4637 if ( floatx80_is_signaling_nan( a )
4638 || floatx80_is_signaling_nan( b ) ) {
4639 float_raise( float_flag_invalid STATUS_VAR);
4641 return 0;
4643 aSign = extractFloatx80Sign( a );
4644 bSign = extractFloatx80Sign( b );
4645 if ( aSign != bSign ) {
4646 return
4647 aSign
4648 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4649 == 0 );
4651 return
4652 aSign ? le128( b.high, b.low, a.high, a.low )
4653 : le128( a.high, a.low, b.high, b.low );
4657 /*----------------------------------------------------------------------------
4658 | Returns 1 if the extended double-precision floating-point value `a' is less
4659 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4660 | an exception. Otherwise, the comparison is performed according to the
4661 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4662 *----------------------------------------------------------------------------*/
4664 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4666 flag aSign, bSign;
4668 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4669 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4670 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4671 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4673 if ( floatx80_is_signaling_nan( a )
4674 || floatx80_is_signaling_nan( b ) ) {
4675 float_raise( float_flag_invalid STATUS_VAR);
4677 return 0;
4679 aSign = extractFloatx80Sign( a );
4680 bSign = extractFloatx80Sign( b );
4681 if ( aSign != bSign ) {
4682 return
4683 aSign
4684 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4685 != 0 );
4687 return
4688 aSign ? lt128( b.high, b.low, a.high, a.low )
4689 : lt128( a.high, a.low, b.high, b.low );
4693 #endif
4695 #ifdef FLOAT128
4697 /*----------------------------------------------------------------------------
4698 | Returns the result of converting the quadruple-precision floating-point
4699 | value `a' to the 32-bit two's complement integer format. The conversion
4700 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4701 | Arithmetic---which means in particular that the conversion is rounded
4702 | according to the current rounding mode. If `a' is a NaN, the largest
4703 | positive integer is returned. Otherwise, if the conversion overflows, the
4704 | largest integer with the same sign as `a' is returned.
4705 *----------------------------------------------------------------------------*/
4707 int32 float128_to_int32( float128 a STATUS_PARAM )
4709 flag aSign;
4710 int32 aExp, shiftCount;
4711 bits64 aSig0, aSig1;
4713 aSig1 = extractFloat128Frac1( a );
4714 aSig0 = extractFloat128Frac0( a );
4715 aExp = extractFloat128Exp( a );
4716 aSign = extractFloat128Sign( a );
4717 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4718 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4719 aSig0 |= ( aSig1 != 0 );
4720 shiftCount = 0x4028 - aExp;
4721 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4722 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4726 /*----------------------------------------------------------------------------
4727 | Returns the result of converting the quadruple-precision floating-point
4728 | value `a' to the 32-bit two's complement integer format. The conversion
4729 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4730 | Arithmetic, except that the conversion is always rounded toward zero. If
4731 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4732 | conversion overflows, the largest integer with the same sign as `a' is
4733 | returned.
4734 *----------------------------------------------------------------------------*/
4736 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4738 flag aSign;
4739 int32 aExp, shiftCount;
4740 bits64 aSig0, aSig1, savedASig;
4741 int32 z;
4743 aSig1 = extractFloat128Frac1( a );
4744 aSig0 = extractFloat128Frac0( a );
4745 aExp = extractFloat128Exp( a );
4746 aSign = extractFloat128Sign( a );
4747 aSig0 |= ( aSig1 != 0 );
4748 if ( 0x401E < aExp ) {
4749 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4750 goto invalid;
4752 else if ( aExp < 0x3FFF ) {
4753 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4754 return 0;
4756 aSig0 |= LIT64( 0x0001000000000000 );
4757 shiftCount = 0x402F - aExp;
4758 savedASig = aSig0;
4759 aSig0 >>= shiftCount;
4760 z = aSig0;
4761 if ( aSign ) z = - z;
4762 if ( ( z < 0 ) ^ aSign ) {
4763 invalid:
4764 float_raise( float_flag_invalid STATUS_VAR);
4765 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4767 if ( ( aSig0<<shiftCount ) != savedASig ) {
4768 STATUS(float_exception_flags) |= float_flag_inexact;
4770 return z;
4774 /*----------------------------------------------------------------------------
4775 | Returns the result of converting the quadruple-precision floating-point
4776 | value `a' to the 64-bit two's complement integer format. The conversion
4777 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4778 | Arithmetic---which means in particular that the conversion is rounded
4779 | according to the current rounding mode. If `a' is a NaN, the largest
4780 | positive integer is returned. Otherwise, if the conversion overflows, the
4781 | largest integer with the same sign as `a' is returned.
4782 *----------------------------------------------------------------------------*/
4784 int64 float128_to_int64( float128 a STATUS_PARAM )
4786 flag aSign;
4787 int32 aExp, shiftCount;
4788 bits64 aSig0, aSig1;
4790 aSig1 = extractFloat128Frac1( a );
4791 aSig0 = extractFloat128Frac0( a );
4792 aExp = extractFloat128Exp( a );
4793 aSign = extractFloat128Sign( a );
4794 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4795 shiftCount = 0x402F - aExp;
4796 if ( shiftCount <= 0 ) {
4797 if ( 0x403E < aExp ) {
4798 float_raise( float_flag_invalid STATUS_VAR);
4799 if ( ! aSign
4800 || ( ( aExp == 0x7FFF )
4801 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4804 return LIT64( 0x7FFFFFFFFFFFFFFF );
4806 return (sbits64) LIT64( 0x8000000000000000 );
4808 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4810 else {
4811 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4813 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4817 /*----------------------------------------------------------------------------
4818 | Returns the result of converting the quadruple-precision floating-point
4819 | value `a' to the 64-bit two's complement integer format. The conversion
4820 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4821 | Arithmetic, except that the conversion is always rounded toward zero.
4822 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4823 | the conversion overflows, the largest integer with the same sign as `a' is
4824 | returned.
4825 *----------------------------------------------------------------------------*/
4827 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4829 flag aSign;
4830 int32 aExp, shiftCount;
4831 bits64 aSig0, aSig1;
4832 int64 z;
4834 aSig1 = extractFloat128Frac1( a );
4835 aSig0 = extractFloat128Frac0( a );
4836 aExp = extractFloat128Exp( a );
4837 aSign = extractFloat128Sign( a );
4838 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4839 shiftCount = aExp - 0x402F;
4840 if ( 0 < shiftCount ) {
4841 if ( 0x403E <= aExp ) {
4842 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4843 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4844 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4845 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4847 else {
4848 float_raise( float_flag_invalid STATUS_VAR);
4849 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4850 return LIT64( 0x7FFFFFFFFFFFFFFF );
4853 return (sbits64) LIT64( 0x8000000000000000 );
4855 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4856 if ( (bits64) ( aSig1<<shiftCount ) ) {
4857 STATUS(float_exception_flags) |= float_flag_inexact;
4860 else {
4861 if ( aExp < 0x3FFF ) {
4862 if ( aExp | aSig0 | aSig1 ) {
4863 STATUS(float_exception_flags) |= float_flag_inexact;
4865 return 0;
4867 z = aSig0>>( - shiftCount );
4868 if ( aSig1
4869 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4870 STATUS(float_exception_flags) |= float_flag_inexact;
4873 if ( aSign ) z = - z;
4874 return z;
4878 /*----------------------------------------------------------------------------
4879 | Returns the result of converting the quadruple-precision floating-point
4880 | value `a' to the single-precision floating-point format. The conversion
4881 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4882 | Arithmetic.
4883 *----------------------------------------------------------------------------*/
4885 float32 float128_to_float32( float128 a STATUS_PARAM )
4887 flag aSign;
4888 int32 aExp;
4889 bits64 aSig0, aSig1;
4890 bits32 zSig;
4892 aSig1 = extractFloat128Frac1( a );
4893 aSig0 = extractFloat128Frac0( a );
4894 aExp = extractFloat128Exp( a );
4895 aSign = extractFloat128Sign( a );
4896 if ( aExp == 0x7FFF ) {
4897 if ( aSig0 | aSig1 ) {
4898 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4900 return packFloat32( aSign, 0xFF, 0 );
4902 aSig0 |= ( aSig1 != 0 );
4903 shift64RightJamming( aSig0, 18, &aSig0 );
4904 zSig = aSig0;
4905 if ( aExp || zSig ) {
4906 zSig |= 0x40000000;
4907 aExp -= 0x3F81;
4909 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4913 /*----------------------------------------------------------------------------
4914 | Returns the result of converting the quadruple-precision floating-point
4915 | value `a' to the double-precision floating-point format. The conversion
4916 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4917 | Arithmetic.
4918 *----------------------------------------------------------------------------*/
4920 float64 float128_to_float64( float128 a STATUS_PARAM )
4922 flag aSign;
4923 int32 aExp;
4924 bits64 aSig0, aSig1;
4926 aSig1 = extractFloat128Frac1( a );
4927 aSig0 = extractFloat128Frac0( a );
4928 aExp = extractFloat128Exp( a );
4929 aSign = extractFloat128Sign( a );
4930 if ( aExp == 0x7FFF ) {
4931 if ( aSig0 | aSig1 ) {
4932 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4934 return packFloat64( aSign, 0x7FF, 0 );
4936 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4937 aSig0 |= ( aSig1 != 0 );
4938 if ( aExp || aSig0 ) {
4939 aSig0 |= LIT64( 0x4000000000000000 );
4940 aExp -= 0x3C01;
4942 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4946 #ifdef FLOATX80
4948 /*----------------------------------------------------------------------------
4949 | Returns the result of converting the quadruple-precision floating-point
4950 | value `a' to the extended double-precision floating-point format. The
4951 | conversion is performed according to the IEC/IEEE Standard for Binary
4952 | Floating-Point Arithmetic.
4953 *----------------------------------------------------------------------------*/
4955 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4957 flag aSign;
4958 int32 aExp;
4959 bits64 aSig0, aSig1;
4961 aSig1 = extractFloat128Frac1( a );
4962 aSig0 = extractFloat128Frac0( a );
4963 aExp = extractFloat128Exp( a );
4964 aSign = extractFloat128Sign( a );
4965 if ( aExp == 0x7FFF ) {
4966 if ( aSig0 | aSig1 ) {
4967 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4969 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4971 if ( aExp == 0 ) {
4972 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4973 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4975 else {
4976 aSig0 |= LIT64( 0x0001000000000000 );
4978 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4979 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4983 #endif
4985 /*----------------------------------------------------------------------------
4986 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4987 | returns the result as a quadruple-precision floating-point value. The
4988 | operation is performed according to the IEC/IEEE Standard for Binary
4989 | Floating-Point Arithmetic.
4990 *----------------------------------------------------------------------------*/
4992 float128 float128_round_to_int( float128 a STATUS_PARAM )
4994 flag aSign;
4995 int32 aExp;
4996 bits64 lastBitMask, roundBitsMask;
4997 int8 roundingMode;
4998 float128 z;
5000 aExp = extractFloat128Exp( a );
5001 if ( 0x402F <= aExp ) {
5002 if ( 0x406F <= aExp ) {
5003 if ( ( aExp == 0x7FFF )
5004 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5006 return propagateFloat128NaN( a, a STATUS_VAR );
5008 return a;
5010 lastBitMask = 1;
5011 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5012 roundBitsMask = lastBitMask - 1;
5013 z = a;
5014 roundingMode = STATUS(float_rounding_mode);
5015 if ( roundingMode == float_round_nearest_even ) {
5016 if ( lastBitMask ) {
5017 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5018 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5020 else {
5021 if ( (sbits64) z.low < 0 ) {
5022 ++z.high;
5023 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
5027 else if ( roundingMode != float_round_to_zero ) {
5028 if ( extractFloat128Sign( z )
5029 ^ ( roundingMode == float_round_up ) ) {
5030 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5033 z.low &= ~ roundBitsMask;
5035 else {
5036 if ( aExp < 0x3FFF ) {
5037 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5038 STATUS(float_exception_flags) |= float_flag_inexact;
5039 aSign = extractFloat128Sign( a );
5040 switch ( STATUS(float_rounding_mode) ) {
5041 case float_round_nearest_even:
5042 if ( ( aExp == 0x3FFE )
5043 && ( extractFloat128Frac0( a )
5044 | extractFloat128Frac1( a ) )
5046 return packFloat128( aSign, 0x3FFF, 0, 0 );
5048 break;
5049 case float_round_down:
5050 return
5051 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5052 : packFloat128( 0, 0, 0, 0 );
5053 case float_round_up:
5054 return
5055 aSign ? packFloat128( 1, 0, 0, 0 )
5056 : packFloat128( 0, 0x3FFF, 0, 0 );
5058 return packFloat128( aSign, 0, 0, 0 );
5060 lastBitMask = 1;
5061 lastBitMask <<= 0x402F - aExp;
5062 roundBitsMask = lastBitMask - 1;
5063 z.low = 0;
5064 z.high = a.high;
5065 roundingMode = STATUS(float_rounding_mode);
5066 if ( roundingMode == float_round_nearest_even ) {
5067 z.high += lastBitMask>>1;
5068 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5069 z.high &= ~ lastBitMask;
5072 else if ( roundingMode != float_round_to_zero ) {
5073 if ( extractFloat128Sign( z )
5074 ^ ( roundingMode == float_round_up ) ) {
5075 z.high |= ( a.low != 0 );
5076 z.high += roundBitsMask;
5079 z.high &= ~ roundBitsMask;
5081 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5082 STATUS(float_exception_flags) |= float_flag_inexact;
5084 return z;
5088 /*----------------------------------------------------------------------------
5089 | Returns the result of adding the absolute values of the quadruple-precision
5090 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5091 | before being returned. `zSign' is ignored if the result is a NaN.
5092 | The addition is performed according to the IEC/IEEE Standard for Binary
5093 | Floating-Point Arithmetic.
5094 *----------------------------------------------------------------------------*/
5096 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5098 int32 aExp, bExp, zExp;
5099 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5100 int32 expDiff;
5102 aSig1 = extractFloat128Frac1( a );
5103 aSig0 = extractFloat128Frac0( a );
5104 aExp = extractFloat128Exp( a );
5105 bSig1 = extractFloat128Frac1( b );
5106 bSig0 = extractFloat128Frac0( b );
5107 bExp = extractFloat128Exp( b );
5108 expDiff = aExp - bExp;
5109 if ( 0 < expDiff ) {
5110 if ( aExp == 0x7FFF ) {
5111 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5112 return a;
5114 if ( bExp == 0 ) {
5115 --expDiff;
5117 else {
5118 bSig0 |= LIT64( 0x0001000000000000 );
5120 shift128ExtraRightJamming(
5121 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5122 zExp = aExp;
5124 else if ( expDiff < 0 ) {
5125 if ( bExp == 0x7FFF ) {
5126 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5127 return packFloat128( zSign, 0x7FFF, 0, 0 );
5129 if ( aExp == 0 ) {
5130 ++expDiff;
5132 else {
5133 aSig0 |= LIT64( 0x0001000000000000 );
5135 shift128ExtraRightJamming(
5136 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5137 zExp = bExp;
5139 else {
5140 if ( aExp == 0x7FFF ) {
5141 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5142 return propagateFloat128NaN( a, b STATUS_VAR );
5144 return a;
5146 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5147 if ( aExp == 0 ) {
5148 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
5149 return packFloat128( zSign, 0, zSig0, zSig1 );
5151 zSig2 = 0;
5152 zSig0 |= LIT64( 0x0002000000000000 );
5153 zExp = aExp;
5154 goto shiftRight1;
5156 aSig0 |= LIT64( 0x0001000000000000 );
5157 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5158 --zExp;
5159 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5160 ++zExp;
5161 shiftRight1:
5162 shift128ExtraRightJamming(
5163 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5164 roundAndPack:
5165 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5169 /*----------------------------------------------------------------------------
5170 | Returns the result of subtracting the absolute values of the quadruple-
5171 | precision floating-point values `a' and `b'. If `zSign' is 1, the
5172 | difference is negated before being returned. `zSign' is ignored if the
5173 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5174 | Standard for Binary Floating-Point Arithmetic.
5175 *----------------------------------------------------------------------------*/
5177 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5179 int32 aExp, bExp, zExp;
5180 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5181 int32 expDiff;
5182 float128 z;
5184 aSig1 = extractFloat128Frac1( a );
5185 aSig0 = extractFloat128Frac0( a );
5186 aExp = extractFloat128Exp( a );
5187 bSig1 = extractFloat128Frac1( b );
5188 bSig0 = extractFloat128Frac0( b );
5189 bExp = extractFloat128Exp( b );
5190 expDiff = aExp - bExp;
5191 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5192 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5193 if ( 0 < expDiff ) goto aExpBigger;
5194 if ( expDiff < 0 ) goto bExpBigger;
5195 if ( aExp == 0x7FFF ) {
5196 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5197 return propagateFloat128NaN( a, b STATUS_VAR );
5199 float_raise( float_flag_invalid STATUS_VAR);
5200 z.low = float128_default_nan_low;
5201 z.high = float128_default_nan_high;
5202 return z;
5204 if ( aExp == 0 ) {
5205 aExp = 1;
5206 bExp = 1;
5208 if ( bSig0 < aSig0 ) goto aBigger;
5209 if ( aSig0 < bSig0 ) goto bBigger;
5210 if ( bSig1 < aSig1 ) goto aBigger;
5211 if ( aSig1 < bSig1 ) goto bBigger;
5212 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5213 bExpBigger:
5214 if ( bExp == 0x7FFF ) {
5215 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5216 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5218 if ( aExp == 0 ) {
5219 ++expDiff;
5221 else {
5222 aSig0 |= LIT64( 0x4000000000000000 );
5224 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5225 bSig0 |= LIT64( 0x4000000000000000 );
5226 bBigger:
5227 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5228 zExp = bExp;
5229 zSign ^= 1;
5230 goto normalizeRoundAndPack;
5231 aExpBigger:
5232 if ( aExp == 0x7FFF ) {
5233 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5234 return a;
5236 if ( bExp == 0 ) {
5237 --expDiff;
5239 else {
5240 bSig0 |= LIT64( 0x4000000000000000 );
5242 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5243 aSig0 |= LIT64( 0x4000000000000000 );
5244 aBigger:
5245 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5246 zExp = aExp;
5247 normalizeRoundAndPack:
5248 --zExp;
5249 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5253 /*----------------------------------------------------------------------------
5254 | Returns the result of adding the quadruple-precision floating-point values
5255 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5256 | for Binary Floating-Point Arithmetic.
5257 *----------------------------------------------------------------------------*/
5259 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5261 flag aSign, bSign;
5263 aSign = extractFloat128Sign( a );
5264 bSign = extractFloat128Sign( b );
5265 if ( aSign == bSign ) {
5266 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5268 else {
5269 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5274 /*----------------------------------------------------------------------------
5275 | Returns the result of subtracting the quadruple-precision floating-point
5276 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5277 | Standard for Binary Floating-Point Arithmetic.
5278 *----------------------------------------------------------------------------*/
5280 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5282 flag aSign, bSign;
5284 aSign = extractFloat128Sign( a );
5285 bSign = extractFloat128Sign( b );
5286 if ( aSign == bSign ) {
5287 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5289 else {
5290 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5295 /*----------------------------------------------------------------------------
5296 | Returns the result of multiplying the quadruple-precision floating-point
5297 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5298 | Standard for Binary Floating-Point Arithmetic.
5299 *----------------------------------------------------------------------------*/
5301 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5303 flag aSign, bSign, zSign;
5304 int32 aExp, bExp, zExp;
5305 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5306 float128 z;
5308 aSig1 = extractFloat128Frac1( a );
5309 aSig0 = extractFloat128Frac0( a );
5310 aExp = extractFloat128Exp( a );
5311 aSign = extractFloat128Sign( a );
5312 bSig1 = extractFloat128Frac1( b );
5313 bSig0 = extractFloat128Frac0( b );
5314 bExp = extractFloat128Exp( b );
5315 bSign = extractFloat128Sign( b );
5316 zSign = aSign ^ bSign;
5317 if ( aExp == 0x7FFF ) {
5318 if ( ( aSig0 | aSig1 )
5319 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5320 return propagateFloat128NaN( a, b STATUS_VAR );
5322 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5323 return packFloat128( zSign, 0x7FFF, 0, 0 );
5325 if ( bExp == 0x7FFF ) {
5326 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5327 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5328 invalid:
5329 float_raise( float_flag_invalid STATUS_VAR);
5330 z.low = float128_default_nan_low;
5331 z.high = float128_default_nan_high;
5332 return z;
5334 return packFloat128( zSign, 0x7FFF, 0, 0 );
5336 if ( aExp == 0 ) {
5337 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5338 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5340 if ( bExp == 0 ) {
5341 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5342 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5344 zExp = aExp + bExp - 0x4000;
5345 aSig0 |= LIT64( 0x0001000000000000 );
5346 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5347 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5348 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5349 zSig2 |= ( zSig3 != 0 );
5350 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5351 shift128ExtraRightJamming(
5352 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5353 ++zExp;
5355 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5359 /*----------------------------------------------------------------------------
5360 | Returns the result of dividing the quadruple-precision floating-point value
5361 | `a' by the corresponding value `b'. The operation is performed according to
5362 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5363 *----------------------------------------------------------------------------*/
5365 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5367 flag aSign, bSign, zSign;
5368 int32 aExp, bExp, zExp;
5369 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5370 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5371 float128 z;
5373 aSig1 = extractFloat128Frac1( a );
5374 aSig0 = extractFloat128Frac0( a );
5375 aExp = extractFloat128Exp( a );
5376 aSign = extractFloat128Sign( a );
5377 bSig1 = extractFloat128Frac1( b );
5378 bSig0 = extractFloat128Frac0( b );
5379 bExp = extractFloat128Exp( b );
5380 bSign = extractFloat128Sign( b );
5381 zSign = aSign ^ bSign;
5382 if ( aExp == 0x7FFF ) {
5383 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5384 if ( bExp == 0x7FFF ) {
5385 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5386 goto invalid;
5388 return packFloat128( zSign, 0x7FFF, 0, 0 );
5390 if ( bExp == 0x7FFF ) {
5391 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5392 return packFloat128( zSign, 0, 0, 0 );
5394 if ( bExp == 0 ) {
5395 if ( ( bSig0 | bSig1 ) == 0 ) {
5396 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5397 invalid:
5398 float_raise( float_flag_invalid STATUS_VAR);
5399 z.low = float128_default_nan_low;
5400 z.high = float128_default_nan_high;
5401 return z;
5403 float_raise( float_flag_divbyzero STATUS_VAR);
5404 return packFloat128( zSign, 0x7FFF, 0, 0 );
5406 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5408 if ( aExp == 0 ) {
5409 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5410 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5412 zExp = aExp - bExp + 0x3FFD;
5413 shortShift128Left(
5414 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5415 shortShift128Left(
5416 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5417 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5418 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5419 ++zExp;
5421 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5422 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5423 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5424 while ( (sbits64) rem0 < 0 ) {
5425 --zSig0;
5426 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5428 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5429 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5430 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5431 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5432 while ( (sbits64) rem1 < 0 ) {
5433 --zSig1;
5434 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5436 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5438 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5439 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5443 /*----------------------------------------------------------------------------
5444 | Returns the remainder of the quadruple-precision floating-point value `a'
5445 | with respect to the corresponding value `b'. The operation is performed
5446 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5447 *----------------------------------------------------------------------------*/
5449 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5451 flag aSign, zSign;
5452 int32 aExp, bExp, expDiff;
5453 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5454 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5455 sbits64 sigMean0;
5456 float128 z;
5458 aSig1 = extractFloat128Frac1( a );
5459 aSig0 = extractFloat128Frac0( a );
5460 aExp = extractFloat128Exp( a );
5461 aSign = extractFloat128Sign( a );
5462 bSig1 = extractFloat128Frac1( b );
5463 bSig0 = extractFloat128Frac0( b );
5464 bExp = extractFloat128Exp( b );
5465 if ( aExp == 0x7FFF ) {
5466 if ( ( aSig0 | aSig1 )
5467 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5468 return propagateFloat128NaN( a, b STATUS_VAR );
5470 goto invalid;
5472 if ( bExp == 0x7FFF ) {
5473 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5474 return a;
5476 if ( bExp == 0 ) {
5477 if ( ( bSig0 | bSig1 ) == 0 ) {
5478 invalid:
5479 float_raise( float_flag_invalid STATUS_VAR);
5480 z.low = float128_default_nan_low;
5481 z.high = float128_default_nan_high;
5482 return z;
5484 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5486 if ( aExp == 0 ) {
5487 if ( ( aSig0 | aSig1 ) == 0 ) return a;
5488 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5490 expDiff = aExp - bExp;
5491 if ( expDiff < -1 ) return a;
5492 shortShift128Left(
5493 aSig0 | LIT64( 0x0001000000000000 ),
5494 aSig1,
5495 15 - ( expDiff < 0 ),
5496 &aSig0,
5497 &aSig1
5499 shortShift128Left(
5500 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5501 q = le128( bSig0, bSig1, aSig0, aSig1 );
5502 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5503 expDiff -= 64;
5504 while ( 0 < expDiff ) {
5505 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5506 q = ( 4 < q ) ? q - 4 : 0;
5507 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5508 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5509 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5510 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5511 expDiff -= 61;
5513 if ( -64 < expDiff ) {
5514 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5515 q = ( 4 < q ) ? q - 4 : 0;
5516 q >>= - expDiff;
5517 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5518 expDiff += 52;
5519 if ( expDiff < 0 ) {
5520 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5522 else {
5523 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5525 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5526 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5528 else {
5529 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5530 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5532 do {
5533 alternateASig0 = aSig0;
5534 alternateASig1 = aSig1;
5535 ++q;
5536 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5537 } while ( 0 <= (sbits64) aSig0 );
5538 add128(
5539 aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5540 if ( ( sigMean0 < 0 )
5541 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5542 aSig0 = alternateASig0;
5543 aSig1 = alternateASig1;
5545 zSign = ( (sbits64) aSig0 < 0 );
5546 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5547 return
5548 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5552 /*----------------------------------------------------------------------------
5553 | Returns the square root of the quadruple-precision floating-point value `a'.
5554 | The operation is performed according to the IEC/IEEE Standard for Binary
5555 | Floating-Point Arithmetic.
5556 *----------------------------------------------------------------------------*/
5558 float128 float128_sqrt( float128 a STATUS_PARAM )
5560 flag aSign;
5561 int32 aExp, zExp;
5562 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5563 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5564 float128 z;
5566 aSig1 = extractFloat128Frac1( a );
5567 aSig0 = extractFloat128Frac0( a );
5568 aExp = extractFloat128Exp( a );
5569 aSign = extractFloat128Sign( a );
5570 if ( aExp == 0x7FFF ) {
5571 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5572 if ( ! aSign ) return a;
5573 goto invalid;
5575 if ( aSign ) {
5576 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5577 invalid:
5578 float_raise( float_flag_invalid STATUS_VAR);
5579 z.low = float128_default_nan_low;
5580 z.high = float128_default_nan_high;
5581 return z;
5583 if ( aExp == 0 ) {
5584 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5585 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5587 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5588 aSig0 |= LIT64( 0x0001000000000000 );
5589 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5590 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5591 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5592 doubleZSig0 = zSig0<<1;
5593 mul64To128( zSig0, zSig0, &term0, &term1 );
5594 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5595 while ( (sbits64) rem0 < 0 ) {
5596 --zSig0;
5597 doubleZSig0 -= 2;
5598 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5600 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5601 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5602 if ( zSig1 == 0 ) zSig1 = 1;
5603 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5604 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5605 mul64To128( zSig1, zSig1, &term2, &term3 );
5606 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5607 while ( (sbits64) rem1 < 0 ) {
5608 --zSig1;
5609 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5610 term3 |= 1;
5611 term2 |= doubleZSig0;
5612 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5614 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5616 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5617 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5621 /*----------------------------------------------------------------------------
5622 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5623 | the corresponding value `b', and 0 otherwise. The comparison is performed
5624 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5625 *----------------------------------------------------------------------------*/
5627 int float128_eq( float128 a, float128 b STATUS_PARAM )
5630 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5631 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5632 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5633 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5635 if ( float128_is_signaling_nan( a )
5636 || float128_is_signaling_nan( b ) ) {
5637 float_raise( float_flag_invalid STATUS_VAR);
5639 return 0;
5641 return
5642 ( a.low == b.low )
5643 && ( ( a.high == b.high )
5644 || ( ( a.low == 0 )
5645 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5650 /*----------------------------------------------------------------------------
5651 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5652 | or equal to the corresponding value `b', and 0 otherwise. The comparison
5653 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5654 | Arithmetic.
5655 *----------------------------------------------------------------------------*/
5657 int float128_le( float128 a, float128 b STATUS_PARAM )
5659 flag aSign, bSign;
5661 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5662 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5663 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5664 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5666 float_raise( float_flag_invalid STATUS_VAR);
5667 return 0;
5669 aSign = extractFloat128Sign( a );
5670 bSign = extractFloat128Sign( b );
5671 if ( aSign != bSign ) {
5672 return
5673 aSign
5674 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5675 == 0 );
5677 return
5678 aSign ? le128( b.high, b.low, a.high, a.low )
5679 : le128( a.high, a.low, b.high, b.low );
5683 /*----------------------------------------------------------------------------
5684 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5685 | the corresponding value `b', and 0 otherwise. The comparison is performed
5686 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5687 *----------------------------------------------------------------------------*/
5689 int float128_lt( float128 a, float128 b STATUS_PARAM )
5691 flag aSign, bSign;
5693 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5694 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5695 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5696 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5698 float_raise( float_flag_invalid STATUS_VAR);
5699 return 0;
5701 aSign = extractFloat128Sign( a );
5702 bSign = extractFloat128Sign( b );
5703 if ( aSign != bSign ) {
5704 return
5705 aSign
5706 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5707 != 0 );
5709 return
5710 aSign ? lt128( b.high, b.low, a.high, a.low )
5711 : lt128( a.high, a.low, b.high, b.low );
5715 /*----------------------------------------------------------------------------
5716 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5717 | the corresponding value `b', and 0 otherwise. The invalid exception is
5718 | raised if either operand is a NaN. Otherwise, the comparison is performed
5719 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5720 *----------------------------------------------------------------------------*/
5722 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5725 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5726 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5727 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5728 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5730 float_raise( float_flag_invalid STATUS_VAR);
5731 return 0;
5733 return
5734 ( a.low == b.low )
5735 && ( ( a.high == b.high )
5736 || ( ( a.low == 0 )
5737 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5742 /*----------------------------------------------------------------------------
5743 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5744 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5745 | cause an exception. Otherwise, the comparison is performed according to the
5746 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5747 *----------------------------------------------------------------------------*/
5749 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5751 flag aSign, bSign;
5753 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5754 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5755 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5756 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5758 if ( float128_is_signaling_nan( a )
5759 || float128_is_signaling_nan( b ) ) {
5760 float_raise( float_flag_invalid STATUS_VAR);
5762 return 0;
5764 aSign = extractFloat128Sign( a );
5765 bSign = extractFloat128Sign( b );
5766 if ( aSign != bSign ) {
5767 return
5768 aSign
5769 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5770 == 0 );
5772 return
5773 aSign ? le128( b.high, b.low, a.high, a.low )
5774 : le128( a.high, a.low, b.high, b.low );
5778 /*----------------------------------------------------------------------------
5779 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5780 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5781 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5782 | Standard for Binary Floating-Point Arithmetic.
5783 *----------------------------------------------------------------------------*/
5785 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5787 flag aSign, bSign;
5789 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5790 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5791 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5792 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5794 if ( float128_is_signaling_nan( a )
5795 || float128_is_signaling_nan( b ) ) {
5796 float_raise( float_flag_invalid STATUS_VAR);
5798 return 0;
5800 aSign = extractFloat128Sign( a );
5801 bSign = extractFloat128Sign( b );
5802 if ( aSign != bSign ) {
5803 return
5804 aSign
5805 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5806 != 0 );
5808 return
5809 aSign ? lt128( b.high, b.low, a.high, a.low )
5810 : lt128( a.high, a.low, b.high, b.low );
5814 #endif
5816 /* misc functions */
5817 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5819 return int64_to_float32(a STATUS_VAR);
5822 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5824 return int64_to_float64(a STATUS_VAR);
5827 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5829 int64_t v;
5830 unsigned int res;
5832 v = float32_to_int64(a STATUS_VAR);
5833 if (v < 0) {
5834 res = 0;
5835 float_raise( float_flag_invalid STATUS_VAR);
5836 } else if (v > 0xffffffff) {
5837 res = 0xffffffff;
5838 float_raise( float_flag_invalid STATUS_VAR);
5839 } else {
5840 res = v;
5842 return res;
5845 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5847 int64_t v;
5848 unsigned int res;
5850 v = float32_to_int64_round_to_zero(a STATUS_VAR);
5851 if (v < 0) {
5852 res = 0;
5853 float_raise( float_flag_invalid STATUS_VAR);
5854 } else if (v > 0xffffffff) {
5855 res = 0xffffffff;
5856 float_raise( float_flag_invalid STATUS_VAR);
5857 } else {
5858 res = v;
5860 return res;
5863 unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
5865 int64_t v;
5866 unsigned int res;
5868 v = float32_to_int64_round_to_zero(a STATUS_VAR);
5869 if (v < 0) {
5870 res = 0;
5871 float_raise( float_flag_invalid STATUS_VAR);
5872 } else if (v > 0xffff) {
5873 res = 0xffff;
5874 float_raise( float_flag_invalid STATUS_VAR);
5875 } else {
5876 res = v;
5878 return res;
5881 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5883 int64_t v;
5884 unsigned int res;
5886 v = float64_to_int64(a STATUS_VAR);
5887 if (v < 0) {
5888 res = 0;
5889 float_raise( float_flag_invalid STATUS_VAR);
5890 } else if (v > 0xffffffff) {
5891 res = 0xffffffff;
5892 float_raise( float_flag_invalid STATUS_VAR);
5893 } else {
5894 res = v;
5896 return res;
5899 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5901 int64_t v;
5902 unsigned int res;
5904 v = float64_to_int64_round_to_zero(a STATUS_VAR);
5905 if (v < 0) {
5906 res = 0;
5907 float_raise( float_flag_invalid STATUS_VAR);
5908 } else if (v > 0xffffffff) {
5909 res = 0xffffffff;
5910 float_raise( float_flag_invalid STATUS_VAR);
5911 } else {
5912 res = v;
5914 return res;
5917 unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
5919 int64_t v;
5920 unsigned int res;
5922 v = float64_to_int64_round_to_zero(a STATUS_VAR);
5923 if (v < 0) {
5924 res = 0;
5925 float_raise( float_flag_invalid STATUS_VAR);
5926 } else if (v > 0xffff) {
5927 res = 0xffff;
5928 float_raise( float_flag_invalid STATUS_VAR);
5929 } else {
5930 res = v;
5932 return res;
5935 /* FIXME: This looks broken. */
5936 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5938 int64_t v;
5940 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5941 v += float64_val(a);
5942 v = float64_to_int64(make_float64(v) STATUS_VAR);
5944 return v - INT64_MIN;
5947 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5949 int64_t v;
5951 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5952 v += float64_val(a);
5953 v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5955 return v - INT64_MIN;
5958 #define COMPARE(s, nan_exp) \
5959 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
5960 int is_quiet STATUS_PARAM ) \
5962 flag aSign, bSign; \
5963 bits ## s av, bv; \
5964 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
5965 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
5967 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
5968 extractFloat ## s ## Frac( a ) ) || \
5969 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
5970 extractFloat ## s ## Frac( b ) )) { \
5971 if (!is_quiet || \
5972 float ## s ## _is_signaling_nan( a ) || \
5973 float ## s ## _is_signaling_nan( b ) ) { \
5974 float_raise( float_flag_invalid STATUS_VAR); \
5976 return float_relation_unordered; \
5978 aSign = extractFloat ## s ## Sign( a ); \
5979 bSign = extractFloat ## s ## Sign( b ); \
5980 av = float ## s ## _val(a); \
5981 bv = float ## s ## _val(b); \
5982 if ( aSign != bSign ) { \
5983 if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) { \
5984 /* zero case */ \
5985 return float_relation_equal; \
5986 } else { \
5987 return 1 - (2 * aSign); \
5989 } else { \
5990 if (av == bv) { \
5991 return float_relation_equal; \
5992 } else { \
5993 return 1 - 2 * (aSign ^ ( av < bv )); \
5998 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
6000 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
6003 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
6005 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
6008 COMPARE(32, 0xff)
6009 COMPARE(64, 0x7ff)
6011 INLINE int float128_compare_internal( float128 a, float128 b,
6012 int is_quiet STATUS_PARAM )
6014 flag aSign, bSign;
6016 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6017 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6018 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6019 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6020 if (!is_quiet ||
6021 float128_is_signaling_nan( a ) ||
6022 float128_is_signaling_nan( b ) ) {
6023 float_raise( float_flag_invalid STATUS_VAR);
6025 return float_relation_unordered;
6027 aSign = extractFloat128Sign( a );
6028 bSign = extractFloat128Sign( b );
6029 if ( aSign != bSign ) {
6030 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6031 /* zero case */
6032 return float_relation_equal;
6033 } else {
6034 return 1 - (2 * aSign);
6036 } else {
6037 if (a.low == b.low && a.high == b.high) {
6038 return float_relation_equal;
6039 } else {
6040 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6045 int float128_compare( float128 a, float128 b STATUS_PARAM )
6047 return float128_compare_internal(a, b, 0 STATUS_VAR);
6050 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6052 return float128_compare_internal(a, b, 1 STATUS_VAR);
6055 /* Multiply A by 2 raised to the power N. */
6056 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6058 flag aSign;
6059 int16 aExp;
6060 bits32 aSig;
6062 a = float32_squash_input_denormal(a STATUS_VAR);
6063 aSig = extractFloat32Frac( a );
6064 aExp = extractFloat32Exp( a );
6065 aSign = extractFloat32Sign( a );
6067 if ( aExp == 0xFF ) {
6068 return a;
6070 if ( aExp != 0 )
6071 aSig |= 0x00800000;
6072 else if ( aSig == 0 )
6073 return a;
6075 aExp += n - 1;
6076 aSig <<= 7;
6077 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6080 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6082 flag aSign;
6083 int16 aExp;
6084 bits64 aSig;
6086 a = float64_squash_input_denormal(a STATUS_VAR);
6087 aSig = extractFloat64Frac( a );
6088 aExp = extractFloat64Exp( a );
6089 aSign = extractFloat64Sign( a );
6091 if ( aExp == 0x7FF ) {
6092 return a;
6094 if ( aExp != 0 )
6095 aSig |= LIT64( 0x0010000000000000 );
6096 else if ( aSig == 0 )
6097 return a;
6099 aExp += n - 1;
6100 aSig <<= 10;
6101 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6104 #ifdef FLOATX80
6105 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6107 flag aSign;
6108 int16 aExp;
6109 bits64 aSig;
6111 aSig = extractFloatx80Frac( a );
6112 aExp = extractFloatx80Exp( a );
6113 aSign = extractFloatx80Sign( a );
6115 if ( aExp == 0x7FF ) {
6116 return a;
6118 if (aExp == 0 && aSig == 0)
6119 return a;
6121 aExp += n;
6122 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6123 aSign, aExp, aSig, 0 STATUS_VAR );
6125 #endif
6127 #ifdef FLOAT128
6128 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6130 flag aSign;
6131 int32 aExp;
6132 bits64 aSig0, aSig1;
6134 aSig1 = extractFloat128Frac1( a );
6135 aSig0 = extractFloat128Frac0( a );
6136 aExp = extractFloat128Exp( a );
6137 aSign = extractFloat128Sign( a );
6138 if ( aExp == 0x7FFF ) {
6139 return a;
6141 if ( aExp != 0 )
6142 aSig0 |= LIT64( 0x0001000000000000 );
6143 else if ( aSig0 == 0 && aSig1 == 0 )
6144 return a;
6146 aExp += n - 1;
6147 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6148 STATUS_VAR );
6151 #endif