Merge remote branch 'kwolf/for-anthony' into staging
[qemu.git] / fpu / softfloat.c
blob03fb9487bd0de848dd29d12fb7f76723aee2c71d
1 /*
2 * QEMU float support
4 * Derived from SoftFloat.
5 */
7 /*============================================================================
9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
10 Package, Release 2b.
12 Written by John R. Hauser. This work was made possible in part by the
13 International Computer Science Institute, located at Suite 600, 1947 Center
14 Street, Berkeley, California 94704. Funding was partially provided by the
15 National Science Foundation under grant MIP-9311980. The original version
16 of this code was written as part of a project to build a fixed-point vector
17 processor in collaboration with the University of California at Berkeley,
18 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20 arithmetic/SoftFloat.html'.
22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
31 Derivative works are acceptable, even for commercial purposes, so long as
32 (1) the source code for the derivative work includes prominent notice that
33 the work is derivative, and (2) the source code includes prominent notice with
34 these four paragraphs for those parts of this code that are retained.
36 =============================================================================*/
38 #include "softfloat.h"
40 /*----------------------------------------------------------------------------
41 | Primitive arithmetic functions, including multi-word arithmetic, and
42 | division and square root approximations. (Can be specialized to target if
43 | desired.)
44 *----------------------------------------------------------------------------*/
45 #include "softfloat-macros.h"
47 /*----------------------------------------------------------------------------
48 | Functions and definitions to determine: (1) whether tininess for underflow
49 | is detected before or after rounding by default, (2) what (if anything)
50 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
51 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
52 | are propagated from function inputs to output. These details are target-
53 | specific.
54 *----------------------------------------------------------------------------*/
55 #include "softfloat-specialize.h"
57 void set_float_rounding_mode(int val STATUS_PARAM)
59 STATUS(float_rounding_mode) = val;
62 void set_float_exception_flags(int val STATUS_PARAM)
64 STATUS(float_exception_flags) = val;
67 #ifdef FLOATX80
68 void set_floatx80_rounding_precision(int val STATUS_PARAM)
70 STATUS(floatx80_rounding_precision) = val;
72 #endif
74 /*----------------------------------------------------------------------------
75 | Returns the fraction bits of the half-precision floating-point value `a'.
76 *----------------------------------------------------------------------------*/
78 INLINE uint32_t extractFloat16Frac(float16 a)
80 return float16_val(a) & 0x3ff;
83 /*----------------------------------------------------------------------------
84 | Returns the exponent bits of the half-precision floating-point value `a'.
85 *----------------------------------------------------------------------------*/
87 INLINE int16 extractFloat16Exp(float16 a)
89 return (float16_val(a) >> 10) & 0x1f;
92 /*----------------------------------------------------------------------------
93 | Returns the sign bit of the single-precision floating-point value `a'.
94 *----------------------------------------------------------------------------*/
96 INLINE flag extractFloat16Sign(float16 a)
98 return float16_val(a)>>15;
101 /*----------------------------------------------------------------------------
102 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
103 | and 7, and returns the properly rounded 32-bit integer corresponding to the
104 | input. If `zSign' is 1, the input is negated before being converted to an
105 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
106 | is simply rounded to an integer, with the inexact exception raised if the
107 | input cannot be represented exactly as an integer. However, if the fixed-
108 | point input is too large, the invalid exception is raised and the largest
109 | positive or negative integer is returned.
110 *----------------------------------------------------------------------------*/
112 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
114 int8 roundingMode;
115 flag roundNearestEven;
116 int8 roundIncrement, roundBits;
117 int32 z;
119 roundingMode = STATUS(float_rounding_mode);
120 roundNearestEven = ( roundingMode == float_round_nearest_even );
121 roundIncrement = 0x40;
122 if ( ! roundNearestEven ) {
123 if ( roundingMode == float_round_to_zero ) {
124 roundIncrement = 0;
126 else {
127 roundIncrement = 0x7F;
128 if ( zSign ) {
129 if ( roundingMode == float_round_up ) roundIncrement = 0;
131 else {
132 if ( roundingMode == float_round_down ) roundIncrement = 0;
136 roundBits = absZ & 0x7F;
137 absZ = ( absZ + roundIncrement )>>7;
138 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
139 z = absZ;
140 if ( zSign ) z = - z;
141 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
142 float_raise( float_flag_invalid STATUS_VAR);
143 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
145 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
146 return z;
150 /*----------------------------------------------------------------------------
151 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
152 | `absZ1', with binary point between bits 63 and 64 (between the input words),
153 | and returns the properly rounded 64-bit integer corresponding to the input.
154 | If `zSign' is 1, the input is negated before being converted to an integer.
155 | Ordinarily, the fixed-point input is simply rounded to an integer, with
156 | the inexact exception raised if the input cannot be represented exactly as
157 | an integer. However, if the fixed-point input is too large, the invalid
158 | exception is raised and the largest positive or negative integer is
159 | returned.
160 *----------------------------------------------------------------------------*/
162 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
164 int8 roundingMode;
165 flag roundNearestEven, increment;
166 int64 z;
168 roundingMode = STATUS(float_rounding_mode);
169 roundNearestEven = ( roundingMode == float_round_nearest_even );
170 increment = ( (int64_t) absZ1 < 0 );
171 if ( ! roundNearestEven ) {
172 if ( roundingMode == float_round_to_zero ) {
173 increment = 0;
175 else {
176 if ( zSign ) {
177 increment = ( roundingMode == float_round_down ) && absZ1;
179 else {
180 increment = ( roundingMode == float_round_up ) && absZ1;
184 if ( increment ) {
185 ++absZ0;
186 if ( absZ0 == 0 ) goto overflow;
187 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
189 z = absZ0;
190 if ( zSign ) z = - z;
191 if ( z && ( ( z < 0 ) ^ zSign ) ) {
192 overflow:
193 float_raise( float_flag_invalid STATUS_VAR);
194 return
195 zSign ? (int64_t) LIT64( 0x8000000000000000 )
196 : LIT64( 0x7FFFFFFFFFFFFFFF );
198 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
199 return z;
203 /*----------------------------------------------------------------------------
204 | Returns the fraction bits of the single-precision floating-point value `a'.
205 *----------------------------------------------------------------------------*/
207 INLINE uint32_t extractFloat32Frac( float32 a )
210 return float32_val(a) & 0x007FFFFF;
214 /*----------------------------------------------------------------------------
215 | Returns the exponent bits of the single-precision floating-point value `a'.
216 *----------------------------------------------------------------------------*/
218 INLINE int16 extractFloat32Exp( float32 a )
221 return ( float32_val(a)>>23 ) & 0xFF;
225 /*----------------------------------------------------------------------------
226 | Returns the sign bit of the single-precision floating-point value `a'.
227 *----------------------------------------------------------------------------*/
229 INLINE flag extractFloat32Sign( float32 a )
232 return float32_val(a)>>31;
236 /*----------------------------------------------------------------------------
237 | If `a' is denormal and we are in flush-to-zero mode then set the
238 | input-denormal exception and return zero. Otherwise just return the value.
239 *----------------------------------------------------------------------------*/
240 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
242 if (STATUS(flush_inputs_to_zero)) {
243 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
244 float_raise(float_flag_input_denormal STATUS_VAR);
245 return make_float32(float32_val(a) & 0x80000000);
248 return a;
251 /*----------------------------------------------------------------------------
252 | Normalizes the subnormal single-precision floating-point value represented
253 | by the denormalized significand `aSig'. The normalized exponent and
254 | significand are stored at the locations pointed to by `zExpPtr' and
255 | `zSigPtr', respectively.
256 *----------------------------------------------------------------------------*/
258 static void
259 normalizeFloat32Subnormal( uint32_t aSig, int16 *zExpPtr, uint32_t *zSigPtr )
261 int8 shiftCount;
263 shiftCount = countLeadingZeros32( aSig ) - 8;
264 *zSigPtr = aSig<<shiftCount;
265 *zExpPtr = 1 - shiftCount;
269 /*----------------------------------------------------------------------------
270 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
271 | single-precision floating-point value, returning the result. After being
272 | shifted into the proper positions, the three fields are simply added
273 | together to form the result. This means that any integer portion of `zSig'
274 | will be added into the exponent. Since a properly normalized significand
275 | will have an integer portion equal to 1, the `zExp' input should be 1 less
276 | than the desired result exponent whenever `zSig' is a complete, normalized
277 | significand.
278 *----------------------------------------------------------------------------*/
280 INLINE float32 packFloat32( flag zSign, int16 zExp, uint32_t zSig )
283 return make_float32(
284 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
288 /*----------------------------------------------------------------------------
289 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
290 | and significand `zSig', and returns the proper single-precision floating-
291 | point value corresponding to the abstract input. Ordinarily, the abstract
292 | value is simply rounded and packed into the single-precision format, with
293 | the inexact exception raised if the abstract input cannot be represented
294 | exactly. However, if the abstract value is too large, the overflow and
295 | inexact exceptions are raised and an infinity or maximal finite value is
296 | returned. If the abstract value is too small, the input value is rounded to
297 | a subnormal number, and the underflow and inexact exceptions are raised if
298 | the abstract input cannot be represented exactly as a subnormal single-
299 | precision floating-point number.
300 | The input significand `zSig' has its binary point between bits 30
301 | and 29, which is 7 bits to the left of the usual location. This shifted
302 | significand must be normalized or smaller. If `zSig' is not normalized,
303 | `zExp' must be 0; in that case, the result returned is a subnormal number,
304 | and it must not require rounding. In the usual case that `zSig' is
305 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
306 | The handling of underflow and overflow follows the IEC/IEEE Standard for
307 | Binary Floating-Point Arithmetic.
308 *----------------------------------------------------------------------------*/
310 static float32 roundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
312 int8 roundingMode;
313 flag roundNearestEven;
314 int8 roundIncrement, roundBits;
315 flag isTiny;
317 roundingMode = STATUS(float_rounding_mode);
318 roundNearestEven = ( roundingMode == float_round_nearest_even );
319 roundIncrement = 0x40;
320 if ( ! roundNearestEven ) {
321 if ( roundingMode == float_round_to_zero ) {
322 roundIncrement = 0;
324 else {
325 roundIncrement = 0x7F;
326 if ( zSign ) {
327 if ( roundingMode == float_round_up ) roundIncrement = 0;
329 else {
330 if ( roundingMode == float_round_down ) roundIncrement = 0;
334 roundBits = zSig & 0x7F;
335 if ( 0xFD <= (uint16_t) zExp ) {
336 if ( ( 0xFD < zExp )
337 || ( ( zExp == 0xFD )
338 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
340 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
341 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
343 if ( zExp < 0 ) {
344 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
345 isTiny =
346 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
347 || ( zExp < -1 )
348 || ( zSig + roundIncrement < 0x80000000 );
349 shift32RightJamming( zSig, - zExp, &zSig );
350 zExp = 0;
351 roundBits = zSig & 0x7F;
352 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
355 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
356 zSig = ( zSig + roundIncrement )>>7;
357 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
358 if ( zSig == 0 ) zExp = 0;
359 return packFloat32( zSign, zExp, zSig );
363 /*----------------------------------------------------------------------------
364 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
365 | and significand `zSig', and returns the proper single-precision floating-
366 | point value corresponding to the abstract input. This routine is just like
367 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
368 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
369 | floating-point exponent.
370 *----------------------------------------------------------------------------*/
372 static float32
373 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
375 int8 shiftCount;
377 shiftCount = countLeadingZeros32( zSig ) - 1;
378 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
382 /*----------------------------------------------------------------------------
383 | Returns the fraction bits of the double-precision floating-point value `a'.
384 *----------------------------------------------------------------------------*/
386 INLINE uint64_t extractFloat64Frac( float64 a )
389 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
393 /*----------------------------------------------------------------------------
394 | Returns the exponent bits of the double-precision floating-point value `a'.
395 *----------------------------------------------------------------------------*/
397 INLINE int16 extractFloat64Exp( float64 a )
400 return ( float64_val(a)>>52 ) & 0x7FF;
404 /*----------------------------------------------------------------------------
405 | Returns the sign bit of the double-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
408 INLINE flag extractFloat64Sign( float64 a )
411 return float64_val(a)>>63;
415 /*----------------------------------------------------------------------------
416 | If `a' is denormal and we are in flush-to-zero mode then set the
417 | input-denormal exception and return zero. Otherwise just return the value.
418 *----------------------------------------------------------------------------*/
419 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
421 if (STATUS(flush_inputs_to_zero)) {
422 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
423 float_raise(float_flag_input_denormal STATUS_VAR);
424 return make_float64(float64_val(a) & (1ULL << 63));
427 return a;
430 /*----------------------------------------------------------------------------
431 | Normalizes the subnormal double-precision floating-point value represented
432 | by the denormalized significand `aSig'. The normalized exponent and
433 | significand are stored at the locations pointed to by `zExpPtr' and
434 | `zSigPtr', respectively.
435 *----------------------------------------------------------------------------*/
437 static void
438 normalizeFloat64Subnormal( uint64_t aSig, int16 *zExpPtr, uint64_t *zSigPtr )
440 int8 shiftCount;
442 shiftCount = countLeadingZeros64( aSig ) - 11;
443 *zSigPtr = aSig<<shiftCount;
444 *zExpPtr = 1 - shiftCount;
448 /*----------------------------------------------------------------------------
449 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
450 | double-precision floating-point value, returning the result. After being
451 | shifted into the proper positions, the three fields are simply added
452 | together to form the result. This means that any integer portion of `zSig'
453 | will be added into the exponent. Since a properly normalized significand
454 | will have an integer portion equal to 1, the `zExp' input should be 1 less
455 | than the desired result exponent whenever `zSig' is a complete, normalized
456 | significand.
457 *----------------------------------------------------------------------------*/
459 INLINE float64 packFloat64( flag zSign, int16 zExp, uint64_t zSig )
462 return make_float64(
463 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
467 /*----------------------------------------------------------------------------
468 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
469 | and significand `zSig', and returns the proper double-precision floating-
470 | point value corresponding to the abstract input. Ordinarily, the abstract
471 | value is simply rounded and packed into the double-precision format, with
472 | the inexact exception raised if the abstract input cannot be represented
473 | exactly. However, if the abstract value is too large, the overflow and
474 | inexact exceptions are raised and an infinity or maximal finite value is
475 | returned. If the abstract value is too small, the input value is rounded
476 | to a subnormal number, and the underflow and inexact exceptions are raised
477 | if the abstract input cannot be represented exactly as a subnormal double-
478 | precision floating-point number.
479 | The input significand `zSig' has its binary point between bits 62
480 | and 61, which is 10 bits to the left of the usual location. This shifted
481 | significand must be normalized or smaller. If `zSig' is not normalized,
482 | `zExp' must be 0; in that case, the result returned is a subnormal number,
483 | and it must not require rounding. In the usual case that `zSig' is
484 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
485 | The handling of underflow and overflow follows the IEC/IEEE Standard for
486 | Binary Floating-Point Arithmetic.
487 *----------------------------------------------------------------------------*/
489 static float64 roundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
491 int8 roundingMode;
492 flag roundNearestEven;
493 int16 roundIncrement, roundBits;
494 flag isTiny;
496 roundingMode = STATUS(float_rounding_mode);
497 roundNearestEven = ( roundingMode == float_round_nearest_even );
498 roundIncrement = 0x200;
499 if ( ! roundNearestEven ) {
500 if ( roundingMode == float_round_to_zero ) {
501 roundIncrement = 0;
503 else {
504 roundIncrement = 0x3FF;
505 if ( zSign ) {
506 if ( roundingMode == float_round_up ) roundIncrement = 0;
508 else {
509 if ( roundingMode == float_round_down ) roundIncrement = 0;
513 roundBits = zSig & 0x3FF;
514 if ( 0x7FD <= (uint16_t) zExp ) {
515 if ( ( 0x7FD < zExp )
516 || ( ( zExp == 0x7FD )
517 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
519 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
520 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
522 if ( zExp < 0 ) {
523 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
524 isTiny =
525 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
526 || ( zExp < -1 )
527 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
528 shift64RightJamming( zSig, - zExp, &zSig );
529 zExp = 0;
530 roundBits = zSig & 0x3FF;
531 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
534 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
535 zSig = ( zSig + roundIncrement )>>10;
536 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
537 if ( zSig == 0 ) zExp = 0;
538 return packFloat64( zSign, zExp, zSig );
542 /*----------------------------------------------------------------------------
543 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
544 | and significand `zSig', and returns the proper double-precision floating-
545 | point value corresponding to the abstract input. This routine is just like
546 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
547 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
548 | floating-point exponent.
549 *----------------------------------------------------------------------------*/
551 static float64
552 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
554 int8 shiftCount;
556 shiftCount = countLeadingZeros64( zSig ) - 1;
557 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
561 #ifdef FLOATX80
563 /*----------------------------------------------------------------------------
564 | Returns the fraction bits of the extended double-precision floating-point
565 | value `a'.
566 *----------------------------------------------------------------------------*/
568 INLINE uint64_t extractFloatx80Frac( floatx80 a )
571 return a.low;
575 /*----------------------------------------------------------------------------
576 | Returns the exponent bits of the extended double-precision floating-point
577 | value `a'.
578 *----------------------------------------------------------------------------*/
580 INLINE int32 extractFloatx80Exp( floatx80 a )
583 return a.high & 0x7FFF;
587 /*----------------------------------------------------------------------------
588 | Returns the sign bit of the extended double-precision floating-point value
589 | `a'.
590 *----------------------------------------------------------------------------*/
592 INLINE flag extractFloatx80Sign( floatx80 a )
595 return a.high>>15;
599 /*----------------------------------------------------------------------------
600 | Normalizes the subnormal extended double-precision floating-point value
601 | represented by the denormalized significand `aSig'. The normalized exponent
602 | and significand are stored at the locations pointed to by `zExpPtr' and
603 | `zSigPtr', respectively.
604 *----------------------------------------------------------------------------*/
606 static void
607 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
609 int8 shiftCount;
611 shiftCount = countLeadingZeros64( aSig );
612 *zSigPtr = aSig<<shiftCount;
613 *zExpPtr = 1 - shiftCount;
617 /*----------------------------------------------------------------------------
618 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
619 | extended double-precision floating-point value, returning the result.
620 *----------------------------------------------------------------------------*/
622 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
624 floatx80 z;
626 z.low = zSig;
627 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
628 return z;
632 /*----------------------------------------------------------------------------
633 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
634 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
635 | and returns the proper extended double-precision floating-point value
636 | corresponding to the abstract input. Ordinarily, the abstract value is
637 | rounded and packed into the extended double-precision format, with the
638 | inexact exception raised if the abstract input cannot be represented
639 | exactly. However, if the abstract value is too large, the overflow and
640 | inexact exceptions are raised and an infinity or maximal finite value is
641 | returned. If the abstract value is too small, the input value is rounded to
642 | a subnormal number, and the underflow and inexact exceptions are raised if
643 | the abstract input cannot be represented exactly as a subnormal extended
644 | double-precision floating-point number.
645 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
646 | number of bits as single or double precision, respectively. Otherwise, the
647 | result is rounded to the full precision of the extended double-precision
648 | format.
649 | The input significand must be normalized or smaller. If the input
650 | significand is not normalized, `zExp' must be 0; in that case, the result
651 | returned is a subnormal number, and it must not require rounding. The
652 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
653 | Floating-Point Arithmetic.
654 *----------------------------------------------------------------------------*/
656 static floatx80
657 roundAndPackFloatx80(
658 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
659 STATUS_PARAM)
661 int8 roundingMode;
662 flag roundNearestEven, increment, isTiny;
663 int64 roundIncrement, roundMask, roundBits;
665 roundingMode = STATUS(float_rounding_mode);
666 roundNearestEven = ( roundingMode == float_round_nearest_even );
667 if ( roundingPrecision == 80 ) goto precision80;
668 if ( roundingPrecision == 64 ) {
669 roundIncrement = LIT64( 0x0000000000000400 );
670 roundMask = LIT64( 0x00000000000007FF );
672 else if ( roundingPrecision == 32 ) {
673 roundIncrement = LIT64( 0x0000008000000000 );
674 roundMask = LIT64( 0x000000FFFFFFFFFF );
676 else {
677 goto precision80;
679 zSig0 |= ( zSig1 != 0 );
680 if ( ! roundNearestEven ) {
681 if ( roundingMode == float_round_to_zero ) {
682 roundIncrement = 0;
684 else {
685 roundIncrement = roundMask;
686 if ( zSign ) {
687 if ( roundingMode == float_round_up ) roundIncrement = 0;
689 else {
690 if ( roundingMode == float_round_down ) roundIncrement = 0;
694 roundBits = zSig0 & roundMask;
695 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
696 if ( ( 0x7FFE < zExp )
697 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
699 goto overflow;
701 if ( zExp <= 0 ) {
702 if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
703 isTiny =
704 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
705 || ( zExp < 0 )
706 || ( zSig0 <= zSig0 + roundIncrement );
707 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
708 zExp = 0;
709 roundBits = zSig0 & roundMask;
710 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
711 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
712 zSig0 += roundIncrement;
713 if ( (int64_t) zSig0 < 0 ) zExp = 1;
714 roundIncrement = roundMask + 1;
715 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
716 roundMask |= roundIncrement;
718 zSig0 &= ~ roundMask;
719 return packFloatx80( zSign, zExp, zSig0 );
722 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
723 zSig0 += roundIncrement;
724 if ( zSig0 < roundIncrement ) {
725 ++zExp;
726 zSig0 = LIT64( 0x8000000000000000 );
728 roundIncrement = roundMask + 1;
729 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
730 roundMask |= roundIncrement;
732 zSig0 &= ~ roundMask;
733 if ( zSig0 == 0 ) zExp = 0;
734 return packFloatx80( zSign, zExp, zSig0 );
735 precision80:
736 increment = ( (int64_t) zSig1 < 0 );
737 if ( ! roundNearestEven ) {
738 if ( roundingMode == float_round_to_zero ) {
739 increment = 0;
741 else {
742 if ( zSign ) {
743 increment = ( roundingMode == float_round_down ) && zSig1;
745 else {
746 increment = ( roundingMode == float_round_up ) && zSig1;
750 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
751 if ( ( 0x7FFE < zExp )
752 || ( ( zExp == 0x7FFE )
753 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
754 && increment
757 roundMask = 0;
758 overflow:
759 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
760 if ( ( roundingMode == float_round_to_zero )
761 || ( zSign && ( roundingMode == float_round_up ) )
762 || ( ! zSign && ( roundingMode == float_round_down ) )
764 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
766 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
768 if ( zExp <= 0 ) {
769 isTiny =
770 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
771 || ( zExp < 0 )
772 || ! increment
773 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
774 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
775 zExp = 0;
776 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
777 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
778 if ( roundNearestEven ) {
779 increment = ( (int64_t) zSig1 < 0 );
781 else {
782 if ( zSign ) {
783 increment = ( roundingMode == float_round_down ) && zSig1;
785 else {
786 increment = ( roundingMode == float_round_up ) && zSig1;
789 if ( increment ) {
790 ++zSig0;
791 zSig0 &=
792 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
793 if ( (int64_t) zSig0 < 0 ) zExp = 1;
795 return packFloatx80( zSign, zExp, zSig0 );
798 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
799 if ( increment ) {
800 ++zSig0;
801 if ( zSig0 == 0 ) {
802 ++zExp;
803 zSig0 = LIT64( 0x8000000000000000 );
805 else {
806 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
809 else {
810 if ( zSig0 == 0 ) zExp = 0;
812 return packFloatx80( zSign, zExp, zSig0 );
816 /*----------------------------------------------------------------------------
817 | Takes an abstract floating-point value having sign `zSign', exponent
818 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
819 | and returns the proper extended double-precision floating-point value
820 | corresponding to the abstract input. This routine is just like
821 | `roundAndPackFloatx80' except that the input significand does not have to be
822 | normalized.
823 *----------------------------------------------------------------------------*/
825 static floatx80
826 normalizeRoundAndPackFloatx80(
827 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
828 STATUS_PARAM)
830 int8 shiftCount;
832 if ( zSig0 == 0 ) {
833 zSig0 = zSig1;
834 zSig1 = 0;
835 zExp -= 64;
837 shiftCount = countLeadingZeros64( zSig0 );
838 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
839 zExp -= shiftCount;
840 return
841 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
845 #endif
847 #ifdef FLOAT128
849 /*----------------------------------------------------------------------------
850 | Returns the least-significant 64 fraction bits of the quadruple-precision
851 | floating-point value `a'.
852 *----------------------------------------------------------------------------*/
854 INLINE uint64_t extractFloat128Frac1( float128 a )
857 return a.low;
861 /*----------------------------------------------------------------------------
862 | Returns the most-significant 48 fraction bits of the quadruple-precision
863 | floating-point value `a'.
864 *----------------------------------------------------------------------------*/
866 INLINE uint64_t extractFloat128Frac0( float128 a )
869 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
873 /*----------------------------------------------------------------------------
874 | Returns the exponent bits of the quadruple-precision floating-point value
875 | `a'.
876 *----------------------------------------------------------------------------*/
878 INLINE int32 extractFloat128Exp( float128 a )
881 return ( a.high>>48 ) & 0x7FFF;
885 /*----------------------------------------------------------------------------
886 | Returns the sign bit of the quadruple-precision floating-point value `a'.
887 *----------------------------------------------------------------------------*/
889 INLINE flag extractFloat128Sign( float128 a )
892 return a.high>>63;
896 /*----------------------------------------------------------------------------
897 | Normalizes the subnormal quadruple-precision floating-point value
898 | represented by the denormalized significand formed by the concatenation of
899 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
900 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
901 | significand are stored at the location pointed to by `zSig0Ptr', and the
902 | least significant 64 bits of the normalized significand are stored at the
903 | location pointed to by `zSig1Ptr'.
904 *----------------------------------------------------------------------------*/
906 static void
907 normalizeFloat128Subnormal(
908 uint64_t aSig0,
909 uint64_t aSig1,
910 int32 *zExpPtr,
911 uint64_t *zSig0Ptr,
912 uint64_t *zSig1Ptr
915 int8 shiftCount;
917 if ( aSig0 == 0 ) {
918 shiftCount = countLeadingZeros64( aSig1 ) - 15;
919 if ( shiftCount < 0 ) {
920 *zSig0Ptr = aSig1>>( - shiftCount );
921 *zSig1Ptr = aSig1<<( shiftCount & 63 );
923 else {
924 *zSig0Ptr = aSig1<<shiftCount;
925 *zSig1Ptr = 0;
927 *zExpPtr = - shiftCount - 63;
929 else {
930 shiftCount = countLeadingZeros64( aSig0 ) - 15;
931 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
932 *zExpPtr = 1 - shiftCount;
937 /*----------------------------------------------------------------------------
938 | Packs the sign `zSign', the exponent `zExp', and the significand formed
939 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
940 | floating-point value, returning the result. After being shifted into the
941 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
942 | added together to form the most significant 32 bits of the result. This
943 | means that any integer portion of `zSig0' will be added into the exponent.
944 | Since a properly normalized significand will have an integer portion equal
945 | to 1, the `zExp' input should be 1 less than the desired result exponent
946 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
947 | significand.
948 *----------------------------------------------------------------------------*/
950 INLINE float128
951 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
953 float128 z;
955 z.low = zSig1;
956 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
957 return z;
961 /*----------------------------------------------------------------------------
962 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
963 | and extended significand formed by the concatenation of `zSig0', `zSig1',
964 | and `zSig2', and returns the proper quadruple-precision floating-point value
965 | corresponding to the abstract input. Ordinarily, the abstract value is
966 | simply rounded and packed into the quadruple-precision format, with the
967 | inexact exception raised if the abstract input cannot be represented
968 | exactly. However, if the abstract value is too large, the overflow and
969 | inexact exceptions are raised and an infinity or maximal finite value is
970 | returned. If the abstract value is too small, the input value is rounded to
971 | a subnormal number, and the underflow and inexact exceptions are raised if
972 | the abstract input cannot be represented exactly as a subnormal quadruple-
973 | precision floating-point number.
974 | The input significand must be normalized or smaller. If the input
975 | significand is not normalized, `zExp' must be 0; in that case, the result
976 | returned is a subnormal number, and it must not require rounding. In the
977 | usual case that the input significand is normalized, `zExp' must be 1 less
978 | than the ``true'' floating-point exponent. The handling of underflow and
979 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
980 *----------------------------------------------------------------------------*/
982 static float128
983 roundAndPackFloat128(
984 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
986 int8 roundingMode;
987 flag roundNearestEven, increment, isTiny;
989 roundingMode = STATUS(float_rounding_mode);
990 roundNearestEven = ( roundingMode == float_round_nearest_even );
991 increment = ( (int64_t) zSig2 < 0 );
992 if ( ! roundNearestEven ) {
993 if ( roundingMode == float_round_to_zero ) {
994 increment = 0;
996 else {
997 if ( zSign ) {
998 increment = ( roundingMode == float_round_down ) && zSig2;
1000 else {
1001 increment = ( roundingMode == float_round_up ) && zSig2;
1005 if ( 0x7FFD <= (uint32_t) zExp ) {
1006 if ( ( 0x7FFD < zExp )
1007 || ( ( zExp == 0x7FFD )
1008 && eq128(
1009 LIT64( 0x0001FFFFFFFFFFFF ),
1010 LIT64( 0xFFFFFFFFFFFFFFFF ),
1011 zSig0,
1012 zSig1
1014 && increment
1017 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1018 if ( ( roundingMode == float_round_to_zero )
1019 || ( zSign && ( roundingMode == float_round_up ) )
1020 || ( ! zSign && ( roundingMode == float_round_down ) )
1022 return
1023 packFloat128(
1024 zSign,
1025 0x7FFE,
1026 LIT64( 0x0000FFFFFFFFFFFF ),
1027 LIT64( 0xFFFFFFFFFFFFFFFF )
1030 return packFloat128( zSign, 0x7FFF, 0, 0 );
1032 if ( zExp < 0 ) {
1033 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
1034 isTiny =
1035 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1036 || ( zExp < -1 )
1037 || ! increment
1038 || lt128(
1039 zSig0,
1040 zSig1,
1041 LIT64( 0x0001FFFFFFFFFFFF ),
1042 LIT64( 0xFFFFFFFFFFFFFFFF )
1044 shift128ExtraRightJamming(
1045 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1046 zExp = 0;
1047 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1048 if ( roundNearestEven ) {
1049 increment = ( (int64_t) zSig2 < 0 );
1051 else {
1052 if ( zSign ) {
1053 increment = ( roundingMode == float_round_down ) && zSig2;
1055 else {
1056 increment = ( roundingMode == float_round_up ) && zSig2;
1061 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1062 if ( increment ) {
1063 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1064 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1066 else {
1067 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1069 return packFloat128( zSign, zExp, zSig0, zSig1 );
1073 /*----------------------------------------------------------------------------
1074 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1075 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1076 | returns the proper quadruple-precision floating-point value corresponding
1077 | to the abstract input. This routine is just like `roundAndPackFloat128'
1078 | except that the input significand has fewer bits and does not have to be
1079 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1080 | point exponent.
1081 *----------------------------------------------------------------------------*/
1083 static float128
1084 normalizeRoundAndPackFloat128(
1085 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1087 int8 shiftCount;
1088 uint64_t zSig2;
1090 if ( zSig0 == 0 ) {
1091 zSig0 = zSig1;
1092 zSig1 = 0;
1093 zExp -= 64;
1095 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1096 if ( 0 <= shiftCount ) {
1097 zSig2 = 0;
1098 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1100 else {
1101 shift128ExtraRightJamming(
1102 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1104 zExp -= shiftCount;
1105 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1109 #endif
1111 /*----------------------------------------------------------------------------
1112 | Returns the result of converting the 32-bit two's complement integer `a'
1113 | to the single-precision floating-point format. The conversion is performed
1114 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115 *----------------------------------------------------------------------------*/
1117 float32 int32_to_float32( int32 a STATUS_PARAM )
1119 flag zSign;
1121 if ( a == 0 ) return float32_zero;
1122 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1123 zSign = ( a < 0 );
1124 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1128 /*----------------------------------------------------------------------------
1129 | Returns the result of converting the 32-bit two's complement integer `a'
1130 | to the double-precision floating-point format. The conversion is performed
1131 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1132 *----------------------------------------------------------------------------*/
1134 float64 int32_to_float64( int32 a STATUS_PARAM )
1136 flag zSign;
1137 uint32 absA;
1138 int8 shiftCount;
1139 uint64_t zSig;
1141 if ( a == 0 ) return float64_zero;
1142 zSign = ( a < 0 );
1143 absA = zSign ? - a : a;
1144 shiftCount = countLeadingZeros32( absA ) + 21;
1145 zSig = absA;
1146 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1150 #ifdef FLOATX80
1152 /*----------------------------------------------------------------------------
1153 | Returns the result of converting the 32-bit two's complement integer `a'
1154 | to the extended double-precision floating-point format. The conversion
1155 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1156 | Arithmetic.
1157 *----------------------------------------------------------------------------*/
1159 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1161 flag zSign;
1162 uint32 absA;
1163 int8 shiftCount;
1164 uint64_t zSig;
1166 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1167 zSign = ( a < 0 );
1168 absA = zSign ? - a : a;
1169 shiftCount = countLeadingZeros32( absA ) + 32;
1170 zSig = absA;
1171 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1175 #endif
1177 #ifdef FLOAT128
1179 /*----------------------------------------------------------------------------
1180 | Returns the result of converting the 32-bit two's complement integer `a' to
1181 | the quadruple-precision floating-point format. The conversion is performed
1182 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1183 *----------------------------------------------------------------------------*/
1185 float128 int32_to_float128( int32 a STATUS_PARAM )
1187 flag zSign;
1188 uint32 absA;
1189 int8 shiftCount;
1190 uint64_t zSig0;
1192 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1193 zSign = ( a < 0 );
1194 absA = zSign ? - a : a;
1195 shiftCount = countLeadingZeros32( absA ) + 17;
1196 zSig0 = absA;
1197 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1201 #endif
1203 /*----------------------------------------------------------------------------
1204 | Returns the result of converting the 64-bit two's complement integer `a'
1205 | to the single-precision floating-point format. The conversion is performed
1206 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1207 *----------------------------------------------------------------------------*/
1209 float32 int64_to_float32( int64 a STATUS_PARAM )
1211 flag zSign;
1212 uint64 absA;
1213 int8 shiftCount;
1215 if ( a == 0 ) return float32_zero;
1216 zSign = ( a < 0 );
1217 absA = zSign ? - a : a;
1218 shiftCount = countLeadingZeros64( absA ) - 40;
1219 if ( 0 <= shiftCount ) {
1220 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1222 else {
1223 shiftCount += 7;
1224 if ( shiftCount < 0 ) {
1225 shift64RightJamming( absA, - shiftCount, &absA );
1227 else {
1228 absA <<= shiftCount;
1230 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1235 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1237 int8 shiftCount;
1239 if ( a == 0 ) return float32_zero;
1240 shiftCount = countLeadingZeros64( a ) - 40;
1241 if ( 0 <= shiftCount ) {
1242 return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1244 else {
1245 shiftCount += 7;
1246 if ( shiftCount < 0 ) {
1247 shift64RightJamming( a, - shiftCount, &a );
1249 else {
1250 a <<= shiftCount;
1252 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1256 /*----------------------------------------------------------------------------
1257 | Returns the result of converting the 64-bit two's complement integer `a'
1258 | to the double-precision floating-point format. The conversion is performed
1259 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1260 *----------------------------------------------------------------------------*/
1262 float64 int64_to_float64( int64 a STATUS_PARAM )
1264 flag zSign;
1266 if ( a == 0 ) return float64_zero;
1267 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1268 return packFloat64( 1, 0x43E, 0 );
1270 zSign = ( a < 0 );
1271 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1275 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1277 if ( a == 0 ) return float64_zero;
1278 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1282 #ifdef FLOATX80
1284 /*----------------------------------------------------------------------------
1285 | Returns the result of converting the 64-bit two's complement integer `a'
1286 | to the extended double-precision floating-point format. The conversion
1287 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1288 | Arithmetic.
1289 *----------------------------------------------------------------------------*/
1291 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1293 flag zSign;
1294 uint64 absA;
1295 int8 shiftCount;
1297 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1298 zSign = ( a < 0 );
1299 absA = zSign ? - a : a;
1300 shiftCount = countLeadingZeros64( absA );
1301 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1305 #endif
1307 #ifdef FLOAT128
1309 /*----------------------------------------------------------------------------
1310 | Returns the result of converting the 64-bit two's complement integer `a' to
1311 | the quadruple-precision floating-point format. The conversion is performed
1312 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1313 *----------------------------------------------------------------------------*/
1315 float128 int64_to_float128( int64 a STATUS_PARAM )
1317 flag zSign;
1318 uint64 absA;
1319 int8 shiftCount;
1320 int32 zExp;
1321 uint64_t zSig0, zSig1;
1323 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1324 zSign = ( a < 0 );
1325 absA = zSign ? - a : a;
1326 shiftCount = countLeadingZeros64( absA ) + 49;
1327 zExp = 0x406E - shiftCount;
1328 if ( 64 <= shiftCount ) {
1329 zSig1 = 0;
1330 zSig0 = absA;
1331 shiftCount -= 64;
1333 else {
1334 zSig1 = absA;
1335 zSig0 = 0;
1337 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1338 return packFloat128( zSign, zExp, zSig0, zSig1 );
1342 #endif
1344 /*----------------------------------------------------------------------------
1345 | Returns the result of converting the single-precision floating-point value
1346 | `a' to the 32-bit two's complement integer format. The conversion is
1347 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1348 | Arithmetic---which means in particular that the conversion is rounded
1349 | according to the current rounding mode. If `a' is a NaN, the largest
1350 | positive integer is returned. Otherwise, if the conversion overflows, the
1351 | largest integer with the same sign as `a' is returned.
1352 *----------------------------------------------------------------------------*/
1354 int32 float32_to_int32( float32 a STATUS_PARAM )
1356 flag aSign;
1357 int16 aExp, shiftCount;
1358 uint32_t aSig;
1359 uint64_t aSig64;
1361 a = float32_squash_input_denormal(a STATUS_VAR);
1362 aSig = extractFloat32Frac( a );
1363 aExp = extractFloat32Exp( a );
1364 aSign = extractFloat32Sign( a );
1365 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1366 if ( aExp ) aSig |= 0x00800000;
1367 shiftCount = 0xAF - aExp;
1368 aSig64 = aSig;
1369 aSig64 <<= 32;
1370 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1371 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1375 /*----------------------------------------------------------------------------
1376 | Returns the result of converting the single-precision floating-point value
1377 | `a' to the 32-bit two's complement integer format. The conversion is
1378 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1379 | Arithmetic, except that the conversion is always rounded toward zero.
1380 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1381 | the conversion overflows, the largest integer with the same sign as `a' is
1382 | returned.
1383 *----------------------------------------------------------------------------*/
1385 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1387 flag aSign;
1388 int16 aExp, shiftCount;
1389 uint32_t aSig;
1390 int32 z;
1391 a = float32_squash_input_denormal(a STATUS_VAR);
1393 aSig = extractFloat32Frac( a );
1394 aExp = extractFloat32Exp( a );
1395 aSign = extractFloat32Sign( a );
1396 shiftCount = aExp - 0x9E;
1397 if ( 0 <= shiftCount ) {
1398 if ( float32_val(a) != 0xCF000000 ) {
1399 float_raise( float_flag_invalid STATUS_VAR);
1400 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1402 return (int32_t) 0x80000000;
1404 else if ( aExp <= 0x7E ) {
1405 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1406 return 0;
1408 aSig = ( aSig | 0x00800000 )<<8;
1409 z = aSig>>( - shiftCount );
1410 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1411 STATUS(float_exception_flags) |= float_flag_inexact;
1413 if ( aSign ) z = - z;
1414 return z;
1418 /*----------------------------------------------------------------------------
1419 | Returns the result of converting the single-precision floating-point value
1420 | `a' to the 16-bit two's complement integer format. The conversion is
1421 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1422 | Arithmetic, except that the conversion is always rounded toward zero.
1423 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1424 | the conversion overflows, the largest integer with the same sign as `a' is
1425 | returned.
1426 *----------------------------------------------------------------------------*/
1428 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1430 flag aSign;
1431 int16 aExp, shiftCount;
1432 uint32_t aSig;
1433 int32 z;
1435 aSig = extractFloat32Frac( a );
1436 aExp = extractFloat32Exp( a );
1437 aSign = extractFloat32Sign( a );
1438 shiftCount = aExp - 0x8E;
1439 if ( 0 <= shiftCount ) {
1440 if ( float32_val(a) != 0xC7000000 ) {
1441 float_raise( float_flag_invalid STATUS_VAR);
1442 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1443 return 0x7FFF;
1446 return (int32_t) 0xffff8000;
1448 else if ( aExp <= 0x7E ) {
1449 if ( aExp | aSig ) {
1450 STATUS(float_exception_flags) |= float_flag_inexact;
1452 return 0;
1454 shiftCount -= 0x10;
1455 aSig = ( aSig | 0x00800000 )<<8;
1456 z = aSig>>( - shiftCount );
1457 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1458 STATUS(float_exception_flags) |= float_flag_inexact;
1460 if ( aSign ) {
1461 z = - z;
1463 return z;
1467 /*----------------------------------------------------------------------------
1468 | Returns the result of converting the single-precision floating-point value
1469 | `a' to the 64-bit two's complement integer format. The conversion is
1470 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1471 | Arithmetic---which means in particular that the conversion is rounded
1472 | according to the current rounding mode. If `a' is a NaN, the largest
1473 | positive integer is returned. Otherwise, if the conversion overflows, the
1474 | largest integer with the same sign as `a' is returned.
1475 *----------------------------------------------------------------------------*/
1477 int64 float32_to_int64( float32 a STATUS_PARAM )
1479 flag aSign;
1480 int16 aExp, shiftCount;
1481 uint32_t aSig;
1482 uint64_t aSig64, aSigExtra;
1483 a = float32_squash_input_denormal(a STATUS_VAR);
1485 aSig = extractFloat32Frac( a );
1486 aExp = extractFloat32Exp( a );
1487 aSign = extractFloat32Sign( a );
1488 shiftCount = 0xBE - aExp;
1489 if ( shiftCount < 0 ) {
1490 float_raise( float_flag_invalid STATUS_VAR);
1491 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1492 return LIT64( 0x7FFFFFFFFFFFFFFF );
1494 return (int64_t) LIT64( 0x8000000000000000 );
1496 if ( aExp ) aSig |= 0x00800000;
1497 aSig64 = aSig;
1498 aSig64 <<= 40;
1499 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1500 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1504 /*----------------------------------------------------------------------------
1505 | Returns the result of converting the single-precision floating-point value
1506 | `a' to the 64-bit two's complement integer format. The conversion is
1507 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1508 | Arithmetic, except that the conversion is always rounded toward zero. If
1509 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1510 | conversion overflows, the largest integer with the same sign as `a' is
1511 | returned.
1512 *----------------------------------------------------------------------------*/
1514 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1516 flag aSign;
1517 int16 aExp, shiftCount;
1518 uint32_t aSig;
1519 uint64_t aSig64;
1520 int64 z;
1521 a = float32_squash_input_denormal(a STATUS_VAR);
1523 aSig = extractFloat32Frac( a );
1524 aExp = extractFloat32Exp( a );
1525 aSign = extractFloat32Sign( a );
1526 shiftCount = aExp - 0xBE;
1527 if ( 0 <= shiftCount ) {
1528 if ( float32_val(a) != 0xDF000000 ) {
1529 float_raise( float_flag_invalid STATUS_VAR);
1530 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1531 return LIT64( 0x7FFFFFFFFFFFFFFF );
1534 return (int64_t) LIT64( 0x8000000000000000 );
1536 else if ( aExp <= 0x7E ) {
1537 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1538 return 0;
1540 aSig64 = aSig | 0x00800000;
1541 aSig64 <<= 40;
1542 z = aSig64>>( - shiftCount );
1543 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1544 STATUS(float_exception_flags) |= float_flag_inexact;
1546 if ( aSign ) z = - z;
1547 return z;
1551 /*----------------------------------------------------------------------------
1552 | Returns the result of converting the single-precision floating-point value
1553 | `a' to the double-precision floating-point format. The conversion is
1554 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1555 | Arithmetic.
1556 *----------------------------------------------------------------------------*/
1558 float64 float32_to_float64( float32 a STATUS_PARAM )
1560 flag aSign;
1561 int16 aExp;
1562 uint32_t aSig;
1563 a = float32_squash_input_denormal(a STATUS_VAR);
1565 aSig = extractFloat32Frac( a );
1566 aExp = extractFloat32Exp( a );
1567 aSign = extractFloat32Sign( a );
1568 if ( aExp == 0xFF ) {
1569 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1570 return packFloat64( aSign, 0x7FF, 0 );
1572 if ( aExp == 0 ) {
1573 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1574 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1575 --aExp;
1577 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1581 #ifdef FLOATX80
1583 /*----------------------------------------------------------------------------
1584 | Returns the result of converting the single-precision floating-point value
1585 | `a' to the extended double-precision floating-point format. The conversion
1586 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1587 | Arithmetic.
1588 *----------------------------------------------------------------------------*/
1590 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1592 flag aSign;
1593 int16 aExp;
1594 uint32_t aSig;
1596 a = float32_squash_input_denormal(a STATUS_VAR);
1597 aSig = extractFloat32Frac( a );
1598 aExp = extractFloat32Exp( a );
1599 aSign = extractFloat32Sign( a );
1600 if ( aExp == 0xFF ) {
1601 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1602 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1604 if ( aExp == 0 ) {
1605 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1606 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1608 aSig |= 0x00800000;
1609 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1613 #endif
1615 #ifdef FLOAT128
1617 /*----------------------------------------------------------------------------
1618 | Returns the result of converting the single-precision floating-point value
1619 | `a' to the double-precision floating-point format. The conversion is
1620 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1621 | Arithmetic.
1622 *----------------------------------------------------------------------------*/
1624 float128 float32_to_float128( float32 a STATUS_PARAM )
1626 flag aSign;
1627 int16 aExp;
1628 uint32_t aSig;
1630 a = float32_squash_input_denormal(a STATUS_VAR);
1631 aSig = extractFloat32Frac( a );
1632 aExp = extractFloat32Exp( a );
1633 aSign = extractFloat32Sign( a );
1634 if ( aExp == 0xFF ) {
1635 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1636 return packFloat128( aSign, 0x7FFF, 0, 0 );
1638 if ( aExp == 0 ) {
1639 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1640 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1641 --aExp;
1643 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1647 #endif
1649 /*----------------------------------------------------------------------------
1650 | Rounds the single-precision floating-point value `a' to an integer, and
1651 | returns the result as a single-precision floating-point value. The
1652 | operation is performed according to the IEC/IEEE Standard for Binary
1653 | Floating-Point Arithmetic.
1654 *----------------------------------------------------------------------------*/
1656 float32 float32_round_to_int( float32 a STATUS_PARAM)
1658 flag aSign;
1659 int16 aExp;
1660 uint32_t lastBitMask, roundBitsMask;
1661 int8 roundingMode;
1662 uint32_t z;
1663 a = float32_squash_input_denormal(a STATUS_VAR);
1665 aExp = extractFloat32Exp( a );
1666 if ( 0x96 <= aExp ) {
1667 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1668 return propagateFloat32NaN( a, a STATUS_VAR );
1670 return a;
1672 if ( aExp <= 0x7E ) {
1673 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1674 STATUS(float_exception_flags) |= float_flag_inexact;
1675 aSign = extractFloat32Sign( a );
1676 switch ( STATUS(float_rounding_mode) ) {
1677 case float_round_nearest_even:
1678 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1679 return packFloat32( aSign, 0x7F, 0 );
1681 break;
1682 case float_round_down:
1683 return make_float32(aSign ? 0xBF800000 : 0);
1684 case float_round_up:
1685 return make_float32(aSign ? 0x80000000 : 0x3F800000);
1687 return packFloat32( aSign, 0, 0 );
1689 lastBitMask = 1;
1690 lastBitMask <<= 0x96 - aExp;
1691 roundBitsMask = lastBitMask - 1;
1692 z = float32_val(a);
1693 roundingMode = STATUS(float_rounding_mode);
1694 if ( roundingMode == float_round_nearest_even ) {
1695 z += lastBitMask>>1;
1696 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1698 else if ( roundingMode != float_round_to_zero ) {
1699 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1700 z += roundBitsMask;
1703 z &= ~ roundBitsMask;
1704 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1705 return make_float32(z);
1709 /*----------------------------------------------------------------------------
1710 | Returns the result of adding the absolute values of the single-precision
1711 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1712 | before being returned. `zSign' is ignored if the result is a NaN.
1713 | The addition is performed according to the IEC/IEEE Standard for Binary
1714 | Floating-Point Arithmetic.
1715 *----------------------------------------------------------------------------*/
1717 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1719 int16 aExp, bExp, zExp;
1720 uint32_t aSig, bSig, zSig;
1721 int16 expDiff;
1723 aSig = extractFloat32Frac( a );
1724 aExp = extractFloat32Exp( a );
1725 bSig = extractFloat32Frac( b );
1726 bExp = extractFloat32Exp( b );
1727 expDiff = aExp - bExp;
1728 aSig <<= 6;
1729 bSig <<= 6;
1730 if ( 0 < expDiff ) {
1731 if ( aExp == 0xFF ) {
1732 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1733 return a;
1735 if ( bExp == 0 ) {
1736 --expDiff;
1738 else {
1739 bSig |= 0x20000000;
1741 shift32RightJamming( bSig, expDiff, &bSig );
1742 zExp = aExp;
1744 else if ( expDiff < 0 ) {
1745 if ( bExp == 0xFF ) {
1746 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1747 return packFloat32( zSign, 0xFF, 0 );
1749 if ( aExp == 0 ) {
1750 ++expDiff;
1752 else {
1753 aSig |= 0x20000000;
1755 shift32RightJamming( aSig, - expDiff, &aSig );
1756 zExp = bExp;
1758 else {
1759 if ( aExp == 0xFF ) {
1760 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1761 return a;
1763 if ( aExp == 0 ) {
1764 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1765 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1767 zSig = 0x40000000 + aSig + bSig;
1768 zExp = aExp;
1769 goto roundAndPack;
1771 aSig |= 0x20000000;
1772 zSig = ( aSig + bSig )<<1;
1773 --zExp;
1774 if ( (int32_t) zSig < 0 ) {
1775 zSig = aSig + bSig;
1776 ++zExp;
1778 roundAndPack:
1779 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1783 /*----------------------------------------------------------------------------
1784 | Returns the result of subtracting the absolute values of the single-
1785 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1786 | difference is negated before being returned. `zSign' is ignored if the
1787 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1788 | Standard for Binary Floating-Point Arithmetic.
1789 *----------------------------------------------------------------------------*/
1791 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1793 int16 aExp, bExp, zExp;
1794 uint32_t aSig, bSig, zSig;
1795 int16 expDiff;
1797 aSig = extractFloat32Frac( a );
1798 aExp = extractFloat32Exp( a );
1799 bSig = extractFloat32Frac( b );
1800 bExp = extractFloat32Exp( b );
1801 expDiff = aExp - bExp;
1802 aSig <<= 7;
1803 bSig <<= 7;
1804 if ( 0 < expDiff ) goto aExpBigger;
1805 if ( expDiff < 0 ) goto bExpBigger;
1806 if ( aExp == 0xFF ) {
1807 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1808 float_raise( float_flag_invalid STATUS_VAR);
1809 return float32_default_nan;
1811 if ( aExp == 0 ) {
1812 aExp = 1;
1813 bExp = 1;
1815 if ( bSig < aSig ) goto aBigger;
1816 if ( aSig < bSig ) goto bBigger;
1817 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1818 bExpBigger:
1819 if ( bExp == 0xFF ) {
1820 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1821 return packFloat32( zSign ^ 1, 0xFF, 0 );
1823 if ( aExp == 0 ) {
1824 ++expDiff;
1826 else {
1827 aSig |= 0x40000000;
1829 shift32RightJamming( aSig, - expDiff, &aSig );
1830 bSig |= 0x40000000;
1831 bBigger:
1832 zSig = bSig - aSig;
1833 zExp = bExp;
1834 zSign ^= 1;
1835 goto normalizeRoundAndPack;
1836 aExpBigger:
1837 if ( aExp == 0xFF ) {
1838 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1839 return a;
1841 if ( bExp == 0 ) {
1842 --expDiff;
1844 else {
1845 bSig |= 0x40000000;
1847 shift32RightJamming( bSig, expDiff, &bSig );
1848 aSig |= 0x40000000;
1849 aBigger:
1850 zSig = aSig - bSig;
1851 zExp = aExp;
1852 normalizeRoundAndPack:
1853 --zExp;
1854 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1858 /*----------------------------------------------------------------------------
1859 | Returns the result of adding the single-precision floating-point values `a'
1860 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1861 | Binary Floating-Point Arithmetic.
1862 *----------------------------------------------------------------------------*/
1864 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1866 flag aSign, bSign;
1867 a = float32_squash_input_denormal(a STATUS_VAR);
1868 b = float32_squash_input_denormal(b STATUS_VAR);
1870 aSign = extractFloat32Sign( a );
1871 bSign = extractFloat32Sign( b );
1872 if ( aSign == bSign ) {
1873 return addFloat32Sigs( a, b, aSign STATUS_VAR);
1875 else {
1876 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1881 /*----------------------------------------------------------------------------
1882 | Returns the result of subtracting the single-precision floating-point values
1883 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1884 | for Binary Floating-Point Arithmetic.
1885 *----------------------------------------------------------------------------*/
1887 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1889 flag aSign, bSign;
1890 a = float32_squash_input_denormal(a STATUS_VAR);
1891 b = float32_squash_input_denormal(b STATUS_VAR);
1893 aSign = extractFloat32Sign( a );
1894 bSign = extractFloat32Sign( b );
1895 if ( aSign == bSign ) {
1896 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1898 else {
1899 return addFloat32Sigs( a, b, aSign STATUS_VAR );
1904 /*----------------------------------------------------------------------------
1905 | Returns the result of multiplying the single-precision floating-point values
1906 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1907 | for Binary Floating-Point Arithmetic.
1908 *----------------------------------------------------------------------------*/
1910 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1912 flag aSign, bSign, zSign;
1913 int16 aExp, bExp, zExp;
1914 uint32_t aSig, bSig;
1915 uint64_t zSig64;
1916 uint32_t zSig;
1918 a = float32_squash_input_denormal(a STATUS_VAR);
1919 b = float32_squash_input_denormal(b STATUS_VAR);
1921 aSig = extractFloat32Frac( a );
1922 aExp = extractFloat32Exp( a );
1923 aSign = extractFloat32Sign( a );
1924 bSig = extractFloat32Frac( b );
1925 bExp = extractFloat32Exp( b );
1926 bSign = extractFloat32Sign( b );
1927 zSign = aSign ^ bSign;
1928 if ( aExp == 0xFF ) {
1929 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1930 return propagateFloat32NaN( a, b STATUS_VAR );
1932 if ( ( bExp | bSig ) == 0 ) {
1933 float_raise( float_flag_invalid STATUS_VAR);
1934 return float32_default_nan;
1936 return packFloat32( zSign, 0xFF, 0 );
1938 if ( bExp == 0xFF ) {
1939 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1940 if ( ( aExp | aSig ) == 0 ) {
1941 float_raise( float_flag_invalid STATUS_VAR);
1942 return float32_default_nan;
1944 return packFloat32( zSign, 0xFF, 0 );
1946 if ( aExp == 0 ) {
1947 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1948 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1950 if ( bExp == 0 ) {
1951 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1952 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1954 zExp = aExp + bExp - 0x7F;
1955 aSig = ( aSig | 0x00800000 )<<7;
1956 bSig = ( bSig | 0x00800000 )<<8;
1957 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
1958 zSig = zSig64;
1959 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
1960 zSig <<= 1;
1961 --zExp;
1963 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1967 /*----------------------------------------------------------------------------
1968 | Returns the result of dividing the single-precision floating-point value `a'
1969 | by the corresponding value `b'. The operation is performed according to the
1970 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1971 *----------------------------------------------------------------------------*/
1973 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1975 flag aSign, bSign, zSign;
1976 int16 aExp, bExp, zExp;
1977 uint32_t aSig, bSig, zSig;
1978 a = float32_squash_input_denormal(a STATUS_VAR);
1979 b = float32_squash_input_denormal(b STATUS_VAR);
1981 aSig = extractFloat32Frac( a );
1982 aExp = extractFloat32Exp( a );
1983 aSign = extractFloat32Sign( a );
1984 bSig = extractFloat32Frac( b );
1985 bExp = extractFloat32Exp( b );
1986 bSign = extractFloat32Sign( b );
1987 zSign = aSign ^ bSign;
1988 if ( aExp == 0xFF ) {
1989 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1990 if ( bExp == 0xFF ) {
1991 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1992 float_raise( float_flag_invalid STATUS_VAR);
1993 return float32_default_nan;
1995 return packFloat32( zSign, 0xFF, 0 );
1997 if ( bExp == 0xFF ) {
1998 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1999 return packFloat32( zSign, 0, 0 );
2001 if ( bExp == 0 ) {
2002 if ( bSig == 0 ) {
2003 if ( ( aExp | aSig ) == 0 ) {
2004 float_raise( float_flag_invalid STATUS_VAR);
2005 return float32_default_nan;
2007 float_raise( float_flag_divbyzero STATUS_VAR);
2008 return packFloat32( zSign, 0xFF, 0 );
2010 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2012 if ( aExp == 0 ) {
2013 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2014 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2016 zExp = aExp - bExp + 0x7D;
2017 aSig = ( aSig | 0x00800000 )<<7;
2018 bSig = ( bSig | 0x00800000 )<<8;
2019 if ( bSig <= ( aSig + aSig ) ) {
2020 aSig >>= 1;
2021 ++zExp;
2023 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2024 if ( ( zSig & 0x3F ) == 0 ) {
2025 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2027 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2031 /*----------------------------------------------------------------------------
2032 | Returns the remainder of the single-precision floating-point value `a'
2033 | with respect to the corresponding value `b'. The operation is performed
2034 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2035 *----------------------------------------------------------------------------*/
2037 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2039 flag aSign, zSign;
2040 int16 aExp, bExp, expDiff;
2041 uint32_t aSig, bSig;
2042 uint32_t q;
2043 uint64_t aSig64, bSig64, q64;
2044 uint32_t alternateASig;
2045 int32_t sigMean;
2046 a = float32_squash_input_denormal(a STATUS_VAR);
2047 b = float32_squash_input_denormal(b STATUS_VAR);
2049 aSig = extractFloat32Frac( a );
2050 aExp = extractFloat32Exp( a );
2051 aSign = extractFloat32Sign( a );
2052 bSig = extractFloat32Frac( b );
2053 bExp = extractFloat32Exp( b );
2054 if ( aExp == 0xFF ) {
2055 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2056 return propagateFloat32NaN( a, b STATUS_VAR );
2058 float_raise( float_flag_invalid STATUS_VAR);
2059 return float32_default_nan;
2061 if ( bExp == 0xFF ) {
2062 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2063 return a;
2065 if ( bExp == 0 ) {
2066 if ( bSig == 0 ) {
2067 float_raise( float_flag_invalid STATUS_VAR);
2068 return float32_default_nan;
2070 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2072 if ( aExp == 0 ) {
2073 if ( aSig == 0 ) return a;
2074 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2076 expDiff = aExp - bExp;
2077 aSig |= 0x00800000;
2078 bSig |= 0x00800000;
2079 if ( expDiff < 32 ) {
2080 aSig <<= 8;
2081 bSig <<= 8;
2082 if ( expDiff < 0 ) {
2083 if ( expDiff < -1 ) return a;
2084 aSig >>= 1;
2086 q = ( bSig <= aSig );
2087 if ( q ) aSig -= bSig;
2088 if ( 0 < expDiff ) {
2089 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2090 q >>= 32 - expDiff;
2091 bSig >>= 2;
2092 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2094 else {
2095 aSig >>= 2;
2096 bSig >>= 2;
2099 else {
2100 if ( bSig <= aSig ) aSig -= bSig;
2101 aSig64 = ( (uint64_t) aSig )<<40;
2102 bSig64 = ( (uint64_t) bSig )<<40;
2103 expDiff -= 64;
2104 while ( 0 < expDiff ) {
2105 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2106 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2107 aSig64 = - ( ( bSig * q64 )<<38 );
2108 expDiff -= 62;
2110 expDiff += 64;
2111 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2112 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2113 q = q64>>( 64 - expDiff );
2114 bSig <<= 6;
2115 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2117 do {
2118 alternateASig = aSig;
2119 ++q;
2120 aSig -= bSig;
2121 } while ( 0 <= (int32_t) aSig );
2122 sigMean = aSig + alternateASig;
2123 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2124 aSig = alternateASig;
2126 zSign = ( (int32_t) aSig < 0 );
2127 if ( zSign ) aSig = - aSig;
2128 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2132 /*----------------------------------------------------------------------------
2133 | Returns the square root of the single-precision floating-point value `a'.
2134 | The operation is performed according to the IEC/IEEE Standard for Binary
2135 | Floating-Point Arithmetic.
2136 *----------------------------------------------------------------------------*/
2138 float32 float32_sqrt( float32 a STATUS_PARAM )
2140 flag aSign;
2141 int16 aExp, zExp;
2142 uint32_t aSig, zSig;
2143 uint64_t rem, term;
2144 a = float32_squash_input_denormal(a STATUS_VAR);
2146 aSig = extractFloat32Frac( a );
2147 aExp = extractFloat32Exp( a );
2148 aSign = extractFloat32Sign( a );
2149 if ( aExp == 0xFF ) {
2150 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2151 if ( ! aSign ) return a;
2152 float_raise( float_flag_invalid STATUS_VAR);
2153 return float32_default_nan;
2155 if ( aSign ) {
2156 if ( ( aExp | aSig ) == 0 ) return a;
2157 float_raise( float_flag_invalid STATUS_VAR);
2158 return float32_default_nan;
2160 if ( aExp == 0 ) {
2161 if ( aSig == 0 ) return float32_zero;
2162 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2164 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2165 aSig = ( aSig | 0x00800000 )<<8;
2166 zSig = estimateSqrt32( aExp, aSig ) + 2;
2167 if ( ( zSig & 0x7F ) <= 5 ) {
2168 if ( zSig < 2 ) {
2169 zSig = 0x7FFFFFFF;
2170 goto roundAndPack;
2172 aSig >>= aExp & 1;
2173 term = ( (uint64_t) zSig ) * zSig;
2174 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2175 while ( (int64_t) rem < 0 ) {
2176 --zSig;
2177 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2179 zSig |= ( rem != 0 );
2181 shift32RightJamming( zSig, 1, &zSig );
2182 roundAndPack:
2183 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2187 /*----------------------------------------------------------------------------
2188 | Returns the binary exponential of the single-precision floating-point value
2189 | `a'. The operation is performed according to the IEC/IEEE Standard for
2190 | Binary Floating-Point Arithmetic.
2192 | Uses the following identities:
2194 | 1. -------------------------------------------------------------------------
2195 | x x*ln(2)
2196 | 2 = e
2198 | 2. -------------------------------------------------------------------------
2199 | 2 3 4 5 n
2200 | x x x x x x x
2201 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2202 | 1! 2! 3! 4! 5! n!
2203 *----------------------------------------------------------------------------*/
2205 static const float64 float32_exp2_coefficients[15] =
2207 const_float64( 0x3ff0000000000000ll ), /* 1 */
2208 const_float64( 0x3fe0000000000000ll ), /* 2 */
2209 const_float64( 0x3fc5555555555555ll ), /* 3 */
2210 const_float64( 0x3fa5555555555555ll ), /* 4 */
2211 const_float64( 0x3f81111111111111ll ), /* 5 */
2212 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2213 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2214 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2215 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2216 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2217 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2218 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2219 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2220 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2221 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2224 float32 float32_exp2( float32 a STATUS_PARAM )
2226 flag aSign;
2227 int16 aExp;
2228 uint32_t aSig;
2229 float64 r, x, xn;
2230 int i;
2231 a = float32_squash_input_denormal(a STATUS_VAR);
2233 aSig = extractFloat32Frac( a );
2234 aExp = extractFloat32Exp( a );
2235 aSign = extractFloat32Sign( a );
2237 if ( aExp == 0xFF) {
2238 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2239 return (aSign) ? float32_zero : a;
2241 if (aExp == 0) {
2242 if (aSig == 0) return float32_one;
2245 float_raise( float_flag_inexact STATUS_VAR);
2247 /* ******************************* */
2248 /* using float64 for approximation */
2249 /* ******************************* */
2250 x = float32_to_float64(a STATUS_VAR);
2251 x = float64_mul(x, float64_ln2 STATUS_VAR);
2253 xn = x;
2254 r = float64_one;
2255 for (i = 0 ; i < 15 ; i++) {
2256 float64 f;
2258 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2259 r = float64_add(r, f STATUS_VAR);
2261 xn = float64_mul(xn, x STATUS_VAR);
2264 return float64_to_float32(r, status);
2267 /*----------------------------------------------------------------------------
2268 | Returns the binary log of the single-precision floating-point value `a'.
2269 | The operation is performed according to the IEC/IEEE Standard for Binary
2270 | Floating-Point Arithmetic.
2271 *----------------------------------------------------------------------------*/
2272 float32 float32_log2( float32 a STATUS_PARAM )
2274 flag aSign, zSign;
2275 int16 aExp;
2276 uint32_t aSig, zSig, i;
2278 a = float32_squash_input_denormal(a STATUS_VAR);
2279 aSig = extractFloat32Frac( a );
2280 aExp = extractFloat32Exp( a );
2281 aSign = extractFloat32Sign( a );
2283 if ( aExp == 0 ) {
2284 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2285 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2287 if ( aSign ) {
2288 float_raise( float_flag_invalid STATUS_VAR);
2289 return float32_default_nan;
2291 if ( aExp == 0xFF ) {
2292 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2293 return a;
2296 aExp -= 0x7F;
2297 aSig |= 0x00800000;
2298 zSign = aExp < 0;
2299 zSig = aExp << 23;
2301 for (i = 1 << 22; i > 0; i >>= 1) {
2302 aSig = ( (uint64_t)aSig * aSig ) >> 23;
2303 if ( aSig & 0x01000000 ) {
2304 aSig >>= 1;
2305 zSig |= i;
2309 if ( zSign )
2310 zSig = -zSig;
2312 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2315 /*----------------------------------------------------------------------------
2316 | Returns 1 if the single-precision floating-point value `a' is equal to
2317 | the corresponding value `b', and 0 otherwise. The comparison is performed
2318 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2319 *----------------------------------------------------------------------------*/
2321 int float32_eq( float32 a, float32 b STATUS_PARAM )
2323 a = float32_squash_input_denormal(a STATUS_VAR);
2324 b = float32_squash_input_denormal(b STATUS_VAR);
2326 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2327 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2329 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2330 float_raise( float_flag_invalid STATUS_VAR);
2332 return 0;
2334 return ( float32_val(a) == float32_val(b) ) ||
2335 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2339 /*----------------------------------------------------------------------------
2340 | Returns 1 if the single-precision floating-point value `a' is less than
2341 | or equal to the corresponding value `b', and 0 otherwise. The comparison
2342 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2343 | Arithmetic.
2344 *----------------------------------------------------------------------------*/
2346 int float32_le( float32 a, float32 b STATUS_PARAM )
2348 flag aSign, bSign;
2349 uint32_t av, bv;
2350 a = float32_squash_input_denormal(a STATUS_VAR);
2351 b = float32_squash_input_denormal(b STATUS_VAR);
2353 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2354 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2356 float_raise( float_flag_invalid STATUS_VAR);
2357 return 0;
2359 aSign = extractFloat32Sign( a );
2360 bSign = extractFloat32Sign( b );
2361 av = float32_val(a);
2362 bv = float32_val(b);
2363 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2364 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2368 /*----------------------------------------------------------------------------
2369 | Returns 1 if the single-precision floating-point value `a' is less than
2370 | the corresponding value `b', and 0 otherwise. The comparison is performed
2371 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2372 *----------------------------------------------------------------------------*/
2374 int float32_lt( float32 a, float32 b STATUS_PARAM )
2376 flag aSign, bSign;
2377 uint32_t av, bv;
2378 a = float32_squash_input_denormal(a STATUS_VAR);
2379 b = float32_squash_input_denormal(b STATUS_VAR);
2381 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2382 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2384 float_raise( float_flag_invalid STATUS_VAR);
2385 return 0;
2387 aSign = extractFloat32Sign( a );
2388 bSign = extractFloat32Sign( b );
2389 av = float32_val(a);
2390 bv = float32_val(b);
2391 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2392 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2396 /*----------------------------------------------------------------------------
2397 | Returns 1 if the single-precision floating-point value `a' is equal to
2398 | the corresponding value `b', and 0 otherwise. The invalid exception is
2399 | raised if either operand is a NaN. Otherwise, the comparison is performed
2400 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2401 *----------------------------------------------------------------------------*/
2403 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2405 uint32_t av, bv;
2406 a = float32_squash_input_denormal(a STATUS_VAR);
2407 b = float32_squash_input_denormal(b STATUS_VAR);
2409 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2410 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2412 float_raise( float_flag_invalid STATUS_VAR);
2413 return 0;
2415 av = float32_val(a);
2416 bv = float32_val(b);
2417 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2421 /*----------------------------------------------------------------------------
2422 | Returns 1 if the single-precision floating-point value `a' is less than or
2423 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2424 | cause an exception. Otherwise, the comparison is performed according to the
2425 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2426 *----------------------------------------------------------------------------*/
2428 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2430 flag aSign, bSign;
2431 uint32_t av, bv;
2432 a = float32_squash_input_denormal(a STATUS_VAR);
2433 b = float32_squash_input_denormal(b STATUS_VAR);
2435 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2436 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2438 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2439 float_raise( float_flag_invalid STATUS_VAR);
2441 return 0;
2443 aSign = extractFloat32Sign( a );
2444 bSign = extractFloat32Sign( b );
2445 av = float32_val(a);
2446 bv = float32_val(b);
2447 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2448 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2452 /*----------------------------------------------------------------------------
2453 | Returns 1 if the single-precision floating-point value `a' is less than
2454 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2455 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2456 | Standard for Binary Floating-Point Arithmetic.
2457 *----------------------------------------------------------------------------*/
2459 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2461 flag aSign, bSign;
2462 uint32_t av, bv;
2463 a = float32_squash_input_denormal(a STATUS_VAR);
2464 b = float32_squash_input_denormal(b STATUS_VAR);
2466 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2467 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2469 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2470 float_raise( float_flag_invalid STATUS_VAR);
2472 return 0;
2474 aSign = extractFloat32Sign( a );
2475 bSign = extractFloat32Sign( b );
2476 av = float32_val(a);
2477 bv = float32_val(b);
2478 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2479 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2483 /*----------------------------------------------------------------------------
2484 | Returns the result of converting the double-precision floating-point value
2485 | `a' to the 32-bit two's complement integer format. The conversion is
2486 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2487 | Arithmetic---which means in particular that the conversion is rounded
2488 | according to the current rounding mode. If `a' is a NaN, the largest
2489 | positive integer is returned. Otherwise, if the conversion overflows, the
2490 | largest integer with the same sign as `a' is returned.
2491 *----------------------------------------------------------------------------*/
2493 int32 float64_to_int32( float64 a STATUS_PARAM )
2495 flag aSign;
2496 int16 aExp, shiftCount;
2497 uint64_t aSig;
2498 a = float64_squash_input_denormal(a STATUS_VAR);
2500 aSig = extractFloat64Frac( a );
2501 aExp = extractFloat64Exp( a );
2502 aSign = extractFloat64Sign( a );
2503 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2504 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2505 shiftCount = 0x42C - aExp;
2506 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2507 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2511 /*----------------------------------------------------------------------------
2512 | Returns the result of converting the double-precision floating-point value
2513 | `a' to the 32-bit two's complement integer format. The conversion is
2514 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2515 | Arithmetic, except that the conversion is always rounded toward zero.
2516 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2517 | the conversion overflows, the largest integer with the same sign as `a' is
2518 | returned.
2519 *----------------------------------------------------------------------------*/
2521 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2523 flag aSign;
2524 int16 aExp, shiftCount;
2525 uint64_t aSig, savedASig;
2526 int32 z;
2527 a = float64_squash_input_denormal(a STATUS_VAR);
2529 aSig = extractFloat64Frac( a );
2530 aExp = extractFloat64Exp( a );
2531 aSign = extractFloat64Sign( a );
2532 if ( 0x41E < aExp ) {
2533 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2534 goto invalid;
2536 else if ( aExp < 0x3FF ) {
2537 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2538 return 0;
2540 aSig |= LIT64( 0x0010000000000000 );
2541 shiftCount = 0x433 - aExp;
2542 savedASig = aSig;
2543 aSig >>= shiftCount;
2544 z = aSig;
2545 if ( aSign ) z = - z;
2546 if ( ( z < 0 ) ^ aSign ) {
2547 invalid:
2548 float_raise( float_flag_invalid STATUS_VAR);
2549 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2551 if ( ( aSig<<shiftCount ) != savedASig ) {
2552 STATUS(float_exception_flags) |= float_flag_inexact;
2554 return z;
2558 /*----------------------------------------------------------------------------
2559 | Returns the result of converting the double-precision floating-point value
2560 | `a' to the 16-bit two's complement integer format. The conversion is
2561 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2562 | Arithmetic, except that the conversion is always rounded toward zero.
2563 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2564 | the conversion overflows, the largest integer with the same sign as `a' is
2565 | returned.
2566 *----------------------------------------------------------------------------*/
2568 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2570 flag aSign;
2571 int16 aExp, shiftCount;
2572 uint64_t aSig, savedASig;
2573 int32 z;
2575 aSig = extractFloat64Frac( a );
2576 aExp = extractFloat64Exp( a );
2577 aSign = extractFloat64Sign( a );
2578 if ( 0x40E < aExp ) {
2579 if ( ( aExp == 0x7FF ) && aSig ) {
2580 aSign = 0;
2582 goto invalid;
2584 else if ( aExp < 0x3FF ) {
2585 if ( aExp || aSig ) {
2586 STATUS(float_exception_flags) |= float_flag_inexact;
2588 return 0;
2590 aSig |= LIT64( 0x0010000000000000 );
2591 shiftCount = 0x433 - aExp;
2592 savedASig = aSig;
2593 aSig >>= shiftCount;
2594 z = aSig;
2595 if ( aSign ) {
2596 z = - z;
2598 if ( ( (int16_t)z < 0 ) ^ aSign ) {
2599 invalid:
2600 float_raise( float_flag_invalid STATUS_VAR);
2601 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2603 if ( ( aSig<<shiftCount ) != savedASig ) {
2604 STATUS(float_exception_flags) |= float_flag_inexact;
2606 return z;
2609 /*----------------------------------------------------------------------------
2610 | Returns the result of converting the double-precision floating-point value
2611 | `a' to the 64-bit two's complement integer format. The conversion is
2612 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2613 | Arithmetic---which means in particular that the conversion is rounded
2614 | according to the current rounding mode. If `a' is a NaN, the largest
2615 | positive integer is returned. Otherwise, if the conversion overflows, the
2616 | largest integer with the same sign as `a' is returned.
2617 *----------------------------------------------------------------------------*/
2619 int64 float64_to_int64( float64 a STATUS_PARAM )
2621 flag aSign;
2622 int16 aExp, shiftCount;
2623 uint64_t aSig, aSigExtra;
2624 a = float64_squash_input_denormal(a STATUS_VAR);
2626 aSig = extractFloat64Frac( a );
2627 aExp = extractFloat64Exp( a );
2628 aSign = extractFloat64Sign( a );
2629 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2630 shiftCount = 0x433 - aExp;
2631 if ( shiftCount <= 0 ) {
2632 if ( 0x43E < aExp ) {
2633 float_raise( float_flag_invalid STATUS_VAR);
2634 if ( ! aSign
2635 || ( ( aExp == 0x7FF )
2636 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2638 return LIT64( 0x7FFFFFFFFFFFFFFF );
2640 return (int64_t) LIT64( 0x8000000000000000 );
2642 aSigExtra = 0;
2643 aSig <<= - shiftCount;
2645 else {
2646 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2648 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2652 /*----------------------------------------------------------------------------
2653 | Returns the result of converting the double-precision floating-point value
2654 | `a' to the 64-bit two's complement integer format. The conversion is
2655 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2656 | Arithmetic, except that the conversion is always rounded toward zero.
2657 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2658 | the conversion overflows, the largest integer with the same sign as `a' is
2659 | returned.
2660 *----------------------------------------------------------------------------*/
2662 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2664 flag aSign;
2665 int16 aExp, shiftCount;
2666 uint64_t aSig;
2667 int64 z;
2668 a = float64_squash_input_denormal(a STATUS_VAR);
2670 aSig = extractFloat64Frac( a );
2671 aExp = extractFloat64Exp( a );
2672 aSign = extractFloat64Sign( a );
2673 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2674 shiftCount = aExp - 0x433;
2675 if ( 0 <= shiftCount ) {
2676 if ( 0x43E <= aExp ) {
2677 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2678 float_raise( float_flag_invalid STATUS_VAR);
2679 if ( ! aSign
2680 || ( ( aExp == 0x7FF )
2681 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2683 return LIT64( 0x7FFFFFFFFFFFFFFF );
2686 return (int64_t) LIT64( 0x8000000000000000 );
2688 z = aSig<<shiftCount;
2690 else {
2691 if ( aExp < 0x3FE ) {
2692 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2693 return 0;
2695 z = aSig>>( - shiftCount );
2696 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
2697 STATUS(float_exception_flags) |= float_flag_inexact;
2700 if ( aSign ) z = - z;
2701 return z;
2705 /*----------------------------------------------------------------------------
2706 | Returns the result of converting the double-precision floating-point value
2707 | `a' to the single-precision floating-point format. The conversion is
2708 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2709 | Arithmetic.
2710 *----------------------------------------------------------------------------*/
2712 float32 float64_to_float32( float64 a STATUS_PARAM )
2714 flag aSign;
2715 int16 aExp;
2716 uint64_t aSig;
2717 uint32_t zSig;
2718 a = float64_squash_input_denormal(a STATUS_VAR);
2720 aSig = extractFloat64Frac( a );
2721 aExp = extractFloat64Exp( a );
2722 aSign = extractFloat64Sign( a );
2723 if ( aExp == 0x7FF ) {
2724 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2725 return packFloat32( aSign, 0xFF, 0 );
2727 shift64RightJamming( aSig, 22, &aSig );
2728 zSig = aSig;
2729 if ( aExp || zSig ) {
2730 zSig |= 0x40000000;
2731 aExp -= 0x381;
2733 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2738 /*----------------------------------------------------------------------------
2739 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2740 | half-precision floating-point value, returning the result. After being
2741 | shifted into the proper positions, the three fields are simply added
2742 | together to form the result. This means that any integer portion of `zSig'
2743 | will be added into the exponent. Since a properly normalized significand
2744 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2745 | than the desired result exponent whenever `zSig' is a complete, normalized
2746 | significand.
2747 *----------------------------------------------------------------------------*/
2748 static float16 packFloat16(flag zSign, int16 zExp, uint16_t zSig)
2750 return make_float16(
2751 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
2754 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2755 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
2757 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2759 flag aSign;
2760 int16 aExp;
2761 uint32_t aSig;
2763 aSign = extractFloat16Sign(a);
2764 aExp = extractFloat16Exp(a);
2765 aSig = extractFloat16Frac(a);
2767 if (aExp == 0x1f && ieee) {
2768 if (aSig) {
2769 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
2771 return packFloat32(aSign, 0xff, aSig << 13);
2773 if (aExp == 0) {
2774 int8 shiftCount;
2776 if (aSig == 0) {
2777 return packFloat32(aSign, 0, 0);
2780 shiftCount = countLeadingZeros32( aSig ) - 21;
2781 aSig = aSig << shiftCount;
2782 aExp = -shiftCount;
2784 return packFloat32( aSign, aExp + 0x70, aSig << 13);
2787 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
2789 flag aSign;
2790 int16 aExp;
2791 uint32_t aSig;
2792 uint32_t mask;
2793 uint32_t increment;
2794 int8 roundingMode;
2795 a = float32_squash_input_denormal(a STATUS_VAR);
2797 aSig = extractFloat32Frac( a );
2798 aExp = extractFloat32Exp( a );
2799 aSign = extractFloat32Sign( a );
2800 if ( aExp == 0xFF ) {
2801 if (aSig) {
2802 /* Input is a NaN */
2803 float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2804 if (!ieee) {
2805 return packFloat16(aSign, 0, 0);
2807 return r;
2809 /* Infinity */
2810 if (!ieee) {
2811 float_raise(float_flag_invalid STATUS_VAR);
2812 return packFloat16(aSign, 0x1f, 0x3ff);
2814 return packFloat16(aSign, 0x1f, 0);
2816 if (aExp == 0 && aSig == 0) {
2817 return packFloat16(aSign, 0, 0);
2819 /* Decimal point between bits 22 and 23. */
2820 aSig |= 0x00800000;
2821 aExp -= 0x7f;
2822 if (aExp < -14) {
2823 mask = 0x00ffffff;
2824 if (aExp >= -24) {
2825 mask >>= 25 + aExp;
2827 } else {
2828 mask = 0x00001fff;
2830 if (aSig & mask) {
2831 float_raise( float_flag_underflow STATUS_VAR );
2832 roundingMode = STATUS(float_rounding_mode);
2833 switch (roundingMode) {
2834 case float_round_nearest_even:
2835 increment = (mask + 1) >> 1;
2836 if ((aSig & mask) == increment) {
2837 increment = aSig & (increment << 1);
2839 break;
2840 case float_round_up:
2841 increment = aSign ? 0 : mask;
2842 break;
2843 case float_round_down:
2844 increment = aSign ? mask : 0;
2845 break;
2846 default: /* round_to_zero */
2847 increment = 0;
2848 break;
2850 aSig += increment;
2851 if (aSig >= 0x01000000) {
2852 aSig >>= 1;
2853 aExp++;
2855 } else if (aExp < -14
2856 && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2857 float_raise( float_flag_underflow STATUS_VAR);
2860 if (ieee) {
2861 if (aExp > 15) {
2862 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2863 return packFloat16(aSign, 0x1f, 0);
2865 } else {
2866 if (aExp > 16) {
2867 float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
2868 return packFloat16(aSign, 0x1f, 0x3ff);
2871 if (aExp < -24) {
2872 return packFloat16(aSign, 0, 0);
2874 if (aExp < -14) {
2875 aSig >>= -14 - aExp;
2876 aExp = -14;
2878 return packFloat16(aSign, aExp + 14, aSig >> 13);
2881 #ifdef FLOATX80
2883 /*----------------------------------------------------------------------------
2884 | Returns the result of converting the double-precision floating-point value
2885 | `a' to the extended double-precision floating-point format. The conversion
2886 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2887 | Arithmetic.
2888 *----------------------------------------------------------------------------*/
2890 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2892 flag aSign;
2893 int16 aExp;
2894 uint64_t aSig;
2896 a = float64_squash_input_denormal(a STATUS_VAR);
2897 aSig = extractFloat64Frac( a );
2898 aExp = extractFloat64Exp( a );
2899 aSign = extractFloat64Sign( a );
2900 if ( aExp == 0x7FF ) {
2901 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2902 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2904 if ( aExp == 0 ) {
2905 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2906 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2908 return
2909 packFloatx80(
2910 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2914 #endif
2916 #ifdef FLOAT128
2918 /*----------------------------------------------------------------------------
2919 | Returns the result of converting the double-precision floating-point value
2920 | `a' to the quadruple-precision floating-point format. The conversion is
2921 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2922 | Arithmetic.
2923 *----------------------------------------------------------------------------*/
2925 float128 float64_to_float128( float64 a STATUS_PARAM )
2927 flag aSign;
2928 int16 aExp;
2929 uint64_t aSig, zSig0, zSig1;
2931 a = float64_squash_input_denormal(a STATUS_VAR);
2932 aSig = extractFloat64Frac( a );
2933 aExp = extractFloat64Exp( a );
2934 aSign = extractFloat64Sign( a );
2935 if ( aExp == 0x7FF ) {
2936 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2937 return packFloat128( aSign, 0x7FFF, 0, 0 );
2939 if ( aExp == 0 ) {
2940 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2941 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2942 --aExp;
2944 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2945 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2949 #endif
2951 /*----------------------------------------------------------------------------
2952 | Rounds the double-precision floating-point value `a' to an integer, and
2953 | returns the result as a double-precision floating-point value. The
2954 | operation is performed according to the IEC/IEEE Standard for Binary
2955 | Floating-Point Arithmetic.
2956 *----------------------------------------------------------------------------*/
2958 float64 float64_round_to_int( float64 a STATUS_PARAM )
2960 flag aSign;
2961 int16 aExp;
2962 uint64_t lastBitMask, roundBitsMask;
2963 int8 roundingMode;
2964 uint64_t z;
2965 a = float64_squash_input_denormal(a STATUS_VAR);
2967 aExp = extractFloat64Exp( a );
2968 if ( 0x433 <= aExp ) {
2969 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2970 return propagateFloat64NaN( a, a STATUS_VAR );
2972 return a;
2974 if ( aExp < 0x3FF ) {
2975 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
2976 STATUS(float_exception_flags) |= float_flag_inexact;
2977 aSign = extractFloat64Sign( a );
2978 switch ( STATUS(float_rounding_mode) ) {
2979 case float_round_nearest_even:
2980 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2981 return packFloat64( aSign, 0x3FF, 0 );
2983 break;
2984 case float_round_down:
2985 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2986 case float_round_up:
2987 return make_float64(
2988 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2990 return packFloat64( aSign, 0, 0 );
2992 lastBitMask = 1;
2993 lastBitMask <<= 0x433 - aExp;
2994 roundBitsMask = lastBitMask - 1;
2995 z = float64_val(a);
2996 roundingMode = STATUS(float_rounding_mode);
2997 if ( roundingMode == float_round_nearest_even ) {
2998 z += lastBitMask>>1;
2999 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3001 else if ( roundingMode != float_round_to_zero ) {
3002 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3003 z += roundBitsMask;
3006 z &= ~ roundBitsMask;
3007 if ( z != float64_val(a) )
3008 STATUS(float_exception_flags) |= float_flag_inexact;
3009 return make_float64(z);
3013 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3015 int oldmode;
3016 float64 res;
3017 oldmode = STATUS(float_rounding_mode);
3018 STATUS(float_rounding_mode) = float_round_to_zero;
3019 res = float64_round_to_int(a STATUS_VAR);
3020 STATUS(float_rounding_mode) = oldmode;
3021 return res;
3024 /*----------------------------------------------------------------------------
3025 | Returns the result of adding the absolute values of the double-precision
3026 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3027 | before being returned. `zSign' is ignored if the result is a NaN.
3028 | The addition is performed according to the IEC/IEEE Standard for Binary
3029 | Floating-Point Arithmetic.
3030 *----------------------------------------------------------------------------*/
3032 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3034 int16 aExp, bExp, zExp;
3035 uint64_t aSig, bSig, zSig;
3036 int16 expDiff;
3038 aSig = extractFloat64Frac( a );
3039 aExp = extractFloat64Exp( a );
3040 bSig = extractFloat64Frac( b );
3041 bExp = extractFloat64Exp( b );
3042 expDiff = aExp - bExp;
3043 aSig <<= 9;
3044 bSig <<= 9;
3045 if ( 0 < expDiff ) {
3046 if ( aExp == 0x7FF ) {
3047 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3048 return a;
3050 if ( bExp == 0 ) {
3051 --expDiff;
3053 else {
3054 bSig |= LIT64( 0x2000000000000000 );
3056 shift64RightJamming( bSig, expDiff, &bSig );
3057 zExp = aExp;
3059 else if ( expDiff < 0 ) {
3060 if ( bExp == 0x7FF ) {
3061 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3062 return packFloat64( zSign, 0x7FF, 0 );
3064 if ( aExp == 0 ) {
3065 ++expDiff;
3067 else {
3068 aSig |= LIT64( 0x2000000000000000 );
3070 shift64RightJamming( aSig, - expDiff, &aSig );
3071 zExp = bExp;
3073 else {
3074 if ( aExp == 0x7FF ) {
3075 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3076 return a;
3078 if ( aExp == 0 ) {
3079 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
3080 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3082 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3083 zExp = aExp;
3084 goto roundAndPack;
3086 aSig |= LIT64( 0x2000000000000000 );
3087 zSig = ( aSig + bSig )<<1;
3088 --zExp;
3089 if ( (int64_t) zSig < 0 ) {
3090 zSig = aSig + bSig;
3091 ++zExp;
3093 roundAndPack:
3094 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3098 /*----------------------------------------------------------------------------
3099 | Returns the result of subtracting the absolute values of the double-
3100 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3101 | difference is negated before being returned. `zSign' is ignored if the
3102 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3103 | Standard for Binary Floating-Point Arithmetic.
3104 *----------------------------------------------------------------------------*/
3106 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3108 int16 aExp, bExp, zExp;
3109 uint64_t aSig, bSig, zSig;
3110 int16 expDiff;
3112 aSig = extractFloat64Frac( a );
3113 aExp = extractFloat64Exp( a );
3114 bSig = extractFloat64Frac( b );
3115 bExp = extractFloat64Exp( b );
3116 expDiff = aExp - bExp;
3117 aSig <<= 10;
3118 bSig <<= 10;
3119 if ( 0 < expDiff ) goto aExpBigger;
3120 if ( expDiff < 0 ) goto bExpBigger;
3121 if ( aExp == 0x7FF ) {
3122 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3123 float_raise( float_flag_invalid STATUS_VAR);
3124 return float64_default_nan;
3126 if ( aExp == 0 ) {
3127 aExp = 1;
3128 bExp = 1;
3130 if ( bSig < aSig ) goto aBigger;
3131 if ( aSig < bSig ) goto bBigger;
3132 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3133 bExpBigger:
3134 if ( bExp == 0x7FF ) {
3135 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3136 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3138 if ( aExp == 0 ) {
3139 ++expDiff;
3141 else {
3142 aSig |= LIT64( 0x4000000000000000 );
3144 shift64RightJamming( aSig, - expDiff, &aSig );
3145 bSig |= LIT64( 0x4000000000000000 );
3146 bBigger:
3147 zSig = bSig - aSig;
3148 zExp = bExp;
3149 zSign ^= 1;
3150 goto normalizeRoundAndPack;
3151 aExpBigger:
3152 if ( aExp == 0x7FF ) {
3153 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3154 return a;
3156 if ( bExp == 0 ) {
3157 --expDiff;
3159 else {
3160 bSig |= LIT64( 0x4000000000000000 );
3162 shift64RightJamming( bSig, expDiff, &bSig );
3163 aSig |= LIT64( 0x4000000000000000 );
3164 aBigger:
3165 zSig = aSig - bSig;
3166 zExp = aExp;
3167 normalizeRoundAndPack:
3168 --zExp;
3169 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3173 /*----------------------------------------------------------------------------
3174 | Returns the result of adding the double-precision floating-point values `a'
3175 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3176 | Binary Floating-Point Arithmetic.
3177 *----------------------------------------------------------------------------*/
3179 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3181 flag aSign, bSign;
3182 a = float64_squash_input_denormal(a STATUS_VAR);
3183 b = float64_squash_input_denormal(b STATUS_VAR);
3185 aSign = extractFloat64Sign( a );
3186 bSign = extractFloat64Sign( b );
3187 if ( aSign == bSign ) {
3188 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3190 else {
3191 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3196 /*----------------------------------------------------------------------------
3197 | Returns the result of subtracting the double-precision floating-point values
3198 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3199 | for Binary Floating-Point Arithmetic.
3200 *----------------------------------------------------------------------------*/
3202 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3204 flag aSign, bSign;
3205 a = float64_squash_input_denormal(a STATUS_VAR);
3206 b = float64_squash_input_denormal(b STATUS_VAR);
3208 aSign = extractFloat64Sign( a );
3209 bSign = extractFloat64Sign( b );
3210 if ( aSign == bSign ) {
3211 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3213 else {
3214 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3219 /*----------------------------------------------------------------------------
3220 | Returns the result of multiplying the double-precision floating-point values
3221 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3222 | for Binary Floating-Point Arithmetic.
3223 *----------------------------------------------------------------------------*/
3225 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3227 flag aSign, bSign, zSign;
3228 int16 aExp, bExp, zExp;
3229 uint64_t aSig, bSig, zSig0, zSig1;
3231 a = float64_squash_input_denormal(a STATUS_VAR);
3232 b = float64_squash_input_denormal(b STATUS_VAR);
3234 aSig = extractFloat64Frac( a );
3235 aExp = extractFloat64Exp( a );
3236 aSign = extractFloat64Sign( a );
3237 bSig = extractFloat64Frac( b );
3238 bExp = extractFloat64Exp( b );
3239 bSign = extractFloat64Sign( b );
3240 zSign = aSign ^ bSign;
3241 if ( aExp == 0x7FF ) {
3242 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3243 return propagateFloat64NaN( a, b STATUS_VAR );
3245 if ( ( bExp | bSig ) == 0 ) {
3246 float_raise( float_flag_invalid STATUS_VAR);
3247 return float64_default_nan;
3249 return packFloat64( zSign, 0x7FF, 0 );
3251 if ( bExp == 0x7FF ) {
3252 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3253 if ( ( aExp | aSig ) == 0 ) {
3254 float_raise( float_flag_invalid STATUS_VAR);
3255 return float64_default_nan;
3257 return packFloat64( zSign, 0x7FF, 0 );
3259 if ( aExp == 0 ) {
3260 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3261 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3263 if ( bExp == 0 ) {
3264 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3265 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3267 zExp = aExp + bExp - 0x3FF;
3268 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3269 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3270 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3271 zSig0 |= ( zSig1 != 0 );
3272 if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3273 zSig0 <<= 1;
3274 --zExp;
3276 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3280 /*----------------------------------------------------------------------------
3281 | Returns the result of dividing the double-precision floating-point value `a'
3282 | by the corresponding value `b'. The operation is performed according to
3283 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3284 *----------------------------------------------------------------------------*/
3286 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3288 flag aSign, bSign, zSign;
3289 int16 aExp, bExp, zExp;
3290 uint64_t aSig, bSig, zSig;
3291 uint64_t rem0, rem1;
3292 uint64_t term0, term1;
3293 a = float64_squash_input_denormal(a STATUS_VAR);
3294 b = float64_squash_input_denormal(b STATUS_VAR);
3296 aSig = extractFloat64Frac( a );
3297 aExp = extractFloat64Exp( a );
3298 aSign = extractFloat64Sign( a );
3299 bSig = extractFloat64Frac( b );
3300 bExp = extractFloat64Exp( b );
3301 bSign = extractFloat64Sign( b );
3302 zSign = aSign ^ bSign;
3303 if ( aExp == 0x7FF ) {
3304 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3305 if ( bExp == 0x7FF ) {
3306 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3307 float_raise( float_flag_invalid STATUS_VAR);
3308 return float64_default_nan;
3310 return packFloat64( zSign, 0x7FF, 0 );
3312 if ( bExp == 0x7FF ) {
3313 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3314 return packFloat64( zSign, 0, 0 );
3316 if ( bExp == 0 ) {
3317 if ( bSig == 0 ) {
3318 if ( ( aExp | aSig ) == 0 ) {
3319 float_raise( float_flag_invalid STATUS_VAR);
3320 return float64_default_nan;
3322 float_raise( float_flag_divbyzero STATUS_VAR);
3323 return packFloat64( zSign, 0x7FF, 0 );
3325 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3327 if ( aExp == 0 ) {
3328 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3329 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3331 zExp = aExp - bExp + 0x3FD;
3332 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3333 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3334 if ( bSig <= ( aSig + aSig ) ) {
3335 aSig >>= 1;
3336 ++zExp;
3338 zSig = estimateDiv128To64( aSig, 0, bSig );
3339 if ( ( zSig & 0x1FF ) <= 2 ) {
3340 mul64To128( bSig, zSig, &term0, &term1 );
3341 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3342 while ( (int64_t) rem0 < 0 ) {
3343 --zSig;
3344 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3346 zSig |= ( rem1 != 0 );
3348 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3352 /*----------------------------------------------------------------------------
3353 | Returns the remainder of the double-precision floating-point value `a'
3354 | with respect to the corresponding value `b'. The operation is performed
3355 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3356 *----------------------------------------------------------------------------*/
3358 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3360 flag aSign, zSign;
3361 int16 aExp, bExp, expDiff;
3362 uint64_t aSig, bSig;
3363 uint64_t q, alternateASig;
3364 int64_t sigMean;
3366 a = float64_squash_input_denormal(a STATUS_VAR);
3367 b = float64_squash_input_denormal(b STATUS_VAR);
3368 aSig = extractFloat64Frac( a );
3369 aExp = extractFloat64Exp( a );
3370 aSign = extractFloat64Sign( a );
3371 bSig = extractFloat64Frac( b );
3372 bExp = extractFloat64Exp( b );
3373 if ( aExp == 0x7FF ) {
3374 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3375 return propagateFloat64NaN( a, b STATUS_VAR );
3377 float_raise( float_flag_invalid STATUS_VAR);
3378 return float64_default_nan;
3380 if ( bExp == 0x7FF ) {
3381 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3382 return a;
3384 if ( bExp == 0 ) {
3385 if ( bSig == 0 ) {
3386 float_raise( float_flag_invalid STATUS_VAR);
3387 return float64_default_nan;
3389 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3391 if ( aExp == 0 ) {
3392 if ( aSig == 0 ) return a;
3393 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3395 expDiff = aExp - bExp;
3396 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3397 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3398 if ( expDiff < 0 ) {
3399 if ( expDiff < -1 ) return a;
3400 aSig >>= 1;
3402 q = ( bSig <= aSig );
3403 if ( q ) aSig -= bSig;
3404 expDiff -= 64;
3405 while ( 0 < expDiff ) {
3406 q = estimateDiv128To64( aSig, 0, bSig );
3407 q = ( 2 < q ) ? q - 2 : 0;
3408 aSig = - ( ( bSig>>2 ) * q );
3409 expDiff -= 62;
3411 expDiff += 64;
3412 if ( 0 < expDiff ) {
3413 q = estimateDiv128To64( aSig, 0, bSig );
3414 q = ( 2 < q ) ? q - 2 : 0;
3415 q >>= 64 - expDiff;
3416 bSig >>= 2;
3417 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3419 else {
3420 aSig >>= 2;
3421 bSig >>= 2;
3423 do {
3424 alternateASig = aSig;
3425 ++q;
3426 aSig -= bSig;
3427 } while ( 0 <= (int64_t) aSig );
3428 sigMean = aSig + alternateASig;
3429 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3430 aSig = alternateASig;
3432 zSign = ( (int64_t) aSig < 0 );
3433 if ( zSign ) aSig = - aSig;
3434 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3438 /*----------------------------------------------------------------------------
3439 | Returns the square root of the double-precision floating-point value `a'.
3440 | The operation is performed according to the IEC/IEEE Standard for Binary
3441 | Floating-Point Arithmetic.
3442 *----------------------------------------------------------------------------*/
3444 float64 float64_sqrt( float64 a STATUS_PARAM )
3446 flag aSign;
3447 int16 aExp, zExp;
3448 uint64_t aSig, zSig, doubleZSig;
3449 uint64_t rem0, rem1, term0, term1;
3450 a = float64_squash_input_denormal(a STATUS_VAR);
3452 aSig = extractFloat64Frac( a );
3453 aExp = extractFloat64Exp( a );
3454 aSign = extractFloat64Sign( a );
3455 if ( aExp == 0x7FF ) {
3456 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3457 if ( ! aSign ) return a;
3458 float_raise( float_flag_invalid STATUS_VAR);
3459 return float64_default_nan;
3461 if ( aSign ) {
3462 if ( ( aExp | aSig ) == 0 ) return a;
3463 float_raise( float_flag_invalid STATUS_VAR);
3464 return float64_default_nan;
3466 if ( aExp == 0 ) {
3467 if ( aSig == 0 ) return float64_zero;
3468 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3470 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3471 aSig |= LIT64( 0x0010000000000000 );
3472 zSig = estimateSqrt32( aExp, aSig>>21 );
3473 aSig <<= 9 - ( aExp & 1 );
3474 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3475 if ( ( zSig & 0x1FF ) <= 5 ) {
3476 doubleZSig = zSig<<1;
3477 mul64To128( zSig, zSig, &term0, &term1 );
3478 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3479 while ( (int64_t) rem0 < 0 ) {
3480 --zSig;
3481 doubleZSig -= 2;
3482 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3484 zSig |= ( ( rem0 | rem1 ) != 0 );
3486 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3490 /*----------------------------------------------------------------------------
3491 | Returns the binary log of the double-precision floating-point value `a'.
3492 | The operation is performed according to the IEC/IEEE Standard for Binary
3493 | Floating-Point Arithmetic.
3494 *----------------------------------------------------------------------------*/
3495 float64 float64_log2( float64 a STATUS_PARAM )
3497 flag aSign, zSign;
3498 int16 aExp;
3499 uint64_t aSig, aSig0, aSig1, zSig, i;
3500 a = float64_squash_input_denormal(a STATUS_VAR);
3502 aSig = extractFloat64Frac( a );
3503 aExp = extractFloat64Exp( a );
3504 aSign = extractFloat64Sign( a );
3506 if ( aExp == 0 ) {
3507 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3508 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3510 if ( aSign ) {
3511 float_raise( float_flag_invalid STATUS_VAR);
3512 return float64_default_nan;
3514 if ( aExp == 0x7FF ) {
3515 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3516 return a;
3519 aExp -= 0x3FF;
3520 aSig |= LIT64( 0x0010000000000000 );
3521 zSign = aExp < 0;
3522 zSig = (uint64_t)aExp << 52;
3523 for (i = 1LL << 51; i > 0; i >>= 1) {
3524 mul64To128( aSig, aSig, &aSig0, &aSig1 );
3525 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3526 if ( aSig & LIT64( 0x0020000000000000 ) ) {
3527 aSig >>= 1;
3528 zSig |= i;
3532 if ( zSign )
3533 zSig = -zSig;
3534 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3537 /*----------------------------------------------------------------------------
3538 | Returns 1 if the double-precision floating-point value `a' is equal to the
3539 | corresponding value `b', and 0 otherwise. The comparison is performed
3540 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3541 *----------------------------------------------------------------------------*/
3543 int float64_eq( float64 a, float64 b STATUS_PARAM )
3545 uint64_t av, bv;
3546 a = float64_squash_input_denormal(a STATUS_VAR);
3547 b = float64_squash_input_denormal(b STATUS_VAR);
3549 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3550 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3552 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3553 float_raise( float_flag_invalid STATUS_VAR);
3555 return 0;
3557 av = float64_val(a);
3558 bv = float64_val(b);
3559 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3563 /*----------------------------------------------------------------------------
3564 | Returns 1 if the double-precision floating-point value `a' is less than or
3565 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3566 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3567 | Arithmetic.
3568 *----------------------------------------------------------------------------*/
3570 int float64_le( float64 a, float64 b STATUS_PARAM )
3572 flag aSign, bSign;
3573 uint64_t av, bv;
3574 a = float64_squash_input_denormal(a STATUS_VAR);
3575 b = float64_squash_input_denormal(b STATUS_VAR);
3577 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3578 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3580 float_raise( float_flag_invalid STATUS_VAR);
3581 return 0;
3583 aSign = extractFloat64Sign( a );
3584 bSign = extractFloat64Sign( b );
3585 av = float64_val(a);
3586 bv = float64_val(b);
3587 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3588 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3592 /*----------------------------------------------------------------------------
3593 | Returns 1 if the double-precision floating-point value `a' is less than
3594 | the corresponding value `b', and 0 otherwise. The comparison is performed
3595 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3596 *----------------------------------------------------------------------------*/
3598 int float64_lt( float64 a, float64 b STATUS_PARAM )
3600 flag aSign, bSign;
3601 uint64_t av, bv;
3603 a = float64_squash_input_denormal(a STATUS_VAR);
3604 b = float64_squash_input_denormal(b STATUS_VAR);
3605 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3606 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3608 float_raise( float_flag_invalid STATUS_VAR);
3609 return 0;
3611 aSign = extractFloat64Sign( a );
3612 bSign = extractFloat64Sign( b );
3613 av = float64_val(a);
3614 bv = float64_val(b);
3615 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3616 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3620 /*----------------------------------------------------------------------------
3621 | Returns 1 if the double-precision floating-point value `a' is equal to the
3622 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3623 | if either operand is a NaN. Otherwise, the comparison is performed
3624 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3625 *----------------------------------------------------------------------------*/
3627 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3629 uint64_t av, bv;
3630 a = float64_squash_input_denormal(a STATUS_VAR);
3631 b = float64_squash_input_denormal(b STATUS_VAR);
3633 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3634 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3636 float_raise( float_flag_invalid STATUS_VAR);
3637 return 0;
3639 av = float64_val(a);
3640 bv = float64_val(b);
3641 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3645 /*----------------------------------------------------------------------------
3646 | Returns 1 if the double-precision floating-point value `a' is less than or
3647 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3648 | cause an exception. Otherwise, the comparison is performed according to the
3649 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3650 *----------------------------------------------------------------------------*/
3652 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3654 flag aSign, bSign;
3655 uint64_t av, bv;
3656 a = float64_squash_input_denormal(a STATUS_VAR);
3657 b = float64_squash_input_denormal(b STATUS_VAR);
3659 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3660 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3662 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3663 float_raise( float_flag_invalid STATUS_VAR);
3665 return 0;
3667 aSign = extractFloat64Sign( a );
3668 bSign = extractFloat64Sign( b );
3669 av = float64_val(a);
3670 bv = float64_val(b);
3671 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3672 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3676 /*----------------------------------------------------------------------------
3677 | Returns 1 if the double-precision floating-point value `a' is less than
3678 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3679 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3680 | Standard for Binary Floating-Point Arithmetic.
3681 *----------------------------------------------------------------------------*/
3683 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3685 flag aSign, bSign;
3686 uint64_t av, bv;
3687 a = float64_squash_input_denormal(a STATUS_VAR);
3688 b = float64_squash_input_denormal(b STATUS_VAR);
3690 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3691 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3693 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3694 float_raise( float_flag_invalid STATUS_VAR);
3696 return 0;
3698 aSign = extractFloat64Sign( a );
3699 bSign = extractFloat64Sign( b );
3700 av = float64_val(a);
3701 bv = float64_val(b);
3702 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3703 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3707 #ifdef FLOATX80
3709 /*----------------------------------------------------------------------------
3710 | Returns the result of converting the extended double-precision floating-
3711 | point value `a' to the 32-bit two's complement integer format. The
3712 | conversion is performed according to the IEC/IEEE Standard for Binary
3713 | Floating-Point Arithmetic---which means in particular that the conversion
3714 | is rounded according to the current rounding mode. If `a' is a NaN, the
3715 | largest positive integer is returned. Otherwise, if the conversion
3716 | overflows, the largest integer with the same sign as `a' is returned.
3717 *----------------------------------------------------------------------------*/
3719 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3721 flag aSign;
3722 int32 aExp, shiftCount;
3723 uint64_t aSig;
3725 aSig = extractFloatx80Frac( a );
3726 aExp = extractFloatx80Exp( a );
3727 aSign = extractFloatx80Sign( a );
3728 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3729 shiftCount = 0x4037 - aExp;
3730 if ( shiftCount <= 0 ) shiftCount = 1;
3731 shift64RightJamming( aSig, shiftCount, &aSig );
3732 return roundAndPackInt32( aSign, aSig STATUS_VAR );
3736 /*----------------------------------------------------------------------------
3737 | Returns the result of converting the extended double-precision floating-
3738 | point value `a' to the 32-bit two's complement integer format. The
3739 | conversion is performed according to the IEC/IEEE Standard for Binary
3740 | Floating-Point Arithmetic, except that the conversion is always rounded
3741 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3742 | Otherwise, if the conversion overflows, the largest integer with the same
3743 | sign as `a' is returned.
3744 *----------------------------------------------------------------------------*/
3746 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3748 flag aSign;
3749 int32 aExp, shiftCount;
3750 uint64_t aSig, savedASig;
3751 int32 z;
3753 aSig = extractFloatx80Frac( a );
3754 aExp = extractFloatx80Exp( a );
3755 aSign = extractFloatx80Sign( a );
3756 if ( 0x401E < aExp ) {
3757 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3758 goto invalid;
3760 else if ( aExp < 0x3FFF ) {
3761 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3762 return 0;
3764 shiftCount = 0x403E - aExp;
3765 savedASig = aSig;
3766 aSig >>= shiftCount;
3767 z = aSig;
3768 if ( aSign ) z = - z;
3769 if ( ( z < 0 ) ^ aSign ) {
3770 invalid:
3771 float_raise( float_flag_invalid STATUS_VAR);
3772 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3774 if ( ( aSig<<shiftCount ) != savedASig ) {
3775 STATUS(float_exception_flags) |= float_flag_inexact;
3777 return z;
3781 /*----------------------------------------------------------------------------
3782 | Returns the result of converting the extended double-precision floating-
3783 | point value `a' to the 64-bit two's complement integer format. The
3784 | conversion is performed according to the IEC/IEEE Standard for Binary
3785 | Floating-Point Arithmetic---which means in particular that the conversion
3786 | is rounded according to the current rounding mode. If `a' is a NaN,
3787 | the largest positive integer is returned. Otherwise, if the conversion
3788 | overflows, the largest integer with the same sign as `a' is returned.
3789 *----------------------------------------------------------------------------*/
3791 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3793 flag aSign;
3794 int32 aExp, shiftCount;
3795 uint64_t aSig, aSigExtra;
3797 aSig = extractFloatx80Frac( a );
3798 aExp = extractFloatx80Exp( a );
3799 aSign = extractFloatx80Sign( a );
3800 shiftCount = 0x403E - aExp;
3801 if ( shiftCount <= 0 ) {
3802 if ( shiftCount ) {
3803 float_raise( float_flag_invalid STATUS_VAR);
3804 if ( ! aSign
3805 || ( ( aExp == 0x7FFF )
3806 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3808 return LIT64( 0x7FFFFFFFFFFFFFFF );
3810 return (int64_t) LIT64( 0x8000000000000000 );
3812 aSigExtra = 0;
3814 else {
3815 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3817 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3821 /*----------------------------------------------------------------------------
3822 | Returns the result of converting the extended double-precision floating-
3823 | point value `a' to the 64-bit two's complement integer format. The
3824 | conversion is performed according to the IEC/IEEE Standard for Binary
3825 | Floating-Point Arithmetic, except that the conversion is always rounded
3826 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3827 | Otherwise, if the conversion overflows, the largest integer with the same
3828 | sign as `a' is returned.
3829 *----------------------------------------------------------------------------*/
3831 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3833 flag aSign;
3834 int32 aExp, shiftCount;
3835 uint64_t aSig;
3836 int64 z;
3838 aSig = extractFloatx80Frac( a );
3839 aExp = extractFloatx80Exp( a );
3840 aSign = extractFloatx80Sign( a );
3841 shiftCount = aExp - 0x403E;
3842 if ( 0 <= shiftCount ) {
3843 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3844 if ( ( a.high != 0xC03E ) || aSig ) {
3845 float_raise( float_flag_invalid STATUS_VAR);
3846 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3847 return LIT64( 0x7FFFFFFFFFFFFFFF );
3850 return (int64_t) LIT64( 0x8000000000000000 );
3852 else if ( aExp < 0x3FFF ) {
3853 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3854 return 0;
3856 z = aSig>>( - shiftCount );
3857 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3858 STATUS(float_exception_flags) |= float_flag_inexact;
3860 if ( aSign ) z = - z;
3861 return z;
3865 /*----------------------------------------------------------------------------
3866 | Returns the result of converting the extended double-precision floating-
3867 | point value `a' to the single-precision floating-point format. The
3868 | conversion is performed according to the IEC/IEEE Standard for Binary
3869 | Floating-Point Arithmetic.
3870 *----------------------------------------------------------------------------*/
3872 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3874 flag aSign;
3875 int32 aExp;
3876 uint64_t aSig;
3878 aSig = extractFloatx80Frac( a );
3879 aExp = extractFloatx80Exp( a );
3880 aSign = extractFloatx80Sign( a );
3881 if ( aExp == 0x7FFF ) {
3882 if ( (uint64_t) ( aSig<<1 ) ) {
3883 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3885 return packFloat32( aSign, 0xFF, 0 );
3887 shift64RightJamming( aSig, 33, &aSig );
3888 if ( aExp || aSig ) aExp -= 0x3F81;
3889 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3893 /*----------------------------------------------------------------------------
3894 | Returns the result of converting the extended double-precision floating-
3895 | point value `a' to the double-precision floating-point format. The
3896 | conversion is performed according to the IEC/IEEE Standard for Binary
3897 | Floating-Point Arithmetic.
3898 *----------------------------------------------------------------------------*/
3900 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3902 flag aSign;
3903 int32 aExp;
3904 uint64_t aSig, zSig;
3906 aSig = extractFloatx80Frac( a );
3907 aExp = extractFloatx80Exp( a );
3908 aSign = extractFloatx80Sign( a );
3909 if ( aExp == 0x7FFF ) {
3910 if ( (uint64_t) ( aSig<<1 ) ) {
3911 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3913 return packFloat64( aSign, 0x7FF, 0 );
3915 shift64RightJamming( aSig, 1, &zSig );
3916 if ( aExp || aSig ) aExp -= 0x3C01;
3917 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3921 #ifdef FLOAT128
3923 /*----------------------------------------------------------------------------
3924 | Returns the result of converting the extended double-precision floating-
3925 | point value `a' to the quadruple-precision floating-point format. The
3926 | conversion is performed according to the IEC/IEEE Standard for Binary
3927 | Floating-Point Arithmetic.
3928 *----------------------------------------------------------------------------*/
3930 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3932 flag aSign;
3933 int16 aExp;
3934 uint64_t aSig, zSig0, zSig1;
3936 aSig = extractFloatx80Frac( a );
3937 aExp = extractFloatx80Exp( a );
3938 aSign = extractFloatx80Sign( a );
3939 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
3940 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3942 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3943 return packFloat128( aSign, aExp, zSig0, zSig1 );
3947 #endif
3949 /*----------------------------------------------------------------------------
3950 | Rounds the extended double-precision floating-point value `a' to an integer,
3951 | and returns the result as an extended quadruple-precision floating-point
3952 | value. The operation is performed according to the IEC/IEEE Standard for
3953 | Binary Floating-Point Arithmetic.
3954 *----------------------------------------------------------------------------*/
3956 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3958 flag aSign;
3959 int32 aExp;
3960 uint64_t lastBitMask, roundBitsMask;
3961 int8 roundingMode;
3962 floatx80 z;
3964 aExp = extractFloatx80Exp( a );
3965 if ( 0x403E <= aExp ) {
3966 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
3967 return propagateFloatx80NaN( a, a STATUS_VAR );
3969 return a;
3971 if ( aExp < 0x3FFF ) {
3972 if ( ( aExp == 0 )
3973 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3974 return a;
3976 STATUS(float_exception_flags) |= float_flag_inexact;
3977 aSign = extractFloatx80Sign( a );
3978 switch ( STATUS(float_rounding_mode) ) {
3979 case float_round_nearest_even:
3980 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
3982 return
3983 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3985 break;
3986 case float_round_down:
3987 return
3988 aSign ?
3989 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3990 : packFloatx80( 0, 0, 0 );
3991 case float_round_up:
3992 return
3993 aSign ? packFloatx80( 1, 0, 0 )
3994 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3996 return packFloatx80( aSign, 0, 0 );
3998 lastBitMask = 1;
3999 lastBitMask <<= 0x403E - aExp;
4000 roundBitsMask = lastBitMask - 1;
4001 z = a;
4002 roundingMode = STATUS(float_rounding_mode);
4003 if ( roundingMode == float_round_nearest_even ) {
4004 z.low += lastBitMask>>1;
4005 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4007 else if ( roundingMode != float_round_to_zero ) {
4008 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4009 z.low += roundBitsMask;
4012 z.low &= ~ roundBitsMask;
4013 if ( z.low == 0 ) {
4014 ++z.high;
4015 z.low = LIT64( 0x8000000000000000 );
4017 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4018 return z;
4022 /*----------------------------------------------------------------------------
4023 | Returns the result of adding the absolute values of the extended double-
4024 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4025 | negated before being returned. `zSign' is ignored if the result is a NaN.
4026 | The addition is performed according to the IEC/IEEE Standard for Binary
4027 | Floating-Point Arithmetic.
4028 *----------------------------------------------------------------------------*/
4030 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4032 int32 aExp, bExp, zExp;
4033 uint64_t aSig, bSig, zSig0, zSig1;
4034 int32 expDiff;
4036 aSig = extractFloatx80Frac( a );
4037 aExp = extractFloatx80Exp( a );
4038 bSig = extractFloatx80Frac( b );
4039 bExp = extractFloatx80Exp( b );
4040 expDiff = aExp - bExp;
4041 if ( 0 < expDiff ) {
4042 if ( aExp == 0x7FFF ) {
4043 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4044 return a;
4046 if ( bExp == 0 ) --expDiff;
4047 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4048 zExp = aExp;
4050 else if ( expDiff < 0 ) {
4051 if ( bExp == 0x7FFF ) {
4052 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4053 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4055 if ( aExp == 0 ) ++expDiff;
4056 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4057 zExp = bExp;
4059 else {
4060 if ( aExp == 0x7FFF ) {
4061 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4062 return propagateFloatx80NaN( a, b STATUS_VAR );
4064 return a;
4066 zSig1 = 0;
4067 zSig0 = aSig + bSig;
4068 if ( aExp == 0 ) {
4069 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4070 goto roundAndPack;
4072 zExp = aExp;
4073 goto shiftRight1;
4075 zSig0 = aSig + bSig;
4076 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4077 shiftRight1:
4078 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4079 zSig0 |= LIT64( 0x8000000000000000 );
4080 ++zExp;
4081 roundAndPack:
4082 return
4083 roundAndPackFloatx80(
4084 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4088 /*----------------------------------------------------------------------------
4089 | Returns the result of subtracting the absolute values of the extended
4090 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4091 | difference is negated before being returned. `zSign' is ignored if the
4092 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4093 | Standard for Binary Floating-Point Arithmetic.
4094 *----------------------------------------------------------------------------*/
4096 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4098 int32 aExp, bExp, zExp;
4099 uint64_t aSig, bSig, zSig0, zSig1;
4100 int32 expDiff;
4101 floatx80 z;
4103 aSig = extractFloatx80Frac( a );
4104 aExp = extractFloatx80Exp( a );
4105 bSig = extractFloatx80Frac( b );
4106 bExp = extractFloatx80Exp( b );
4107 expDiff = aExp - bExp;
4108 if ( 0 < expDiff ) goto aExpBigger;
4109 if ( expDiff < 0 ) goto bExpBigger;
4110 if ( aExp == 0x7FFF ) {
4111 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4112 return propagateFloatx80NaN( a, b STATUS_VAR );
4114 float_raise( float_flag_invalid STATUS_VAR);
4115 z.low = floatx80_default_nan_low;
4116 z.high = floatx80_default_nan_high;
4117 return z;
4119 if ( aExp == 0 ) {
4120 aExp = 1;
4121 bExp = 1;
4123 zSig1 = 0;
4124 if ( bSig < aSig ) goto aBigger;
4125 if ( aSig < bSig ) goto bBigger;
4126 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4127 bExpBigger:
4128 if ( bExp == 0x7FFF ) {
4129 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4130 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4132 if ( aExp == 0 ) ++expDiff;
4133 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4134 bBigger:
4135 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4136 zExp = bExp;
4137 zSign ^= 1;
4138 goto normalizeRoundAndPack;
4139 aExpBigger:
4140 if ( aExp == 0x7FFF ) {
4141 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4142 return a;
4144 if ( bExp == 0 ) --expDiff;
4145 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4146 aBigger:
4147 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4148 zExp = aExp;
4149 normalizeRoundAndPack:
4150 return
4151 normalizeRoundAndPackFloatx80(
4152 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4156 /*----------------------------------------------------------------------------
4157 | Returns the result of adding the extended double-precision floating-point
4158 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4159 | Standard for Binary Floating-Point Arithmetic.
4160 *----------------------------------------------------------------------------*/
4162 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4164 flag aSign, bSign;
4166 aSign = extractFloatx80Sign( a );
4167 bSign = extractFloatx80Sign( b );
4168 if ( aSign == bSign ) {
4169 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4171 else {
4172 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4177 /*----------------------------------------------------------------------------
4178 | Returns the result of subtracting the extended double-precision floating-
4179 | point values `a' and `b'. The operation is performed according to the
4180 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4181 *----------------------------------------------------------------------------*/
4183 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4185 flag aSign, bSign;
4187 aSign = extractFloatx80Sign( a );
4188 bSign = extractFloatx80Sign( b );
4189 if ( aSign == bSign ) {
4190 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4192 else {
4193 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4198 /*----------------------------------------------------------------------------
4199 | Returns the result of multiplying the extended double-precision floating-
4200 | point values `a' and `b'. The operation is performed according to the
4201 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4202 *----------------------------------------------------------------------------*/
4204 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4206 flag aSign, bSign, zSign;
4207 int32 aExp, bExp, zExp;
4208 uint64_t aSig, bSig, zSig0, zSig1;
4209 floatx80 z;
4211 aSig = extractFloatx80Frac( a );
4212 aExp = extractFloatx80Exp( a );
4213 aSign = extractFloatx80Sign( a );
4214 bSig = extractFloatx80Frac( b );
4215 bExp = extractFloatx80Exp( b );
4216 bSign = extractFloatx80Sign( b );
4217 zSign = aSign ^ bSign;
4218 if ( aExp == 0x7FFF ) {
4219 if ( (uint64_t) ( aSig<<1 )
4220 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4221 return propagateFloatx80NaN( a, b STATUS_VAR );
4223 if ( ( bExp | bSig ) == 0 ) goto invalid;
4224 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4226 if ( bExp == 0x7FFF ) {
4227 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4228 if ( ( aExp | aSig ) == 0 ) {
4229 invalid:
4230 float_raise( float_flag_invalid STATUS_VAR);
4231 z.low = floatx80_default_nan_low;
4232 z.high = floatx80_default_nan_high;
4233 return z;
4235 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4237 if ( aExp == 0 ) {
4238 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4239 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4241 if ( bExp == 0 ) {
4242 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4243 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4245 zExp = aExp + bExp - 0x3FFE;
4246 mul64To128( aSig, bSig, &zSig0, &zSig1 );
4247 if ( 0 < (int64_t) zSig0 ) {
4248 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4249 --zExp;
4251 return
4252 roundAndPackFloatx80(
4253 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4257 /*----------------------------------------------------------------------------
4258 | Returns the result of dividing the extended double-precision floating-point
4259 | value `a' by the corresponding value `b'. The operation is performed
4260 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4261 *----------------------------------------------------------------------------*/
4263 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4265 flag aSign, bSign, zSign;
4266 int32 aExp, bExp, zExp;
4267 uint64_t aSig, bSig, zSig0, zSig1;
4268 uint64_t rem0, rem1, rem2, term0, term1, term2;
4269 floatx80 z;
4271 aSig = extractFloatx80Frac( a );
4272 aExp = extractFloatx80Exp( a );
4273 aSign = extractFloatx80Sign( a );
4274 bSig = extractFloatx80Frac( b );
4275 bExp = extractFloatx80Exp( b );
4276 bSign = extractFloatx80Sign( b );
4277 zSign = aSign ^ bSign;
4278 if ( aExp == 0x7FFF ) {
4279 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4280 if ( bExp == 0x7FFF ) {
4281 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4282 goto invalid;
4284 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4286 if ( bExp == 0x7FFF ) {
4287 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4288 return packFloatx80( zSign, 0, 0 );
4290 if ( bExp == 0 ) {
4291 if ( bSig == 0 ) {
4292 if ( ( aExp | aSig ) == 0 ) {
4293 invalid:
4294 float_raise( float_flag_invalid STATUS_VAR);
4295 z.low = floatx80_default_nan_low;
4296 z.high = floatx80_default_nan_high;
4297 return z;
4299 float_raise( float_flag_divbyzero STATUS_VAR);
4300 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4302 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4304 if ( aExp == 0 ) {
4305 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4306 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4308 zExp = aExp - bExp + 0x3FFE;
4309 rem1 = 0;
4310 if ( bSig <= aSig ) {
4311 shift128Right( aSig, 0, 1, &aSig, &rem1 );
4312 ++zExp;
4314 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4315 mul64To128( bSig, zSig0, &term0, &term1 );
4316 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4317 while ( (int64_t) rem0 < 0 ) {
4318 --zSig0;
4319 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4321 zSig1 = estimateDiv128To64( rem1, 0, bSig );
4322 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4323 mul64To128( bSig, zSig1, &term1, &term2 );
4324 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4325 while ( (int64_t) rem1 < 0 ) {
4326 --zSig1;
4327 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4329 zSig1 |= ( ( rem1 | rem2 ) != 0 );
4331 return
4332 roundAndPackFloatx80(
4333 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4337 /*----------------------------------------------------------------------------
4338 | Returns the remainder of the extended double-precision floating-point value
4339 | `a' with respect to the corresponding value `b'. The operation is performed
4340 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4341 *----------------------------------------------------------------------------*/
4343 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4345 flag aSign, zSign;
4346 int32 aExp, bExp, expDiff;
4347 uint64_t aSig0, aSig1, bSig;
4348 uint64_t q, term0, term1, alternateASig0, alternateASig1;
4349 floatx80 z;
4351 aSig0 = extractFloatx80Frac( a );
4352 aExp = extractFloatx80Exp( a );
4353 aSign = extractFloatx80Sign( a );
4354 bSig = extractFloatx80Frac( b );
4355 bExp = extractFloatx80Exp( b );
4356 if ( aExp == 0x7FFF ) {
4357 if ( (uint64_t) ( aSig0<<1 )
4358 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4359 return propagateFloatx80NaN( a, b STATUS_VAR );
4361 goto invalid;
4363 if ( bExp == 0x7FFF ) {
4364 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4365 return a;
4367 if ( bExp == 0 ) {
4368 if ( bSig == 0 ) {
4369 invalid:
4370 float_raise( float_flag_invalid STATUS_VAR);
4371 z.low = floatx80_default_nan_low;
4372 z.high = floatx80_default_nan_high;
4373 return z;
4375 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4377 if ( aExp == 0 ) {
4378 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
4379 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4381 bSig |= LIT64( 0x8000000000000000 );
4382 zSign = aSign;
4383 expDiff = aExp - bExp;
4384 aSig1 = 0;
4385 if ( expDiff < 0 ) {
4386 if ( expDiff < -1 ) return a;
4387 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4388 expDiff = 0;
4390 q = ( bSig <= aSig0 );
4391 if ( q ) aSig0 -= bSig;
4392 expDiff -= 64;
4393 while ( 0 < expDiff ) {
4394 q = estimateDiv128To64( aSig0, aSig1, bSig );
4395 q = ( 2 < q ) ? q - 2 : 0;
4396 mul64To128( bSig, q, &term0, &term1 );
4397 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4398 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4399 expDiff -= 62;
4401 expDiff += 64;
4402 if ( 0 < expDiff ) {
4403 q = estimateDiv128To64( aSig0, aSig1, bSig );
4404 q = ( 2 < q ) ? q - 2 : 0;
4405 q >>= 64 - expDiff;
4406 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4407 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4408 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4409 while ( le128( term0, term1, aSig0, aSig1 ) ) {
4410 ++q;
4411 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4414 else {
4415 term1 = 0;
4416 term0 = bSig;
4418 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4419 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4420 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4421 && ( q & 1 ) )
4423 aSig0 = alternateASig0;
4424 aSig1 = alternateASig1;
4425 zSign = ! zSign;
4427 return
4428 normalizeRoundAndPackFloatx80(
4429 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4433 /*----------------------------------------------------------------------------
4434 | Returns the square root of the extended double-precision floating-point
4435 | value `a'. The operation is performed according to the IEC/IEEE Standard
4436 | for Binary Floating-Point Arithmetic.
4437 *----------------------------------------------------------------------------*/
4439 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4441 flag aSign;
4442 int32 aExp, zExp;
4443 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4444 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4445 floatx80 z;
4447 aSig0 = extractFloatx80Frac( a );
4448 aExp = extractFloatx80Exp( a );
4449 aSign = extractFloatx80Sign( a );
4450 if ( aExp == 0x7FFF ) {
4451 if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4452 if ( ! aSign ) return a;
4453 goto invalid;
4455 if ( aSign ) {
4456 if ( ( aExp | aSig0 ) == 0 ) return a;
4457 invalid:
4458 float_raise( float_flag_invalid STATUS_VAR);
4459 z.low = floatx80_default_nan_low;
4460 z.high = floatx80_default_nan_high;
4461 return z;
4463 if ( aExp == 0 ) {
4464 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4465 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4467 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4468 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4469 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4470 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4471 doubleZSig0 = zSig0<<1;
4472 mul64To128( zSig0, zSig0, &term0, &term1 );
4473 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4474 while ( (int64_t) rem0 < 0 ) {
4475 --zSig0;
4476 doubleZSig0 -= 2;
4477 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4479 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4480 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4481 if ( zSig1 == 0 ) zSig1 = 1;
4482 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4483 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4484 mul64To128( zSig1, zSig1, &term2, &term3 );
4485 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4486 while ( (int64_t) rem1 < 0 ) {
4487 --zSig1;
4488 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4489 term3 |= 1;
4490 term2 |= doubleZSig0;
4491 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4493 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4495 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4496 zSig0 |= doubleZSig0;
4497 return
4498 roundAndPackFloatx80(
4499 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4503 /*----------------------------------------------------------------------------
4504 | Returns 1 if the extended double-precision floating-point value `a' is
4505 | equal to the corresponding value `b', and 0 otherwise. The comparison is
4506 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4507 | Arithmetic.
4508 *----------------------------------------------------------------------------*/
4510 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4513 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4514 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4515 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4516 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4518 if ( floatx80_is_signaling_nan( a )
4519 || floatx80_is_signaling_nan( b ) ) {
4520 float_raise( float_flag_invalid STATUS_VAR);
4522 return 0;
4524 return
4525 ( a.low == b.low )
4526 && ( ( a.high == b.high )
4527 || ( ( a.low == 0 )
4528 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4533 /*----------------------------------------------------------------------------
4534 | Returns 1 if the extended double-precision floating-point value `a' is
4535 | less than or equal to the corresponding value `b', and 0 otherwise. The
4536 | comparison is performed according to the IEC/IEEE Standard for Binary
4537 | Floating-Point Arithmetic.
4538 *----------------------------------------------------------------------------*/
4540 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4542 flag aSign, bSign;
4544 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4545 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4546 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4547 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4549 float_raise( float_flag_invalid STATUS_VAR);
4550 return 0;
4552 aSign = extractFloatx80Sign( a );
4553 bSign = extractFloatx80Sign( b );
4554 if ( aSign != bSign ) {
4555 return
4556 aSign
4557 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4558 == 0 );
4560 return
4561 aSign ? le128( b.high, b.low, a.high, a.low )
4562 : le128( a.high, a.low, b.high, b.low );
4566 /*----------------------------------------------------------------------------
4567 | Returns 1 if the extended double-precision floating-point value `a' is
4568 | less than the corresponding value `b', and 0 otherwise. The comparison
4569 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4570 | Arithmetic.
4571 *----------------------------------------------------------------------------*/
4573 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4575 flag aSign, bSign;
4577 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4578 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4579 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4580 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4582 float_raise( float_flag_invalid STATUS_VAR);
4583 return 0;
4585 aSign = extractFloatx80Sign( a );
4586 bSign = extractFloatx80Sign( b );
4587 if ( aSign != bSign ) {
4588 return
4589 aSign
4590 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4591 != 0 );
4593 return
4594 aSign ? lt128( b.high, b.low, a.high, a.low )
4595 : lt128( a.high, a.low, b.high, b.low );
4599 /*----------------------------------------------------------------------------
4600 | Returns 1 if the extended double-precision floating-point value `a' is equal
4601 | to the corresponding value `b', and 0 otherwise. The invalid exception is
4602 | raised if either operand is a NaN. Otherwise, the comparison is performed
4603 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4604 *----------------------------------------------------------------------------*/
4606 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4609 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4610 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4611 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4612 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4614 float_raise( float_flag_invalid STATUS_VAR);
4615 return 0;
4617 return
4618 ( a.low == b.low )
4619 && ( ( a.high == b.high )
4620 || ( ( a.low == 0 )
4621 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4626 /*----------------------------------------------------------------------------
4627 | Returns 1 if the extended double-precision floating-point value `a' is less
4628 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4629 | do not cause an exception. Otherwise, the comparison is performed according
4630 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4631 *----------------------------------------------------------------------------*/
4633 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4635 flag aSign, bSign;
4637 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4638 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4639 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4640 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4642 if ( floatx80_is_signaling_nan( a )
4643 || floatx80_is_signaling_nan( b ) ) {
4644 float_raise( float_flag_invalid STATUS_VAR);
4646 return 0;
4648 aSign = extractFloatx80Sign( a );
4649 bSign = extractFloatx80Sign( b );
4650 if ( aSign != bSign ) {
4651 return
4652 aSign
4653 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4654 == 0 );
4656 return
4657 aSign ? le128( b.high, b.low, a.high, a.low )
4658 : le128( a.high, a.low, b.high, b.low );
4662 /*----------------------------------------------------------------------------
4663 | Returns 1 if the extended double-precision floating-point value `a' is less
4664 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4665 | an exception. Otherwise, the comparison is performed according to the
4666 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4667 *----------------------------------------------------------------------------*/
4669 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4671 flag aSign, bSign;
4673 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4674 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4675 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4676 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4678 if ( floatx80_is_signaling_nan( a )
4679 || floatx80_is_signaling_nan( b ) ) {
4680 float_raise( float_flag_invalid STATUS_VAR);
4682 return 0;
4684 aSign = extractFloatx80Sign( a );
4685 bSign = extractFloatx80Sign( b );
4686 if ( aSign != bSign ) {
4687 return
4688 aSign
4689 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4690 != 0 );
4692 return
4693 aSign ? lt128( b.high, b.low, a.high, a.low )
4694 : lt128( a.high, a.low, b.high, b.low );
4698 #endif
4700 #ifdef FLOAT128
4702 /*----------------------------------------------------------------------------
4703 | Returns the result of converting the quadruple-precision floating-point
4704 | value `a' to the 32-bit two's complement integer format. The conversion
4705 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4706 | Arithmetic---which means in particular that the conversion is rounded
4707 | according to the current rounding mode. If `a' is a NaN, the largest
4708 | positive integer is returned. Otherwise, if the conversion overflows, the
4709 | largest integer with the same sign as `a' is returned.
4710 *----------------------------------------------------------------------------*/
4712 int32 float128_to_int32( float128 a STATUS_PARAM )
4714 flag aSign;
4715 int32 aExp, shiftCount;
4716 uint64_t aSig0, aSig1;
4718 aSig1 = extractFloat128Frac1( a );
4719 aSig0 = extractFloat128Frac0( a );
4720 aExp = extractFloat128Exp( a );
4721 aSign = extractFloat128Sign( a );
4722 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4723 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4724 aSig0 |= ( aSig1 != 0 );
4725 shiftCount = 0x4028 - aExp;
4726 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4727 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4731 /*----------------------------------------------------------------------------
4732 | Returns the result of converting the quadruple-precision floating-point
4733 | value `a' to the 32-bit two's complement integer format. The conversion
4734 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4735 | Arithmetic, except that the conversion is always rounded toward zero. If
4736 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4737 | conversion overflows, the largest integer with the same sign as `a' is
4738 | returned.
4739 *----------------------------------------------------------------------------*/
4741 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4743 flag aSign;
4744 int32 aExp, shiftCount;
4745 uint64_t aSig0, aSig1, savedASig;
4746 int32 z;
4748 aSig1 = extractFloat128Frac1( a );
4749 aSig0 = extractFloat128Frac0( a );
4750 aExp = extractFloat128Exp( a );
4751 aSign = extractFloat128Sign( a );
4752 aSig0 |= ( aSig1 != 0 );
4753 if ( 0x401E < aExp ) {
4754 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4755 goto invalid;
4757 else if ( aExp < 0x3FFF ) {
4758 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4759 return 0;
4761 aSig0 |= LIT64( 0x0001000000000000 );
4762 shiftCount = 0x402F - aExp;
4763 savedASig = aSig0;
4764 aSig0 >>= shiftCount;
4765 z = aSig0;
4766 if ( aSign ) z = - z;
4767 if ( ( z < 0 ) ^ aSign ) {
4768 invalid:
4769 float_raise( float_flag_invalid STATUS_VAR);
4770 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4772 if ( ( aSig0<<shiftCount ) != savedASig ) {
4773 STATUS(float_exception_flags) |= float_flag_inexact;
4775 return z;
4779 /*----------------------------------------------------------------------------
4780 | Returns the result of converting the quadruple-precision floating-point
4781 | value `a' to the 64-bit two's complement integer format. The conversion
4782 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4783 | Arithmetic---which means in particular that the conversion is rounded
4784 | according to the current rounding mode. If `a' is a NaN, the largest
4785 | positive integer is returned. Otherwise, if the conversion overflows, the
4786 | largest integer with the same sign as `a' is returned.
4787 *----------------------------------------------------------------------------*/
4789 int64 float128_to_int64( float128 a STATUS_PARAM )
4791 flag aSign;
4792 int32 aExp, shiftCount;
4793 uint64_t aSig0, aSig1;
4795 aSig1 = extractFloat128Frac1( a );
4796 aSig0 = extractFloat128Frac0( a );
4797 aExp = extractFloat128Exp( a );
4798 aSign = extractFloat128Sign( a );
4799 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4800 shiftCount = 0x402F - aExp;
4801 if ( shiftCount <= 0 ) {
4802 if ( 0x403E < aExp ) {
4803 float_raise( float_flag_invalid STATUS_VAR);
4804 if ( ! aSign
4805 || ( ( aExp == 0x7FFF )
4806 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4809 return LIT64( 0x7FFFFFFFFFFFFFFF );
4811 return (int64_t) LIT64( 0x8000000000000000 );
4813 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4815 else {
4816 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4818 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4822 /*----------------------------------------------------------------------------
4823 | Returns the result of converting the quadruple-precision floating-point
4824 | value `a' to the 64-bit two's complement integer format. The conversion
4825 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4826 | Arithmetic, except that the conversion is always rounded toward zero.
4827 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4828 | the conversion overflows, the largest integer with the same sign as `a' is
4829 | returned.
4830 *----------------------------------------------------------------------------*/
4832 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4834 flag aSign;
4835 int32 aExp, shiftCount;
4836 uint64_t aSig0, aSig1;
4837 int64 z;
4839 aSig1 = extractFloat128Frac1( a );
4840 aSig0 = extractFloat128Frac0( a );
4841 aExp = extractFloat128Exp( a );
4842 aSign = extractFloat128Sign( a );
4843 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4844 shiftCount = aExp - 0x402F;
4845 if ( 0 < shiftCount ) {
4846 if ( 0x403E <= aExp ) {
4847 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4848 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4849 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4850 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4852 else {
4853 float_raise( float_flag_invalid STATUS_VAR);
4854 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4855 return LIT64( 0x7FFFFFFFFFFFFFFF );
4858 return (int64_t) LIT64( 0x8000000000000000 );
4860 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4861 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
4862 STATUS(float_exception_flags) |= float_flag_inexact;
4865 else {
4866 if ( aExp < 0x3FFF ) {
4867 if ( aExp | aSig0 | aSig1 ) {
4868 STATUS(float_exception_flags) |= float_flag_inexact;
4870 return 0;
4872 z = aSig0>>( - shiftCount );
4873 if ( aSig1
4874 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4875 STATUS(float_exception_flags) |= float_flag_inexact;
4878 if ( aSign ) z = - z;
4879 return z;
4883 /*----------------------------------------------------------------------------
4884 | Returns the result of converting the quadruple-precision floating-point
4885 | value `a' to the single-precision floating-point format. The conversion
4886 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4887 | Arithmetic.
4888 *----------------------------------------------------------------------------*/
4890 float32 float128_to_float32( float128 a STATUS_PARAM )
4892 flag aSign;
4893 int32 aExp;
4894 uint64_t aSig0, aSig1;
4895 uint32_t zSig;
4897 aSig1 = extractFloat128Frac1( a );
4898 aSig0 = extractFloat128Frac0( a );
4899 aExp = extractFloat128Exp( a );
4900 aSign = extractFloat128Sign( a );
4901 if ( aExp == 0x7FFF ) {
4902 if ( aSig0 | aSig1 ) {
4903 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4905 return packFloat32( aSign, 0xFF, 0 );
4907 aSig0 |= ( aSig1 != 0 );
4908 shift64RightJamming( aSig0, 18, &aSig0 );
4909 zSig = aSig0;
4910 if ( aExp || zSig ) {
4911 zSig |= 0x40000000;
4912 aExp -= 0x3F81;
4914 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4918 /*----------------------------------------------------------------------------
4919 | Returns the result of converting the quadruple-precision floating-point
4920 | value `a' to the double-precision floating-point format. The conversion
4921 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4922 | Arithmetic.
4923 *----------------------------------------------------------------------------*/
4925 float64 float128_to_float64( float128 a STATUS_PARAM )
4927 flag aSign;
4928 int32 aExp;
4929 uint64_t aSig0, aSig1;
4931 aSig1 = extractFloat128Frac1( a );
4932 aSig0 = extractFloat128Frac0( a );
4933 aExp = extractFloat128Exp( a );
4934 aSign = extractFloat128Sign( a );
4935 if ( aExp == 0x7FFF ) {
4936 if ( aSig0 | aSig1 ) {
4937 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4939 return packFloat64( aSign, 0x7FF, 0 );
4941 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4942 aSig0 |= ( aSig1 != 0 );
4943 if ( aExp || aSig0 ) {
4944 aSig0 |= LIT64( 0x4000000000000000 );
4945 aExp -= 0x3C01;
4947 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4951 #ifdef FLOATX80
4953 /*----------------------------------------------------------------------------
4954 | Returns the result of converting the quadruple-precision floating-point
4955 | value `a' to the extended double-precision floating-point format. The
4956 | conversion is performed according to the IEC/IEEE Standard for Binary
4957 | Floating-Point Arithmetic.
4958 *----------------------------------------------------------------------------*/
4960 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4962 flag aSign;
4963 int32 aExp;
4964 uint64_t aSig0, aSig1;
4966 aSig1 = extractFloat128Frac1( a );
4967 aSig0 = extractFloat128Frac0( a );
4968 aExp = extractFloat128Exp( a );
4969 aSign = extractFloat128Sign( a );
4970 if ( aExp == 0x7FFF ) {
4971 if ( aSig0 | aSig1 ) {
4972 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4974 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4976 if ( aExp == 0 ) {
4977 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4978 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4980 else {
4981 aSig0 |= LIT64( 0x0001000000000000 );
4983 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4984 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4988 #endif
4990 /*----------------------------------------------------------------------------
4991 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4992 | returns the result as a quadruple-precision floating-point value. The
4993 | operation is performed according to the IEC/IEEE Standard for Binary
4994 | Floating-Point Arithmetic.
4995 *----------------------------------------------------------------------------*/
4997 float128 float128_round_to_int( float128 a STATUS_PARAM )
4999 flag aSign;
5000 int32 aExp;
5001 uint64_t lastBitMask, roundBitsMask;
5002 int8 roundingMode;
5003 float128 z;
5005 aExp = extractFloat128Exp( a );
5006 if ( 0x402F <= aExp ) {
5007 if ( 0x406F <= aExp ) {
5008 if ( ( aExp == 0x7FFF )
5009 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5011 return propagateFloat128NaN( a, a STATUS_VAR );
5013 return a;
5015 lastBitMask = 1;
5016 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5017 roundBitsMask = lastBitMask - 1;
5018 z = a;
5019 roundingMode = STATUS(float_rounding_mode);
5020 if ( roundingMode == float_round_nearest_even ) {
5021 if ( lastBitMask ) {
5022 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5023 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5025 else {
5026 if ( (int64_t) z.low < 0 ) {
5027 ++z.high;
5028 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5032 else if ( roundingMode != float_round_to_zero ) {
5033 if ( extractFloat128Sign( z )
5034 ^ ( roundingMode == float_round_up ) ) {
5035 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5038 z.low &= ~ roundBitsMask;
5040 else {
5041 if ( aExp < 0x3FFF ) {
5042 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5043 STATUS(float_exception_flags) |= float_flag_inexact;
5044 aSign = extractFloat128Sign( a );
5045 switch ( STATUS(float_rounding_mode) ) {
5046 case float_round_nearest_even:
5047 if ( ( aExp == 0x3FFE )
5048 && ( extractFloat128Frac0( a )
5049 | extractFloat128Frac1( a ) )
5051 return packFloat128( aSign, 0x3FFF, 0, 0 );
5053 break;
5054 case float_round_down:
5055 return
5056 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5057 : packFloat128( 0, 0, 0, 0 );
5058 case float_round_up:
5059 return
5060 aSign ? packFloat128( 1, 0, 0, 0 )
5061 : packFloat128( 0, 0x3FFF, 0, 0 );
5063 return packFloat128( aSign, 0, 0, 0 );
5065 lastBitMask = 1;
5066 lastBitMask <<= 0x402F - aExp;
5067 roundBitsMask = lastBitMask - 1;
5068 z.low = 0;
5069 z.high = a.high;
5070 roundingMode = STATUS(float_rounding_mode);
5071 if ( roundingMode == float_round_nearest_even ) {
5072 z.high += lastBitMask>>1;
5073 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5074 z.high &= ~ lastBitMask;
5077 else if ( roundingMode != float_round_to_zero ) {
5078 if ( extractFloat128Sign( z )
5079 ^ ( roundingMode == float_round_up ) ) {
5080 z.high |= ( a.low != 0 );
5081 z.high += roundBitsMask;
5084 z.high &= ~ roundBitsMask;
5086 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5087 STATUS(float_exception_flags) |= float_flag_inexact;
5089 return z;
5093 /*----------------------------------------------------------------------------
5094 | Returns the result of adding the absolute values of the quadruple-precision
5095 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5096 | before being returned. `zSign' is ignored if the result is a NaN.
5097 | The addition is performed according to the IEC/IEEE Standard for Binary
5098 | Floating-Point Arithmetic.
5099 *----------------------------------------------------------------------------*/
5101 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5103 int32 aExp, bExp, zExp;
5104 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5105 int32 expDiff;
5107 aSig1 = extractFloat128Frac1( a );
5108 aSig0 = extractFloat128Frac0( a );
5109 aExp = extractFloat128Exp( a );
5110 bSig1 = extractFloat128Frac1( b );
5111 bSig0 = extractFloat128Frac0( b );
5112 bExp = extractFloat128Exp( b );
5113 expDiff = aExp - bExp;
5114 if ( 0 < expDiff ) {
5115 if ( aExp == 0x7FFF ) {
5116 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5117 return a;
5119 if ( bExp == 0 ) {
5120 --expDiff;
5122 else {
5123 bSig0 |= LIT64( 0x0001000000000000 );
5125 shift128ExtraRightJamming(
5126 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5127 zExp = aExp;
5129 else if ( expDiff < 0 ) {
5130 if ( bExp == 0x7FFF ) {
5131 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5132 return packFloat128( zSign, 0x7FFF, 0, 0 );
5134 if ( aExp == 0 ) {
5135 ++expDiff;
5137 else {
5138 aSig0 |= LIT64( 0x0001000000000000 );
5140 shift128ExtraRightJamming(
5141 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5142 zExp = bExp;
5144 else {
5145 if ( aExp == 0x7FFF ) {
5146 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5147 return propagateFloat128NaN( a, b STATUS_VAR );
5149 return a;
5151 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5152 if ( aExp == 0 ) {
5153 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
5154 return packFloat128( zSign, 0, zSig0, zSig1 );
5156 zSig2 = 0;
5157 zSig0 |= LIT64( 0x0002000000000000 );
5158 zExp = aExp;
5159 goto shiftRight1;
5161 aSig0 |= LIT64( 0x0001000000000000 );
5162 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5163 --zExp;
5164 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5165 ++zExp;
5166 shiftRight1:
5167 shift128ExtraRightJamming(
5168 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5169 roundAndPack:
5170 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5174 /*----------------------------------------------------------------------------
5175 | Returns the result of subtracting the absolute values of the quadruple-
5176 | precision floating-point values `a' and `b'. If `zSign' is 1, the
5177 | difference is negated before being returned. `zSign' is ignored if the
5178 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5179 | Standard for Binary Floating-Point Arithmetic.
5180 *----------------------------------------------------------------------------*/
5182 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5184 int32 aExp, bExp, zExp;
5185 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5186 int32 expDiff;
5187 float128 z;
5189 aSig1 = extractFloat128Frac1( a );
5190 aSig0 = extractFloat128Frac0( a );
5191 aExp = extractFloat128Exp( a );
5192 bSig1 = extractFloat128Frac1( b );
5193 bSig0 = extractFloat128Frac0( b );
5194 bExp = extractFloat128Exp( b );
5195 expDiff = aExp - bExp;
5196 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5197 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5198 if ( 0 < expDiff ) goto aExpBigger;
5199 if ( expDiff < 0 ) goto bExpBigger;
5200 if ( aExp == 0x7FFF ) {
5201 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5202 return propagateFloat128NaN( a, b STATUS_VAR );
5204 float_raise( float_flag_invalid STATUS_VAR);
5205 z.low = float128_default_nan_low;
5206 z.high = float128_default_nan_high;
5207 return z;
5209 if ( aExp == 0 ) {
5210 aExp = 1;
5211 bExp = 1;
5213 if ( bSig0 < aSig0 ) goto aBigger;
5214 if ( aSig0 < bSig0 ) goto bBigger;
5215 if ( bSig1 < aSig1 ) goto aBigger;
5216 if ( aSig1 < bSig1 ) goto bBigger;
5217 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5218 bExpBigger:
5219 if ( bExp == 0x7FFF ) {
5220 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5221 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5223 if ( aExp == 0 ) {
5224 ++expDiff;
5226 else {
5227 aSig0 |= LIT64( 0x4000000000000000 );
5229 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5230 bSig0 |= LIT64( 0x4000000000000000 );
5231 bBigger:
5232 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5233 zExp = bExp;
5234 zSign ^= 1;
5235 goto normalizeRoundAndPack;
5236 aExpBigger:
5237 if ( aExp == 0x7FFF ) {
5238 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5239 return a;
5241 if ( bExp == 0 ) {
5242 --expDiff;
5244 else {
5245 bSig0 |= LIT64( 0x4000000000000000 );
5247 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5248 aSig0 |= LIT64( 0x4000000000000000 );
5249 aBigger:
5250 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5251 zExp = aExp;
5252 normalizeRoundAndPack:
5253 --zExp;
5254 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5258 /*----------------------------------------------------------------------------
5259 | Returns the result of adding the quadruple-precision floating-point values
5260 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5261 | for Binary Floating-Point Arithmetic.
5262 *----------------------------------------------------------------------------*/
5264 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5266 flag aSign, bSign;
5268 aSign = extractFloat128Sign( a );
5269 bSign = extractFloat128Sign( b );
5270 if ( aSign == bSign ) {
5271 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5273 else {
5274 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5279 /*----------------------------------------------------------------------------
5280 | Returns the result of subtracting the quadruple-precision floating-point
5281 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5282 | Standard for Binary Floating-Point Arithmetic.
5283 *----------------------------------------------------------------------------*/
5285 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5287 flag aSign, bSign;
5289 aSign = extractFloat128Sign( a );
5290 bSign = extractFloat128Sign( b );
5291 if ( aSign == bSign ) {
5292 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5294 else {
5295 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5300 /*----------------------------------------------------------------------------
5301 | Returns the result of multiplying the quadruple-precision floating-point
5302 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5303 | Standard for Binary Floating-Point Arithmetic.
5304 *----------------------------------------------------------------------------*/
5306 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5308 flag aSign, bSign, zSign;
5309 int32 aExp, bExp, zExp;
5310 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5311 float128 z;
5313 aSig1 = extractFloat128Frac1( a );
5314 aSig0 = extractFloat128Frac0( a );
5315 aExp = extractFloat128Exp( a );
5316 aSign = extractFloat128Sign( a );
5317 bSig1 = extractFloat128Frac1( b );
5318 bSig0 = extractFloat128Frac0( b );
5319 bExp = extractFloat128Exp( b );
5320 bSign = extractFloat128Sign( b );
5321 zSign = aSign ^ bSign;
5322 if ( aExp == 0x7FFF ) {
5323 if ( ( aSig0 | aSig1 )
5324 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5325 return propagateFloat128NaN( a, b STATUS_VAR );
5327 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5328 return packFloat128( zSign, 0x7FFF, 0, 0 );
5330 if ( bExp == 0x7FFF ) {
5331 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5332 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5333 invalid:
5334 float_raise( float_flag_invalid STATUS_VAR);
5335 z.low = float128_default_nan_low;
5336 z.high = float128_default_nan_high;
5337 return z;
5339 return packFloat128( zSign, 0x7FFF, 0, 0 );
5341 if ( aExp == 0 ) {
5342 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5343 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5345 if ( bExp == 0 ) {
5346 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5347 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5349 zExp = aExp + bExp - 0x4000;
5350 aSig0 |= LIT64( 0x0001000000000000 );
5351 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5352 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5353 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5354 zSig2 |= ( zSig3 != 0 );
5355 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5356 shift128ExtraRightJamming(
5357 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5358 ++zExp;
5360 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5364 /*----------------------------------------------------------------------------
5365 | Returns the result of dividing the quadruple-precision floating-point value
5366 | `a' by the corresponding value `b'. The operation is performed according to
5367 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5368 *----------------------------------------------------------------------------*/
5370 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5372 flag aSign, bSign, zSign;
5373 int32 aExp, bExp, zExp;
5374 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5375 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5376 float128 z;
5378 aSig1 = extractFloat128Frac1( a );
5379 aSig0 = extractFloat128Frac0( a );
5380 aExp = extractFloat128Exp( a );
5381 aSign = extractFloat128Sign( a );
5382 bSig1 = extractFloat128Frac1( b );
5383 bSig0 = extractFloat128Frac0( b );
5384 bExp = extractFloat128Exp( b );
5385 bSign = extractFloat128Sign( b );
5386 zSign = aSign ^ bSign;
5387 if ( aExp == 0x7FFF ) {
5388 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5389 if ( bExp == 0x7FFF ) {
5390 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5391 goto invalid;
5393 return packFloat128( zSign, 0x7FFF, 0, 0 );
5395 if ( bExp == 0x7FFF ) {
5396 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5397 return packFloat128( zSign, 0, 0, 0 );
5399 if ( bExp == 0 ) {
5400 if ( ( bSig0 | bSig1 ) == 0 ) {
5401 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5402 invalid:
5403 float_raise( float_flag_invalid STATUS_VAR);
5404 z.low = float128_default_nan_low;
5405 z.high = float128_default_nan_high;
5406 return z;
5408 float_raise( float_flag_divbyzero STATUS_VAR);
5409 return packFloat128( zSign, 0x7FFF, 0, 0 );
5411 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5413 if ( aExp == 0 ) {
5414 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5415 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5417 zExp = aExp - bExp + 0x3FFD;
5418 shortShift128Left(
5419 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5420 shortShift128Left(
5421 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5422 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5423 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5424 ++zExp;
5426 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5427 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5428 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5429 while ( (int64_t) rem0 < 0 ) {
5430 --zSig0;
5431 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5433 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5434 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5435 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5436 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5437 while ( (int64_t) rem1 < 0 ) {
5438 --zSig1;
5439 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5441 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5443 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5444 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5448 /*----------------------------------------------------------------------------
5449 | Returns the remainder of the quadruple-precision floating-point value `a'
5450 | with respect to the corresponding value `b'. The operation is performed
5451 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5452 *----------------------------------------------------------------------------*/
5454 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5456 flag aSign, zSign;
5457 int32 aExp, bExp, expDiff;
5458 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5459 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
5460 int64_t sigMean0;
5461 float128 z;
5463 aSig1 = extractFloat128Frac1( a );
5464 aSig0 = extractFloat128Frac0( a );
5465 aExp = extractFloat128Exp( a );
5466 aSign = extractFloat128Sign( a );
5467 bSig1 = extractFloat128Frac1( b );
5468 bSig0 = extractFloat128Frac0( b );
5469 bExp = extractFloat128Exp( b );
5470 if ( aExp == 0x7FFF ) {
5471 if ( ( aSig0 | aSig1 )
5472 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5473 return propagateFloat128NaN( a, b STATUS_VAR );
5475 goto invalid;
5477 if ( bExp == 0x7FFF ) {
5478 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5479 return a;
5481 if ( bExp == 0 ) {
5482 if ( ( bSig0 | bSig1 ) == 0 ) {
5483 invalid:
5484 float_raise( float_flag_invalid STATUS_VAR);
5485 z.low = float128_default_nan_low;
5486 z.high = float128_default_nan_high;
5487 return z;
5489 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5491 if ( aExp == 0 ) {
5492 if ( ( aSig0 | aSig1 ) == 0 ) return a;
5493 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5495 expDiff = aExp - bExp;
5496 if ( expDiff < -1 ) return a;
5497 shortShift128Left(
5498 aSig0 | LIT64( 0x0001000000000000 ),
5499 aSig1,
5500 15 - ( expDiff < 0 ),
5501 &aSig0,
5502 &aSig1
5504 shortShift128Left(
5505 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5506 q = le128( bSig0, bSig1, aSig0, aSig1 );
5507 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5508 expDiff -= 64;
5509 while ( 0 < expDiff ) {
5510 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5511 q = ( 4 < q ) ? q - 4 : 0;
5512 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5513 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5514 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5515 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5516 expDiff -= 61;
5518 if ( -64 < expDiff ) {
5519 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5520 q = ( 4 < q ) ? q - 4 : 0;
5521 q >>= - expDiff;
5522 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5523 expDiff += 52;
5524 if ( expDiff < 0 ) {
5525 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5527 else {
5528 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5530 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5531 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5533 else {
5534 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5535 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5537 do {
5538 alternateASig0 = aSig0;
5539 alternateASig1 = aSig1;
5540 ++q;
5541 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5542 } while ( 0 <= (int64_t) aSig0 );
5543 add128(
5544 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
5545 if ( ( sigMean0 < 0 )
5546 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5547 aSig0 = alternateASig0;
5548 aSig1 = alternateASig1;
5550 zSign = ( (int64_t) aSig0 < 0 );
5551 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5552 return
5553 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5557 /*----------------------------------------------------------------------------
5558 | Returns the square root of the quadruple-precision floating-point value `a'.
5559 | The operation is performed according to the IEC/IEEE Standard for Binary
5560 | Floating-Point Arithmetic.
5561 *----------------------------------------------------------------------------*/
5563 float128 float128_sqrt( float128 a STATUS_PARAM )
5565 flag aSign;
5566 int32 aExp, zExp;
5567 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5568 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5569 float128 z;
5571 aSig1 = extractFloat128Frac1( a );
5572 aSig0 = extractFloat128Frac0( a );
5573 aExp = extractFloat128Exp( a );
5574 aSign = extractFloat128Sign( a );
5575 if ( aExp == 0x7FFF ) {
5576 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5577 if ( ! aSign ) return a;
5578 goto invalid;
5580 if ( aSign ) {
5581 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5582 invalid:
5583 float_raise( float_flag_invalid STATUS_VAR);
5584 z.low = float128_default_nan_low;
5585 z.high = float128_default_nan_high;
5586 return z;
5588 if ( aExp == 0 ) {
5589 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5590 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5592 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5593 aSig0 |= LIT64( 0x0001000000000000 );
5594 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5595 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5596 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5597 doubleZSig0 = zSig0<<1;
5598 mul64To128( zSig0, zSig0, &term0, &term1 );
5599 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5600 while ( (int64_t) rem0 < 0 ) {
5601 --zSig0;
5602 doubleZSig0 -= 2;
5603 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5605 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5606 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5607 if ( zSig1 == 0 ) zSig1 = 1;
5608 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5609 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5610 mul64To128( zSig1, zSig1, &term2, &term3 );
5611 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5612 while ( (int64_t) rem1 < 0 ) {
5613 --zSig1;
5614 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5615 term3 |= 1;
5616 term2 |= doubleZSig0;
5617 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5619 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5621 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5622 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5626 /*----------------------------------------------------------------------------
5627 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5628 | the corresponding value `b', and 0 otherwise. The comparison is performed
5629 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5630 *----------------------------------------------------------------------------*/
5632 int float128_eq( float128 a, float128 b STATUS_PARAM )
5635 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5636 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5637 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5638 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5640 if ( float128_is_signaling_nan( a )
5641 || float128_is_signaling_nan( b ) ) {
5642 float_raise( float_flag_invalid STATUS_VAR);
5644 return 0;
5646 return
5647 ( a.low == b.low )
5648 && ( ( a.high == b.high )
5649 || ( ( a.low == 0 )
5650 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5655 /*----------------------------------------------------------------------------
5656 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5657 | or equal to the corresponding value `b', and 0 otherwise. The comparison
5658 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5659 | Arithmetic.
5660 *----------------------------------------------------------------------------*/
5662 int float128_le( float128 a, float128 b STATUS_PARAM )
5664 flag aSign, bSign;
5666 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5667 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5668 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5669 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5671 float_raise( float_flag_invalid STATUS_VAR);
5672 return 0;
5674 aSign = extractFloat128Sign( a );
5675 bSign = extractFloat128Sign( b );
5676 if ( aSign != bSign ) {
5677 return
5678 aSign
5679 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5680 == 0 );
5682 return
5683 aSign ? le128( b.high, b.low, a.high, a.low )
5684 : le128( a.high, a.low, b.high, b.low );
5688 /*----------------------------------------------------------------------------
5689 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5690 | the corresponding value `b', and 0 otherwise. The comparison is performed
5691 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5692 *----------------------------------------------------------------------------*/
5694 int float128_lt( float128 a, float128 b STATUS_PARAM )
5696 flag aSign, bSign;
5698 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5699 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5700 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5701 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5703 float_raise( float_flag_invalid STATUS_VAR);
5704 return 0;
5706 aSign = extractFloat128Sign( a );
5707 bSign = extractFloat128Sign( b );
5708 if ( aSign != bSign ) {
5709 return
5710 aSign
5711 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5712 != 0 );
5714 return
5715 aSign ? lt128( b.high, b.low, a.high, a.low )
5716 : lt128( a.high, a.low, b.high, b.low );
5720 /*----------------------------------------------------------------------------
5721 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5722 | the corresponding value `b', and 0 otherwise. The invalid exception is
5723 | raised if either operand is a NaN. Otherwise, the comparison is performed
5724 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5725 *----------------------------------------------------------------------------*/
5727 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5730 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5731 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5732 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5733 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5735 float_raise( float_flag_invalid STATUS_VAR);
5736 return 0;
5738 return
5739 ( a.low == b.low )
5740 && ( ( a.high == b.high )
5741 || ( ( a.low == 0 )
5742 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5747 /*----------------------------------------------------------------------------
5748 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5749 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5750 | cause an exception. Otherwise, the comparison is performed according to the
5751 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5752 *----------------------------------------------------------------------------*/
5754 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5756 flag aSign, bSign;
5758 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5759 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5760 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5761 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5763 if ( float128_is_signaling_nan( a )
5764 || float128_is_signaling_nan( b ) ) {
5765 float_raise( float_flag_invalid STATUS_VAR);
5767 return 0;
5769 aSign = extractFloat128Sign( a );
5770 bSign = extractFloat128Sign( b );
5771 if ( aSign != bSign ) {
5772 return
5773 aSign
5774 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5775 == 0 );
5777 return
5778 aSign ? le128( b.high, b.low, a.high, a.low )
5779 : le128( a.high, a.low, b.high, b.low );
5783 /*----------------------------------------------------------------------------
5784 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5785 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5786 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5787 | Standard for Binary Floating-Point Arithmetic.
5788 *----------------------------------------------------------------------------*/
5790 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5792 flag aSign, bSign;
5794 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5795 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5796 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5797 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5799 if ( float128_is_signaling_nan( a )
5800 || float128_is_signaling_nan( b ) ) {
5801 float_raise( float_flag_invalid STATUS_VAR);
5803 return 0;
5805 aSign = extractFloat128Sign( a );
5806 bSign = extractFloat128Sign( b );
5807 if ( aSign != bSign ) {
5808 return
5809 aSign
5810 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5811 != 0 );
5813 return
5814 aSign ? lt128( b.high, b.low, a.high, a.low )
5815 : lt128( a.high, a.low, b.high, b.low );
5819 #endif
5821 /* misc functions */
5822 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5824 return int64_to_float32(a STATUS_VAR);
5827 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5829 return int64_to_float64(a STATUS_VAR);
5832 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5834 int64_t v;
5835 unsigned int res;
5837 v = float32_to_int64(a STATUS_VAR);
5838 if (v < 0) {
5839 res = 0;
5840 float_raise( float_flag_invalid STATUS_VAR);
5841 } else if (v > 0xffffffff) {
5842 res = 0xffffffff;
5843 float_raise( float_flag_invalid STATUS_VAR);
5844 } else {
5845 res = v;
5847 return res;
5850 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5852 int64_t v;
5853 unsigned int res;
5855 v = float32_to_int64_round_to_zero(a STATUS_VAR);
5856 if (v < 0) {
5857 res = 0;
5858 float_raise( float_flag_invalid STATUS_VAR);
5859 } else if (v > 0xffffffff) {
5860 res = 0xffffffff;
5861 float_raise( float_flag_invalid STATUS_VAR);
5862 } else {
5863 res = v;
5865 return res;
5868 unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
5870 int64_t v;
5871 unsigned int res;
5873 v = float32_to_int64_round_to_zero(a STATUS_VAR);
5874 if (v < 0) {
5875 res = 0;
5876 float_raise( float_flag_invalid STATUS_VAR);
5877 } else if (v > 0xffff) {
5878 res = 0xffff;
5879 float_raise( float_flag_invalid STATUS_VAR);
5880 } else {
5881 res = v;
5883 return res;
5886 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5888 int64_t v;
5889 unsigned int res;
5891 v = float64_to_int64(a STATUS_VAR);
5892 if (v < 0) {
5893 res = 0;
5894 float_raise( float_flag_invalid STATUS_VAR);
5895 } else if (v > 0xffffffff) {
5896 res = 0xffffffff;
5897 float_raise( float_flag_invalid STATUS_VAR);
5898 } else {
5899 res = v;
5901 return res;
5904 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5906 int64_t v;
5907 unsigned int res;
5909 v = float64_to_int64_round_to_zero(a STATUS_VAR);
5910 if (v < 0) {
5911 res = 0;
5912 float_raise( float_flag_invalid STATUS_VAR);
5913 } else if (v > 0xffffffff) {
5914 res = 0xffffffff;
5915 float_raise( float_flag_invalid STATUS_VAR);
5916 } else {
5917 res = v;
5919 return res;
5922 unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
5924 int64_t v;
5925 unsigned int res;
5927 v = float64_to_int64_round_to_zero(a STATUS_VAR);
5928 if (v < 0) {
5929 res = 0;
5930 float_raise( float_flag_invalid STATUS_VAR);
5931 } else if (v > 0xffff) {
5932 res = 0xffff;
5933 float_raise( float_flag_invalid STATUS_VAR);
5934 } else {
5935 res = v;
5937 return res;
5940 /* FIXME: This looks broken. */
5941 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5943 int64_t v;
5945 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5946 v += float64_val(a);
5947 v = float64_to_int64(make_float64(v) STATUS_VAR);
5949 return v - INT64_MIN;
5952 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5954 int64_t v;
5956 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5957 v += float64_val(a);
5958 v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5960 return v - INT64_MIN;
5963 #define COMPARE(s, nan_exp) \
5964 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
5965 int is_quiet STATUS_PARAM ) \
5967 flag aSign, bSign; \
5968 uint ## s ## _t av, bv; \
5969 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
5970 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
5972 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
5973 extractFloat ## s ## Frac( a ) ) || \
5974 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
5975 extractFloat ## s ## Frac( b ) )) { \
5976 if (!is_quiet || \
5977 float ## s ## _is_signaling_nan( a ) || \
5978 float ## s ## _is_signaling_nan( b ) ) { \
5979 float_raise( float_flag_invalid STATUS_VAR); \
5981 return float_relation_unordered; \
5983 aSign = extractFloat ## s ## Sign( a ); \
5984 bSign = extractFloat ## s ## Sign( b ); \
5985 av = float ## s ## _val(a); \
5986 bv = float ## s ## _val(b); \
5987 if ( aSign != bSign ) { \
5988 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
5989 /* zero case */ \
5990 return float_relation_equal; \
5991 } else { \
5992 return 1 - (2 * aSign); \
5994 } else { \
5995 if (av == bv) { \
5996 return float_relation_equal; \
5997 } else { \
5998 return 1 - 2 * (aSign ^ ( av < bv )); \
6003 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
6005 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
6008 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
6010 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
6013 COMPARE(32, 0xff)
6014 COMPARE(64, 0x7ff)
6016 INLINE int float128_compare_internal( float128 a, float128 b,
6017 int is_quiet STATUS_PARAM )
6019 flag aSign, bSign;
6021 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6022 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6023 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6024 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6025 if (!is_quiet ||
6026 float128_is_signaling_nan( a ) ||
6027 float128_is_signaling_nan( b ) ) {
6028 float_raise( float_flag_invalid STATUS_VAR);
6030 return float_relation_unordered;
6032 aSign = extractFloat128Sign( a );
6033 bSign = extractFloat128Sign( b );
6034 if ( aSign != bSign ) {
6035 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6036 /* zero case */
6037 return float_relation_equal;
6038 } else {
6039 return 1 - (2 * aSign);
6041 } else {
6042 if (a.low == b.low && a.high == b.high) {
6043 return float_relation_equal;
6044 } else {
6045 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6050 int float128_compare( float128 a, float128 b STATUS_PARAM )
6052 return float128_compare_internal(a, b, 0 STATUS_VAR);
6055 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6057 return float128_compare_internal(a, b, 1 STATUS_VAR);
6060 /* min() and max() functions. These can't be implemented as
6061 * 'compare and pick one input' because that would mishandle
6062 * NaNs and +0 vs -0.
6064 #define MINMAX(s, nan_exp) \
6065 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \
6066 int ismin STATUS_PARAM ) \
6068 flag aSign, bSign; \
6069 uint ## s ## _t av, bv; \
6070 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6071 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6072 if (float ## s ## _is_any_nan(a) || \
6073 float ## s ## _is_any_nan(b)) { \
6074 return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
6076 aSign = extractFloat ## s ## Sign(a); \
6077 bSign = extractFloat ## s ## Sign(b); \
6078 av = float ## s ## _val(a); \
6079 bv = float ## s ## _val(b); \
6080 if (aSign != bSign) { \
6081 if (ismin) { \
6082 return aSign ? a : b; \
6083 } else { \
6084 return aSign ? b : a; \
6086 } else { \
6087 if (ismin) { \
6088 return (aSign ^ (av < bv)) ? a : b; \
6089 } else { \
6090 return (aSign ^ (av < bv)) ? b : a; \
6095 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
6097 return float ## s ## _minmax(a, b, 1 STATUS_VAR); \
6100 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
6102 return float ## s ## _minmax(a, b, 0 STATUS_VAR); \
6105 MINMAX(32, 0xff)
6106 MINMAX(64, 0x7ff)
6109 /* Multiply A by 2 raised to the power N. */
6110 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6112 flag aSign;
6113 int16 aExp;
6114 uint32_t aSig;
6116 a = float32_squash_input_denormal(a STATUS_VAR);
6117 aSig = extractFloat32Frac( a );
6118 aExp = extractFloat32Exp( a );
6119 aSign = extractFloat32Sign( a );
6121 if ( aExp == 0xFF ) {
6122 return a;
6124 if ( aExp != 0 )
6125 aSig |= 0x00800000;
6126 else if ( aSig == 0 )
6127 return a;
6129 aExp += n - 1;
6130 aSig <<= 7;
6131 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6134 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6136 flag aSign;
6137 int16 aExp;
6138 uint64_t aSig;
6140 a = float64_squash_input_denormal(a STATUS_VAR);
6141 aSig = extractFloat64Frac( a );
6142 aExp = extractFloat64Exp( a );
6143 aSign = extractFloat64Sign( a );
6145 if ( aExp == 0x7FF ) {
6146 return a;
6148 if ( aExp != 0 )
6149 aSig |= LIT64( 0x0010000000000000 );
6150 else if ( aSig == 0 )
6151 return a;
6153 aExp += n - 1;
6154 aSig <<= 10;
6155 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6158 #ifdef FLOATX80
6159 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6161 flag aSign;
6162 int16 aExp;
6163 uint64_t aSig;
6165 aSig = extractFloatx80Frac( a );
6166 aExp = extractFloatx80Exp( a );
6167 aSign = extractFloatx80Sign( a );
6169 if ( aExp == 0x7FF ) {
6170 return a;
6172 if (aExp == 0 && aSig == 0)
6173 return a;
6175 aExp += n;
6176 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6177 aSign, aExp, aSig, 0 STATUS_VAR );
6179 #endif
6181 #ifdef FLOAT128
6182 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6184 flag aSign;
6185 int32 aExp;
6186 uint64_t aSig0, aSig1;
6188 aSig1 = extractFloat128Frac1( a );
6189 aSig0 = extractFloat128Frac0( a );
6190 aExp = extractFloat128Exp( a );
6191 aSign = extractFloat128Sign( a );
6192 if ( aExp == 0x7FFF ) {
6193 return a;
6195 if ( aExp != 0 )
6196 aSig0 |= LIT64( 0x0001000000000000 );
6197 else if ( aSig0 == 0 && aSig1 == 0 )
6198 return a;
6200 aExp += n - 1;
6201 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6202 STATUS_VAR );
6205 #endif