kvm: use qemu_free consistently
[qemu/stefanha.git] / fpu / softfloat.c
blobbaba1dc44b812c978ba04cb0f37718e4e67acab6
1 /*
2 * QEMU float support
4 * Derived from SoftFloat.
5 */
7 /*============================================================================
9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
10 Package, Release 2b.
12 Written by John R. Hauser. This work was made possible in part by the
13 International Computer Science Institute, located at Suite 600, 1947 Center
14 Street, Berkeley, California 94704. Funding was partially provided by the
15 National Science Foundation under grant MIP-9311980. The original version
16 of this code was written as part of a project to build a fixed-point vector
17 processor in collaboration with the University of California at Berkeley,
18 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20 arithmetic/SoftFloat.html'.
22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
31 Derivative works are acceptable, even for commercial purposes, so long as
32 (1) the source code for the derivative work includes prominent notice that
33 the work is derivative, and (2) the source code includes prominent notice with
34 these four paragraphs for those parts of this code that are retained.
36 =============================================================================*/
38 #include "softfloat.h"
40 /*----------------------------------------------------------------------------
41 | Primitive arithmetic functions, including multi-word arithmetic, and
42 | division and square root approximations. (Can be specialized to target if
43 | desired.)
44 *----------------------------------------------------------------------------*/
45 #include "softfloat-macros.h"
47 /*----------------------------------------------------------------------------
48 | Functions and definitions to determine: (1) whether tininess for underflow
49 | is detected before or after rounding by default, (2) what (if anything)
50 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
51 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
52 | are propagated from function inputs to output. These details are target-
53 | specific.
54 *----------------------------------------------------------------------------*/
55 #include "softfloat-specialize.h"
57 void set_float_rounding_mode(int val STATUS_PARAM)
59 STATUS(float_rounding_mode) = val;
62 void set_float_exception_flags(int val STATUS_PARAM)
64 STATUS(float_exception_flags) = val;
67 #ifdef FLOATX80
68 void set_floatx80_rounding_precision(int val STATUS_PARAM)
70 STATUS(floatx80_rounding_precision) = val;
72 #endif
74 /*----------------------------------------------------------------------------
75 | Returns the fraction bits of the half-precision floating-point value `a'.
76 *----------------------------------------------------------------------------*/
78 INLINE uint32_t extractFloat16Frac(float16 a)
80 return float16_val(a) & 0x3ff;
83 /*----------------------------------------------------------------------------
84 | Returns the exponent bits of the half-precision floating-point value `a'.
85 *----------------------------------------------------------------------------*/
87 INLINE int16 extractFloat16Exp(float16 a)
89 return (float16_val(a) >> 10) & 0x1f;
92 /*----------------------------------------------------------------------------
93 | Returns the sign bit of the single-precision floating-point value `a'.
94 *----------------------------------------------------------------------------*/
96 INLINE flag extractFloat16Sign(float16 a)
98 return float16_val(a)>>15;
101 /*----------------------------------------------------------------------------
102 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
103 | and 7, and returns the properly rounded 32-bit integer corresponding to the
104 | input. If `zSign' is 1, the input is negated before being converted to an
105 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
106 | is simply rounded to an integer, with the inexact exception raised if the
107 | input cannot be represented exactly as an integer. However, if the fixed-
108 | point input is too large, the invalid exception is raised and the largest
109 | positive or negative integer is returned.
110 *----------------------------------------------------------------------------*/
112 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
114 int8 roundingMode;
115 flag roundNearestEven;
116 int8 roundIncrement, roundBits;
117 int32 z;
119 roundingMode = STATUS(float_rounding_mode);
120 roundNearestEven = ( roundingMode == float_round_nearest_even );
121 roundIncrement = 0x40;
122 if ( ! roundNearestEven ) {
123 if ( roundingMode == float_round_to_zero ) {
124 roundIncrement = 0;
126 else {
127 roundIncrement = 0x7F;
128 if ( zSign ) {
129 if ( roundingMode == float_round_up ) roundIncrement = 0;
131 else {
132 if ( roundingMode == float_round_down ) roundIncrement = 0;
136 roundBits = absZ & 0x7F;
137 absZ = ( absZ + roundIncrement )>>7;
138 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
139 z = absZ;
140 if ( zSign ) z = - z;
141 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
142 float_raise( float_flag_invalid STATUS_VAR);
143 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
145 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
146 return z;
150 /*----------------------------------------------------------------------------
151 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
152 | `absZ1', with binary point between bits 63 and 64 (between the input words),
153 | and returns the properly rounded 64-bit integer corresponding to the input.
154 | If `zSign' is 1, the input is negated before being converted to an integer.
155 | Ordinarily, the fixed-point input is simply rounded to an integer, with
156 | the inexact exception raised if the input cannot be represented exactly as
157 | an integer. However, if the fixed-point input is too large, the invalid
158 | exception is raised and the largest positive or negative integer is
159 | returned.
160 *----------------------------------------------------------------------------*/
162 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
164 int8 roundingMode;
165 flag roundNearestEven, increment;
166 int64 z;
168 roundingMode = STATUS(float_rounding_mode);
169 roundNearestEven = ( roundingMode == float_round_nearest_even );
170 increment = ( (int64_t) absZ1 < 0 );
171 if ( ! roundNearestEven ) {
172 if ( roundingMode == float_round_to_zero ) {
173 increment = 0;
175 else {
176 if ( zSign ) {
177 increment = ( roundingMode == float_round_down ) && absZ1;
179 else {
180 increment = ( roundingMode == float_round_up ) && absZ1;
184 if ( increment ) {
185 ++absZ0;
186 if ( absZ0 == 0 ) goto overflow;
187 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
189 z = absZ0;
190 if ( zSign ) z = - z;
191 if ( z && ( ( z < 0 ) ^ zSign ) ) {
192 overflow:
193 float_raise( float_flag_invalid STATUS_VAR);
194 return
195 zSign ? (int64_t) LIT64( 0x8000000000000000 )
196 : LIT64( 0x7FFFFFFFFFFFFFFF );
198 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
199 return z;
203 /*----------------------------------------------------------------------------
204 | Returns the fraction bits of the single-precision floating-point value `a'.
205 *----------------------------------------------------------------------------*/
207 INLINE uint32_t extractFloat32Frac( float32 a )
210 return float32_val(a) & 0x007FFFFF;
214 /*----------------------------------------------------------------------------
215 | Returns the exponent bits of the single-precision floating-point value `a'.
216 *----------------------------------------------------------------------------*/
218 INLINE int16 extractFloat32Exp( float32 a )
221 return ( float32_val(a)>>23 ) & 0xFF;
225 /*----------------------------------------------------------------------------
226 | Returns the sign bit of the single-precision floating-point value `a'.
227 *----------------------------------------------------------------------------*/
229 INLINE flag extractFloat32Sign( float32 a )
232 return float32_val(a)>>31;
236 /*----------------------------------------------------------------------------
237 | If `a' is denormal and we are in flush-to-zero mode then set the
238 | input-denormal exception and return zero. Otherwise just return the value.
239 *----------------------------------------------------------------------------*/
240 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
242 if (STATUS(flush_inputs_to_zero)) {
243 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
244 float_raise(float_flag_input_denormal STATUS_VAR);
245 return make_float32(float32_val(a) & 0x80000000);
248 return a;
251 /*----------------------------------------------------------------------------
252 | Normalizes the subnormal single-precision floating-point value represented
253 | by the denormalized significand `aSig'. The normalized exponent and
254 | significand are stored at the locations pointed to by `zExpPtr' and
255 | `zSigPtr', respectively.
256 *----------------------------------------------------------------------------*/
258 static void
259 normalizeFloat32Subnormal( uint32_t aSig, int16 *zExpPtr, uint32_t *zSigPtr )
261 int8 shiftCount;
263 shiftCount = countLeadingZeros32( aSig ) - 8;
264 *zSigPtr = aSig<<shiftCount;
265 *zExpPtr = 1 - shiftCount;
269 /*----------------------------------------------------------------------------
270 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
271 | single-precision floating-point value, returning the result. After being
272 | shifted into the proper positions, the three fields are simply added
273 | together to form the result. This means that any integer portion of `zSig'
274 | will be added into the exponent. Since a properly normalized significand
275 | will have an integer portion equal to 1, the `zExp' input should be 1 less
276 | than the desired result exponent whenever `zSig' is a complete, normalized
277 | significand.
278 *----------------------------------------------------------------------------*/
280 INLINE float32 packFloat32( flag zSign, int16 zExp, uint32_t zSig )
283 return make_float32(
284 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
288 /*----------------------------------------------------------------------------
289 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
290 | and significand `zSig', and returns the proper single-precision floating-
291 | point value corresponding to the abstract input. Ordinarily, the abstract
292 | value is simply rounded and packed into the single-precision format, with
293 | the inexact exception raised if the abstract input cannot be represented
294 | exactly. However, if the abstract value is too large, the overflow and
295 | inexact exceptions are raised and an infinity or maximal finite value is
296 | returned. If the abstract value is too small, the input value is rounded to
297 | a subnormal number, and the underflow and inexact exceptions are raised if
298 | the abstract input cannot be represented exactly as a subnormal single-
299 | precision floating-point number.
300 | The input significand `zSig' has its binary point between bits 30
301 | and 29, which is 7 bits to the left of the usual location. This shifted
302 | significand must be normalized or smaller. If `zSig' is not normalized,
303 | `zExp' must be 0; in that case, the result returned is a subnormal number,
304 | and it must not require rounding. In the usual case that `zSig' is
305 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
306 | The handling of underflow and overflow follows the IEC/IEEE Standard for
307 | Binary Floating-Point Arithmetic.
308 *----------------------------------------------------------------------------*/
310 static float32 roundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
312 int8 roundingMode;
313 flag roundNearestEven;
314 int8 roundIncrement, roundBits;
315 flag isTiny;
317 roundingMode = STATUS(float_rounding_mode);
318 roundNearestEven = ( roundingMode == float_round_nearest_even );
319 roundIncrement = 0x40;
320 if ( ! roundNearestEven ) {
321 if ( roundingMode == float_round_to_zero ) {
322 roundIncrement = 0;
324 else {
325 roundIncrement = 0x7F;
326 if ( zSign ) {
327 if ( roundingMode == float_round_up ) roundIncrement = 0;
329 else {
330 if ( roundingMode == float_round_down ) roundIncrement = 0;
334 roundBits = zSig & 0x7F;
335 if ( 0xFD <= (uint16_t) zExp ) {
336 if ( ( 0xFD < zExp )
337 || ( ( zExp == 0xFD )
338 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
340 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
341 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
343 if ( zExp < 0 ) {
344 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
345 isTiny =
346 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
347 || ( zExp < -1 )
348 || ( zSig + roundIncrement < 0x80000000 );
349 shift32RightJamming( zSig, - zExp, &zSig );
350 zExp = 0;
351 roundBits = zSig & 0x7F;
352 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
355 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
356 zSig = ( zSig + roundIncrement )>>7;
357 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
358 if ( zSig == 0 ) zExp = 0;
359 return packFloat32( zSign, zExp, zSig );
363 /*----------------------------------------------------------------------------
364 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
365 | and significand `zSig', and returns the proper single-precision floating-
366 | point value corresponding to the abstract input. This routine is just like
367 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
368 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
369 | floating-point exponent.
370 *----------------------------------------------------------------------------*/
372 static float32
373 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
375 int8 shiftCount;
377 shiftCount = countLeadingZeros32( zSig ) - 1;
378 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
382 /*----------------------------------------------------------------------------
383 | Returns the fraction bits of the double-precision floating-point value `a'.
384 *----------------------------------------------------------------------------*/
386 INLINE uint64_t extractFloat64Frac( float64 a )
389 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
393 /*----------------------------------------------------------------------------
394 | Returns the exponent bits of the double-precision floating-point value `a'.
395 *----------------------------------------------------------------------------*/
397 INLINE int16 extractFloat64Exp( float64 a )
400 return ( float64_val(a)>>52 ) & 0x7FF;
404 /*----------------------------------------------------------------------------
405 | Returns the sign bit of the double-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
408 INLINE flag extractFloat64Sign( float64 a )
411 return float64_val(a)>>63;
415 /*----------------------------------------------------------------------------
416 | If `a' is denormal and we are in flush-to-zero mode then set the
417 | input-denormal exception and return zero. Otherwise just return the value.
418 *----------------------------------------------------------------------------*/
419 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
421 if (STATUS(flush_inputs_to_zero)) {
422 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
423 float_raise(float_flag_input_denormal STATUS_VAR);
424 return make_float64(float64_val(a) & (1ULL << 63));
427 return a;
430 /*----------------------------------------------------------------------------
431 | Normalizes the subnormal double-precision floating-point value represented
432 | by the denormalized significand `aSig'. The normalized exponent and
433 | significand are stored at the locations pointed to by `zExpPtr' and
434 | `zSigPtr', respectively.
435 *----------------------------------------------------------------------------*/
437 static void
438 normalizeFloat64Subnormal( uint64_t aSig, int16 *zExpPtr, uint64_t *zSigPtr )
440 int8 shiftCount;
442 shiftCount = countLeadingZeros64( aSig ) - 11;
443 *zSigPtr = aSig<<shiftCount;
444 *zExpPtr = 1 - shiftCount;
448 /*----------------------------------------------------------------------------
449 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
450 | double-precision floating-point value, returning the result. After being
451 | shifted into the proper positions, the three fields are simply added
452 | together to form the result. This means that any integer portion of `zSig'
453 | will be added into the exponent. Since a properly normalized significand
454 | will have an integer portion equal to 1, the `zExp' input should be 1 less
455 | than the desired result exponent whenever `zSig' is a complete, normalized
456 | significand.
457 *----------------------------------------------------------------------------*/
459 INLINE float64 packFloat64( flag zSign, int16 zExp, uint64_t zSig )
462 return make_float64(
463 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
467 /*----------------------------------------------------------------------------
468 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
469 | and significand `zSig', and returns the proper double-precision floating-
470 | point value corresponding to the abstract input. Ordinarily, the abstract
471 | value is simply rounded and packed into the double-precision format, with
472 | the inexact exception raised if the abstract input cannot be represented
473 | exactly. However, if the abstract value is too large, the overflow and
474 | inexact exceptions are raised and an infinity or maximal finite value is
475 | returned. If the abstract value is too small, the input value is rounded
476 | to a subnormal number, and the underflow and inexact exceptions are raised
477 | if the abstract input cannot be represented exactly as a subnormal double-
478 | precision floating-point number.
479 | The input significand `zSig' has its binary point between bits 62
480 | and 61, which is 10 bits to the left of the usual location. This shifted
481 | significand must be normalized or smaller. If `zSig' is not normalized,
482 | `zExp' must be 0; in that case, the result returned is a subnormal number,
483 | and it must not require rounding. In the usual case that `zSig' is
484 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
485 | The handling of underflow and overflow follows the IEC/IEEE Standard for
486 | Binary Floating-Point Arithmetic.
487 *----------------------------------------------------------------------------*/
489 static float64 roundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
491 int8 roundingMode;
492 flag roundNearestEven;
493 int16 roundIncrement, roundBits;
494 flag isTiny;
496 roundingMode = STATUS(float_rounding_mode);
497 roundNearestEven = ( roundingMode == float_round_nearest_even );
498 roundIncrement = 0x200;
499 if ( ! roundNearestEven ) {
500 if ( roundingMode == float_round_to_zero ) {
501 roundIncrement = 0;
503 else {
504 roundIncrement = 0x3FF;
505 if ( zSign ) {
506 if ( roundingMode == float_round_up ) roundIncrement = 0;
508 else {
509 if ( roundingMode == float_round_down ) roundIncrement = 0;
513 roundBits = zSig & 0x3FF;
514 if ( 0x7FD <= (uint16_t) zExp ) {
515 if ( ( 0x7FD < zExp )
516 || ( ( zExp == 0x7FD )
517 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
519 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
520 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
522 if ( zExp < 0 ) {
523 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
524 isTiny =
525 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
526 || ( zExp < -1 )
527 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
528 shift64RightJamming( zSig, - zExp, &zSig );
529 zExp = 0;
530 roundBits = zSig & 0x3FF;
531 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
534 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
535 zSig = ( zSig + roundIncrement )>>10;
536 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
537 if ( zSig == 0 ) zExp = 0;
538 return packFloat64( zSign, zExp, zSig );
542 /*----------------------------------------------------------------------------
543 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
544 | and significand `zSig', and returns the proper double-precision floating-
545 | point value corresponding to the abstract input. This routine is just like
546 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
547 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
548 | floating-point exponent.
549 *----------------------------------------------------------------------------*/
551 static float64
552 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
554 int8 shiftCount;
556 shiftCount = countLeadingZeros64( zSig ) - 1;
557 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
561 #ifdef FLOATX80
563 /*----------------------------------------------------------------------------
564 | Returns the fraction bits of the extended double-precision floating-point
565 | value `a'.
566 *----------------------------------------------------------------------------*/
568 INLINE uint64_t extractFloatx80Frac( floatx80 a )
571 return a.low;
575 /*----------------------------------------------------------------------------
576 | Returns the exponent bits of the extended double-precision floating-point
577 | value `a'.
578 *----------------------------------------------------------------------------*/
580 INLINE int32 extractFloatx80Exp( floatx80 a )
583 return a.high & 0x7FFF;
587 /*----------------------------------------------------------------------------
588 | Returns the sign bit of the extended double-precision floating-point value
589 | `a'.
590 *----------------------------------------------------------------------------*/
592 INLINE flag extractFloatx80Sign( floatx80 a )
595 return a.high>>15;
599 /*----------------------------------------------------------------------------
600 | Normalizes the subnormal extended double-precision floating-point value
601 | represented by the denormalized significand `aSig'. The normalized exponent
602 | and significand are stored at the locations pointed to by `zExpPtr' and
603 | `zSigPtr', respectively.
604 *----------------------------------------------------------------------------*/
606 static void
607 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
609 int8 shiftCount;
611 shiftCount = countLeadingZeros64( aSig );
612 *zSigPtr = aSig<<shiftCount;
613 *zExpPtr = 1 - shiftCount;
617 /*----------------------------------------------------------------------------
618 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
619 | extended double-precision floating-point value, returning the result.
620 *----------------------------------------------------------------------------*/
622 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
624 floatx80 z;
626 z.low = zSig;
627 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
628 return z;
632 /*----------------------------------------------------------------------------
633 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
634 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
635 | and returns the proper extended double-precision floating-point value
636 | corresponding to the abstract input. Ordinarily, the abstract value is
637 | rounded and packed into the extended double-precision format, with the
638 | inexact exception raised if the abstract input cannot be represented
639 | exactly. However, if the abstract value is too large, the overflow and
640 | inexact exceptions are raised and an infinity or maximal finite value is
641 | returned. If the abstract value is too small, the input value is rounded to
642 | a subnormal number, and the underflow and inexact exceptions are raised if
643 | the abstract input cannot be represented exactly as a subnormal extended
644 | double-precision floating-point number.
645 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
646 | number of bits as single or double precision, respectively. Otherwise, the
647 | result is rounded to the full precision of the extended double-precision
648 | format.
649 | The input significand must be normalized or smaller. If the input
650 | significand is not normalized, `zExp' must be 0; in that case, the result
651 | returned is a subnormal number, and it must not require rounding. The
652 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
653 | Floating-Point Arithmetic.
654 *----------------------------------------------------------------------------*/
656 static floatx80
657 roundAndPackFloatx80(
658 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
659 STATUS_PARAM)
661 int8 roundingMode;
662 flag roundNearestEven, increment, isTiny;
663 int64 roundIncrement, roundMask, roundBits;
665 roundingMode = STATUS(float_rounding_mode);
666 roundNearestEven = ( roundingMode == float_round_nearest_even );
667 if ( roundingPrecision == 80 ) goto precision80;
668 if ( roundingPrecision == 64 ) {
669 roundIncrement = LIT64( 0x0000000000000400 );
670 roundMask = LIT64( 0x00000000000007FF );
672 else if ( roundingPrecision == 32 ) {
673 roundIncrement = LIT64( 0x0000008000000000 );
674 roundMask = LIT64( 0x000000FFFFFFFFFF );
676 else {
677 goto precision80;
679 zSig0 |= ( zSig1 != 0 );
680 if ( ! roundNearestEven ) {
681 if ( roundingMode == float_round_to_zero ) {
682 roundIncrement = 0;
684 else {
685 roundIncrement = roundMask;
686 if ( zSign ) {
687 if ( roundingMode == float_round_up ) roundIncrement = 0;
689 else {
690 if ( roundingMode == float_round_down ) roundIncrement = 0;
694 roundBits = zSig0 & roundMask;
695 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
696 if ( ( 0x7FFE < zExp )
697 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
699 goto overflow;
701 if ( zExp <= 0 ) {
702 if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
703 isTiny =
704 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
705 || ( zExp < 0 )
706 || ( zSig0 <= zSig0 + roundIncrement );
707 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
708 zExp = 0;
709 roundBits = zSig0 & roundMask;
710 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
711 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
712 zSig0 += roundIncrement;
713 if ( (int64_t) zSig0 < 0 ) zExp = 1;
714 roundIncrement = roundMask + 1;
715 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
716 roundMask |= roundIncrement;
718 zSig0 &= ~ roundMask;
719 return packFloatx80( zSign, zExp, zSig0 );
722 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
723 zSig0 += roundIncrement;
724 if ( zSig0 < roundIncrement ) {
725 ++zExp;
726 zSig0 = LIT64( 0x8000000000000000 );
728 roundIncrement = roundMask + 1;
729 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
730 roundMask |= roundIncrement;
732 zSig0 &= ~ roundMask;
733 if ( zSig0 == 0 ) zExp = 0;
734 return packFloatx80( zSign, zExp, zSig0 );
735 precision80:
736 increment = ( (int64_t) zSig1 < 0 );
737 if ( ! roundNearestEven ) {
738 if ( roundingMode == float_round_to_zero ) {
739 increment = 0;
741 else {
742 if ( zSign ) {
743 increment = ( roundingMode == float_round_down ) && zSig1;
745 else {
746 increment = ( roundingMode == float_round_up ) && zSig1;
750 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
751 if ( ( 0x7FFE < zExp )
752 || ( ( zExp == 0x7FFE )
753 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
754 && increment
757 roundMask = 0;
758 overflow:
759 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
760 if ( ( roundingMode == float_round_to_zero )
761 || ( zSign && ( roundingMode == float_round_up ) )
762 || ( ! zSign && ( roundingMode == float_round_down ) )
764 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
766 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
768 if ( zExp <= 0 ) {
769 isTiny =
770 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
771 || ( zExp < 0 )
772 || ! increment
773 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
774 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
775 zExp = 0;
776 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
777 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
778 if ( roundNearestEven ) {
779 increment = ( (int64_t) zSig1 < 0 );
781 else {
782 if ( zSign ) {
783 increment = ( roundingMode == float_round_down ) && zSig1;
785 else {
786 increment = ( roundingMode == float_round_up ) && zSig1;
789 if ( increment ) {
790 ++zSig0;
791 zSig0 &=
792 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
793 if ( (int64_t) zSig0 < 0 ) zExp = 1;
795 return packFloatx80( zSign, zExp, zSig0 );
798 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
799 if ( increment ) {
800 ++zSig0;
801 if ( zSig0 == 0 ) {
802 ++zExp;
803 zSig0 = LIT64( 0x8000000000000000 );
805 else {
806 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
809 else {
810 if ( zSig0 == 0 ) zExp = 0;
812 return packFloatx80( zSign, zExp, zSig0 );
816 /*----------------------------------------------------------------------------
817 | Takes an abstract floating-point value having sign `zSign', exponent
818 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
819 | and returns the proper extended double-precision floating-point value
820 | corresponding to the abstract input. This routine is just like
821 | `roundAndPackFloatx80' except that the input significand does not have to be
822 | normalized.
823 *----------------------------------------------------------------------------*/
825 static floatx80
826 normalizeRoundAndPackFloatx80(
827 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
828 STATUS_PARAM)
830 int8 shiftCount;
832 if ( zSig0 == 0 ) {
833 zSig0 = zSig1;
834 zSig1 = 0;
835 zExp -= 64;
837 shiftCount = countLeadingZeros64( zSig0 );
838 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
839 zExp -= shiftCount;
840 return
841 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
845 #endif
847 #ifdef FLOAT128
849 /*----------------------------------------------------------------------------
850 | Returns the least-significant 64 fraction bits of the quadruple-precision
851 | floating-point value `a'.
852 *----------------------------------------------------------------------------*/
854 INLINE uint64_t extractFloat128Frac1( float128 a )
857 return a.low;
861 /*----------------------------------------------------------------------------
862 | Returns the most-significant 48 fraction bits of the quadruple-precision
863 | floating-point value `a'.
864 *----------------------------------------------------------------------------*/
866 INLINE uint64_t extractFloat128Frac0( float128 a )
869 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
873 /*----------------------------------------------------------------------------
874 | Returns the exponent bits of the quadruple-precision floating-point value
875 | `a'.
876 *----------------------------------------------------------------------------*/
878 INLINE int32 extractFloat128Exp( float128 a )
881 return ( a.high>>48 ) & 0x7FFF;
885 /*----------------------------------------------------------------------------
886 | Returns the sign bit of the quadruple-precision floating-point value `a'.
887 *----------------------------------------------------------------------------*/
889 INLINE flag extractFloat128Sign( float128 a )
892 return a.high>>63;
896 /*----------------------------------------------------------------------------
897 | Normalizes the subnormal quadruple-precision floating-point value
898 | represented by the denormalized significand formed by the concatenation of
899 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
900 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
901 | significand are stored at the location pointed to by `zSig0Ptr', and the
902 | least significant 64 bits of the normalized significand are stored at the
903 | location pointed to by `zSig1Ptr'.
904 *----------------------------------------------------------------------------*/
906 static void
907 normalizeFloat128Subnormal(
908 uint64_t aSig0,
909 uint64_t aSig1,
910 int32 *zExpPtr,
911 uint64_t *zSig0Ptr,
912 uint64_t *zSig1Ptr
915 int8 shiftCount;
917 if ( aSig0 == 0 ) {
918 shiftCount = countLeadingZeros64( aSig1 ) - 15;
919 if ( shiftCount < 0 ) {
920 *zSig0Ptr = aSig1>>( - shiftCount );
921 *zSig1Ptr = aSig1<<( shiftCount & 63 );
923 else {
924 *zSig0Ptr = aSig1<<shiftCount;
925 *zSig1Ptr = 0;
927 *zExpPtr = - shiftCount - 63;
929 else {
930 shiftCount = countLeadingZeros64( aSig0 ) - 15;
931 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
932 *zExpPtr = 1 - shiftCount;
937 /*----------------------------------------------------------------------------
938 | Packs the sign `zSign', the exponent `zExp', and the significand formed
939 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
940 | floating-point value, returning the result. After being shifted into the
941 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
942 | added together to form the most significant 32 bits of the result. This
943 | means that any integer portion of `zSig0' will be added into the exponent.
944 | Since a properly normalized significand will have an integer portion equal
945 | to 1, the `zExp' input should be 1 less than the desired result exponent
946 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
947 | significand.
948 *----------------------------------------------------------------------------*/
950 INLINE float128
951 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
953 float128 z;
955 z.low = zSig1;
956 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
957 return z;
961 /*----------------------------------------------------------------------------
962 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
963 | and extended significand formed by the concatenation of `zSig0', `zSig1',
964 | and `zSig2', and returns the proper quadruple-precision floating-point value
965 | corresponding to the abstract input. Ordinarily, the abstract value is
966 | simply rounded and packed into the quadruple-precision format, with the
967 | inexact exception raised if the abstract input cannot be represented
968 | exactly. However, if the abstract value is too large, the overflow and
969 | inexact exceptions are raised and an infinity or maximal finite value is
970 | returned. If the abstract value is too small, the input value is rounded to
971 | a subnormal number, and the underflow and inexact exceptions are raised if
972 | the abstract input cannot be represented exactly as a subnormal quadruple-
973 | precision floating-point number.
974 | The input significand must be normalized or smaller. If the input
975 | significand is not normalized, `zExp' must be 0; in that case, the result
976 | returned is a subnormal number, and it must not require rounding. In the
977 | usual case that the input significand is normalized, `zExp' must be 1 less
978 | than the ``true'' floating-point exponent. The handling of underflow and
979 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
980 *----------------------------------------------------------------------------*/
982 static float128
983 roundAndPackFloat128(
984 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
986 int8 roundingMode;
987 flag roundNearestEven, increment, isTiny;
989 roundingMode = STATUS(float_rounding_mode);
990 roundNearestEven = ( roundingMode == float_round_nearest_even );
991 increment = ( (int64_t) zSig2 < 0 );
992 if ( ! roundNearestEven ) {
993 if ( roundingMode == float_round_to_zero ) {
994 increment = 0;
996 else {
997 if ( zSign ) {
998 increment = ( roundingMode == float_round_down ) && zSig2;
1000 else {
1001 increment = ( roundingMode == float_round_up ) && zSig2;
1005 if ( 0x7FFD <= (uint32_t) zExp ) {
1006 if ( ( 0x7FFD < zExp )
1007 || ( ( zExp == 0x7FFD )
1008 && eq128(
1009 LIT64( 0x0001FFFFFFFFFFFF ),
1010 LIT64( 0xFFFFFFFFFFFFFFFF ),
1011 zSig0,
1012 zSig1
1014 && increment
1017 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1018 if ( ( roundingMode == float_round_to_zero )
1019 || ( zSign && ( roundingMode == float_round_up ) )
1020 || ( ! zSign && ( roundingMode == float_round_down ) )
1022 return
1023 packFloat128(
1024 zSign,
1025 0x7FFE,
1026 LIT64( 0x0000FFFFFFFFFFFF ),
1027 LIT64( 0xFFFFFFFFFFFFFFFF )
1030 return packFloat128( zSign, 0x7FFF, 0, 0 );
1032 if ( zExp < 0 ) {
1033 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
1034 isTiny =
1035 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1036 || ( zExp < -1 )
1037 || ! increment
1038 || lt128(
1039 zSig0,
1040 zSig1,
1041 LIT64( 0x0001FFFFFFFFFFFF ),
1042 LIT64( 0xFFFFFFFFFFFFFFFF )
1044 shift128ExtraRightJamming(
1045 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1046 zExp = 0;
1047 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1048 if ( roundNearestEven ) {
1049 increment = ( (int64_t) zSig2 < 0 );
1051 else {
1052 if ( zSign ) {
1053 increment = ( roundingMode == float_round_down ) && zSig2;
1055 else {
1056 increment = ( roundingMode == float_round_up ) && zSig2;
1061 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1062 if ( increment ) {
1063 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1064 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1066 else {
1067 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1069 return packFloat128( zSign, zExp, zSig0, zSig1 );
1073 /*----------------------------------------------------------------------------
1074 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1075 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1076 | returns the proper quadruple-precision floating-point value corresponding
1077 | to the abstract input. This routine is just like `roundAndPackFloat128'
1078 | except that the input significand has fewer bits and does not have to be
1079 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1080 | point exponent.
1081 *----------------------------------------------------------------------------*/
1083 static float128
1084 normalizeRoundAndPackFloat128(
1085 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1087 int8 shiftCount;
1088 uint64_t zSig2;
1090 if ( zSig0 == 0 ) {
1091 zSig0 = zSig1;
1092 zSig1 = 0;
1093 zExp -= 64;
1095 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1096 if ( 0 <= shiftCount ) {
1097 zSig2 = 0;
1098 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1100 else {
1101 shift128ExtraRightJamming(
1102 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1104 zExp -= shiftCount;
1105 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1109 #endif
1111 /*----------------------------------------------------------------------------
1112 | Returns the result of converting the 32-bit two's complement integer `a'
1113 | to the single-precision floating-point format. The conversion is performed
1114 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115 *----------------------------------------------------------------------------*/
1117 float32 int32_to_float32( int32 a STATUS_PARAM )
1119 flag zSign;
1121 if ( a == 0 ) return float32_zero;
1122 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1123 zSign = ( a < 0 );
1124 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1128 /*----------------------------------------------------------------------------
1129 | Returns the result of converting the 32-bit two's complement integer `a'
1130 | to the double-precision floating-point format. The conversion is performed
1131 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1132 *----------------------------------------------------------------------------*/
1134 float64 int32_to_float64( int32 a STATUS_PARAM )
1136 flag zSign;
1137 uint32 absA;
1138 int8 shiftCount;
1139 uint64_t zSig;
1141 if ( a == 0 ) return float64_zero;
1142 zSign = ( a < 0 );
1143 absA = zSign ? - a : a;
1144 shiftCount = countLeadingZeros32( absA ) + 21;
1145 zSig = absA;
1146 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1150 #ifdef FLOATX80
1152 /*----------------------------------------------------------------------------
1153 | Returns the result of converting the 32-bit two's complement integer `a'
1154 | to the extended double-precision floating-point format. The conversion
1155 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1156 | Arithmetic.
1157 *----------------------------------------------------------------------------*/
1159 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1161 flag zSign;
1162 uint32 absA;
1163 int8 shiftCount;
1164 uint64_t zSig;
1166 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1167 zSign = ( a < 0 );
1168 absA = zSign ? - a : a;
1169 shiftCount = countLeadingZeros32( absA ) + 32;
1170 zSig = absA;
1171 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1175 #endif
1177 #ifdef FLOAT128
1179 /*----------------------------------------------------------------------------
1180 | Returns the result of converting the 32-bit two's complement integer `a' to
1181 | the quadruple-precision floating-point format. The conversion is performed
1182 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1183 *----------------------------------------------------------------------------*/
1185 float128 int32_to_float128( int32 a STATUS_PARAM )
1187 flag zSign;
1188 uint32 absA;
1189 int8 shiftCount;
1190 uint64_t zSig0;
1192 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1193 zSign = ( a < 0 );
1194 absA = zSign ? - a : a;
1195 shiftCount = countLeadingZeros32( absA ) + 17;
1196 zSig0 = absA;
1197 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1201 #endif
1203 /*----------------------------------------------------------------------------
1204 | Returns the result of converting the 64-bit two's complement integer `a'
1205 | to the single-precision floating-point format. The conversion is performed
1206 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1207 *----------------------------------------------------------------------------*/
1209 float32 int64_to_float32( int64 a STATUS_PARAM )
1211 flag zSign;
1212 uint64 absA;
1213 int8 shiftCount;
1215 if ( a == 0 ) return float32_zero;
1216 zSign = ( a < 0 );
1217 absA = zSign ? - a : a;
1218 shiftCount = countLeadingZeros64( absA ) - 40;
1219 if ( 0 <= shiftCount ) {
1220 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1222 else {
1223 shiftCount += 7;
1224 if ( shiftCount < 0 ) {
1225 shift64RightJamming( absA, - shiftCount, &absA );
1227 else {
1228 absA <<= shiftCount;
1230 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1235 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1237 int8 shiftCount;
1239 if ( a == 0 ) return float32_zero;
1240 shiftCount = countLeadingZeros64( a ) - 40;
1241 if ( 0 <= shiftCount ) {
1242 return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1244 else {
1245 shiftCount += 7;
1246 if ( shiftCount < 0 ) {
1247 shift64RightJamming( a, - shiftCount, &a );
1249 else {
1250 a <<= shiftCount;
1252 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1256 /*----------------------------------------------------------------------------
1257 | Returns the result of converting the 64-bit two's complement integer `a'
1258 | to the double-precision floating-point format. The conversion is performed
1259 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1260 *----------------------------------------------------------------------------*/
1262 float64 int64_to_float64( int64 a STATUS_PARAM )
1264 flag zSign;
1266 if ( a == 0 ) return float64_zero;
1267 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1268 return packFloat64( 1, 0x43E, 0 );
1270 zSign = ( a < 0 );
1271 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1275 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1277 if ( a == 0 ) return float64_zero;
1278 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1282 #ifdef FLOATX80
1284 /*----------------------------------------------------------------------------
1285 | Returns the result of converting the 64-bit two's complement integer `a'
1286 | to the extended double-precision floating-point format. The conversion
1287 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1288 | Arithmetic.
1289 *----------------------------------------------------------------------------*/
1291 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1293 flag zSign;
1294 uint64 absA;
1295 int8 shiftCount;
1297 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1298 zSign = ( a < 0 );
1299 absA = zSign ? - a : a;
1300 shiftCount = countLeadingZeros64( absA );
1301 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1305 #endif
1307 #ifdef FLOAT128
1309 /*----------------------------------------------------------------------------
1310 | Returns the result of converting the 64-bit two's complement integer `a' to
1311 | the quadruple-precision floating-point format. The conversion is performed
1312 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1313 *----------------------------------------------------------------------------*/
1315 float128 int64_to_float128( int64 a STATUS_PARAM )
1317 flag zSign;
1318 uint64 absA;
1319 int8 shiftCount;
1320 int32 zExp;
1321 uint64_t zSig0, zSig1;
1323 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1324 zSign = ( a < 0 );
1325 absA = zSign ? - a : a;
1326 shiftCount = countLeadingZeros64( absA ) + 49;
1327 zExp = 0x406E - shiftCount;
1328 if ( 64 <= shiftCount ) {
1329 zSig1 = 0;
1330 zSig0 = absA;
1331 shiftCount -= 64;
1333 else {
1334 zSig1 = absA;
1335 zSig0 = 0;
1337 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1338 return packFloat128( zSign, zExp, zSig0, zSig1 );
1342 #endif
1344 /*----------------------------------------------------------------------------
1345 | Returns the result of converting the single-precision floating-point value
1346 | `a' to the 32-bit two's complement integer format. The conversion is
1347 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1348 | Arithmetic---which means in particular that the conversion is rounded
1349 | according to the current rounding mode. If `a' is a NaN, the largest
1350 | positive integer is returned. Otherwise, if the conversion overflows, the
1351 | largest integer with the same sign as `a' is returned.
1352 *----------------------------------------------------------------------------*/
1354 int32 float32_to_int32( float32 a STATUS_PARAM )
1356 flag aSign;
1357 int16 aExp, shiftCount;
1358 uint32_t aSig;
1359 uint64_t aSig64;
1361 a = float32_squash_input_denormal(a STATUS_VAR);
1362 aSig = extractFloat32Frac( a );
1363 aExp = extractFloat32Exp( a );
1364 aSign = extractFloat32Sign( a );
1365 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1366 if ( aExp ) aSig |= 0x00800000;
1367 shiftCount = 0xAF - aExp;
1368 aSig64 = aSig;
1369 aSig64 <<= 32;
1370 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1371 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1375 /*----------------------------------------------------------------------------
1376 | Returns the result of converting the single-precision floating-point value
1377 | `a' to the 32-bit two's complement integer format. The conversion is
1378 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1379 | Arithmetic, except that the conversion is always rounded toward zero.
1380 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1381 | the conversion overflows, the largest integer with the same sign as `a' is
1382 | returned.
1383 *----------------------------------------------------------------------------*/
1385 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1387 flag aSign;
1388 int16 aExp, shiftCount;
1389 uint32_t aSig;
1390 int32 z;
1391 a = float32_squash_input_denormal(a STATUS_VAR);
1393 aSig = extractFloat32Frac( a );
1394 aExp = extractFloat32Exp( a );
1395 aSign = extractFloat32Sign( a );
1396 shiftCount = aExp - 0x9E;
1397 if ( 0 <= shiftCount ) {
1398 if ( float32_val(a) != 0xCF000000 ) {
1399 float_raise( float_flag_invalid STATUS_VAR);
1400 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1402 return (int32_t) 0x80000000;
1404 else if ( aExp <= 0x7E ) {
1405 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1406 return 0;
1408 aSig = ( aSig | 0x00800000 )<<8;
1409 z = aSig>>( - shiftCount );
1410 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1411 STATUS(float_exception_flags) |= float_flag_inexact;
1413 if ( aSign ) z = - z;
1414 return z;
1418 /*----------------------------------------------------------------------------
1419 | Returns the result of converting the single-precision floating-point value
1420 | `a' to the 16-bit two's complement integer format. The conversion is
1421 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1422 | Arithmetic, except that the conversion is always rounded toward zero.
1423 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1424 | the conversion overflows, the largest integer with the same sign as `a' is
1425 | returned.
1426 *----------------------------------------------------------------------------*/
1428 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1430 flag aSign;
1431 int16 aExp, shiftCount;
1432 uint32_t aSig;
1433 int32 z;
1435 aSig = extractFloat32Frac( a );
1436 aExp = extractFloat32Exp( a );
1437 aSign = extractFloat32Sign( a );
1438 shiftCount = aExp - 0x8E;
1439 if ( 0 <= shiftCount ) {
1440 if ( float32_val(a) != 0xC7000000 ) {
1441 float_raise( float_flag_invalid STATUS_VAR);
1442 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1443 return 0x7FFF;
1446 return (int32_t) 0xffff8000;
1448 else if ( aExp <= 0x7E ) {
1449 if ( aExp | aSig ) {
1450 STATUS(float_exception_flags) |= float_flag_inexact;
1452 return 0;
1454 shiftCount -= 0x10;
1455 aSig = ( aSig | 0x00800000 )<<8;
1456 z = aSig>>( - shiftCount );
1457 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1458 STATUS(float_exception_flags) |= float_flag_inexact;
1460 if ( aSign ) {
1461 z = - z;
1463 return z;
1467 /*----------------------------------------------------------------------------
1468 | Returns the result of converting the single-precision floating-point value
1469 | `a' to the 64-bit two's complement integer format. The conversion is
1470 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1471 | Arithmetic---which means in particular that the conversion is rounded
1472 | according to the current rounding mode. If `a' is a NaN, the largest
1473 | positive integer is returned. Otherwise, if the conversion overflows, the
1474 | largest integer with the same sign as `a' is returned.
1475 *----------------------------------------------------------------------------*/
1477 int64 float32_to_int64( float32 a STATUS_PARAM )
1479 flag aSign;
1480 int16 aExp, shiftCount;
1481 uint32_t aSig;
1482 uint64_t aSig64, aSigExtra;
1483 a = float32_squash_input_denormal(a STATUS_VAR);
1485 aSig = extractFloat32Frac( a );
1486 aExp = extractFloat32Exp( a );
1487 aSign = extractFloat32Sign( a );
1488 shiftCount = 0xBE - aExp;
1489 if ( shiftCount < 0 ) {
1490 float_raise( float_flag_invalid STATUS_VAR);
1491 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1492 return LIT64( 0x7FFFFFFFFFFFFFFF );
1494 return (int64_t) LIT64( 0x8000000000000000 );
1496 if ( aExp ) aSig |= 0x00800000;
1497 aSig64 = aSig;
1498 aSig64 <<= 40;
1499 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1500 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1504 /*----------------------------------------------------------------------------
1505 | Returns the result of converting the single-precision floating-point value
1506 | `a' to the 64-bit two's complement integer format. The conversion is
1507 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1508 | Arithmetic, except that the conversion is always rounded toward zero. If
1509 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1510 | conversion overflows, the largest integer with the same sign as `a' is
1511 | returned.
1512 *----------------------------------------------------------------------------*/
1514 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1516 flag aSign;
1517 int16 aExp, shiftCount;
1518 uint32_t aSig;
1519 uint64_t aSig64;
1520 int64 z;
1521 a = float32_squash_input_denormal(a STATUS_VAR);
1523 aSig = extractFloat32Frac( a );
1524 aExp = extractFloat32Exp( a );
1525 aSign = extractFloat32Sign( a );
1526 shiftCount = aExp - 0xBE;
1527 if ( 0 <= shiftCount ) {
1528 if ( float32_val(a) != 0xDF000000 ) {
1529 float_raise( float_flag_invalid STATUS_VAR);
1530 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1531 return LIT64( 0x7FFFFFFFFFFFFFFF );
1534 return (int64_t) LIT64( 0x8000000000000000 );
1536 else if ( aExp <= 0x7E ) {
1537 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1538 return 0;
1540 aSig64 = aSig | 0x00800000;
1541 aSig64 <<= 40;
1542 z = aSig64>>( - shiftCount );
1543 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1544 STATUS(float_exception_flags) |= float_flag_inexact;
1546 if ( aSign ) z = - z;
1547 return z;
1551 /*----------------------------------------------------------------------------
1552 | Returns the result of converting the single-precision floating-point value
1553 | `a' to the double-precision floating-point format. The conversion is
1554 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1555 | Arithmetic.
1556 *----------------------------------------------------------------------------*/
1558 float64 float32_to_float64( float32 a STATUS_PARAM )
1560 flag aSign;
1561 int16 aExp;
1562 uint32_t aSig;
1563 a = float32_squash_input_denormal(a STATUS_VAR);
1565 aSig = extractFloat32Frac( a );
1566 aExp = extractFloat32Exp( a );
1567 aSign = extractFloat32Sign( a );
1568 if ( aExp == 0xFF ) {
1569 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1570 return packFloat64( aSign, 0x7FF, 0 );
1572 if ( aExp == 0 ) {
1573 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1574 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1575 --aExp;
1577 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1581 #ifdef FLOATX80
1583 /*----------------------------------------------------------------------------
1584 | Returns the result of converting the single-precision floating-point value
1585 | `a' to the extended double-precision floating-point format. The conversion
1586 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1587 | Arithmetic.
1588 *----------------------------------------------------------------------------*/
1590 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1592 flag aSign;
1593 int16 aExp;
1594 uint32_t aSig;
1596 a = float32_squash_input_denormal(a STATUS_VAR);
1597 aSig = extractFloat32Frac( a );
1598 aExp = extractFloat32Exp( a );
1599 aSign = extractFloat32Sign( a );
1600 if ( aExp == 0xFF ) {
1601 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1602 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1604 if ( aExp == 0 ) {
1605 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1606 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1608 aSig |= 0x00800000;
1609 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1613 #endif
1615 #ifdef FLOAT128
1617 /*----------------------------------------------------------------------------
1618 | Returns the result of converting the single-precision floating-point value
1619 | `a' to the double-precision floating-point format. The conversion is
1620 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1621 | Arithmetic.
1622 *----------------------------------------------------------------------------*/
1624 float128 float32_to_float128( float32 a STATUS_PARAM )
1626 flag aSign;
1627 int16 aExp;
1628 uint32_t aSig;
1630 a = float32_squash_input_denormal(a STATUS_VAR);
1631 aSig = extractFloat32Frac( a );
1632 aExp = extractFloat32Exp( a );
1633 aSign = extractFloat32Sign( a );
1634 if ( aExp == 0xFF ) {
1635 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1636 return packFloat128( aSign, 0x7FFF, 0, 0 );
1638 if ( aExp == 0 ) {
1639 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1640 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1641 --aExp;
1643 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1647 #endif
1649 /*----------------------------------------------------------------------------
1650 | Rounds the single-precision floating-point value `a' to an integer, and
1651 | returns the result as a single-precision floating-point value. The
1652 | operation is performed according to the IEC/IEEE Standard for Binary
1653 | Floating-Point Arithmetic.
1654 *----------------------------------------------------------------------------*/
1656 float32 float32_round_to_int( float32 a STATUS_PARAM)
1658 flag aSign;
1659 int16 aExp;
1660 uint32_t lastBitMask, roundBitsMask;
1661 int8 roundingMode;
1662 uint32_t z;
1663 a = float32_squash_input_denormal(a STATUS_VAR);
1665 aExp = extractFloat32Exp( a );
1666 if ( 0x96 <= aExp ) {
1667 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1668 return propagateFloat32NaN( a, a STATUS_VAR );
1670 return a;
1672 if ( aExp <= 0x7E ) {
1673 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1674 STATUS(float_exception_flags) |= float_flag_inexact;
1675 aSign = extractFloat32Sign( a );
1676 switch ( STATUS(float_rounding_mode) ) {
1677 case float_round_nearest_even:
1678 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1679 return packFloat32( aSign, 0x7F, 0 );
1681 break;
1682 case float_round_down:
1683 return make_float32(aSign ? 0xBF800000 : 0);
1684 case float_round_up:
1685 return make_float32(aSign ? 0x80000000 : 0x3F800000);
1687 return packFloat32( aSign, 0, 0 );
1689 lastBitMask = 1;
1690 lastBitMask <<= 0x96 - aExp;
1691 roundBitsMask = lastBitMask - 1;
1692 z = float32_val(a);
1693 roundingMode = STATUS(float_rounding_mode);
1694 if ( roundingMode == float_round_nearest_even ) {
1695 z += lastBitMask>>1;
1696 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1698 else if ( roundingMode != float_round_to_zero ) {
1699 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1700 z += roundBitsMask;
1703 z &= ~ roundBitsMask;
1704 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1705 return make_float32(z);
1709 /*----------------------------------------------------------------------------
1710 | Returns the result of adding the absolute values of the single-precision
1711 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1712 | before being returned. `zSign' is ignored if the result is a NaN.
1713 | The addition is performed according to the IEC/IEEE Standard for Binary
1714 | Floating-Point Arithmetic.
1715 *----------------------------------------------------------------------------*/
1717 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1719 int16 aExp, bExp, zExp;
1720 uint32_t aSig, bSig, zSig;
1721 int16 expDiff;
1723 aSig = extractFloat32Frac( a );
1724 aExp = extractFloat32Exp( a );
1725 bSig = extractFloat32Frac( b );
1726 bExp = extractFloat32Exp( b );
1727 expDiff = aExp - bExp;
1728 aSig <<= 6;
1729 bSig <<= 6;
1730 if ( 0 < expDiff ) {
1731 if ( aExp == 0xFF ) {
1732 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1733 return a;
1735 if ( bExp == 0 ) {
1736 --expDiff;
1738 else {
1739 bSig |= 0x20000000;
1741 shift32RightJamming( bSig, expDiff, &bSig );
1742 zExp = aExp;
1744 else if ( expDiff < 0 ) {
1745 if ( bExp == 0xFF ) {
1746 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1747 return packFloat32( zSign, 0xFF, 0 );
1749 if ( aExp == 0 ) {
1750 ++expDiff;
1752 else {
1753 aSig |= 0x20000000;
1755 shift32RightJamming( aSig, - expDiff, &aSig );
1756 zExp = bExp;
1758 else {
1759 if ( aExp == 0xFF ) {
1760 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1761 return a;
1763 if ( aExp == 0 ) {
1764 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1765 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1767 zSig = 0x40000000 + aSig + bSig;
1768 zExp = aExp;
1769 goto roundAndPack;
1771 aSig |= 0x20000000;
1772 zSig = ( aSig + bSig )<<1;
1773 --zExp;
1774 if ( (int32_t) zSig < 0 ) {
1775 zSig = aSig + bSig;
1776 ++zExp;
1778 roundAndPack:
1779 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1783 /*----------------------------------------------------------------------------
1784 | Returns the result of subtracting the absolute values of the single-
1785 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1786 | difference is negated before being returned. `zSign' is ignored if the
1787 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1788 | Standard for Binary Floating-Point Arithmetic.
1789 *----------------------------------------------------------------------------*/
1791 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1793 int16 aExp, bExp, zExp;
1794 uint32_t aSig, bSig, zSig;
1795 int16 expDiff;
1797 aSig = extractFloat32Frac( a );
1798 aExp = extractFloat32Exp( a );
1799 bSig = extractFloat32Frac( b );
1800 bExp = extractFloat32Exp( b );
1801 expDiff = aExp - bExp;
1802 aSig <<= 7;
1803 bSig <<= 7;
1804 if ( 0 < expDiff ) goto aExpBigger;
1805 if ( expDiff < 0 ) goto bExpBigger;
1806 if ( aExp == 0xFF ) {
1807 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1808 float_raise( float_flag_invalid STATUS_VAR);
1809 return float32_default_nan;
1811 if ( aExp == 0 ) {
1812 aExp = 1;
1813 bExp = 1;
1815 if ( bSig < aSig ) goto aBigger;
1816 if ( aSig < bSig ) goto bBigger;
1817 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1818 bExpBigger:
1819 if ( bExp == 0xFF ) {
1820 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1821 return packFloat32( zSign ^ 1, 0xFF, 0 );
1823 if ( aExp == 0 ) {
1824 ++expDiff;
1826 else {
1827 aSig |= 0x40000000;
1829 shift32RightJamming( aSig, - expDiff, &aSig );
1830 bSig |= 0x40000000;
1831 bBigger:
1832 zSig = bSig - aSig;
1833 zExp = bExp;
1834 zSign ^= 1;
1835 goto normalizeRoundAndPack;
1836 aExpBigger:
1837 if ( aExp == 0xFF ) {
1838 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1839 return a;
1841 if ( bExp == 0 ) {
1842 --expDiff;
1844 else {
1845 bSig |= 0x40000000;
1847 shift32RightJamming( bSig, expDiff, &bSig );
1848 aSig |= 0x40000000;
1849 aBigger:
1850 zSig = aSig - bSig;
1851 zExp = aExp;
1852 normalizeRoundAndPack:
1853 --zExp;
1854 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1858 /*----------------------------------------------------------------------------
1859 | Returns the result of adding the single-precision floating-point values `a'
1860 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1861 | Binary Floating-Point Arithmetic.
1862 *----------------------------------------------------------------------------*/
1864 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1866 flag aSign, bSign;
1867 a = float32_squash_input_denormal(a STATUS_VAR);
1868 b = float32_squash_input_denormal(b STATUS_VAR);
1870 aSign = extractFloat32Sign( a );
1871 bSign = extractFloat32Sign( b );
1872 if ( aSign == bSign ) {
1873 return addFloat32Sigs( a, b, aSign STATUS_VAR);
1875 else {
1876 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1881 /*----------------------------------------------------------------------------
1882 | Returns the result of subtracting the single-precision floating-point values
1883 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1884 | for Binary Floating-Point Arithmetic.
1885 *----------------------------------------------------------------------------*/
1887 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1889 flag aSign, bSign;
1890 a = float32_squash_input_denormal(a STATUS_VAR);
1891 b = float32_squash_input_denormal(b STATUS_VAR);
1893 aSign = extractFloat32Sign( a );
1894 bSign = extractFloat32Sign( b );
1895 if ( aSign == bSign ) {
1896 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1898 else {
1899 return addFloat32Sigs( a, b, aSign STATUS_VAR );
1904 /*----------------------------------------------------------------------------
1905 | Returns the result of multiplying the single-precision floating-point values
1906 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1907 | for Binary Floating-Point Arithmetic.
1908 *----------------------------------------------------------------------------*/
1910 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1912 flag aSign, bSign, zSign;
1913 int16 aExp, bExp, zExp;
1914 uint32_t aSig, bSig;
1915 uint64_t zSig64;
1916 uint32_t zSig;
1918 a = float32_squash_input_denormal(a STATUS_VAR);
1919 b = float32_squash_input_denormal(b STATUS_VAR);
1921 aSig = extractFloat32Frac( a );
1922 aExp = extractFloat32Exp( a );
1923 aSign = extractFloat32Sign( a );
1924 bSig = extractFloat32Frac( b );
1925 bExp = extractFloat32Exp( b );
1926 bSign = extractFloat32Sign( b );
1927 zSign = aSign ^ bSign;
1928 if ( aExp == 0xFF ) {
1929 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1930 return propagateFloat32NaN( a, b STATUS_VAR );
1932 if ( ( bExp | bSig ) == 0 ) {
1933 float_raise( float_flag_invalid STATUS_VAR);
1934 return float32_default_nan;
1936 return packFloat32( zSign, 0xFF, 0 );
1938 if ( bExp == 0xFF ) {
1939 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1940 if ( ( aExp | aSig ) == 0 ) {
1941 float_raise( float_flag_invalid STATUS_VAR);
1942 return float32_default_nan;
1944 return packFloat32( zSign, 0xFF, 0 );
1946 if ( aExp == 0 ) {
1947 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1948 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1950 if ( bExp == 0 ) {
1951 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1952 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1954 zExp = aExp + bExp - 0x7F;
1955 aSig = ( aSig | 0x00800000 )<<7;
1956 bSig = ( bSig | 0x00800000 )<<8;
1957 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
1958 zSig = zSig64;
1959 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
1960 zSig <<= 1;
1961 --zExp;
1963 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1967 /*----------------------------------------------------------------------------
1968 | Returns the result of dividing the single-precision floating-point value `a'
1969 | by the corresponding value `b'. The operation is performed according to the
1970 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1971 *----------------------------------------------------------------------------*/
1973 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1975 flag aSign, bSign, zSign;
1976 int16 aExp, bExp, zExp;
1977 uint32_t aSig, bSig, zSig;
1978 a = float32_squash_input_denormal(a STATUS_VAR);
1979 b = float32_squash_input_denormal(b STATUS_VAR);
1981 aSig = extractFloat32Frac( a );
1982 aExp = extractFloat32Exp( a );
1983 aSign = extractFloat32Sign( a );
1984 bSig = extractFloat32Frac( b );
1985 bExp = extractFloat32Exp( b );
1986 bSign = extractFloat32Sign( b );
1987 zSign = aSign ^ bSign;
1988 if ( aExp == 0xFF ) {
1989 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1990 if ( bExp == 0xFF ) {
1991 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1992 float_raise( float_flag_invalid STATUS_VAR);
1993 return float32_default_nan;
1995 return packFloat32( zSign, 0xFF, 0 );
1997 if ( bExp == 0xFF ) {
1998 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1999 return packFloat32( zSign, 0, 0 );
2001 if ( bExp == 0 ) {
2002 if ( bSig == 0 ) {
2003 if ( ( aExp | aSig ) == 0 ) {
2004 float_raise( float_flag_invalid STATUS_VAR);
2005 return float32_default_nan;
2007 float_raise( float_flag_divbyzero STATUS_VAR);
2008 return packFloat32( zSign, 0xFF, 0 );
2010 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2012 if ( aExp == 0 ) {
2013 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2014 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2016 zExp = aExp - bExp + 0x7D;
2017 aSig = ( aSig | 0x00800000 )<<7;
2018 bSig = ( bSig | 0x00800000 )<<8;
2019 if ( bSig <= ( aSig + aSig ) ) {
2020 aSig >>= 1;
2021 ++zExp;
2023 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2024 if ( ( zSig & 0x3F ) == 0 ) {
2025 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2027 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2031 /*----------------------------------------------------------------------------
2032 | Returns the remainder of the single-precision floating-point value `a'
2033 | with respect to the corresponding value `b'. The operation is performed
2034 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2035 *----------------------------------------------------------------------------*/
2037 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2039 flag aSign, zSign;
2040 int16 aExp, bExp, expDiff;
2041 uint32_t aSig, bSig;
2042 uint32_t q;
2043 uint64_t aSig64, bSig64, q64;
2044 uint32_t alternateASig;
2045 int32_t sigMean;
2046 a = float32_squash_input_denormal(a STATUS_VAR);
2047 b = float32_squash_input_denormal(b STATUS_VAR);
2049 aSig = extractFloat32Frac( a );
2050 aExp = extractFloat32Exp( a );
2051 aSign = extractFloat32Sign( a );
2052 bSig = extractFloat32Frac( b );
2053 bExp = extractFloat32Exp( b );
2054 if ( aExp == 0xFF ) {
2055 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2056 return propagateFloat32NaN( a, b STATUS_VAR );
2058 float_raise( float_flag_invalid STATUS_VAR);
2059 return float32_default_nan;
2061 if ( bExp == 0xFF ) {
2062 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2063 return a;
2065 if ( bExp == 0 ) {
2066 if ( bSig == 0 ) {
2067 float_raise( float_flag_invalid STATUS_VAR);
2068 return float32_default_nan;
2070 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2072 if ( aExp == 0 ) {
2073 if ( aSig == 0 ) return a;
2074 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2076 expDiff = aExp - bExp;
2077 aSig |= 0x00800000;
2078 bSig |= 0x00800000;
2079 if ( expDiff < 32 ) {
2080 aSig <<= 8;
2081 bSig <<= 8;
2082 if ( expDiff < 0 ) {
2083 if ( expDiff < -1 ) return a;
2084 aSig >>= 1;
2086 q = ( bSig <= aSig );
2087 if ( q ) aSig -= bSig;
2088 if ( 0 < expDiff ) {
2089 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2090 q >>= 32 - expDiff;
2091 bSig >>= 2;
2092 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2094 else {
2095 aSig >>= 2;
2096 bSig >>= 2;
2099 else {
2100 if ( bSig <= aSig ) aSig -= bSig;
2101 aSig64 = ( (uint64_t) aSig )<<40;
2102 bSig64 = ( (uint64_t) bSig )<<40;
2103 expDiff -= 64;
2104 while ( 0 < expDiff ) {
2105 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2106 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2107 aSig64 = - ( ( bSig * q64 )<<38 );
2108 expDiff -= 62;
2110 expDiff += 64;
2111 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2112 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2113 q = q64>>( 64 - expDiff );
2114 bSig <<= 6;
2115 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2117 do {
2118 alternateASig = aSig;
2119 ++q;
2120 aSig -= bSig;
2121 } while ( 0 <= (int32_t) aSig );
2122 sigMean = aSig + alternateASig;
2123 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2124 aSig = alternateASig;
2126 zSign = ( (int32_t) aSig < 0 );
2127 if ( zSign ) aSig = - aSig;
2128 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2132 /*----------------------------------------------------------------------------
2133 | Returns the square root of the single-precision floating-point value `a'.
2134 | The operation is performed according to the IEC/IEEE Standard for Binary
2135 | Floating-Point Arithmetic.
2136 *----------------------------------------------------------------------------*/
2138 float32 float32_sqrt( float32 a STATUS_PARAM )
2140 flag aSign;
2141 int16 aExp, zExp;
2142 uint32_t aSig, zSig;
2143 uint64_t rem, term;
2144 a = float32_squash_input_denormal(a STATUS_VAR);
2146 aSig = extractFloat32Frac( a );
2147 aExp = extractFloat32Exp( a );
2148 aSign = extractFloat32Sign( a );
2149 if ( aExp == 0xFF ) {
2150 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2151 if ( ! aSign ) return a;
2152 float_raise( float_flag_invalid STATUS_VAR);
2153 return float32_default_nan;
2155 if ( aSign ) {
2156 if ( ( aExp | aSig ) == 0 ) return a;
2157 float_raise( float_flag_invalid STATUS_VAR);
2158 return float32_default_nan;
2160 if ( aExp == 0 ) {
2161 if ( aSig == 0 ) return float32_zero;
2162 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2164 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2165 aSig = ( aSig | 0x00800000 )<<8;
2166 zSig = estimateSqrt32( aExp, aSig ) + 2;
2167 if ( ( zSig & 0x7F ) <= 5 ) {
2168 if ( zSig < 2 ) {
2169 zSig = 0x7FFFFFFF;
2170 goto roundAndPack;
2172 aSig >>= aExp & 1;
2173 term = ( (uint64_t) zSig ) * zSig;
2174 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2175 while ( (int64_t) rem < 0 ) {
2176 --zSig;
2177 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2179 zSig |= ( rem != 0 );
2181 shift32RightJamming( zSig, 1, &zSig );
2182 roundAndPack:
2183 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2187 /*----------------------------------------------------------------------------
2188 | Returns the binary exponential of the single-precision floating-point value
2189 | `a'. The operation is performed according to the IEC/IEEE Standard for
2190 | Binary Floating-Point Arithmetic.
2192 | Uses the following identities:
2194 | 1. -------------------------------------------------------------------------
2195 | x x*ln(2)
2196 | 2 = e
2198 | 2. -------------------------------------------------------------------------
2199 | 2 3 4 5 n
2200 | x x x x x x x
2201 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2202 | 1! 2! 3! 4! 5! n!
2203 *----------------------------------------------------------------------------*/
2205 static const float64 float32_exp2_coefficients[15] =
2207 const_float64( 0x3ff0000000000000ll ), /* 1 */
2208 const_float64( 0x3fe0000000000000ll ), /* 2 */
2209 const_float64( 0x3fc5555555555555ll ), /* 3 */
2210 const_float64( 0x3fa5555555555555ll ), /* 4 */
2211 const_float64( 0x3f81111111111111ll ), /* 5 */
2212 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2213 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2214 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2215 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2216 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2217 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2218 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2219 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2220 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2221 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2224 float32 float32_exp2( float32 a STATUS_PARAM )
2226 flag aSign;
2227 int16 aExp;
2228 uint32_t aSig;
2229 float64 r, x, xn;
2230 int i;
2231 a = float32_squash_input_denormal(a STATUS_VAR);
2233 aSig = extractFloat32Frac( a );
2234 aExp = extractFloat32Exp( a );
2235 aSign = extractFloat32Sign( a );
2237 if ( aExp == 0xFF) {
2238 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2239 return (aSign) ? float32_zero : a;
2241 if (aExp == 0) {
2242 if (aSig == 0) return float32_one;
2245 float_raise( float_flag_inexact STATUS_VAR);
2247 /* ******************************* */
2248 /* using float64 for approximation */
2249 /* ******************************* */
2250 x = float32_to_float64(a STATUS_VAR);
2251 x = float64_mul(x, float64_ln2 STATUS_VAR);
2253 xn = x;
2254 r = float64_one;
2255 for (i = 0 ; i < 15 ; i++) {
2256 float64 f;
2258 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2259 r = float64_add(r, f STATUS_VAR);
2261 xn = float64_mul(xn, x STATUS_VAR);
2264 return float64_to_float32(r, status);
2267 /*----------------------------------------------------------------------------
2268 | Returns the binary log of the single-precision floating-point value `a'.
2269 | The operation is performed according to the IEC/IEEE Standard for Binary
2270 | Floating-Point Arithmetic.
2271 *----------------------------------------------------------------------------*/
2272 float32 float32_log2( float32 a STATUS_PARAM )
2274 flag aSign, zSign;
2275 int16 aExp;
2276 uint32_t aSig, zSig, i;
2278 a = float32_squash_input_denormal(a STATUS_VAR);
2279 aSig = extractFloat32Frac( a );
2280 aExp = extractFloat32Exp( a );
2281 aSign = extractFloat32Sign( a );
2283 if ( aExp == 0 ) {
2284 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2285 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2287 if ( aSign ) {
2288 float_raise( float_flag_invalid STATUS_VAR);
2289 return float32_default_nan;
2291 if ( aExp == 0xFF ) {
2292 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2293 return a;
2296 aExp -= 0x7F;
2297 aSig |= 0x00800000;
2298 zSign = aExp < 0;
2299 zSig = aExp << 23;
2301 for (i = 1 << 22; i > 0; i >>= 1) {
2302 aSig = ( (uint64_t)aSig * aSig ) >> 23;
2303 if ( aSig & 0x01000000 ) {
2304 aSig >>= 1;
2305 zSig |= i;
2309 if ( zSign )
2310 zSig = -zSig;
2312 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2315 /*----------------------------------------------------------------------------
2316 | Returns 1 if the single-precision floating-point value `a' is equal to
2317 | the corresponding value `b', and 0 otherwise. The invalid exception is
2318 | raised if either operand is a NaN. Otherwise, the comparison is performed
2319 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2320 *----------------------------------------------------------------------------*/
2322 int float32_eq( float32 a, float32 b STATUS_PARAM )
2324 uint32_t av, bv;
2325 a = float32_squash_input_denormal(a STATUS_VAR);
2326 b = float32_squash_input_denormal(b STATUS_VAR);
2328 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2329 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2331 float_raise( float_flag_invalid STATUS_VAR);
2332 return 0;
2334 av = float32_val(a);
2335 bv = float32_val(b);
2336 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2339 /*----------------------------------------------------------------------------
2340 | Returns 1 if the single-precision floating-point value `a' is less than
2341 | or equal to the corresponding value `b', and 0 otherwise. The invalid
2342 | exception is raised if either operand is a NaN. The comparison is performed
2343 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2344 *----------------------------------------------------------------------------*/
2346 int float32_le( float32 a, float32 b STATUS_PARAM )
2348 flag aSign, bSign;
2349 uint32_t av, bv;
2350 a = float32_squash_input_denormal(a STATUS_VAR);
2351 b = float32_squash_input_denormal(b STATUS_VAR);
2353 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2354 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2356 float_raise( float_flag_invalid STATUS_VAR);
2357 return 0;
2359 aSign = extractFloat32Sign( a );
2360 bSign = extractFloat32Sign( b );
2361 av = float32_val(a);
2362 bv = float32_val(b);
2363 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2364 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2368 /*----------------------------------------------------------------------------
2369 | Returns 1 if the single-precision floating-point value `a' is less than
2370 | the corresponding value `b', and 0 otherwise. The invalid exception is
2371 | raised if either operand is a NaN. The comparison is performed according
2372 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2373 *----------------------------------------------------------------------------*/
2375 int float32_lt( float32 a, float32 b STATUS_PARAM )
2377 flag aSign, bSign;
2378 uint32_t av, bv;
2379 a = float32_squash_input_denormal(a STATUS_VAR);
2380 b = float32_squash_input_denormal(b STATUS_VAR);
2382 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2383 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2385 float_raise( float_flag_invalid STATUS_VAR);
2386 return 0;
2388 aSign = extractFloat32Sign( a );
2389 bSign = extractFloat32Sign( b );
2390 av = float32_val(a);
2391 bv = float32_val(b);
2392 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2393 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2397 /*----------------------------------------------------------------------------
2398 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2399 | be compared, and 0 otherwise. The invalid exception is raised if either
2400 | operand is a NaN. The comparison is performed according to the IEC/IEEE
2401 | Standard for Binary Floating-Point Arithmetic.
2402 *----------------------------------------------------------------------------*/
2404 int float32_unordered( float32 a, float32 b STATUS_PARAM )
2406 a = float32_squash_input_denormal(a STATUS_VAR);
2407 b = float32_squash_input_denormal(b STATUS_VAR);
2409 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2410 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2412 float_raise( float_flag_invalid STATUS_VAR);
2413 return 1;
2415 return 0;
2418 /*----------------------------------------------------------------------------
2419 | Returns 1 if the single-precision floating-point value `a' is equal to
2420 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2421 | exception. The comparison is performed according to the IEC/IEEE Standard
2422 | for Binary Floating-Point Arithmetic.
2423 *----------------------------------------------------------------------------*/
2425 int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2427 a = float32_squash_input_denormal(a STATUS_VAR);
2428 b = float32_squash_input_denormal(b STATUS_VAR);
2430 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2431 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2433 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2434 float_raise( float_flag_invalid STATUS_VAR);
2436 return 0;
2438 return ( float32_val(a) == float32_val(b) ) ||
2439 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2442 /*----------------------------------------------------------------------------
2443 | Returns 1 if the single-precision floating-point value `a' is less than or
2444 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2445 | cause an exception. Otherwise, the comparison is performed according to the
2446 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2447 *----------------------------------------------------------------------------*/
2449 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2451 flag aSign, bSign;
2452 uint32_t av, bv;
2453 a = float32_squash_input_denormal(a STATUS_VAR);
2454 b = float32_squash_input_denormal(b STATUS_VAR);
2456 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2457 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2459 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2460 float_raise( float_flag_invalid STATUS_VAR);
2462 return 0;
2464 aSign = extractFloat32Sign( a );
2465 bSign = extractFloat32Sign( b );
2466 av = float32_val(a);
2467 bv = float32_val(b);
2468 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2469 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2473 /*----------------------------------------------------------------------------
2474 | Returns 1 if the single-precision floating-point value `a' is less than
2475 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2476 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2477 | Standard for Binary Floating-Point Arithmetic.
2478 *----------------------------------------------------------------------------*/
2480 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2482 flag aSign, bSign;
2483 uint32_t av, bv;
2484 a = float32_squash_input_denormal(a STATUS_VAR);
2485 b = float32_squash_input_denormal(b STATUS_VAR);
2487 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2488 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2490 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2491 float_raise( float_flag_invalid STATUS_VAR);
2493 return 0;
2495 aSign = extractFloat32Sign( a );
2496 bSign = extractFloat32Sign( b );
2497 av = float32_val(a);
2498 bv = float32_val(b);
2499 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2500 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2504 /*----------------------------------------------------------------------------
2505 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2506 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2507 | comparison is performed according to the IEC/IEEE Standard for Binary
2508 | Floating-Point Arithmetic.
2509 *----------------------------------------------------------------------------*/
2511 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2513 a = float32_squash_input_denormal(a STATUS_VAR);
2514 b = float32_squash_input_denormal(b STATUS_VAR);
2516 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2517 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2519 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2520 float_raise( float_flag_invalid STATUS_VAR);
2522 return 1;
2524 return 0;
2527 /*----------------------------------------------------------------------------
2528 | Returns the result of converting the double-precision floating-point value
2529 | `a' to the 32-bit two's complement integer format. The conversion is
2530 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2531 | Arithmetic---which means in particular that the conversion is rounded
2532 | according to the current rounding mode. If `a' is a NaN, the largest
2533 | positive integer is returned. Otherwise, if the conversion overflows, the
2534 | largest integer with the same sign as `a' is returned.
2535 *----------------------------------------------------------------------------*/
2537 int32 float64_to_int32( float64 a STATUS_PARAM )
2539 flag aSign;
2540 int16 aExp, shiftCount;
2541 uint64_t aSig;
2542 a = float64_squash_input_denormal(a STATUS_VAR);
2544 aSig = extractFloat64Frac( a );
2545 aExp = extractFloat64Exp( a );
2546 aSign = extractFloat64Sign( a );
2547 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2548 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2549 shiftCount = 0x42C - aExp;
2550 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2551 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2555 /*----------------------------------------------------------------------------
2556 | Returns the result of converting the double-precision floating-point value
2557 | `a' to the 32-bit two's complement integer format. The conversion is
2558 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2559 | Arithmetic, except that the conversion is always rounded toward zero.
2560 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2561 | the conversion overflows, the largest integer with the same sign as `a' is
2562 | returned.
2563 *----------------------------------------------------------------------------*/
2565 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2567 flag aSign;
2568 int16 aExp, shiftCount;
2569 uint64_t aSig, savedASig;
2570 int32 z;
2571 a = float64_squash_input_denormal(a STATUS_VAR);
2573 aSig = extractFloat64Frac( a );
2574 aExp = extractFloat64Exp( a );
2575 aSign = extractFloat64Sign( a );
2576 if ( 0x41E < aExp ) {
2577 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2578 goto invalid;
2580 else if ( aExp < 0x3FF ) {
2581 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2582 return 0;
2584 aSig |= LIT64( 0x0010000000000000 );
2585 shiftCount = 0x433 - aExp;
2586 savedASig = aSig;
2587 aSig >>= shiftCount;
2588 z = aSig;
2589 if ( aSign ) z = - z;
2590 if ( ( z < 0 ) ^ aSign ) {
2591 invalid:
2592 float_raise( float_flag_invalid STATUS_VAR);
2593 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2595 if ( ( aSig<<shiftCount ) != savedASig ) {
2596 STATUS(float_exception_flags) |= float_flag_inexact;
2598 return z;
2602 /*----------------------------------------------------------------------------
2603 | Returns the result of converting the double-precision floating-point value
2604 | `a' to the 16-bit two's complement integer format. The conversion is
2605 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2606 | Arithmetic, except that the conversion is always rounded toward zero.
2607 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2608 | the conversion overflows, the largest integer with the same sign as `a' is
2609 | returned.
2610 *----------------------------------------------------------------------------*/
2612 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2614 flag aSign;
2615 int16 aExp, shiftCount;
2616 uint64_t aSig, savedASig;
2617 int32 z;
2619 aSig = extractFloat64Frac( a );
2620 aExp = extractFloat64Exp( a );
2621 aSign = extractFloat64Sign( a );
2622 if ( 0x40E < aExp ) {
2623 if ( ( aExp == 0x7FF ) && aSig ) {
2624 aSign = 0;
2626 goto invalid;
2628 else if ( aExp < 0x3FF ) {
2629 if ( aExp || aSig ) {
2630 STATUS(float_exception_flags) |= float_flag_inexact;
2632 return 0;
2634 aSig |= LIT64( 0x0010000000000000 );
2635 shiftCount = 0x433 - aExp;
2636 savedASig = aSig;
2637 aSig >>= shiftCount;
2638 z = aSig;
2639 if ( aSign ) {
2640 z = - z;
2642 if ( ( (int16_t)z < 0 ) ^ aSign ) {
2643 invalid:
2644 float_raise( float_flag_invalid STATUS_VAR);
2645 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2647 if ( ( aSig<<shiftCount ) != savedASig ) {
2648 STATUS(float_exception_flags) |= float_flag_inexact;
2650 return z;
2653 /*----------------------------------------------------------------------------
2654 | Returns the result of converting the double-precision floating-point value
2655 | `a' to the 64-bit two's complement integer format. The conversion is
2656 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2657 | Arithmetic---which means in particular that the conversion is rounded
2658 | according to the current rounding mode. If `a' is a NaN, the largest
2659 | positive integer is returned. Otherwise, if the conversion overflows, the
2660 | largest integer with the same sign as `a' is returned.
2661 *----------------------------------------------------------------------------*/
2663 int64 float64_to_int64( float64 a STATUS_PARAM )
2665 flag aSign;
2666 int16 aExp, shiftCount;
2667 uint64_t aSig, aSigExtra;
2668 a = float64_squash_input_denormal(a STATUS_VAR);
2670 aSig = extractFloat64Frac( a );
2671 aExp = extractFloat64Exp( a );
2672 aSign = extractFloat64Sign( a );
2673 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2674 shiftCount = 0x433 - aExp;
2675 if ( shiftCount <= 0 ) {
2676 if ( 0x43E < aExp ) {
2677 float_raise( float_flag_invalid STATUS_VAR);
2678 if ( ! aSign
2679 || ( ( aExp == 0x7FF )
2680 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2682 return LIT64( 0x7FFFFFFFFFFFFFFF );
2684 return (int64_t) LIT64( 0x8000000000000000 );
2686 aSigExtra = 0;
2687 aSig <<= - shiftCount;
2689 else {
2690 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2692 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2696 /*----------------------------------------------------------------------------
2697 | Returns the result of converting the double-precision floating-point value
2698 | `a' to the 64-bit two's complement integer format. The conversion is
2699 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2700 | Arithmetic, except that the conversion is always rounded toward zero.
2701 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2702 | the conversion overflows, the largest integer with the same sign as `a' is
2703 | returned.
2704 *----------------------------------------------------------------------------*/
2706 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2708 flag aSign;
2709 int16 aExp, shiftCount;
2710 uint64_t aSig;
2711 int64 z;
2712 a = float64_squash_input_denormal(a STATUS_VAR);
2714 aSig = extractFloat64Frac( a );
2715 aExp = extractFloat64Exp( a );
2716 aSign = extractFloat64Sign( a );
2717 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2718 shiftCount = aExp - 0x433;
2719 if ( 0 <= shiftCount ) {
2720 if ( 0x43E <= aExp ) {
2721 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2722 float_raise( float_flag_invalid STATUS_VAR);
2723 if ( ! aSign
2724 || ( ( aExp == 0x7FF )
2725 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2727 return LIT64( 0x7FFFFFFFFFFFFFFF );
2730 return (int64_t) LIT64( 0x8000000000000000 );
2732 z = aSig<<shiftCount;
2734 else {
2735 if ( aExp < 0x3FE ) {
2736 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2737 return 0;
2739 z = aSig>>( - shiftCount );
2740 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
2741 STATUS(float_exception_flags) |= float_flag_inexact;
2744 if ( aSign ) z = - z;
2745 return z;
2749 /*----------------------------------------------------------------------------
2750 | Returns the result of converting the double-precision floating-point value
2751 | `a' to the single-precision floating-point format. The conversion is
2752 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2753 | Arithmetic.
2754 *----------------------------------------------------------------------------*/
2756 float32 float64_to_float32( float64 a STATUS_PARAM )
2758 flag aSign;
2759 int16 aExp;
2760 uint64_t aSig;
2761 uint32_t zSig;
2762 a = float64_squash_input_denormal(a STATUS_VAR);
2764 aSig = extractFloat64Frac( a );
2765 aExp = extractFloat64Exp( a );
2766 aSign = extractFloat64Sign( a );
2767 if ( aExp == 0x7FF ) {
2768 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2769 return packFloat32( aSign, 0xFF, 0 );
2771 shift64RightJamming( aSig, 22, &aSig );
2772 zSig = aSig;
2773 if ( aExp || zSig ) {
2774 zSig |= 0x40000000;
2775 aExp -= 0x381;
2777 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2782 /*----------------------------------------------------------------------------
2783 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2784 | half-precision floating-point value, returning the result. After being
2785 | shifted into the proper positions, the three fields are simply added
2786 | together to form the result. This means that any integer portion of `zSig'
2787 | will be added into the exponent. Since a properly normalized significand
2788 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2789 | than the desired result exponent whenever `zSig' is a complete, normalized
2790 | significand.
2791 *----------------------------------------------------------------------------*/
2792 static float16 packFloat16(flag zSign, int16 zExp, uint16_t zSig)
2794 return make_float16(
2795 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
2798 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2799 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
2801 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2803 flag aSign;
2804 int16 aExp;
2805 uint32_t aSig;
2807 aSign = extractFloat16Sign(a);
2808 aExp = extractFloat16Exp(a);
2809 aSig = extractFloat16Frac(a);
2811 if (aExp == 0x1f && ieee) {
2812 if (aSig) {
2813 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
2815 return packFloat32(aSign, 0xff, aSig << 13);
2817 if (aExp == 0) {
2818 int8 shiftCount;
2820 if (aSig == 0) {
2821 return packFloat32(aSign, 0, 0);
2824 shiftCount = countLeadingZeros32( aSig ) - 21;
2825 aSig = aSig << shiftCount;
2826 aExp = -shiftCount;
2828 return packFloat32( aSign, aExp + 0x70, aSig << 13);
2831 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
2833 flag aSign;
2834 int16 aExp;
2835 uint32_t aSig;
2836 uint32_t mask;
2837 uint32_t increment;
2838 int8 roundingMode;
2839 a = float32_squash_input_denormal(a STATUS_VAR);
2841 aSig = extractFloat32Frac( a );
2842 aExp = extractFloat32Exp( a );
2843 aSign = extractFloat32Sign( a );
2844 if ( aExp == 0xFF ) {
2845 if (aSig) {
2846 /* Input is a NaN */
2847 float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2848 if (!ieee) {
2849 return packFloat16(aSign, 0, 0);
2851 return r;
2853 /* Infinity */
2854 if (!ieee) {
2855 float_raise(float_flag_invalid STATUS_VAR);
2856 return packFloat16(aSign, 0x1f, 0x3ff);
2858 return packFloat16(aSign, 0x1f, 0);
2860 if (aExp == 0 && aSig == 0) {
2861 return packFloat16(aSign, 0, 0);
2863 /* Decimal point between bits 22 and 23. */
2864 aSig |= 0x00800000;
2865 aExp -= 0x7f;
2866 if (aExp < -14) {
2867 mask = 0x00ffffff;
2868 if (aExp >= -24) {
2869 mask >>= 25 + aExp;
2871 } else {
2872 mask = 0x00001fff;
2874 if (aSig & mask) {
2875 float_raise( float_flag_underflow STATUS_VAR );
2876 roundingMode = STATUS(float_rounding_mode);
2877 switch (roundingMode) {
2878 case float_round_nearest_even:
2879 increment = (mask + 1) >> 1;
2880 if ((aSig & mask) == increment) {
2881 increment = aSig & (increment << 1);
2883 break;
2884 case float_round_up:
2885 increment = aSign ? 0 : mask;
2886 break;
2887 case float_round_down:
2888 increment = aSign ? mask : 0;
2889 break;
2890 default: /* round_to_zero */
2891 increment = 0;
2892 break;
2894 aSig += increment;
2895 if (aSig >= 0x01000000) {
2896 aSig >>= 1;
2897 aExp++;
2899 } else if (aExp < -14
2900 && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2901 float_raise( float_flag_underflow STATUS_VAR);
2904 if (ieee) {
2905 if (aExp > 15) {
2906 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2907 return packFloat16(aSign, 0x1f, 0);
2909 } else {
2910 if (aExp > 16) {
2911 float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
2912 return packFloat16(aSign, 0x1f, 0x3ff);
2915 if (aExp < -24) {
2916 return packFloat16(aSign, 0, 0);
2918 if (aExp < -14) {
2919 aSig >>= -14 - aExp;
2920 aExp = -14;
2922 return packFloat16(aSign, aExp + 14, aSig >> 13);
2925 #ifdef FLOATX80
2927 /*----------------------------------------------------------------------------
2928 | Returns the result of converting the double-precision floating-point value
2929 | `a' to the extended double-precision floating-point format. The conversion
2930 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2931 | Arithmetic.
2932 *----------------------------------------------------------------------------*/
2934 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2936 flag aSign;
2937 int16 aExp;
2938 uint64_t aSig;
2940 a = float64_squash_input_denormal(a STATUS_VAR);
2941 aSig = extractFloat64Frac( a );
2942 aExp = extractFloat64Exp( a );
2943 aSign = extractFloat64Sign( a );
2944 if ( aExp == 0x7FF ) {
2945 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2946 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2948 if ( aExp == 0 ) {
2949 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2950 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2952 return
2953 packFloatx80(
2954 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2958 #endif
2960 #ifdef FLOAT128
2962 /*----------------------------------------------------------------------------
2963 | Returns the result of converting the double-precision floating-point value
2964 | `a' to the quadruple-precision floating-point format. The conversion is
2965 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2966 | Arithmetic.
2967 *----------------------------------------------------------------------------*/
2969 float128 float64_to_float128( float64 a STATUS_PARAM )
2971 flag aSign;
2972 int16 aExp;
2973 uint64_t aSig, zSig0, zSig1;
2975 a = float64_squash_input_denormal(a STATUS_VAR);
2976 aSig = extractFloat64Frac( a );
2977 aExp = extractFloat64Exp( a );
2978 aSign = extractFloat64Sign( a );
2979 if ( aExp == 0x7FF ) {
2980 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2981 return packFloat128( aSign, 0x7FFF, 0, 0 );
2983 if ( aExp == 0 ) {
2984 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2985 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2986 --aExp;
2988 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2989 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2993 #endif
2995 /*----------------------------------------------------------------------------
2996 | Rounds the double-precision floating-point value `a' to an integer, and
2997 | returns the result as a double-precision floating-point value. The
2998 | operation is performed according to the IEC/IEEE Standard for Binary
2999 | Floating-Point Arithmetic.
3000 *----------------------------------------------------------------------------*/
3002 float64 float64_round_to_int( float64 a STATUS_PARAM )
3004 flag aSign;
3005 int16 aExp;
3006 uint64_t lastBitMask, roundBitsMask;
3007 int8 roundingMode;
3008 uint64_t z;
3009 a = float64_squash_input_denormal(a STATUS_VAR);
3011 aExp = extractFloat64Exp( a );
3012 if ( 0x433 <= aExp ) {
3013 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3014 return propagateFloat64NaN( a, a STATUS_VAR );
3016 return a;
3018 if ( aExp < 0x3FF ) {
3019 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3020 STATUS(float_exception_flags) |= float_flag_inexact;
3021 aSign = extractFloat64Sign( a );
3022 switch ( STATUS(float_rounding_mode) ) {
3023 case float_round_nearest_even:
3024 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3025 return packFloat64( aSign, 0x3FF, 0 );
3027 break;
3028 case float_round_down:
3029 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3030 case float_round_up:
3031 return make_float64(
3032 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3034 return packFloat64( aSign, 0, 0 );
3036 lastBitMask = 1;
3037 lastBitMask <<= 0x433 - aExp;
3038 roundBitsMask = lastBitMask - 1;
3039 z = float64_val(a);
3040 roundingMode = STATUS(float_rounding_mode);
3041 if ( roundingMode == float_round_nearest_even ) {
3042 z += lastBitMask>>1;
3043 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3045 else if ( roundingMode != float_round_to_zero ) {
3046 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3047 z += roundBitsMask;
3050 z &= ~ roundBitsMask;
3051 if ( z != float64_val(a) )
3052 STATUS(float_exception_flags) |= float_flag_inexact;
3053 return make_float64(z);
3057 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3059 int oldmode;
3060 float64 res;
3061 oldmode = STATUS(float_rounding_mode);
3062 STATUS(float_rounding_mode) = float_round_to_zero;
3063 res = float64_round_to_int(a STATUS_VAR);
3064 STATUS(float_rounding_mode) = oldmode;
3065 return res;
3068 /*----------------------------------------------------------------------------
3069 | Returns the result of adding the absolute values of the double-precision
3070 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3071 | before being returned. `zSign' is ignored if the result is a NaN.
3072 | The addition is performed according to the IEC/IEEE Standard for Binary
3073 | Floating-Point Arithmetic.
3074 *----------------------------------------------------------------------------*/
3076 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3078 int16 aExp, bExp, zExp;
3079 uint64_t aSig, bSig, zSig;
3080 int16 expDiff;
3082 aSig = extractFloat64Frac( a );
3083 aExp = extractFloat64Exp( a );
3084 bSig = extractFloat64Frac( b );
3085 bExp = extractFloat64Exp( b );
3086 expDiff = aExp - bExp;
3087 aSig <<= 9;
3088 bSig <<= 9;
3089 if ( 0 < expDiff ) {
3090 if ( aExp == 0x7FF ) {
3091 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3092 return a;
3094 if ( bExp == 0 ) {
3095 --expDiff;
3097 else {
3098 bSig |= LIT64( 0x2000000000000000 );
3100 shift64RightJamming( bSig, expDiff, &bSig );
3101 zExp = aExp;
3103 else if ( expDiff < 0 ) {
3104 if ( bExp == 0x7FF ) {
3105 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3106 return packFloat64( zSign, 0x7FF, 0 );
3108 if ( aExp == 0 ) {
3109 ++expDiff;
3111 else {
3112 aSig |= LIT64( 0x2000000000000000 );
3114 shift64RightJamming( aSig, - expDiff, &aSig );
3115 zExp = bExp;
3117 else {
3118 if ( aExp == 0x7FF ) {
3119 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3120 return a;
3122 if ( aExp == 0 ) {
3123 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
3124 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3126 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3127 zExp = aExp;
3128 goto roundAndPack;
3130 aSig |= LIT64( 0x2000000000000000 );
3131 zSig = ( aSig + bSig )<<1;
3132 --zExp;
3133 if ( (int64_t) zSig < 0 ) {
3134 zSig = aSig + bSig;
3135 ++zExp;
3137 roundAndPack:
3138 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3142 /*----------------------------------------------------------------------------
3143 | Returns the result of subtracting the absolute values of the double-
3144 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3145 | difference is negated before being returned. `zSign' is ignored if the
3146 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3147 | Standard for Binary Floating-Point Arithmetic.
3148 *----------------------------------------------------------------------------*/
3150 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3152 int16 aExp, bExp, zExp;
3153 uint64_t aSig, bSig, zSig;
3154 int16 expDiff;
3156 aSig = extractFloat64Frac( a );
3157 aExp = extractFloat64Exp( a );
3158 bSig = extractFloat64Frac( b );
3159 bExp = extractFloat64Exp( b );
3160 expDiff = aExp - bExp;
3161 aSig <<= 10;
3162 bSig <<= 10;
3163 if ( 0 < expDiff ) goto aExpBigger;
3164 if ( expDiff < 0 ) goto bExpBigger;
3165 if ( aExp == 0x7FF ) {
3166 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3167 float_raise( float_flag_invalid STATUS_VAR);
3168 return float64_default_nan;
3170 if ( aExp == 0 ) {
3171 aExp = 1;
3172 bExp = 1;
3174 if ( bSig < aSig ) goto aBigger;
3175 if ( aSig < bSig ) goto bBigger;
3176 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3177 bExpBigger:
3178 if ( bExp == 0x7FF ) {
3179 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3180 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3182 if ( aExp == 0 ) {
3183 ++expDiff;
3185 else {
3186 aSig |= LIT64( 0x4000000000000000 );
3188 shift64RightJamming( aSig, - expDiff, &aSig );
3189 bSig |= LIT64( 0x4000000000000000 );
3190 bBigger:
3191 zSig = bSig - aSig;
3192 zExp = bExp;
3193 zSign ^= 1;
3194 goto normalizeRoundAndPack;
3195 aExpBigger:
3196 if ( aExp == 0x7FF ) {
3197 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3198 return a;
3200 if ( bExp == 0 ) {
3201 --expDiff;
3203 else {
3204 bSig |= LIT64( 0x4000000000000000 );
3206 shift64RightJamming( bSig, expDiff, &bSig );
3207 aSig |= LIT64( 0x4000000000000000 );
3208 aBigger:
3209 zSig = aSig - bSig;
3210 zExp = aExp;
3211 normalizeRoundAndPack:
3212 --zExp;
3213 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3217 /*----------------------------------------------------------------------------
3218 | Returns the result of adding the double-precision floating-point values `a'
3219 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3220 | Binary Floating-Point Arithmetic.
3221 *----------------------------------------------------------------------------*/
3223 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3225 flag aSign, bSign;
3226 a = float64_squash_input_denormal(a STATUS_VAR);
3227 b = float64_squash_input_denormal(b STATUS_VAR);
3229 aSign = extractFloat64Sign( a );
3230 bSign = extractFloat64Sign( b );
3231 if ( aSign == bSign ) {
3232 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3234 else {
3235 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3240 /*----------------------------------------------------------------------------
3241 | Returns the result of subtracting the double-precision floating-point values
3242 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3243 | for Binary Floating-Point Arithmetic.
3244 *----------------------------------------------------------------------------*/
3246 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3248 flag aSign, bSign;
3249 a = float64_squash_input_denormal(a STATUS_VAR);
3250 b = float64_squash_input_denormal(b STATUS_VAR);
3252 aSign = extractFloat64Sign( a );
3253 bSign = extractFloat64Sign( b );
3254 if ( aSign == bSign ) {
3255 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3257 else {
3258 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3263 /*----------------------------------------------------------------------------
3264 | Returns the result of multiplying the double-precision floating-point values
3265 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3266 | for Binary Floating-Point Arithmetic.
3267 *----------------------------------------------------------------------------*/
3269 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3271 flag aSign, bSign, zSign;
3272 int16 aExp, bExp, zExp;
3273 uint64_t aSig, bSig, zSig0, zSig1;
3275 a = float64_squash_input_denormal(a STATUS_VAR);
3276 b = float64_squash_input_denormal(b STATUS_VAR);
3278 aSig = extractFloat64Frac( a );
3279 aExp = extractFloat64Exp( a );
3280 aSign = extractFloat64Sign( a );
3281 bSig = extractFloat64Frac( b );
3282 bExp = extractFloat64Exp( b );
3283 bSign = extractFloat64Sign( b );
3284 zSign = aSign ^ bSign;
3285 if ( aExp == 0x7FF ) {
3286 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3287 return propagateFloat64NaN( a, b STATUS_VAR );
3289 if ( ( bExp | bSig ) == 0 ) {
3290 float_raise( float_flag_invalid STATUS_VAR);
3291 return float64_default_nan;
3293 return packFloat64( zSign, 0x7FF, 0 );
3295 if ( bExp == 0x7FF ) {
3296 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3297 if ( ( aExp | aSig ) == 0 ) {
3298 float_raise( float_flag_invalid STATUS_VAR);
3299 return float64_default_nan;
3301 return packFloat64( zSign, 0x7FF, 0 );
3303 if ( aExp == 0 ) {
3304 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3305 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3307 if ( bExp == 0 ) {
3308 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3309 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3311 zExp = aExp + bExp - 0x3FF;
3312 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3313 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3314 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3315 zSig0 |= ( zSig1 != 0 );
3316 if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3317 zSig0 <<= 1;
3318 --zExp;
3320 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3324 /*----------------------------------------------------------------------------
3325 | Returns the result of dividing the double-precision floating-point value `a'
3326 | by the corresponding value `b'. The operation is performed according to
3327 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3328 *----------------------------------------------------------------------------*/
3330 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3332 flag aSign, bSign, zSign;
3333 int16 aExp, bExp, zExp;
3334 uint64_t aSig, bSig, zSig;
3335 uint64_t rem0, rem1;
3336 uint64_t term0, term1;
3337 a = float64_squash_input_denormal(a STATUS_VAR);
3338 b = float64_squash_input_denormal(b STATUS_VAR);
3340 aSig = extractFloat64Frac( a );
3341 aExp = extractFloat64Exp( a );
3342 aSign = extractFloat64Sign( a );
3343 bSig = extractFloat64Frac( b );
3344 bExp = extractFloat64Exp( b );
3345 bSign = extractFloat64Sign( b );
3346 zSign = aSign ^ bSign;
3347 if ( aExp == 0x7FF ) {
3348 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3349 if ( bExp == 0x7FF ) {
3350 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3351 float_raise( float_flag_invalid STATUS_VAR);
3352 return float64_default_nan;
3354 return packFloat64( zSign, 0x7FF, 0 );
3356 if ( bExp == 0x7FF ) {
3357 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3358 return packFloat64( zSign, 0, 0 );
3360 if ( bExp == 0 ) {
3361 if ( bSig == 0 ) {
3362 if ( ( aExp | aSig ) == 0 ) {
3363 float_raise( float_flag_invalid STATUS_VAR);
3364 return float64_default_nan;
3366 float_raise( float_flag_divbyzero STATUS_VAR);
3367 return packFloat64( zSign, 0x7FF, 0 );
3369 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3371 if ( aExp == 0 ) {
3372 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3373 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3375 zExp = aExp - bExp + 0x3FD;
3376 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3377 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3378 if ( bSig <= ( aSig + aSig ) ) {
3379 aSig >>= 1;
3380 ++zExp;
3382 zSig = estimateDiv128To64( aSig, 0, bSig );
3383 if ( ( zSig & 0x1FF ) <= 2 ) {
3384 mul64To128( bSig, zSig, &term0, &term1 );
3385 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3386 while ( (int64_t) rem0 < 0 ) {
3387 --zSig;
3388 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3390 zSig |= ( rem1 != 0 );
3392 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3396 /*----------------------------------------------------------------------------
3397 | Returns the remainder of the double-precision floating-point value `a'
3398 | with respect to the corresponding value `b'. The operation is performed
3399 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3400 *----------------------------------------------------------------------------*/
3402 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3404 flag aSign, zSign;
3405 int16 aExp, bExp, expDiff;
3406 uint64_t aSig, bSig;
3407 uint64_t q, alternateASig;
3408 int64_t sigMean;
3410 a = float64_squash_input_denormal(a STATUS_VAR);
3411 b = float64_squash_input_denormal(b STATUS_VAR);
3412 aSig = extractFloat64Frac( a );
3413 aExp = extractFloat64Exp( a );
3414 aSign = extractFloat64Sign( a );
3415 bSig = extractFloat64Frac( b );
3416 bExp = extractFloat64Exp( b );
3417 if ( aExp == 0x7FF ) {
3418 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3419 return propagateFloat64NaN( a, b STATUS_VAR );
3421 float_raise( float_flag_invalid STATUS_VAR);
3422 return float64_default_nan;
3424 if ( bExp == 0x7FF ) {
3425 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3426 return a;
3428 if ( bExp == 0 ) {
3429 if ( bSig == 0 ) {
3430 float_raise( float_flag_invalid STATUS_VAR);
3431 return float64_default_nan;
3433 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3435 if ( aExp == 0 ) {
3436 if ( aSig == 0 ) return a;
3437 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3439 expDiff = aExp - bExp;
3440 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3441 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3442 if ( expDiff < 0 ) {
3443 if ( expDiff < -1 ) return a;
3444 aSig >>= 1;
3446 q = ( bSig <= aSig );
3447 if ( q ) aSig -= bSig;
3448 expDiff -= 64;
3449 while ( 0 < expDiff ) {
3450 q = estimateDiv128To64( aSig, 0, bSig );
3451 q = ( 2 < q ) ? q - 2 : 0;
3452 aSig = - ( ( bSig>>2 ) * q );
3453 expDiff -= 62;
3455 expDiff += 64;
3456 if ( 0 < expDiff ) {
3457 q = estimateDiv128To64( aSig, 0, bSig );
3458 q = ( 2 < q ) ? q - 2 : 0;
3459 q >>= 64 - expDiff;
3460 bSig >>= 2;
3461 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3463 else {
3464 aSig >>= 2;
3465 bSig >>= 2;
3467 do {
3468 alternateASig = aSig;
3469 ++q;
3470 aSig -= bSig;
3471 } while ( 0 <= (int64_t) aSig );
3472 sigMean = aSig + alternateASig;
3473 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3474 aSig = alternateASig;
3476 zSign = ( (int64_t) aSig < 0 );
3477 if ( zSign ) aSig = - aSig;
3478 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3482 /*----------------------------------------------------------------------------
3483 | Returns the square root of the double-precision floating-point value `a'.
3484 | The operation is performed according to the IEC/IEEE Standard for Binary
3485 | Floating-Point Arithmetic.
3486 *----------------------------------------------------------------------------*/
3488 float64 float64_sqrt( float64 a STATUS_PARAM )
3490 flag aSign;
3491 int16 aExp, zExp;
3492 uint64_t aSig, zSig, doubleZSig;
3493 uint64_t rem0, rem1, term0, term1;
3494 a = float64_squash_input_denormal(a STATUS_VAR);
3496 aSig = extractFloat64Frac( a );
3497 aExp = extractFloat64Exp( a );
3498 aSign = extractFloat64Sign( a );
3499 if ( aExp == 0x7FF ) {
3500 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3501 if ( ! aSign ) return a;
3502 float_raise( float_flag_invalid STATUS_VAR);
3503 return float64_default_nan;
3505 if ( aSign ) {
3506 if ( ( aExp | aSig ) == 0 ) return a;
3507 float_raise( float_flag_invalid STATUS_VAR);
3508 return float64_default_nan;
3510 if ( aExp == 0 ) {
3511 if ( aSig == 0 ) return float64_zero;
3512 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3514 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3515 aSig |= LIT64( 0x0010000000000000 );
3516 zSig = estimateSqrt32( aExp, aSig>>21 );
3517 aSig <<= 9 - ( aExp & 1 );
3518 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3519 if ( ( zSig & 0x1FF ) <= 5 ) {
3520 doubleZSig = zSig<<1;
3521 mul64To128( zSig, zSig, &term0, &term1 );
3522 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3523 while ( (int64_t) rem0 < 0 ) {
3524 --zSig;
3525 doubleZSig -= 2;
3526 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3528 zSig |= ( ( rem0 | rem1 ) != 0 );
3530 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3534 /*----------------------------------------------------------------------------
3535 | Returns the binary log of the double-precision floating-point value `a'.
3536 | The operation is performed according to the IEC/IEEE Standard for Binary
3537 | Floating-Point Arithmetic.
3538 *----------------------------------------------------------------------------*/
3539 float64 float64_log2( float64 a STATUS_PARAM )
3541 flag aSign, zSign;
3542 int16 aExp;
3543 uint64_t aSig, aSig0, aSig1, zSig, i;
3544 a = float64_squash_input_denormal(a STATUS_VAR);
3546 aSig = extractFloat64Frac( a );
3547 aExp = extractFloat64Exp( a );
3548 aSign = extractFloat64Sign( a );
3550 if ( aExp == 0 ) {
3551 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3552 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3554 if ( aSign ) {
3555 float_raise( float_flag_invalid STATUS_VAR);
3556 return float64_default_nan;
3558 if ( aExp == 0x7FF ) {
3559 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3560 return a;
3563 aExp -= 0x3FF;
3564 aSig |= LIT64( 0x0010000000000000 );
3565 zSign = aExp < 0;
3566 zSig = (uint64_t)aExp << 52;
3567 for (i = 1LL << 51; i > 0; i >>= 1) {
3568 mul64To128( aSig, aSig, &aSig0, &aSig1 );
3569 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3570 if ( aSig & LIT64( 0x0020000000000000 ) ) {
3571 aSig >>= 1;
3572 zSig |= i;
3576 if ( zSign )
3577 zSig = -zSig;
3578 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3581 /*----------------------------------------------------------------------------
3582 | Returns 1 if the double-precision floating-point value `a' is equal to the
3583 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3584 | if either operand is a NaN. Otherwise, the comparison is performed
3585 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3586 *----------------------------------------------------------------------------*/
3588 int float64_eq( float64 a, float64 b STATUS_PARAM )
3590 uint64_t av, bv;
3591 a = float64_squash_input_denormal(a STATUS_VAR);
3592 b = float64_squash_input_denormal(b STATUS_VAR);
3594 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3595 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3597 float_raise( float_flag_invalid STATUS_VAR);
3598 return 0;
3600 av = float64_val(a);
3601 bv = float64_val(b);
3602 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3606 /*----------------------------------------------------------------------------
3607 | Returns 1 if the double-precision floating-point value `a' is less than or
3608 | equal to the corresponding value `b', and 0 otherwise. The invalid
3609 | exception is raised if either operand is a NaN. The comparison is performed
3610 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3611 *----------------------------------------------------------------------------*/
3613 int float64_le( float64 a, float64 b STATUS_PARAM )
3615 flag aSign, bSign;
3616 uint64_t av, bv;
3617 a = float64_squash_input_denormal(a STATUS_VAR);
3618 b = float64_squash_input_denormal(b STATUS_VAR);
3620 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3621 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3623 float_raise( float_flag_invalid STATUS_VAR);
3624 return 0;
3626 aSign = extractFloat64Sign( a );
3627 bSign = extractFloat64Sign( b );
3628 av = float64_val(a);
3629 bv = float64_val(b);
3630 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3631 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3635 /*----------------------------------------------------------------------------
3636 | Returns 1 if the double-precision floating-point value `a' is less than
3637 | the corresponding value `b', and 0 otherwise. The invalid exception is
3638 | raised if either operand is a NaN. The comparison is performed according
3639 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3640 *----------------------------------------------------------------------------*/
3642 int float64_lt( float64 a, float64 b STATUS_PARAM )
3644 flag aSign, bSign;
3645 uint64_t av, bv;
3647 a = float64_squash_input_denormal(a STATUS_VAR);
3648 b = float64_squash_input_denormal(b STATUS_VAR);
3649 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3650 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3652 float_raise( float_flag_invalid STATUS_VAR);
3653 return 0;
3655 aSign = extractFloat64Sign( a );
3656 bSign = extractFloat64Sign( b );
3657 av = float64_val(a);
3658 bv = float64_val(b);
3659 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3660 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3664 /*----------------------------------------------------------------------------
3665 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
3666 | be compared, and 0 otherwise. The invalid exception is raised if either
3667 | operand is a NaN. The comparison is performed according to the IEC/IEEE
3668 | Standard for Binary Floating-Point Arithmetic.
3669 *----------------------------------------------------------------------------*/
3671 int float64_unordered( float64 a, float64 b STATUS_PARAM )
3673 a = float64_squash_input_denormal(a STATUS_VAR);
3674 b = float64_squash_input_denormal(b STATUS_VAR);
3676 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3677 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3679 float_raise( float_flag_invalid STATUS_VAR);
3680 return 1;
3682 return 0;
3685 /*----------------------------------------------------------------------------
3686 | Returns 1 if the double-precision floating-point value `a' is equal to the
3687 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3688 | exception.The comparison is performed according to the IEC/IEEE Standard
3689 | for Binary Floating-Point Arithmetic.
3690 *----------------------------------------------------------------------------*/
3692 int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
3694 uint64_t av, bv;
3695 a = float64_squash_input_denormal(a STATUS_VAR);
3696 b = float64_squash_input_denormal(b STATUS_VAR);
3698 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3699 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3701 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3702 float_raise( float_flag_invalid STATUS_VAR);
3704 return 0;
3706 av = float64_val(a);
3707 bv = float64_val(b);
3708 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3712 /*----------------------------------------------------------------------------
3713 | Returns 1 if the double-precision floating-point value `a' is less than or
3714 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3715 | cause an exception. Otherwise, the comparison is performed according to the
3716 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3717 *----------------------------------------------------------------------------*/
3719 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3721 flag aSign, bSign;
3722 uint64_t av, bv;
3723 a = float64_squash_input_denormal(a STATUS_VAR);
3724 b = float64_squash_input_denormal(b STATUS_VAR);
3726 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3727 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3729 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3730 float_raise( float_flag_invalid STATUS_VAR);
3732 return 0;
3734 aSign = extractFloat64Sign( a );
3735 bSign = extractFloat64Sign( b );
3736 av = float64_val(a);
3737 bv = float64_val(b);
3738 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3739 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3743 /*----------------------------------------------------------------------------
3744 | Returns 1 if the double-precision floating-point value `a' is less than
3745 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3746 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3747 | Standard for Binary Floating-Point Arithmetic.
3748 *----------------------------------------------------------------------------*/
3750 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3752 flag aSign, bSign;
3753 uint64_t av, bv;
3754 a = float64_squash_input_denormal(a STATUS_VAR);
3755 b = float64_squash_input_denormal(b STATUS_VAR);
3757 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3758 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3760 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3761 float_raise( float_flag_invalid STATUS_VAR);
3763 return 0;
3765 aSign = extractFloat64Sign( a );
3766 bSign = extractFloat64Sign( b );
3767 av = float64_val(a);
3768 bv = float64_val(b);
3769 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3770 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3774 /*----------------------------------------------------------------------------
3775 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
3776 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3777 | comparison is performed according to the IEC/IEEE Standard for Binary
3778 | Floating-Point Arithmetic.
3779 *----------------------------------------------------------------------------*/
3781 int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
3783 a = float64_squash_input_denormal(a STATUS_VAR);
3784 b = float64_squash_input_denormal(b STATUS_VAR);
3786 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3787 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3789 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3790 float_raise( float_flag_invalid STATUS_VAR);
3792 return 1;
3794 return 0;
3797 #ifdef FLOATX80
3799 /*----------------------------------------------------------------------------
3800 | Returns the result of converting the extended double-precision floating-
3801 | point value `a' to the 32-bit two's complement integer format. The
3802 | conversion is performed according to the IEC/IEEE Standard for Binary
3803 | Floating-Point Arithmetic---which means in particular that the conversion
3804 | is rounded according to the current rounding mode. If `a' is a NaN, the
3805 | largest positive integer is returned. Otherwise, if the conversion
3806 | overflows, the largest integer with the same sign as `a' is returned.
3807 *----------------------------------------------------------------------------*/
3809 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3811 flag aSign;
3812 int32 aExp, shiftCount;
3813 uint64_t aSig;
3815 aSig = extractFloatx80Frac( a );
3816 aExp = extractFloatx80Exp( a );
3817 aSign = extractFloatx80Sign( a );
3818 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3819 shiftCount = 0x4037 - aExp;
3820 if ( shiftCount <= 0 ) shiftCount = 1;
3821 shift64RightJamming( aSig, shiftCount, &aSig );
3822 return roundAndPackInt32( aSign, aSig STATUS_VAR );
3826 /*----------------------------------------------------------------------------
3827 | Returns the result of converting the extended double-precision floating-
3828 | point value `a' to the 32-bit two's complement integer format. The
3829 | conversion is performed according to the IEC/IEEE Standard for Binary
3830 | Floating-Point Arithmetic, except that the conversion is always rounded
3831 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3832 | Otherwise, if the conversion overflows, the largest integer with the same
3833 | sign as `a' is returned.
3834 *----------------------------------------------------------------------------*/
3836 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3838 flag aSign;
3839 int32 aExp, shiftCount;
3840 uint64_t aSig, savedASig;
3841 int32 z;
3843 aSig = extractFloatx80Frac( a );
3844 aExp = extractFloatx80Exp( a );
3845 aSign = extractFloatx80Sign( a );
3846 if ( 0x401E < aExp ) {
3847 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3848 goto invalid;
3850 else if ( aExp < 0x3FFF ) {
3851 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3852 return 0;
3854 shiftCount = 0x403E - aExp;
3855 savedASig = aSig;
3856 aSig >>= shiftCount;
3857 z = aSig;
3858 if ( aSign ) z = - z;
3859 if ( ( z < 0 ) ^ aSign ) {
3860 invalid:
3861 float_raise( float_flag_invalid STATUS_VAR);
3862 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3864 if ( ( aSig<<shiftCount ) != savedASig ) {
3865 STATUS(float_exception_flags) |= float_flag_inexact;
3867 return z;
3871 /*----------------------------------------------------------------------------
3872 | Returns the result of converting the extended double-precision floating-
3873 | point value `a' to the 64-bit two's complement integer format. The
3874 | conversion is performed according to the IEC/IEEE Standard for Binary
3875 | Floating-Point Arithmetic---which means in particular that the conversion
3876 | is rounded according to the current rounding mode. If `a' is a NaN,
3877 | the largest positive integer is returned. Otherwise, if the conversion
3878 | overflows, the largest integer with the same sign as `a' is returned.
3879 *----------------------------------------------------------------------------*/
3881 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3883 flag aSign;
3884 int32 aExp, shiftCount;
3885 uint64_t aSig, aSigExtra;
3887 aSig = extractFloatx80Frac( a );
3888 aExp = extractFloatx80Exp( a );
3889 aSign = extractFloatx80Sign( a );
3890 shiftCount = 0x403E - aExp;
3891 if ( shiftCount <= 0 ) {
3892 if ( shiftCount ) {
3893 float_raise( float_flag_invalid STATUS_VAR);
3894 if ( ! aSign
3895 || ( ( aExp == 0x7FFF )
3896 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3898 return LIT64( 0x7FFFFFFFFFFFFFFF );
3900 return (int64_t) LIT64( 0x8000000000000000 );
3902 aSigExtra = 0;
3904 else {
3905 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3907 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3911 /*----------------------------------------------------------------------------
3912 | Returns the result of converting the extended double-precision floating-
3913 | point value `a' to the 64-bit two's complement integer format. The
3914 | conversion is performed according to the IEC/IEEE Standard for Binary
3915 | Floating-Point Arithmetic, except that the conversion is always rounded
3916 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3917 | Otherwise, if the conversion overflows, the largest integer with the same
3918 | sign as `a' is returned.
3919 *----------------------------------------------------------------------------*/
3921 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3923 flag aSign;
3924 int32 aExp, shiftCount;
3925 uint64_t aSig;
3926 int64 z;
3928 aSig = extractFloatx80Frac( a );
3929 aExp = extractFloatx80Exp( a );
3930 aSign = extractFloatx80Sign( a );
3931 shiftCount = aExp - 0x403E;
3932 if ( 0 <= shiftCount ) {
3933 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3934 if ( ( a.high != 0xC03E ) || aSig ) {
3935 float_raise( float_flag_invalid STATUS_VAR);
3936 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3937 return LIT64( 0x7FFFFFFFFFFFFFFF );
3940 return (int64_t) LIT64( 0x8000000000000000 );
3942 else if ( aExp < 0x3FFF ) {
3943 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3944 return 0;
3946 z = aSig>>( - shiftCount );
3947 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3948 STATUS(float_exception_flags) |= float_flag_inexact;
3950 if ( aSign ) z = - z;
3951 return z;
3955 /*----------------------------------------------------------------------------
3956 | Returns the result of converting the extended double-precision floating-
3957 | point value `a' to the single-precision floating-point format. The
3958 | conversion is performed according to the IEC/IEEE Standard for Binary
3959 | Floating-Point Arithmetic.
3960 *----------------------------------------------------------------------------*/
3962 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3964 flag aSign;
3965 int32 aExp;
3966 uint64_t aSig;
3968 aSig = extractFloatx80Frac( a );
3969 aExp = extractFloatx80Exp( a );
3970 aSign = extractFloatx80Sign( a );
3971 if ( aExp == 0x7FFF ) {
3972 if ( (uint64_t) ( aSig<<1 ) ) {
3973 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3975 return packFloat32( aSign, 0xFF, 0 );
3977 shift64RightJamming( aSig, 33, &aSig );
3978 if ( aExp || aSig ) aExp -= 0x3F81;
3979 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3983 /*----------------------------------------------------------------------------
3984 | Returns the result of converting the extended double-precision floating-
3985 | point value `a' to the double-precision floating-point format. The
3986 | conversion is performed according to the IEC/IEEE Standard for Binary
3987 | Floating-Point Arithmetic.
3988 *----------------------------------------------------------------------------*/
3990 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3992 flag aSign;
3993 int32 aExp;
3994 uint64_t aSig, zSig;
3996 aSig = extractFloatx80Frac( a );
3997 aExp = extractFloatx80Exp( a );
3998 aSign = extractFloatx80Sign( a );
3999 if ( aExp == 0x7FFF ) {
4000 if ( (uint64_t) ( aSig<<1 ) ) {
4001 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4003 return packFloat64( aSign, 0x7FF, 0 );
4005 shift64RightJamming( aSig, 1, &zSig );
4006 if ( aExp || aSig ) aExp -= 0x3C01;
4007 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4011 #ifdef FLOAT128
4013 /*----------------------------------------------------------------------------
4014 | Returns the result of converting the extended double-precision floating-
4015 | point value `a' to the quadruple-precision floating-point format. The
4016 | conversion is performed according to the IEC/IEEE Standard for Binary
4017 | Floating-Point Arithmetic.
4018 *----------------------------------------------------------------------------*/
4020 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4022 flag aSign;
4023 int16 aExp;
4024 uint64_t aSig, zSig0, zSig1;
4026 aSig = extractFloatx80Frac( a );
4027 aExp = extractFloatx80Exp( a );
4028 aSign = extractFloatx80Sign( a );
4029 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4030 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4032 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4033 return packFloat128( aSign, aExp, zSig0, zSig1 );
4037 #endif
4039 /*----------------------------------------------------------------------------
4040 | Rounds the extended double-precision floating-point value `a' to an integer,
4041 | and returns the result as an extended quadruple-precision floating-point
4042 | value. The operation is performed according to the IEC/IEEE Standard for
4043 | Binary Floating-Point Arithmetic.
4044 *----------------------------------------------------------------------------*/
4046 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4048 flag aSign;
4049 int32 aExp;
4050 uint64_t lastBitMask, roundBitsMask;
4051 int8 roundingMode;
4052 floatx80 z;
4054 aExp = extractFloatx80Exp( a );
4055 if ( 0x403E <= aExp ) {
4056 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4057 return propagateFloatx80NaN( a, a STATUS_VAR );
4059 return a;
4061 if ( aExp < 0x3FFF ) {
4062 if ( ( aExp == 0 )
4063 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4064 return a;
4066 STATUS(float_exception_flags) |= float_flag_inexact;
4067 aSign = extractFloatx80Sign( a );
4068 switch ( STATUS(float_rounding_mode) ) {
4069 case float_round_nearest_even:
4070 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4072 return
4073 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4075 break;
4076 case float_round_down:
4077 return
4078 aSign ?
4079 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4080 : packFloatx80( 0, 0, 0 );
4081 case float_round_up:
4082 return
4083 aSign ? packFloatx80( 1, 0, 0 )
4084 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4086 return packFloatx80( aSign, 0, 0 );
4088 lastBitMask = 1;
4089 lastBitMask <<= 0x403E - aExp;
4090 roundBitsMask = lastBitMask - 1;
4091 z = a;
4092 roundingMode = STATUS(float_rounding_mode);
4093 if ( roundingMode == float_round_nearest_even ) {
4094 z.low += lastBitMask>>1;
4095 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4097 else if ( roundingMode != float_round_to_zero ) {
4098 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4099 z.low += roundBitsMask;
4102 z.low &= ~ roundBitsMask;
4103 if ( z.low == 0 ) {
4104 ++z.high;
4105 z.low = LIT64( 0x8000000000000000 );
4107 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4108 return z;
4112 /*----------------------------------------------------------------------------
4113 | Returns the result of adding the absolute values of the extended double-
4114 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4115 | negated before being returned. `zSign' is ignored if the result is a NaN.
4116 | The addition is performed according to the IEC/IEEE Standard for Binary
4117 | Floating-Point Arithmetic.
4118 *----------------------------------------------------------------------------*/
4120 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4122 int32 aExp, bExp, zExp;
4123 uint64_t aSig, bSig, zSig0, zSig1;
4124 int32 expDiff;
4126 aSig = extractFloatx80Frac( a );
4127 aExp = extractFloatx80Exp( a );
4128 bSig = extractFloatx80Frac( b );
4129 bExp = extractFloatx80Exp( b );
4130 expDiff = aExp - bExp;
4131 if ( 0 < expDiff ) {
4132 if ( aExp == 0x7FFF ) {
4133 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4134 return a;
4136 if ( bExp == 0 ) --expDiff;
4137 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4138 zExp = aExp;
4140 else if ( expDiff < 0 ) {
4141 if ( bExp == 0x7FFF ) {
4142 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4143 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4145 if ( aExp == 0 ) ++expDiff;
4146 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4147 zExp = bExp;
4149 else {
4150 if ( aExp == 0x7FFF ) {
4151 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4152 return propagateFloatx80NaN( a, b STATUS_VAR );
4154 return a;
4156 zSig1 = 0;
4157 zSig0 = aSig + bSig;
4158 if ( aExp == 0 ) {
4159 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4160 goto roundAndPack;
4162 zExp = aExp;
4163 goto shiftRight1;
4165 zSig0 = aSig + bSig;
4166 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4167 shiftRight1:
4168 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4169 zSig0 |= LIT64( 0x8000000000000000 );
4170 ++zExp;
4171 roundAndPack:
4172 return
4173 roundAndPackFloatx80(
4174 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4178 /*----------------------------------------------------------------------------
4179 | Returns the result of subtracting the absolute values of the extended
4180 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4181 | difference is negated before being returned. `zSign' is ignored if the
4182 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4183 | Standard for Binary Floating-Point Arithmetic.
4184 *----------------------------------------------------------------------------*/
4186 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4188 int32 aExp, bExp, zExp;
4189 uint64_t aSig, bSig, zSig0, zSig1;
4190 int32 expDiff;
4191 floatx80 z;
4193 aSig = extractFloatx80Frac( a );
4194 aExp = extractFloatx80Exp( a );
4195 bSig = extractFloatx80Frac( b );
4196 bExp = extractFloatx80Exp( b );
4197 expDiff = aExp - bExp;
4198 if ( 0 < expDiff ) goto aExpBigger;
4199 if ( expDiff < 0 ) goto bExpBigger;
4200 if ( aExp == 0x7FFF ) {
4201 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4202 return propagateFloatx80NaN( a, b STATUS_VAR );
4204 float_raise( float_flag_invalid STATUS_VAR);
4205 z.low = floatx80_default_nan_low;
4206 z.high = floatx80_default_nan_high;
4207 return z;
4209 if ( aExp == 0 ) {
4210 aExp = 1;
4211 bExp = 1;
4213 zSig1 = 0;
4214 if ( bSig < aSig ) goto aBigger;
4215 if ( aSig < bSig ) goto bBigger;
4216 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4217 bExpBigger:
4218 if ( bExp == 0x7FFF ) {
4219 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4220 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4222 if ( aExp == 0 ) ++expDiff;
4223 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4224 bBigger:
4225 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4226 zExp = bExp;
4227 zSign ^= 1;
4228 goto normalizeRoundAndPack;
4229 aExpBigger:
4230 if ( aExp == 0x7FFF ) {
4231 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4232 return a;
4234 if ( bExp == 0 ) --expDiff;
4235 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4236 aBigger:
4237 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4238 zExp = aExp;
4239 normalizeRoundAndPack:
4240 return
4241 normalizeRoundAndPackFloatx80(
4242 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4246 /*----------------------------------------------------------------------------
4247 | Returns the result of adding the extended double-precision floating-point
4248 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4249 | Standard for Binary Floating-Point Arithmetic.
4250 *----------------------------------------------------------------------------*/
4252 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4254 flag aSign, bSign;
4256 aSign = extractFloatx80Sign( a );
4257 bSign = extractFloatx80Sign( b );
4258 if ( aSign == bSign ) {
4259 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4261 else {
4262 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4267 /*----------------------------------------------------------------------------
4268 | Returns the result of subtracting the extended double-precision floating-
4269 | point values `a' and `b'. The operation is performed according to the
4270 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4271 *----------------------------------------------------------------------------*/
4273 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4275 flag aSign, bSign;
4277 aSign = extractFloatx80Sign( a );
4278 bSign = extractFloatx80Sign( b );
4279 if ( aSign == bSign ) {
4280 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4282 else {
4283 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4288 /*----------------------------------------------------------------------------
4289 | Returns the result of multiplying the extended double-precision floating-
4290 | point values `a' and `b'. The operation is performed according to the
4291 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4292 *----------------------------------------------------------------------------*/
4294 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4296 flag aSign, bSign, zSign;
4297 int32 aExp, bExp, zExp;
4298 uint64_t aSig, bSig, zSig0, zSig1;
4299 floatx80 z;
4301 aSig = extractFloatx80Frac( a );
4302 aExp = extractFloatx80Exp( a );
4303 aSign = extractFloatx80Sign( a );
4304 bSig = extractFloatx80Frac( b );
4305 bExp = extractFloatx80Exp( b );
4306 bSign = extractFloatx80Sign( b );
4307 zSign = aSign ^ bSign;
4308 if ( aExp == 0x7FFF ) {
4309 if ( (uint64_t) ( aSig<<1 )
4310 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4311 return propagateFloatx80NaN( a, b STATUS_VAR );
4313 if ( ( bExp | bSig ) == 0 ) goto invalid;
4314 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4316 if ( bExp == 0x7FFF ) {
4317 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4318 if ( ( aExp | aSig ) == 0 ) {
4319 invalid:
4320 float_raise( float_flag_invalid STATUS_VAR);
4321 z.low = floatx80_default_nan_low;
4322 z.high = floatx80_default_nan_high;
4323 return z;
4325 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4327 if ( aExp == 0 ) {
4328 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4329 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4331 if ( bExp == 0 ) {
4332 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4333 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4335 zExp = aExp + bExp - 0x3FFE;
4336 mul64To128( aSig, bSig, &zSig0, &zSig1 );
4337 if ( 0 < (int64_t) zSig0 ) {
4338 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4339 --zExp;
4341 return
4342 roundAndPackFloatx80(
4343 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4347 /*----------------------------------------------------------------------------
4348 | Returns the result of dividing the extended double-precision floating-point
4349 | value `a' by the corresponding value `b'. The operation is performed
4350 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4351 *----------------------------------------------------------------------------*/
4353 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4355 flag aSign, bSign, zSign;
4356 int32 aExp, bExp, zExp;
4357 uint64_t aSig, bSig, zSig0, zSig1;
4358 uint64_t rem0, rem1, rem2, term0, term1, term2;
4359 floatx80 z;
4361 aSig = extractFloatx80Frac( a );
4362 aExp = extractFloatx80Exp( a );
4363 aSign = extractFloatx80Sign( a );
4364 bSig = extractFloatx80Frac( b );
4365 bExp = extractFloatx80Exp( b );
4366 bSign = extractFloatx80Sign( b );
4367 zSign = aSign ^ bSign;
4368 if ( aExp == 0x7FFF ) {
4369 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4370 if ( bExp == 0x7FFF ) {
4371 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4372 goto invalid;
4374 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4376 if ( bExp == 0x7FFF ) {
4377 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4378 return packFloatx80( zSign, 0, 0 );
4380 if ( bExp == 0 ) {
4381 if ( bSig == 0 ) {
4382 if ( ( aExp | aSig ) == 0 ) {
4383 invalid:
4384 float_raise( float_flag_invalid STATUS_VAR);
4385 z.low = floatx80_default_nan_low;
4386 z.high = floatx80_default_nan_high;
4387 return z;
4389 float_raise( float_flag_divbyzero STATUS_VAR);
4390 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4392 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4394 if ( aExp == 0 ) {
4395 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4396 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4398 zExp = aExp - bExp + 0x3FFE;
4399 rem1 = 0;
4400 if ( bSig <= aSig ) {
4401 shift128Right( aSig, 0, 1, &aSig, &rem1 );
4402 ++zExp;
4404 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4405 mul64To128( bSig, zSig0, &term0, &term1 );
4406 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4407 while ( (int64_t) rem0 < 0 ) {
4408 --zSig0;
4409 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4411 zSig1 = estimateDiv128To64( rem1, 0, bSig );
4412 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4413 mul64To128( bSig, zSig1, &term1, &term2 );
4414 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4415 while ( (int64_t) rem1 < 0 ) {
4416 --zSig1;
4417 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4419 zSig1 |= ( ( rem1 | rem2 ) != 0 );
4421 return
4422 roundAndPackFloatx80(
4423 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4427 /*----------------------------------------------------------------------------
4428 | Returns the remainder of the extended double-precision floating-point value
4429 | `a' with respect to the corresponding value `b'. The operation is performed
4430 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4431 *----------------------------------------------------------------------------*/
4433 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4435 flag aSign, zSign;
4436 int32 aExp, bExp, expDiff;
4437 uint64_t aSig0, aSig1, bSig;
4438 uint64_t q, term0, term1, alternateASig0, alternateASig1;
4439 floatx80 z;
4441 aSig0 = extractFloatx80Frac( a );
4442 aExp = extractFloatx80Exp( a );
4443 aSign = extractFloatx80Sign( a );
4444 bSig = extractFloatx80Frac( b );
4445 bExp = extractFloatx80Exp( b );
4446 if ( aExp == 0x7FFF ) {
4447 if ( (uint64_t) ( aSig0<<1 )
4448 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4449 return propagateFloatx80NaN( a, b STATUS_VAR );
4451 goto invalid;
4453 if ( bExp == 0x7FFF ) {
4454 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4455 return a;
4457 if ( bExp == 0 ) {
4458 if ( bSig == 0 ) {
4459 invalid:
4460 float_raise( float_flag_invalid STATUS_VAR);
4461 z.low = floatx80_default_nan_low;
4462 z.high = floatx80_default_nan_high;
4463 return z;
4465 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4467 if ( aExp == 0 ) {
4468 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
4469 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4471 bSig |= LIT64( 0x8000000000000000 );
4472 zSign = aSign;
4473 expDiff = aExp - bExp;
4474 aSig1 = 0;
4475 if ( expDiff < 0 ) {
4476 if ( expDiff < -1 ) return a;
4477 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4478 expDiff = 0;
4480 q = ( bSig <= aSig0 );
4481 if ( q ) aSig0 -= bSig;
4482 expDiff -= 64;
4483 while ( 0 < expDiff ) {
4484 q = estimateDiv128To64( aSig0, aSig1, bSig );
4485 q = ( 2 < q ) ? q - 2 : 0;
4486 mul64To128( bSig, q, &term0, &term1 );
4487 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4488 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4489 expDiff -= 62;
4491 expDiff += 64;
4492 if ( 0 < expDiff ) {
4493 q = estimateDiv128To64( aSig0, aSig1, bSig );
4494 q = ( 2 < q ) ? q - 2 : 0;
4495 q >>= 64 - expDiff;
4496 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4497 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4498 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4499 while ( le128( term0, term1, aSig0, aSig1 ) ) {
4500 ++q;
4501 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4504 else {
4505 term1 = 0;
4506 term0 = bSig;
4508 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4509 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4510 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4511 && ( q & 1 ) )
4513 aSig0 = alternateASig0;
4514 aSig1 = alternateASig1;
4515 zSign = ! zSign;
4517 return
4518 normalizeRoundAndPackFloatx80(
4519 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4523 /*----------------------------------------------------------------------------
4524 | Returns the square root of the extended double-precision floating-point
4525 | value `a'. The operation is performed according to the IEC/IEEE Standard
4526 | for Binary Floating-Point Arithmetic.
4527 *----------------------------------------------------------------------------*/
4529 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4531 flag aSign;
4532 int32 aExp, zExp;
4533 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4534 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4535 floatx80 z;
4537 aSig0 = extractFloatx80Frac( a );
4538 aExp = extractFloatx80Exp( a );
4539 aSign = extractFloatx80Sign( a );
4540 if ( aExp == 0x7FFF ) {
4541 if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4542 if ( ! aSign ) return a;
4543 goto invalid;
4545 if ( aSign ) {
4546 if ( ( aExp | aSig0 ) == 0 ) return a;
4547 invalid:
4548 float_raise( float_flag_invalid STATUS_VAR);
4549 z.low = floatx80_default_nan_low;
4550 z.high = floatx80_default_nan_high;
4551 return z;
4553 if ( aExp == 0 ) {
4554 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4555 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4557 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4558 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4559 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4560 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4561 doubleZSig0 = zSig0<<1;
4562 mul64To128( zSig0, zSig0, &term0, &term1 );
4563 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4564 while ( (int64_t) rem0 < 0 ) {
4565 --zSig0;
4566 doubleZSig0 -= 2;
4567 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4569 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4570 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4571 if ( zSig1 == 0 ) zSig1 = 1;
4572 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4573 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4574 mul64To128( zSig1, zSig1, &term2, &term3 );
4575 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4576 while ( (int64_t) rem1 < 0 ) {
4577 --zSig1;
4578 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4579 term3 |= 1;
4580 term2 |= doubleZSig0;
4581 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4583 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4585 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4586 zSig0 |= doubleZSig0;
4587 return
4588 roundAndPackFloatx80(
4589 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4593 /*----------------------------------------------------------------------------
4594 | Returns 1 if the extended double-precision floating-point value `a' is equal
4595 | to the corresponding value `b', and 0 otherwise. The invalid exception is
4596 | raised if either operand is a NaN. Otherwise, the comparison is performed
4597 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4598 *----------------------------------------------------------------------------*/
4600 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4603 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4604 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4605 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4606 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4608 float_raise( float_flag_invalid STATUS_VAR);
4609 return 0;
4611 return
4612 ( a.low == b.low )
4613 && ( ( a.high == b.high )
4614 || ( ( a.low == 0 )
4615 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4620 /*----------------------------------------------------------------------------
4621 | Returns 1 if the extended double-precision floating-point value `a' is
4622 | less than or equal to the corresponding value `b', and 0 otherwise. The
4623 | invalid exception is raised if either operand is a NaN. The comparison is
4624 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4625 | Arithmetic.
4626 *----------------------------------------------------------------------------*/
4628 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4630 flag aSign, bSign;
4632 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4633 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4634 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4635 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4637 float_raise( float_flag_invalid STATUS_VAR);
4638 return 0;
4640 aSign = extractFloatx80Sign( a );
4641 bSign = extractFloatx80Sign( b );
4642 if ( aSign != bSign ) {
4643 return
4644 aSign
4645 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4646 == 0 );
4648 return
4649 aSign ? le128( b.high, b.low, a.high, a.low )
4650 : le128( a.high, a.low, b.high, b.low );
4654 /*----------------------------------------------------------------------------
4655 | Returns 1 if the extended double-precision floating-point value `a' is
4656 | less than the corresponding value `b', and 0 otherwise. The invalid
4657 | exception is raised if either operand is a NaN. The comparison is performed
4658 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4659 *----------------------------------------------------------------------------*/
4661 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4663 flag aSign, bSign;
4665 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4666 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4667 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4668 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4670 float_raise( float_flag_invalid STATUS_VAR);
4671 return 0;
4673 aSign = extractFloatx80Sign( a );
4674 bSign = extractFloatx80Sign( b );
4675 if ( aSign != bSign ) {
4676 return
4677 aSign
4678 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4679 != 0 );
4681 return
4682 aSign ? lt128( b.high, b.low, a.high, a.low )
4683 : lt128( a.high, a.low, b.high, b.low );
4687 /*----------------------------------------------------------------------------
4688 | Returns 1 if the extended double-precision floating-point values `a' and `b'
4689 | cannot be compared, and 0 otherwise. The invalid exception is raised if
4690 | either operand is a NaN. The comparison is performed according to the
4691 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4692 *----------------------------------------------------------------------------*/
4693 int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
4695 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4696 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4697 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4698 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4700 float_raise( float_flag_invalid STATUS_VAR);
4701 return 1;
4703 return 0;
4706 /*----------------------------------------------------------------------------
4707 | Returns 1 if the extended double-precision floating-point value `a' is
4708 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4709 | cause an exception. The comparison is performed according to the IEC/IEEE
4710 | Standard for Binary Floating-Point Arithmetic.
4711 *----------------------------------------------------------------------------*/
4713 int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4716 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4717 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4718 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4719 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4721 if ( floatx80_is_signaling_nan( a )
4722 || floatx80_is_signaling_nan( b ) ) {
4723 float_raise( float_flag_invalid STATUS_VAR);
4725 return 0;
4727 return
4728 ( a.low == b.low )
4729 && ( ( a.high == b.high )
4730 || ( ( a.low == 0 )
4731 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4736 /*----------------------------------------------------------------------------
4737 | Returns 1 if the extended double-precision floating-point value `a' is less
4738 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4739 | do not cause an exception. Otherwise, the comparison is performed according
4740 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4741 *----------------------------------------------------------------------------*/
4743 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4745 flag aSign, bSign;
4747 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4748 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4749 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4750 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4752 if ( floatx80_is_signaling_nan( a )
4753 || floatx80_is_signaling_nan( b ) ) {
4754 float_raise( float_flag_invalid STATUS_VAR);
4756 return 0;
4758 aSign = extractFloatx80Sign( a );
4759 bSign = extractFloatx80Sign( b );
4760 if ( aSign != bSign ) {
4761 return
4762 aSign
4763 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4764 == 0 );
4766 return
4767 aSign ? le128( b.high, b.low, a.high, a.low )
4768 : le128( a.high, a.low, b.high, b.low );
4772 /*----------------------------------------------------------------------------
4773 | Returns 1 if the extended double-precision floating-point value `a' is less
4774 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4775 | an exception. Otherwise, the comparison is performed according to the
4776 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4777 *----------------------------------------------------------------------------*/
4779 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4781 flag aSign, bSign;
4783 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4784 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4785 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4786 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4788 if ( floatx80_is_signaling_nan( a )
4789 || floatx80_is_signaling_nan( b ) ) {
4790 float_raise( float_flag_invalid STATUS_VAR);
4792 return 0;
4794 aSign = extractFloatx80Sign( a );
4795 bSign = extractFloatx80Sign( b );
4796 if ( aSign != bSign ) {
4797 return
4798 aSign
4799 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4800 != 0 );
4802 return
4803 aSign ? lt128( b.high, b.low, a.high, a.low )
4804 : lt128( a.high, a.low, b.high, b.low );
4808 /*----------------------------------------------------------------------------
4809 | Returns 1 if the extended double-precision floating-point values `a' and `b'
4810 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
4811 | The comparison is performed according to the IEC/IEEE Standard for Binary
4812 | Floating-Point Arithmetic.
4813 *----------------------------------------------------------------------------*/
4814 int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4816 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4817 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4818 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4819 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4821 if ( floatx80_is_signaling_nan( a )
4822 || floatx80_is_signaling_nan( b ) ) {
4823 float_raise( float_flag_invalid STATUS_VAR);
4825 return 1;
4827 return 0;
4830 #endif
4832 #ifdef FLOAT128
4834 /*----------------------------------------------------------------------------
4835 | Returns the result of converting the quadruple-precision floating-point
4836 | value `a' to the 32-bit two's complement integer format. The conversion
4837 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4838 | Arithmetic---which means in particular that the conversion is rounded
4839 | according to the current rounding mode. If `a' is a NaN, the largest
4840 | positive integer is returned. Otherwise, if the conversion overflows, the
4841 | largest integer with the same sign as `a' is returned.
4842 *----------------------------------------------------------------------------*/
4844 int32 float128_to_int32( float128 a STATUS_PARAM )
4846 flag aSign;
4847 int32 aExp, shiftCount;
4848 uint64_t aSig0, aSig1;
4850 aSig1 = extractFloat128Frac1( a );
4851 aSig0 = extractFloat128Frac0( a );
4852 aExp = extractFloat128Exp( a );
4853 aSign = extractFloat128Sign( a );
4854 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4855 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4856 aSig0 |= ( aSig1 != 0 );
4857 shiftCount = 0x4028 - aExp;
4858 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4859 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4863 /*----------------------------------------------------------------------------
4864 | Returns the result of converting the quadruple-precision floating-point
4865 | value `a' to the 32-bit two's complement integer format. The conversion
4866 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4867 | Arithmetic, except that the conversion is always rounded toward zero. If
4868 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4869 | conversion overflows, the largest integer with the same sign as `a' is
4870 | returned.
4871 *----------------------------------------------------------------------------*/
4873 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4875 flag aSign;
4876 int32 aExp, shiftCount;
4877 uint64_t aSig0, aSig1, savedASig;
4878 int32 z;
4880 aSig1 = extractFloat128Frac1( a );
4881 aSig0 = extractFloat128Frac0( a );
4882 aExp = extractFloat128Exp( a );
4883 aSign = extractFloat128Sign( a );
4884 aSig0 |= ( aSig1 != 0 );
4885 if ( 0x401E < aExp ) {
4886 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4887 goto invalid;
4889 else if ( aExp < 0x3FFF ) {
4890 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4891 return 0;
4893 aSig0 |= LIT64( 0x0001000000000000 );
4894 shiftCount = 0x402F - aExp;
4895 savedASig = aSig0;
4896 aSig0 >>= shiftCount;
4897 z = aSig0;
4898 if ( aSign ) z = - z;
4899 if ( ( z < 0 ) ^ aSign ) {
4900 invalid:
4901 float_raise( float_flag_invalid STATUS_VAR);
4902 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4904 if ( ( aSig0<<shiftCount ) != savedASig ) {
4905 STATUS(float_exception_flags) |= float_flag_inexact;
4907 return z;
4911 /*----------------------------------------------------------------------------
4912 | Returns the result of converting the quadruple-precision floating-point
4913 | value `a' to the 64-bit two's complement integer format. The conversion
4914 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4915 | Arithmetic---which means in particular that the conversion is rounded
4916 | according to the current rounding mode. If `a' is a NaN, the largest
4917 | positive integer is returned. Otherwise, if the conversion overflows, the
4918 | largest integer with the same sign as `a' is returned.
4919 *----------------------------------------------------------------------------*/
4921 int64 float128_to_int64( float128 a STATUS_PARAM )
4923 flag aSign;
4924 int32 aExp, shiftCount;
4925 uint64_t aSig0, aSig1;
4927 aSig1 = extractFloat128Frac1( a );
4928 aSig0 = extractFloat128Frac0( a );
4929 aExp = extractFloat128Exp( a );
4930 aSign = extractFloat128Sign( a );
4931 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4932 shiftCount = 0x402F - aExp;
4933 if ( shiftCount <= 0 ) {
4934 if ( 0x403E < aExp ) {
4935 float_raise( float_flag_invalid STATUS_VAR);
4936 if ( ! aSign
4937 || ( ( aExp == 0x7FFF )
4938 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4941 return LIT64( 0x7FFFFFFFFFFFFFFF );
4943 return (int64_t) LIT64( 0x8000000000000000 );
4945 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4947 else {
4948 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4950 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4954 /*----------------------------------------------------------------------------
4955 | Returns the result of converting the quadruple-precision floating-point
4956 | value `a' to the 64-bit two's complement integer format. The conversion
4957 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4958 | Arithmetic, except that the conversion is always rounded toward zero.
4959 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4960 | the conversion overflows, the largest integer with the same sign as `a' is
4961 | returned.
4962 *----------------------------------------------------------------------------*/
4964 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4966 flag aSign;
4967 int32 aExp, shiftCount;
4968 uint64_t aSig0, aSig1;
4969 int64 z;
4971 aSig1 = extractFloat128Frac1( a );
4972 aSig0 = extractFloat128Frac0( a );
4973 aExp = extractFloat128Exp( a );
4974 aSign = extractFloat128Sign( a );
4975 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4976 shiftCount = aExp - 0x402F;
4977 if ( 0 < shiftCount ) {
4978 if ( 0x403E <= aExp ) {
4979 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4980 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4981 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4982 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4984 else {
4985 float_raise( float_flag_invalid STATUS_VAR);
4986 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4987 return LIT64( 0x7FFFFFFFFFFFFFFF );
4990 return (int64_t) LIT64( 0x8000000000000000 );
4992 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4993 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
4994 STATUS(float_exception_flags) |= float_flag_inexact;
4997 else {
4998 if ( aExp < 0x3FFF ) {
4999 if ( aExp | aSig0 | aSig1 ) {
5000 STATUS(float_exception_flags) |= float_flag_inexact;
5002 return 0;
5004 z = aSig0>>( - shiftCount );
5005 if ( aSig1
5006 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5007 STATUS(float_exception_flags) |= float_flag_inexact;
5010 if ( aSign ) z = - z;
5011 return z;
5015 /*----------------------------------------------------------------------------
5016 | Returns the result of converting the quadruple-precision floating-point
5017 | value `a' to the single-precision floating-point format. The conversion
5018 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5019 | Arithmetic.
5020 *----------------------------------------------------------------------------*/
5022 float32 float128_to_float32( float128 a STATUS_PARAM )
5024 flag aSign;
5025 int32 aExp;
5026 uint64_t aSig0, aSig1;
5027 uint32_t zSig;
5029 aSig1 = extractFloat128Frac1( a );
5030 aSig0 = extractFloat128Frac0( a );
5031 aExp = extractFloat128Exp( a );
5032 aSign = extractFloat128Sign( a );
5033 if ( aExp == 0x7FFF ) {
5034 if ( aSig0 | aSig1 ) {
5035 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5037 return packFloat32( aSign, 0xFF, 0 );
5039 aSig0 |= ( aSig1 != 0 );
5040 shift64RightJamming( aSig0, 18, &aSig0 );
5041 zSig = aSig0;
5042 if ( aExp || zSig ) {
5043 zSig |= 0x40000000;
5044 aExp -= 0x3F81;
5046 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5050 /*----------------------------------------------------------------------------
5051 | Returns the result of converting the quadruple-precision floating-point
5052 | value `a' to the double-precision floating-point format. The conversion
5053 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5054 | Arithmetic.
5055 *----------------------------------------------------------------------------*/
5057 float64 float128_to_float64( float128 a STATUS_PARAM )
5059 flag aSign;
5060 int32 aExp;
5061 uint64_t aSig0, aSig1;
5063 aSig1 = extractFloat128Frac1( a );
5064 aSig0 = extractFloat128Frac0( a );
5065 aExp = extractFloat128Exp( a );
5066 aSign = extractFloat128Sign( a );
5067 if ( aExp == 0x7FFF ) {
5068 if ( aSig0 | aSig1 ) {
5069 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5071 return packFloat64( aSign, 0x7FF, 0 );
5073 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5074 aSig0 |= ( aSig1 != 0 );
5075 if ( aExp || aSig0 ) {
5076 aSig0 |= LIT64( 0x4000000000000000 );
5077 aExp -= 0x3C01;
5079 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5083 #ifdef FLOATX80
5085 /*----------------------------------------------------------------------------
5086 | Returns the result of converting the quadruple-precision floating-point
5087 | value `a' to the extended double-precision floating-point format. The
5088 | conversion is performed according to the IEC/IEEE Standard for Binary
5089 | Floating-Point Arithmetic.
5090 *----------------------------------------------------------------------------*/
5092 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5094 flag aSign;
5095 int32 aExp;
5096 uint64_t aSig0, aSig1;
5098 aSig1 = extractFloat128Frac1( a );
5099 aSig0 = extractFloat128Frac0( a );
5100 aExp = extractFloat128Exp( a );
5101 aSign = extractFloat128Sign( a );
5102 if ( aExp == 0x7FFF ) {
5103 if ( aSig0 | aSig1 ) {
5104 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5106 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5108 if ( aExp == 0 ) {
5109 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5110 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5112 else {
5113 aSig0 |= LIT64( 0x0001000000000000 );
5115 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5116 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5120 #endif
5122 /*----------------------------------------------------------------------------
5123 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5124 | returns the result as a quadruple-precision floating-point value. The
5125 | operation is performed according to the IEC/IEEE Standard for Binary
5126 | Floating-Point Arithmetic.
5127 *----------------------------------------------------------------------------*/
5129 float128 float128_round_to_int( float128 a STATUS_PARAM )
5131 flag aSign;
5132 int32 aExp;
5133 uint64_t lastBitMask, roundBitsMask;
5134 int8 roundingMode;
5135 float128 z;
5137 aExp = extractFloat128Exp( a );
5138 if ( 0x402F <= aExp ) {
5139 if ( 0x406F <= aExp ) {
5140 if ( ( aExp == 0x7FFF )
5141 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5143 return propagateFloat128NaN( a, a STATUS_VAR );
5145 return a;
5147 lastBitMask = 1;
5148 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5149 roundBitsMask = lastBitMask - 1;
5150 z = a;
5151 roundingMode = STATUS(float_rounding_mode);
5152 if ( roundingMode == float_round_nearest_even ) {
5153 if ( lastBitMask ) {
5154 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5155 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5157 else {
5158 if ( (int64_t) z.low < 0 ) {
5159 ++z.high;
5160 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5164 else if ( roundingMode != float_round_to_zero ) {
5165 if ( extractFloat128Sign( z )
5166 ^ ( roundingMode == float_round_up ) ) {
5167 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5170 z.low &= ~ roundBitsMask;
5172 else {
5173 if ( aExp < 0x3FFF ) {
5174 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5175 STATUS(float_exception_flags) |= float_flag_inexact;
5176 aSign = extractFloat128Sign( a );
5177 switch ( STATUS(float_rounding_mode) ) {
5178 case float_round_nearest_even:
5179 if ( ( aExp == 0x3FFE )
5180 && ( extractFloat128Frac0( a )
5181 | extractFloat128Frac1( a ) )
5183 return packFloat128( aSign, 0x3FFF, 0, 0 );
5185 break;
5186 case float_round_down:
5187 return
5188 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5189 : packFloat128( 0, 0, 0, 0 );
5190 case float_round_up:
5191 return
5192 aSign ? packFloat128( 1, 0, 0, 0 )
5193 : packFloat128( 0, 0x3FFF, 0, 0 );
5195 return packFloat128( aSign, 0, 0, 0 );
5197 lastBitMask = 1;
5198 lastBitMask <<= 0x402F - aExp;
5199 roundBitsMask = lastBitMask - 1;
5200 z.low = 0;
5201 z.high = a.high;
5202 roundingMode = STATUS(float_rounding_mode);
5203 if ( roundingMode == float_round_nearest_even ) {
5204 z.high += lastBitMask>>1;
5205 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5206 z.high &= ~ lastBitMask;
5209 else if ( roundingMode != float_round_to_zero ) {
5210 if ( extractFloat128Sign( z )
5211 ^ ( roundingMode == float_round_up ) ) {
5212 z.high |= ( a.low != 0 );
5213 z.high += roundBitsMask;
5216 z.high &= ~ roundBitsMask;
5218 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5219 STATUS(float_exception_flags) |= float_flag_inexact;
5221 return z;
5225 /*----------------------------------------------------------------------------
5226 | Returns the result of adding the absolute values of the quadruple-precision
5227 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5228 | before being returned. `zSign' is ignored if the result is a NaN.
5229 | The addition is performed according to the IEC/IEEE Standard for Binary
5230 | Floating-Point Arithmetic.
5231 *----------------------------------------------------------------------------*/
5233 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5235 int32 aExp, bExp, zExp;
5236 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5237 int32 expDiff;
5239 aSig1 = extractFloat128Frac1( a );
5240 aSig0 = extractFloat128Frac0( a );
5241 aExp = extractFloat128Exp( a );
5242 bSig1 = extractFloat128Frac1( b );
5243 bSig0 = extractFloat128Frac0( b );
5244 bExp = extractFloat128Exp( b );
5245 expDiff = aExp - bExp;
5246 if ( 0 < expDiff ) {
5247 if ( aExp == 0x7FFF ) {
5248 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5249 return a;
5251 if ( bExp == 0 ) {
5252 --expDiff;
5254 else {
5255 bSig0 |= LIT64( 0x0001000000000000 );
5257 shift128ExtraRightJamming(
5258 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5259 zExp = aExp;
5261 else if ( expDiff < 0 ) {
5262 if ( bExp == 0x7FFF ) {
5263 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5264 return packFloat128( zSign, 0x7FFF, 0, 0 );
5266 if ( aExp == 0 ) {
5267 ++expDiff;
5269 else {
5270 aSig0 |= LIT64( 0x0001000000000000 );
5272 shift128ExtraRightJamming(
5273 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5274 zExp = bExp;
5276 else {
5277 if ( aExp == 0x7FFF ) {
5278 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5279 return propagateFloat128NaN( a, b STATUS_VAR );
5281 return a;
5283 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5284 if ( aExp == 0 ) {
5285 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
5286 return packFloat128( zSign, 0, zSig0, zSig1 );
5288 zSig2 = 0;
5289 zSig0 |= LIT64( 0x0002000000000000 );
5290 zExp = aExp;
5291 goto shiftRight1;
5293 aSig0 |= LIT64( 0x0001000000000000 );
5294 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5295 --zExp;
5296 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5297 ++zExp;
5298 shiftRight1:
5299 shift128ExtraRightJamming(
5300 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5301 roundAndPack:
5302 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5306 /*----------------------------------------------------------------------------
5307 | Returns the result of subtracting the absolute values of the quadruple-
5308 | precision floating-point values `a' and `b'. If `zSign' is 1, the
5309 | difference is negated before being returned. `zSign' is ignored if the
5310 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5311 | Standard for Binary Floating-Point Arithmetic.
5312 *----------------------------------------------------------------------------*/
5314 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5316 int32 aExp, bExp, zExp;
5317 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5318 int32 expDiff;
5319 float128 z;
5321 aSig1 = extractFloat128Frac1( a );
5322 aSig0 = extractFloat128Frac0( a );
5323 aExp = extractFloat128Exp( a );
5324 bSig1 = extractFloat128Frac1( b );
5325 bSig0 = extractFloat128Frac0( b );
5326 bExp = extractFloat128Exp( b );
5327 expDiff = aExp - bExp;
5328 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5329 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5330 if ( 0 < expDiff ) goto aExpBigger;
5331 if ( expDiff < 0 ) goto bExpBigger;
5332 if ( aExp == 0x7FFF ) {
5333 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5334 return propagateFloat128NaN( a, b STATUS_VAR );
5336 float_raise( float_flag_invalid STATUS_VAR);
5337 z.low = float128_default_nan_low;
5338 z.high = float128_default_nan_high;
5339 return z;
5341 if ( aExp == 0 ) {
5342 aExp = 1;
5343 bExp = 1;
5345 if ( bSig0 < aSig0 ) goto aBigger;
5346 if ( aSig0 < bSig0 ) goto bBigger;
5347 if ( bSig1 < aSig1 ) goto aBigger;
5348 if ( aSig1 < bSig1 ) goto bBigger;
5349 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5350 bExpBigger:
5351 if ( bExp == 0x7FFF ) {
5352 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5353 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5355 if ( aExp == 0 ) {
5356 ++expDiff;
5358 else {
5359 aSig0 |= LIT64( 0x4000000000000000 );
5361 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5362 bSig0 |= LIT64( 0x4000000000000000 );
5363 bBigger:
5364 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5365 zExp = bExp;
5366 zSign ^= 1;
5367 goto normalizeRoundAndPack;
5368 aExpBigger:
5369 if ( aExp == 0x7FFF ) {
5370 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5371 return a;
5373 if ( bExp == 0 ) {
5374 --expDiff;
5376 else {
5377 bSig0 |= LIT64( 0x4000000000000000 );
5379 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5380 aSig0 |= LIT64( 0x4000000000000000 );
5381 aBigger:
5382 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5383 zExp = aExp;
5384 normalizeRoundAndPack:
5385 --zExp;
5386 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5390 /*----------------------------------------------------------------------------
5391 | Returns the result of adding the quadruple-precision floating-point values
5392 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5393 | for Binary Floating-Point Arithmetic.
5394 *----------------------------------------------------------------------------*/
5396 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5398 flag aSign, bSign;
5400 aSign = extractFloat128Sign( a );
5401 bSign = extractFloat128Sign( b );
5402 if ( aSign == bSign ) {
5403 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5405 else {
5406 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5411 /*----------------------------------------------------------------------------
5412 | Returns the result of subtracting the quadruple-precision floating-point
5413 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5414 | Standard for Binary Floating-Point Arithmetic.
5415 *----------------------------------------------------------------------------*/
5417 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5419 flag aSign, bSign;
5421 aSign = extractFloat128Sign( a );
5422 bSign = extractFloat128Sign( b );
5423 if ( aSign == bSign ) {
5424 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5426 else {
5427 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5432 /*----------------------------------------------------------------------------
5433 | Returns the result of multiplying the quadruple-precision floating-point
5434 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5435 | Standard for Binary Floating-Point Arithmetic.
5436 *----------------------------------------------------------------------------*/
5438 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5440 flag aSign, bSign, zSign;
5441 int32 aExp, bExp, zExp;
5442 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5443 float128 z;
5445 aSig1 = extractFloat128Frac1( a );
5446 aSig0 = extractFloat128Frac0( a );
5447 aExp = extractFloat128Exp( a );
5448 aSign = extractFloat128Sign( a );
5449 bSig1 = extractFloat128Frac1( b );
5450 bSig0 = extractFloat128Frac0( b );
5451 bExp = extractFloat128Exp( b );
5452 bSign = extractFloat128Sign( b );
5453 zSign = aSign ^ bSign;
5454 if ( aExp == 0x7FFF ) {
5455 if ( ( aSig0 | aSig1 )
5456 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5457 return propagateFloat128NaN( a, b STATUS_VAR );
5459 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5460 return packFloat128( zSign, 0x7FFF, 0, 0 );
5462 if ( bExp == 0x7FFF ) {
5463 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5464 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5465 invalid:
5466 float_raise( float_flag_invalid STATUS_VAR);
5467 z.low = float128_default_nan_low;
5468 z.high = float128_default_nan_high;
5469 return z;
5471 return packFloat128( zSign, 0x7FFF, 0, 0 );
5473 if ( aExp == 0 ) {
5474 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5475 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5477 if ( bExp == 0 ) {
5478 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5479 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5481 zExp = aExp + bExp - 0x4000;
5482 aSig0 |= LIT64( 0x0001000000000000 );
5483 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5484 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5485 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5486 zSig2 |= ( zSig3 != 0 );
5487 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5488 shift128ExtraRightJamming(
5489 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5490 ++zExp;
5492 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5496 /*----------------------------------------------------------------------------
5497 | Returns the result of dividing the quadruple-precision floating-point value
5498 | `a' by the corresponding value `b'. The operation is performed according to
5499 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5500 *----------------------------------------------------------------------------*/
5502 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5504 flag aSign, bSign, zSign;
5505 int32 aExp, bExp, zExp;
5506 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5507 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5508 float128 z;
5510 aSig1 = extractFloat128Frac1( a );
5511 aSig0 = extractFloat128Frac0( a );
5512 aExp = extractFloat128Exp( a );
5513 aSign = extractFloat128Sign( a );
5514 bSig1 = extractFloat128Frac1( b );
5515 bSig0 = extractFloat128Frac0( b );
5516 bExp = extractFloat128Exp( b );
5517 bSign = extractFloat128Sign( b );
5518 zSign = aSign ^ bSign;
5519 if ( aExp == 0x7FFF ) {
5520 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5521 if ( bExp == 0x7FFF ) {
5522 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5523 goto invalid;
5525 return packFloat128( zSign, 0x7FFF, 0, 0 );
5527 if ( bExp == 0x7FFF ) {
5528 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5529 return packFloat128( zSign, 0, 0, 0 );
5531 if ( bExp == 0 ) {
5532 if ( ( bSig0 | bSig1 ) == 0 ) {
5533 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5534 invalid:
5535 float_raise( float_flag_invalid STATUS_VAR);
5536 z.low = float128_default_nan_low;
5537 z.high = float128_default_nan_high;
5538 return z;
5540 float_raise( float_flag_divbyzero STATUS_VAR);
5541 return packFloat128( zSign, 0x7FFF, 0, 0 );
5543 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5545 if ( aExp == 0 ) {
5546 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5547 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5549 zExp = aExp - bExp + 0x3FFD;
5550 shortShift128Left(
5551 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5552 shortShift128Left(
5553 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5554 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5555 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5556 ++zExp;
5558 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5559 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5560 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5561 while ( (int64_t) rem0 < 0 ) {
5562 --zSig0;
5563 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5565 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5566 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5567 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5568 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5569 while ( (int64_t) rem1 < 0 ) {
5570 --zSig1;
5571 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5573 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5575 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5576 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5580 /*----------------------------------------------------------------------------
5581 | Returns the remainder of the quadruple-precision floating-point value `a'
5582 | with respect to the corresponding value `b'. The operation is performed
5583 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5584 *----------------------------------------------------------------------------*/
5586 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5588 flag aSign, zSign;
5589 int32 aExp, bExp, expDiff;
5590 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5591 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
5592 int64_t sigMean0;
5593 float128 z;
5595 aSig1 = extractFloat128Frac1( a );
5596 aSig0 = extractFloat128Frac0( a );
5597 aExp = extractFloat128Exp( a );
5598 aSign = extractFloat128Sign( a );
5599 bSig1 = extractFloat128Frac1( b );
5600 bSig0 = extractFloat128Frac0( b );
5601 bExp = extractFloat128Exp( b );
5602 if ( aExp == 0x7FFF ) {
5603 if ( ( aSig0 | aSig1 )
5604 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5605 return propagateFloat128NaN( a, b STATUS_VAR );
5607 goto invalid;
5609 if ( bExp == 0x7FFF ) {
5610 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5611 return a;
5613 if ( bExp == 0 ) {
5614 if ( ( bSig0 | bSig1 ) == 0 ) {
5615 invalid:
5616 float_raise( float_flag_invalid STATUS_VAR);
5617 z.low = float128_default_nan_low;
5618 z.high = float128_default_nan_high;
5619 return z;
5621 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5623 if ( aExp == 0 ) {
5624 if ( ( aSig0 | aSig1 ) == 0 ) return a;
5625 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5627 expDiff = aExp - bExp;
5628 if ( expDiff < -1 ) return a;
5629 shortShift128Left(
5630 aSig0 | LIT64( 0x0001000000000000 ),
5631 aSig1,
5632 15 - ( expDiff < 0 ),
5633 &aSig0,
5634 &aSig1
5636 shortShift128Left(
5637 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5638 q = le128( bSig0, bSig1, aSig0, aSig1 );
5639 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5640 expDiff -= 64;
5641 while ( 0 < expDiff ) {
5642 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5643 q = ( 4 < q ) ? q - 4 : 0;
5644 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5645 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5646 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5647 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5648 expDiff -= 61;
5650 if ( -64 < expDiff ) {
5651 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5652 q = ( 4 < q ) ? q - 4 : 0;
5653 q >>= - expDiff;
5654 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5655 expDiff += 52;
5656 if ( expDiff < 0 ) {
5657 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5659 else {
5660 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5662 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5663 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5665 else {
5666 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5667 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5669 do {
5670 alternateASig0 = aSig0;
5671 alternateASig1 = aSig1;
5672 ++q;
5673 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5674 } while ( 0 <= (int64_t) aSig0 );
5675 add128(
5676 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
5677 if ( ( sigMean0 < 0 )
5678 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5679 aSig0 = alternateASig0;
5680 aSig1 = alternateASig1;
5682 zSign = ( (int64_t) aSig0 < 0 );
5683 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5684 return
5685 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5689 /*----------------------------------------------------------------------------
5690 | Returns the square root of the quadruple-precision floating-point value `a'.
5691 | The operation is performed according to the IEC/IEEE Standard for Binary
5692 | Floating-Point Arithmetic.
5693 *----------------------------------------------------------------------------*/
5695 float128 float128_sqrt( float128 a STATUS_PARAM )
5697 flag aSign;
5698 int32 aExp, zExp;
5699 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5700 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5701 float128 z;
5703 aSig1 = extractFloat128Frac1( a );
5704 aSig0 = extractFloat128Frac0( a );
5705 aExp = extractFloat128Exp( a );
5706 aSign = extractFloat128Sign( a );
5707 if ( aExp == 0x7FFF ) {
5708 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5709 if ( ! aSign ) return a;
5710 goto invalid;
5712 if ( aSign ) {
5713 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5714 invalid:
5715 float_raise( float_flag_invalid STATUS_VAR);
5716 z.low = float128_default_nan_low;
5717 z.high = float128_default_nan_high;
5718 return z;
5720 if ( aExp == 0 ) {
5721 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5722 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5724 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5725 aSig0 |= LIT64( 0x0001000000000000 );
5726 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5727 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5728 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5729 doubleZSig0 = zSig0<<1;
5730 mul64To128( zSig0, zSig0, &term0, &term1 );
5731 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5732 while ( (int64_t) rem0 < 0 ) {
5733 --zSig0;
5734 doubleZSig0 -= 2;
5735 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5737 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5738 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5739 if ( zSig1 == 0 ) zSig1 = 1;
5740 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5741 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5742 mul64To128( zSig1, zSig1, &term2, &term3 );
5743 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5744 while ( (int64_t) rem1 < 0 ) {
5745 --zSig1;
5746 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5747 term3 |= 1;
5748 term2 |= doubleZSig0;
5749 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5751 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5753 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5754 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5758 /*----------------------------------------------------------------------------
5759 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5760 | the corresponding value `b', and 0 otherwise. The invalid exception is
5761 | raised if either operand is a NaN. Otherwise, the comparison is performed
5762 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5763 *----------------------------------------------------------------------------*/
5765 int float128_eq( float128 a, float128 b STATUS_PARAM )
5768 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5769 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5770 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5771 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5773 float_raise( float_flag_invalid STATUS_VAR);
5774 return 0;
5776 return
5777 ( a.low == b.low )
5778 && ( ( a.high == b.high )
5779 || ( ( a.low == 0 )
5780 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5785 /*----------------------------------------------------------------------------
5786 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5787 | or equal to the corresponding value `b', and 0 otherwise. The invalid
5788 | exception is raised if either operand is a NaN. The comparison is performed
5789 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5790 *----------------------------------------------------------------------------*/
5792 int float128_le( float128 a, float128 b STATUS_PARAM )
5794 flag aSign, bSign;
5796 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5797 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5798 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5799 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5801 float_raise( float_flag_invalid STATUS_VAR);
5802 return 0;
5804 aSign = extractFloat128Sign( a );
5805 bSign = extractFloat128Sign( b );
5806 if ( aSign != bSign ) {
5807 return
5808 aSign
5809 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5810 == 0 );
5812 return
5813 aSign ? le128( b.high, b.low, a.high, a.low )
5814 : le128( a.high, a.low, b.high, b.low );
5818 /*----------------------------------------------------------------------------
5819 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5820 | the corresponding value `b', and 0 otherwise. The invalid exception is
5821 | raised if either operand is a NaN. The comparison is performed according
5822 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5823 *----------------------------------------------------------------------------*/
5825 int float128_lt( float128 a, float128 b STATUS_PARAM )
5827 flag aSign, bSign;
5829 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5830 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5831 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5832 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5834 float_raise( float_flag_invalid STATUS_VAR);
5835 return 0;
5837 aSign = extractFloat128Sign( a );
5838 bSign = extractFloat128Sign( b );
5839 if ( aSign != bSign ) {
5840 return
5841 aSign
5842 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5843 != 0 );
5845 return
5846 aSign ? lt128( b.high, b.low, a.high, a.low )
5847 : lt128( a.high, a.low, b.high, b.low );
5851 /*----------------------------------------------------------------------------
5852 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5853 | be compared, and 0 otherwise. The invalid exception is raised if either
5854 | operand is a NaN. The comparison is performed according to the IEC/IEEE
5855 | Standard for Binary Floating-Point Arithmetic.
5856 *----------------------------------------------------------------------------*/
5858 int float128_unordered( float128 a, float128 b STATUS_PARAM )
5860 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5861 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5862 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5863 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5865 float_raise( float_flag_invalid STATUS_VAR);
5866 return 1;
5868 return 0;
5871 /*----------------------------------------------------------------------------
5872 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5873 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5874 | exception. The comparison is performed according to the IEC/IEEE Standard
5875 | for Binary Floating-Point Arithmetic.
5876 *----------------------------------------------------------------------------*/
5878 int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
5881 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5882 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5883 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5884 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5886 if ( float128_is_signaling_nan( a )
5887 || float128_is_signaling_nan( b ) ) {
5888 float_raise( float_flag_invalid STATUS_VAR);
5890 return 0;
5892 return
5893 ( a.low == b.low )
5894 && ( ( a.high == b.high )
5895 || ( ( a.low == 0 )
5896 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5901 /*----------------------------------------------------------------------------
5902 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5903 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5904 | cause an exception. Otherwise, the comparison is performed according to the
5905 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5906 *----------------------------------------------------------------------------*/
5908 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5910 flag aSign, bSign;
5912 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5913 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5914 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5915 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5917 if ( float128_is_signaling_nan( a )
5918 || float128_is_signaling_nan( b ) ) {
5919 float_raise( float_flag_invalid STATUS_VAR);
5921 return 0;
5923 aSign = extractFloat128Sign( a );
5924 bSign = extractFloat128Sign( b );
5925 if ( aSign != bSign ) {
5926 return
5927 aSign
5928 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5929 == 0 );
5931 return
5932 aSign ? le128( b.high, b.low, a.high, a.low )
5933 : le128( a.high, a.low, b.high, b.low );
5937 /*----------------------------------------------------------------------------
5938 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5939 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5940 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5941 | Standard for Binary Floating-Point Arithmetic.
5942 *----------------------------------------------------------------------------*/
5944 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5946 flag aSign, bSign;
5948 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5949 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5950 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5951 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5953 if ( float128_is_signaling_nan( a )
5954 || float128_is_signaling_nan( b ) ) {
5955 float_raise( float_flag_invalid STATUS_VAR);
5957 return 0;
5959 aSign = extractFloat128Sign( a );
5960 bSign = extractFloat128Sign( b );
5961 if ( aSign != bSign ) {
5962 return
5963 aSign
5964 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5965 != 0 );
5967 return
5968 aSign ? lt128( b.high, b.low, a.high, a.low )
5969 : lt128( a.high, a.low, b.high, b.low );
5973 /*----------------------------------------------------------------------------
5974 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5975 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
5976 | comparison is performed according to the IEC/IEEE Standard for Binary
5977 | Floating-Point Arithmetic.
5978 *----------------------------------------------------------------------------*/
5980 int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
5982 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5983 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5984 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5985 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5987 if ( float128_is_signaling_nan( a )
5988 || float128_is_signaling_nan( b ) ) {
5989 float_raise( float_flag_invalid STATUS_VAR);
5991 return 1;
5993 return 0;
5996 #endif
5998 /* misc functions */
5999 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
6001 return int64_to_float32(a STATUS_VAR);
6004 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
6006 return int64_to_float64(a STATUS_VAR);
6009 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
6011 int64_t v;
6012 unsigned int res;
6014 v = float32_to_int64(a STATUS_VAR);
6015 if (v < 0) {
6016 res = 0;
6017 float_raise( float_flag_invalid STATUS_VAR);
6018 } else if (v > 0xffffffff) {
6019 res = 0xffffffff;
6020 float_raise( float_flag_invalid STATUS_VAR);
6021 } else {
6022 res = v;
6024 return res;
6027 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6029 int64_t v;
6030 unsigned int res;
6032 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6033 if (v < 0) {
6034 res = 0;
6035 float_raise( float_flag_invalid STATUS_VAR);
6036 } else if (v > 0xffffffff) {
6037 res = 0xffffffff;
6038 float_raise( float_flag_invalid STATUS_VAR);
6039 } else {
6040 res = v;
6042 return res;
6045 unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
6047 int64_t v;
6048 unsigned int res;
6050 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6051 if (v < 0) {
6052 res = 0;
6053 float_raise( float_flag_invalid STATUS_VAR);
6054 } else if (v > 0xffff) {
6055 res = 0xffff;
6056 float_raise( float_flag_invalid STATUS_VAR);
6057 } else {
6058 res = v;
6060 return res;
6063 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
6065 int64_t v;
6066 unsigned int res;
6068 v = float64_to_int64(a STATUS_VAR);
6069 if (v < 0) {
6070 res = 0;
6071 float_raise( float_flag_invalid STATUS_VAR);
6072 } else if (v > 0xffffffff) {
6073 res = 0xffffffff;
6074 float_raise( float_flag_invalid STATUS_VAR);
6075 } else {
6076 res = v;
6078 return res;
6081 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6083 int64_t v;
6084 unsigned int res;
6086 v = float64_to_int64_round_to_zero(a STATUS_VAR);
6087 if (v < 0) {
6088 res = 0;
6089 float_raise( float_flag_invalid STATUS_VAR);
6090 } else if (v > 0xffffffff) {
6091 res = 0xffffffff;
6092 float_raise( float_flag_invalid STATUS_VAR);
6093 } else {
6094 res = v;
6096 return res;
6099 unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
6101 int64_t v;
6102 unsigned int res;
6104 v = float64_to_int64_round_to_zero(a STATUS_VAR);
6105 if (v < 0) {
6106 res = 0;
6107 float_raise( float_flag_invalid STATUS_VAR);
6108 } else if (v > 0xffff) {
6109 res = 0xffff;
6110 float_raise( float_flag_invalid STATUS_VAR);
6111 } else {
6112 res = v;
6114 return res;
6117 /* FIXME: This looks broken. */
6118 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
6120 int64_t v;
6122 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6123 v += float64_val(a);
6124 v = float64_to_int64(make_float64(v) STATUS_VAR);
6126 return v - INT64_MIN;
6129 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6131 int64_t v;
6133 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6134 v += float64_val(a);
6135 v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
6137 return v - INT64_MIN;
6140 #define COMPARE(s, nan_exp) \
6141 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
6142 int is_quiet STATUS_PARAM ) \
6144 flag aSign, bSign; \
6145 uint ## s ## _t av, bv; \
6146 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6147 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6149 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
6150 extractFloat ## s ## Frac( a ) ) || \
6151 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
6152 extractFloat ## s ## Frac( b ) )) { \
6153 if (!is_quiet || \
6154 float ## s ## _is_signaling_nan( a ) || \
6155 float ## s ## _is_signaling_nan( b ) ) { \
6156 float_raise( float_flag_invalid STATUS_VAR); \
6158 return float_relation_unordered; \
6160 aSign = extractFloat ## s ## Sign( a ); \
6161 bSign = extractFloat ## s ## Sign( b ); \
6162 av = float ## s ## _val(a); \
6163 bv = float ## s ## _val(b); \
6164 if ( aSign != bSign ) { \
6165 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
6166 /* zero case */ \
6167 return float_relation_equal; \
6168 } else { \
6169 return 1 - (2 * aSign); \
6171 } else { \
6172 if (av == bv) { \
6173 return float_relation_equal; \
6174 } else { \
6175 return 1 - 2 * (aSign ^ ( av < bv )); \
6180 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
6182 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
6185 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
6187 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
6190 COMPARE(32, 0xff)
6191 COMPARE(64, 0x7ff)
6193 INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
6194 int is_quiet STATUS_PARAM )
6196 flag aSign, bSign;
6198 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6199 ( extractFloatx80Frac( a )<<1 ) ) ||
6200 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6201 ( extractFloatx80Frac( b )<<1 ) )) {
6202 if (!is_quiet ||
6203 floatx80_is_signaling_nan( a ) ||
6204 floatx80_is_signaling_nan( b ) ) {
6205 float_raise( float_flag_invalid STATUS_VAR);
6207 return float_relation_unordered;
6209 aSign = extractFloatx80Sign( a );
6210 bSign = extractFloatx80Sign( b );
6211 if ( aSign != bSign ) {
6213 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6214 ( ( a.low | b.low ) == 0 ) ) {
6215 /* zero case */
6216 return float_relation_equal;
6217 } else {
6218 return 1 - (2 * aSign);
6220 } else {
6221 if (a.low == b.low && a.high == b.high) {
6222 return float_relation_equal;
6223 } else {
6224 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6229 int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
6231 return floatx80_compare_internal(a, b, 0 STATUS_VAR);
6234 int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
6236 return floatx80_compare_internal(a, b, 1 STATUS_VAR);
6239 INLINE int float128_compare_internal( float128 a, float128 b,
6240 int is_quiet STATUS_PARAM )
6242 flag aSign, bSign;
6244 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6245 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6246 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6247 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6248 if (!is_quiet ||
6249 float128_is_signaling_nan( a ) ||
6250 float128_is_signaling_nan( b ) ) {
6251 float_raise( float_flag_invalid STATUS_VAR);
6253 return float_relation_unordered;
6255 aSign = extractFloat128Sign( a );
6256 bSign = extractFloat128Sign( b );
6257 if ( aSign != bSign ) {
6258 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6259 /* zero case */
6260 return float_relation_equal;
6261 } else {
6262 return 1 - (2 * aSign);
6264 } else {
6265 if (a.low == b.low && a.high == b.high) {
6266 return float_relation_equal;
6267 } else {
6268 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6273 int float128_compare( float128 a, float128 b STATUS_PARAM )
6275 return float128_compare_internal(a, b, 0 STATUS_VAR);
6278 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6280 return float128_compare_internal(a, b, 1 STATUS_VAR);
6283 /* min() and max() functions. These can't be implemented as
6284 * 'compare and pick one input' because that would mishandle
6285 * NaNs and +0 vs -0.
6287 #define MINMAX(s, nan_exp) \
6288 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \
6289 int ismin STATUS_PARAM ) \
6291 flag aSign, bSign; \
6292 uint ## s ## _t av, bv; \
6293 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6294 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6295 if (float ## s ## _is_any_nan(a) || \
6296 float ## s ## _is_any_nan(b)) { \
6297 return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
6299 aSign = extractFloat ## s ## Sign(a); \
6300 bSign = extractFloat ## s ## Sign(b); \
6301 av = float ## s ## _val(a); \
6302 bv = float ## s ## _val(b); \
6303 if (aSign != bSign) { \
6304 if (ismin) { \
6305 return aSign ? a : b; \
6306 } else { \
6307 return aSign ? b : a; \
6309 } else { \
6310 if (ismin) { \
6311 return (aSign ^ (av < bv)) ? a : b; \
6312 } else { \
6313 return (aSign ^ (av < bv)) ? b : a; \
6318 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
6320 return float ## s ## _minmax(a, b, 1 STATUS_VAR); \
6323 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
6325 return float ## s ## _minmax(a, b, 0 STATUS_VAR); \
6328 MINMAX(32, 0xff)
6329 MINMAX(64, 0x7ff)
6332 /* Multiply A by 2 raised to the power N. */
6333 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6335 flag aSign;
6336 int16_t aExp;
6337 uint32_t aSig;
6339 a = float32_squash_input_denormal(a STATUS_VAR);
6340 aSig = extractFloat32Frac( a );
6341 aExp = extractFloat32Exp( a );
6342 aSign = extractFloat32Sign( a );
6344 if ( aExp == 0xFF ) {
6345 if ( aSig ) {
6346 return propagateFloat32NaN( a, a STATUS_VAR );
6348 return a;
6350 if ( aExp != 0 )
6351 aSig |= 0x00800000;
6352 else if ( aSig == 0 )
6353 return a;
6355 if (n > 0x200) {
6356 n = 0x200;
6357 } else if (n < -0x200) {
6358 n = -0x200;
6361 aExp += n - 1;
6362 aSig <<= 7;
6363 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6366 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6368 flag aSign;
6369 int16_t aExp;
6370 uint64_t aSig;
6372 a = float64_squash_input_denormal(a STATUS_VAR);
6373 aSig = extractFloat64Frac( a );
6374 aExp = extractFloat64Exp( a );
6375 aSign = extractFloat64Sign( a );
6377 if ( aExp == 0x7FF ) {
6378 if ( aSig ) {
6379 return propagateFloat64NaN( a, a STATUS_VAR );
6381 return a;
6383 if ( aExp != 0 )
6384 aSig |= LIT64( 0x0010000000000000 );
6385 else if ( aSig == 0 )
6386 return a;
6388 if (n > 0x1000) {
6389 n = 0x1000;
6390 } else if (n < -0x1000) {
6391 n = -0x1000;
6394 aExp += n - 1;
6395 aSig <<= 10;
6396 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6399 #ifdef FLOATX80
6400 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6402 flag aSign;
6403 int32_t aExp;
6404 uint64_t aSig;
6406 aSig = extractFloatx80Frac( a );
6407 aExp = extractFloatx80Exp( a );
6408 aSign = extractFloatx80Sign( a );
6410 if ( aExp == 0x7FFF ) {
6411 if ( aSig<<1 ) {
6412 return propagateFloatx80NaN( a, a STATUS_VAR );
6414 return a;
6417 if (aExp == 0 && aSig == 0)
6418 return a;
6420 if (n > 0x10000) {
6421 n = 0x10000;
6422 } else if (n < -0x10000) {
6423 n = -0x10000;
6426 aExp += n;
6427 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6428 aSign, aExp, aSig, 0 STATUS_VAR );
6430 #endif
6432 #ifdef FLOAT128
6433 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6435 flag aSign;
6436 int32_t aExp;
6437 uint64_t aSig0, aSig1;
6439 aSig1 = extractFloat128Frac1( a );
6440 aSig0 = extractFloat128Frac0( a );
6441 aExp = extractFloat128Exp( a );
6442 aSign = extractFloat128Sign( a );
6443 if ( aExp == 0x7FFF ) {
6444 if ( aSig0 | aSig1 ) {
6445 return propagateFloat128NaN( a, a STATUS_VAR );
6447 return a;
6449 if ( aExp != 0 )
6450 aSig0 |= LIT64( 0x0001000000000000 );
6451 else if ( aSig0 == 0 && aSig1 == 0 )
6452 return a;
6454 if (n > 0x10000) {
6455 n = 0x10000;
6456 } else if (n < -0x10000) {
6457 n = -0x10000;
6460 aExp += n - 1;
6461 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6462 STATUS_VAR );
6465 #endif