softfloat: Use uint32 consistently
[qemu.git] / fpu / softfloat.c
blob2b2008502c57f6e1c5d7cccbb73df684143ab76b
1 /*
2 * QEMU float support
4 * Derived from SoftFloat.
5 */
7 /*============================================================================
9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
10 Package, Release 2b.
12 Written by John R. Hauser. This work was made possible in part by the
13 International Computer Science Institute, located at Suite 600, 1947 Center
14 Street, Berkeley, California 94704. Funding was partially provided by the
15 National Science Foundation under grant MIP-9311980. The original version
16 of this code was written as part of a project to build a fixed-point vector
17 processor in collaboration with the University of California at Berkeley,
18 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20 arithmetic/SoftFloat.html'.
22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
31 Derivative works are acceptable, even for commercial purposes, so long as
32 (1) the source code for the derivative work includes prominent notice that
33 the work is derivative, and (2) the source code includes prominent notice with
34 these four paragraphs for those parts of this code that are retained.
36 =============================================================================*/
38 #include "softfloat.h"
40 /*----------------------------------------------------------------------------
41 | Primitive arithmetic functions, including multi-word arithmetic, and
42 | division and square root approximations. (Can be specialized to target if
43 | desired.)
44 *----------------------------------------------------------------------------*/
45 #include "softfloat-macros.h"
47 /*----------------------------------------------------------------------------
48 | Functions and definitions to determine: (1) whether tininess for underflow
49 | is detected before or after rounding by default, (2) what (if anything)
50 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
51 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
52 | are propagated from function inputs to output. These details are target-
53 | specific.
54 *----------------------------------------------------------------------------*/
55 #include "softfloat-specialize.h"
57 void set_float_rounding_mode(int val STATUS_PARAM)
59 STATUS(float_rounding_mode) = val;
62 void set_float_exception_flags(int val STATUS_PARAM)
64 STATUS(float_exception_flags) = val;
67 void set_floatx80_rounding_precision(int val STATUS_PARAM)
69 STATUS(floatx80_rounding_precision) = val;
72 /*----------------------------------------------------------------------------
73 | Returns the fraction bits of the half-precision floating-point value `a'.
74 *----------------------------------------------------------------------------*/
76 INLINE uint32_t extractFloat16Frac(float16 a)
78 return float16_val(a) & 0x3ff;
81 /*----------------------------------------------------------------------------
82 | Returns the exponent bits of the half-precision floating-point value `a'.
83 *----------------------------------------------------------------------------*/
85 INLINE int16 extractFloat16Exp(float16 a)
87 return (float16_val(a) >> 10) & 0x1f;
90 /*----------------------------------------------------------------------------
91 | Returns the sign bit of the single-precision floating-point value `a'.
92 *----------------------------------------------------------------------------*/
94 INLINE flag extractFloat16Sign(float16 a)
96 return float16_val(a)>>15;
99 /*----------------------------------------------------------------------------
100 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
101 | and 7, and returns the properly rounded 32-bit integer corresponding to the
102 | input. If `zSign' is 1, the input is negated before being converted to an
103 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
104 | is simply rounded to an integer, with the inexact exception raised if the
105 | input cannot be represented exactly as an integer. However, if the fixed-
106 | point input is too large, the invalid exception is raised and the largest
107 | positive or negative integer is returned.
108 *----------------------------------------------------------------------------*/
110 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
112 int8 roundingMode;
113 flag roundNearestEven;
114 int8 roundIncrement, roundBits;
115 int32 z;
117 roundingMode = STATUS(float_rounding_mode);
118 roundNearestEven = ( roundingMode == float_round_nearest_even );
119 roundIncrement = 0x40;
120 if ( ! roundNearestEven ) {
121 if ( roundingMode == float_round_to_zero ) {
122 roundIncrement = 0;
124 else {
125 roundIncrement = 0x7F;
126 if ( zSign ) {
127 if ( roundingMode == float_round_up ) roundIncrement = 0;
129 else {
130 if ( roundingMode == float_round_down ) roundIncrement = 0;
134 roundBits = absZ & 0x7F;
135 absZ = ( absZ + roundIncrement )>>7;
136 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
137 z = absZ;
138 if ( zSign ) z = - z;
139 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
140 float_raise( float_flag_invalid STATUS_VAR);
141 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
143 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
144 return z;
148 /*----------------------------------------------------------------------------
149 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
150 | `absZ1', with binary point between bits 63 and 64 (between the input words),
151 | and returns the properly rounded 64-bit integer corresponding to the input.
152 | If `zSign' is 1, the input is negated before being converted to an integer.
153 | Ordinarily, the fixed-point input is simply rounded to an integer, with
154 | the inexact exception raised if the input cannot be represented exactly as
155 | an integer. However, if the fixed-point input is too large, the invalid
156 | exception is raised and the largest positive or negative integer is
157 | returned.
158 *----------------------------------------------------------------------------*/
160 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
162 int8 roundingMode;
163 flag roundNearestEven, increment;
164 int64 z;
166 roundingMode = STATUS(float_rounding_mode);
167 roundNearestEven = ( roundingMode == float_round_nearest_even );
168 increment = ( (int64_t) absZ1 < 0 );
169 if ( ! roundNearestEven ) {
170 if ( roundingMode == float_round_to_zero ) {
171 increment = 0;
173 else {
174 if ( zSign ) {
175 increment = ( roundingMode == float_round_down ) && absZ1;
177 else {
178 increment = ( roundingMode == float_round_up ) && absZ1;
182 if ( increment ) {
183 ++absZ0;
184 if ( absZ0 == 0 ) goto overflow;
185 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
187 z = absZ0;
188 if ( zSign ) z = - z;
189 if ( z && ( ( z < 0 ) ^ zSign ) ) {
190 overflow:
191 float_raise( float_flag_invalid STATUS_VAR);
192 return
193 zSign ? (int64_t) LIT64( 0x8000000000000000 )
194 : LIT64( 0x7FFFFFFFFFFFFFFF );
196 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
197 return z;
201 /*----------------------------------------------------------------------------
202 | Returns the fraction bits of the single-precision floating-point value `a'.
203 *----------------------------------------------------------------------------*/
205 INLINE uint32_t extractFloat32Frac( float32 a )
208 return float32_val(a) & 0x007FFFFF;
212 /*----------------------------------------------------------------------------
213 | Returns the exponent bits of the single-precision floating-point value `a'.
214 *----------------------------------------------------------------------------*/
216 INLINE int16 extractFloat32Exp( float32 a )
219 return ( float32_val(a)>>23 ) & 0xFF;
223 /*----------------------------------------------------------------------------
224 | Returns the sign bit of the single-precision floating-point value `a'.
225 *----------------------------------------------------------------------------*/
227 INLINE flag extractFloat32Sign( float32 a )
230 return float32_val(a)>>31;
234 /*----------------------------------------------------------------------------
235 | If `a' is denormal and we are in flush-to-zero mode then set the
236 | input-denormal exception and return zero. Otherwise just return the value.
237 *----------------------------------------------------------------------------*/
238 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
240 if (STATUS(flush_inputs_to_zero)) {
241 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
242 float_raise(float_flag_input_denormal STATUS_VAR);
243 return make_float32(float32_val(a) & 0x80000000);
246 return a;
249 /*----------------------------------------------------------------------------
250 | Normalizes the subnormal single-precision floating-point value represented
251 | by the denormalized significand `aSig'. The normalized exponent and
252 | significand are stored at the locations pointed to by `zExpPtr' and
253 | `zSigPtr', respectively.
254 *----------------------------------------------------------------------------*/
256 static void
257 normalizeFloat32Subnormal( uint32_t aSig, int16 *zExpPtr, uint32_t *zSigPtr )
259 int8 shiftCount;
261 shiftCount = countLeadingZeros32( aSig ) - 8;
262 *zSigPtr = aSig<<shiftCount;
263 *zExpPtr = 1 - shiftCount;
267 /*----------------------------------------------------------------------------
268 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
269 | single-precision floating-point value, returning the result. After being
270 | shifted into the proper positions, the three fields are simply added
271 | together to form the result. This means that any integer portion of `zSig'
272 | will be added into the exponent. Since a properly normalized significand
273 | will have an integer portion equal to 1, the `zExp' input should be 1 less
274 | than the desired result exponent whenever `zSig' is a complete, normalized
275 | significand.
276 *----------------------------------------------------------------------------*/
278 INLINE float32 packFloat32( flag zSign, int16 zExp, uint32_t zSig )
281 return make_float32(
282 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
286 /*----------------------------------------------------------------------------
287 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
288 | and significand `zSig', and returns the proper single-precision floating-
289 | point value corresponding to the abstract input. Ordinarily, the abstract
290 | value is simply rounded and packed into the single-precision format, with
291 | the inexact exception raised if the abstract input cannot be represented
292 | exactly. However, if the abstract value is too large, the overflow and
293 | inexact exceptions are raised and an infinity or maximal finite value is
294 | returned. If the abstract value is too small, the input value is rounded to
295 | a subnormal number, and the underflow and inexact exceptions are raised if
296 | the abstract input cannot be represented exactly as a subnormal single-
297 | precision floating-point number.
298 | The input significand `zSig' has its binary point between bits 30
299 | and 29, which is 7 bits to the left of the usual location. This shifted
300 | significand must be normalized or smaller. If `zSig' is not normalized,
301 | `zExp' must be 0; in that case, the result returned is a subnormal number,
302 | and it must not require rounding. In the usual case that `zSig' is
303 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
304 | The handling of underflow and overflow follows the IEC/IEEE Standard for
305 | Binary Floating-Point Arithmetic.
306 *----------------------------------------------------------------------------*/
308 static float32 roundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
310 int8 roundingMode;
311 flag roundNearestEven;
312 int8 roundIncrement, roundBits;
313 flag isTiny;
315 roundingMode = STATUS(float_rounding_mode);
316 roundNearestEven = ( roundingMode == float_round_nearest_even );
317 roundIncrement = 0x40;
318 if ( ! roundNearestEven ) {
319 if ( roundingMode == float_round_to_zero ) {
320 roundIncrement = 0;
322 else {
323 roundIncrement = 0x7F;
324 if ( zSign ) {
325 if ( roundingMode == float_round_up ) roundIncrement = 0;
327 else {
328 if ( roundingMode == float_round_down ) roundIncrement = 0;
332 roundBits = zSig & 0x7F;
333 if ( 0xFD <= (uint16_t) zExp ) {
334 if ( ( 0xFD < zExp )
335 || ( ( zExp == 0xFD )
336 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
338 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
339 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
341 if ( zExp < 0 ) {
342 if (STATUS(flush_to_zero)) {
343 float_raise(float_flag_output_denormal STATUS_VAR);
344 return packFloat32(zSign, 0, 0);
346 isTiny =
347 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
348 || ( zExp < -1 )
349 || ( zSig + roundIncrement < 0x80000000 );
350 shift32RightJamming( zSig, - zExp, &zSig );
351 zExp = 0;
352 roundBits = zSig & 0x7F;
353 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
356 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
357 zSig = ( zSig + roundIncrement )>>7;
358 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
359 if ( zSig == 0 ) zExp = 0;
360 return packFloat32( zSign, zExp, zSig );
364 /*----------------------------------------------------------------------------
365 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
366 | and significand `zSig', and returns the proper single-precision floating-
367 | point value corresponding to the abstract input. This routine is just like
368 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
369 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
370 | floating-point exponent.
371 *----------------------------------------------------------------------------*/
373 static float32
374 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
376 int8 shiftCount;
378 shiftCount = countLeadingZeros32( zSig ) - 1;
379 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
383 /*----------------------------------------------------------------------------
384 | Returns the fraction bits of the double-precision floating-point value `a'.
385 *----------------------------------------------------------------------------*/
387 INLINE uint64_t extractFloat64Frac( float64 a )
390 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
394 /*----------------------------------------------------------------------------
395 | Returns the exponent bits of the double-precision floating-point value `a'.
396 *----------------------------------------------------------------------------*/
398 INLINE int16 extractFloat64Exp( float64 a )
401 return ( float64_val(a)>>52 ) & 0x7FF;
405 /*----------------------------------------------------------------------------
406 | Returns the sign bit of the double-precision floating-point value `a'.
407 *----------------------------------------------------------------------------*/
409 INLINE flag extractFloat64Sign( float64 a )
412 return float64_val(a)>>63;
416 /*----------------------------------------------------------------------------
417 | If `a' is denormal and we are in flush-to-zero mode then set the
418 | input-denormal exception and return zero. Otherwise just return the value.
419 *----------------------------------------------------------------------------*/
420 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
422 if (STATUS(flush_inputs_to_zero)) {
423 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
424 float_raise(float_flag_input_denormal STATUS_VAR);
425 return make_float64(float64_val(a) & (1ULL << 63));
428 return a;
431 /*----------------------------------------------------------------------------
432 | Normalizes the subnormal double-precision floating-point value represented
433 | by the denormalized significand `aSig'. The normalized exponent and
434 | significand are stored at the locations pointed to by `zExpPtr' and
435 | `zSigPtr', respectively.
436 *----------------------------------------------------------------------------*/
438 static void
439 normalizeFloat64Subnormal( uint64_t aSig, int16 *zExpPtr, uint64_t *zSigPtr )
441 int8 shiftCount;
443 shiftCount = countLeadingZeros64( aSig ) - 11;
444 *zSigPtr = aSig<<shiftCount;
445 *zExpPtr = 1 - shiftCount;
449 /*----------------------------------------------------------------------------
450 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
451 | double-precision floating-point value, returning the result. After being
452 | shifted into the proper positions, the three fields are simply added
453 | together to form the result. This means that any integer portion of `zSig'
454 | will be added into the exponent. Since a properly normalized significand
455 | will have an integer portion equal to 1, the `zExp' input should be 1 less
456 | than the desired result exponent whenever `zSig' is a complete, normalized
457 | significand.
458 *----------------------------------------------------------------------------*/
460 INLINE float64 packFloat64( flag zSign, int16 zExp, uint64_t zSig )
463 return make_float64(
464 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
468 /*----------------------------------------------------------------------------
469 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
470 | and significand `zSig', and returns the proper double-precision floating-
471 | point value corresponding to the abstract input. Ordinarily, the abstract
472 | value is simply rounded and packed into the double-precision format, with
473 | the inexact exception raised if the abstract input cannot be represented
474 | exactly. However, if the abstract value is too large, the overflow and
475 | inexact exceptions are raised and an infinity or maximal finite value is
476 | returned. If the abstract value is too small, the input value is rounded
477 | to a subnormal number, and the underflow and inexact exceptions are raised
478 | if the abstract input cannot be represented exactly as a subnormal double-
479 | precision floating-point number.
480 | The input significand `zSig' has its binary point between bits 62
481 | and 61, which is 10 bits to the left of the usual location. This shifted
482 | significand must be normalized or smaller. If `zSig' is not normalized,
483 | `zExp' must be 0; in that case, the result returned is a subnormal number,
484 | and it must not require rounding. In the usual case that `zSig' is
485 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
486 | The handling of underflow and overflow follows the IEC/IEEE Standard for
487 | Binary Floating-Point Arithmetic.
488 *----------------------------------------------------------------------------*/
490 static float64 roundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
492 int8 roundingMode;
493 flag roundNearestEven;
494 int16 roundIncrement, roundBits;
495 flag isTiny;
497 roundingMode = STATUS(float_rounding_mode);
498 roundNearestEven = ( roundingMode == float_round_nearest_even );
499 roundIncrement = 0x200;
500 if ( ! roundNearestEven ) {
501 if ( roundingMode == float_round_to_zero ) {
502 roundIncrement = 0;
504 else {
505 roundIncrement = 0x3FF;
506 if ( zSign ) {
507 if ( roundingMode == float_round_up ) roundIncrement = 0;
509 else {
510 if ( roundingMode == float_round_down ) roundIncrement = 0;
514 roundBits = zSig & 0x3FF;
515 if ( 0x7FD <= (uint16_t) zExp ) {
516 if ( ( 0x7FD < zExp )
517 || ( ( zExp == 0x7FD )
518 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
520 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
521 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
523 if ( zExp < 0 ) {
524 if (STATUS(flush_to_zero)) {
525 float_raise(float_flag_output_denormal STATUS_VAR);
526 return packFloat64(zSign, 0, 0);
528 isTiny =
529 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
530 || ( zExp < -1 )
531 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
532 shift64RightJamming( zSig, - zExp, &zSig );
533 zExp = 0;
534 roundBits = zSig & 0x3FF;
535 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
538 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
539 zSig = ( zSig + roundIncrement )>>10;
540 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
541 if ( zSig == 0 ) zExp = 0;
542 return packFloat64( zSign, zExp, zSig );
546 /*----------------------------------------------------------------------------
547 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
548 | and significand `zSig', and returns the proper double-precision floating-
549 | point value corresponding to the abstract input. This routine is just like
550 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
551 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
552 | floating-point exponent.
553 *----------------------------------------------------------------------------*/
555 static float64
556 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
558 int8 shiftCount;
560 shiftCount = countLeadingZeros64( zSig ) - 1;
561 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
565 /*----------------------------------------------------------------------------
566 | Returns the fraction bits of the extended double-precision floating-point
567 | value `a'.
568 *----------------------------------------------------------------------------*/
570 INLINE uint64_t extractFloatx80Frac( floatx80 a )
573 return a.low;
577 /*----------------------------------------------------------------------------
578 | Returns the exponent bits of the extended double-precision floating-point
579 | value `a'.
580 *----------------------------------------------------------------------------*/
582 INLINE int32 extractFloatx80Exp( floatx80 a )
585 return a.high & 0x7FFF;
589 /*----------------------------------------------------------------------------
590 | Returns the sign bit of the extended double-precision floating-point value
591 | `a'.
592 *----------------------------------------------------------------------------*/
594 INLINE flag extractFloatx80Sign( floatx80 a )
597 return a.high>>15;
601 /*----------------------------------------------------------------------------
602 | Normalizes the subnormal extended double-precision floating-point value
603 | represented by the denormalized significand `aSig'. The normalized exponent
604 | and significand are stored at the locations pointed to by `zExpPtr' and
605 | `zSigPtr', respectively.
606 *----------------------------------------------------------------------------*/
608 static void
609 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
611 int8 shiftCount;
613 shiftCount = countLeadingZeros64( aSig );
614 *zSigPtr = aSig<<shiftCount;
615 *zExpPtr = 1 - shiftCount;
619 /*----------------------------------------------------------------------------
620 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
621 | extended double-precision floating-point value, returning the result.
622 *----------------------------------------------------------------------------*/
624 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
626 floatx80 z;
628 z.low = zSig;
629 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
630 return z;
634 /*----------------------------------------------------------------------------
635 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
636 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
637 | and returns the proper extended double-precision floating-point value
638 | corresponding to the abstract input. Ordinarily, the abstract value is
639 | rounded and packed into the extended double-precision format, with the
640 | inexact exception raised if the abstract input cannot be represented
641 | exactly. However, if the abstract value is too large, the overflow and
642 | inexact exceptions are raised and an infinity or maximal finite value is
643 | returned. If the abstract value is too small, the input value is rounded to
644 | a subnormal number, and the underflow and inexact exceptions are raised if
645 | the abstract input cannot be represented exactly as a subnormal extended
646 | double-precision floating-point number.
647 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
648 | number of bits as single or double precision, respectively. Otherwise, the
649 | result is rounded to the full precision of the extended double-precision
650 | format.
651 | The input significand must be normalized or smaller. If the input
652 | significand is not normalized, `zExp' must be 0; in that case, the result
653 | returned is a subnormal number, and it must not require rounding. The
654 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
655 | Floating-Point Arithmetic.
656 *----------------------------------------------------------------------------*/
658 static floatx80
659 roundAndPackFloatx80(
660 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
661 STATUS_PARAM)
663 int8 roundingMode;
664 flag roundNearestEven, increment, isTiny;
665 int64 roundIncrement, roundMask, roundBits;
667 roundingMode = STATUS(float_rounding_mode);
668 roundNearestEven = ( roundingMode == float_round_nearest_even );
669 if ( roundingPrecision == 80 ) goto precision80;
670 if ( roundingPrecision == 64 ) {
671 roundIncrement = LIT64( 0x0000000000000400 );
672 roundMask = LIT64( 0x00000000000007FF );
674 else if ( roundingPrecision == 32 ) {
675 roundIncrement = LIT64( 0x0000008000000000 );
676 roundMask = LIT64( 0x000000FFFFFFFFFF );
678 else {
679 goto precision80;
681 zSig0 |= ( zSig1 != 0 );
682 if ( ! roundNearestEven ) {
683 if ( roundingMode == float_round_to_zero ) {
684 roundIncrement = 0;
686 else {
687 roundIncrement = roundMask;
688 if ( zSign ) {
689 if ( roundingMode == float_round_up ) roundIncrement = 0;
691 else {
692 if ( roundingMode == float_round_down ) roundIncrement = 0;
696 roundBits = zSig0 & roundMask;
697 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
698 if ( ( 0x7FFE < zExp )
699 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
701 goto overflow;
703 if ( zExp <= 0 ) {
704 if (STATUS(flush_to_zero)) {
705 float_raise(float_flag_output_denormal STATUS_VAR);
706 return packFloatx80(zSign, 0, 0);
708 isTiny =
709 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
710 || ( zExp < 0 )
711 || ( zSig0 <= zSig0 + roundIncrement );
712 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
713 zExp = 0;
714 roundBits = zSig0 & roundMask;
715 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
716 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
717 zSig0 += roundIncrement;
718 if ( (int64_t) zSig0 < 0 ) zExp = 1;
719 roundIncrement = roundMask + 1;
720 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
721 roundMask |= roundIncrement;
723 zSig0 &= ~ roundMask;
724 return packFloatx80( zSign, zExp, zSig0 );
727 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
728 zSig0 += roundIncrement;
729 if ( zSig0 < roundIncrement ) {
730 ++zExp;
731 zSig0 = LIT64( 0x8000000000000000 );
733 roundIncrement = roundMask + 1;
734 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
735 roundMask |= roundIncrement;
737 zSig0 &= ~ roundMask;
738 if ( zSig0 == 0 ) zExp = 0;
739 return packFloatx80( zSign, zExp, zSig0 );
740 precision80:
741 increment = ( (int64_t) zSig1 < 0 );
742 if ( ! roundNearestEven ) {
743 if ( roundingMode == float_round_to_zero ) {
744 increment = 0;
746 else {
747 if ( zSign ) {
748 increment = ( roundingMode == float_round_down ) && zSig1;
750 else {
751 increment = ( roundingMode == float_round_up ) && zSig1;
755 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
756 if ( ( 0x7FFE < zExp )
757 || ( ( zExp == 0x7FFE )
758 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
759 && increment
762 roundMask = 0;
763 overflow:
764 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
765 if ( ( roundingMode == float_round_to_zero )
766 || ( zSign && ( roundingMode == float_round_up ) )
767 || ( ! zSign && ( roundingMode == float_round_down ) )
769 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
771 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
773 if ( zExp <= 0 ) {
774 isTiny =
775 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
776 || ( zExp < 0 )
777 || ! increment
778 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
779 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
780 zExp = 0;
781 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
782 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
783 if ( roundNearestEven ) {
784 increment = ( (int64_t) zSig1 < 0 );
786 else {
787 if ( zSign ) {
788 increment = ( roundingMode == float_round_down ) && zSig1;
790 else {
791 increment = ( roundingMode == float_round_up ) && zSig1;
794 if ( increment ) {
795 ++zSig0;
796 zSig0 &=
797 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
798 if ( (int64_t) zSig0 < 0 ) zExp = 1;
800 return packFloatx80( zSign, zExp, zSig0 );
803 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
804 if ( increment ) {
805 ++zSig0;
806 if ( zSig0 == 0 ) {
807 ++zExp;
808 zSig0 = LIT64( 0x8000000000000000 );
810 else {
811 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
814 else {
815 if ( zSig0 == 0 ) zExp = 0;
817 return packFloatx80( zSign, zExp, zSig0 );
821 /*----------------------------------------------------------------------------
822 | Takes an abstract floating-point value having sign `zSign', exponent
823 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
824 | and returns the proper extended double-precision floating-point value
825 | corresponding to the abstract input. This routine is just like
826 | `roundAndPackFloatx80' except that the input significand does not have to be
827 | normalized.
828 *----------------------------------------------------------------------------*/
830 static floatx80
831 normalizeRoundAndPackFloatx80(
832 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
833 STATUS_PARAM)
835 int8 shiftCount;
837 if ( zSig0 == 0 ) {
838 zSig0 = zSig1;
839 zSig1 = 0;
840 zExp -= 64;
842 shiftCount = countLeadingZeros64( zSig0 );
843 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
844 zExp -= shiftCount;
845 return
846 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
850 /*----------------------------------------------------------------------------
851 | Returns the least-significant 64 fraction bits of the quadruple-precision
852 | floating-point value `a'.
853 *----------------------------------------------------------------------------*/
855 INLINE uint64_t extractFloat128Frac1( float128 a )
858 return a.low;
862 /*----------------------------------------------------------------------------
863 | Returns the most-significant 48 fraction bits of the quadruple-precision
864 | floating-point value `a'.
865 *----------------------------------------------------------------------------*/
867 INLINE uint64_t extractFloat128Frac0( float128 a )
870 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
874 /*----------------------------------------------------------------------------
875 | Returns the exponent bits of the quadruple-precision floating-point value
876 | `a'.
877 *----------------------------------------------------------------------------*/
879 INLINE int32 extractFloat128Exp( float128 a )
882 return ( a.high>>48 ) & 0x7FFF;
886 /*----------------------------------------------------------------------------
887 | Returns the sign bit of the quadruple-precision floating-point value `a'.
888 *----------------------------------------------------------------------------*/
890 INLINE flag extractFloat128Sign( float128 a )
893 return a.high>>63;
897 /*----------------------------------------------------------------------------
898 | Normalizes the subnormal quadruple-precision floating-point value
899 | represented by the denormalized significand formed by the concatenation of
900 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
901 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
902 | significand are stored at the location pointed to by `zSig0Ptr', and the
903 | least significant 64 bits of the normalized significand are stored at the
904 | location pointed to by `zSig1Ptr'.
905 *----------------------------------------------------------------------------*/
907 static void
908 normalizeFloat128Subnormal(
909 uint64_t aSig0,
910 uint64_t aSig1,
911 int32 *zExpPtr,
912 uint64_t *zSig0Ptr,
913 uint64_t *zSig1Ptr
916 int8 shiftCount;
918 if ( aSig0 == 0 ) {
919 shiftCount = countLeadingZeros64( aSig1 ) - 15;
920 if ( shiftCount < 0 ) {
921 *zSig0Ptr = aSig1>>( - shiftCount );
922 *zSig1Ptr = aSig1<<( shiftCount & 63 );
924 else {
925 *zSig0Ptr = aSig1<<shiftCount;
926 *zSig1Ptr = 0;
928 *zExpPtr = - shiftCount - 63;
930 else {
931 shiftCount = countLeadingZeros64( aSig0 ) - 15;
932 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
933 *zExpPtr = 1 - shiftCount;
938 /*----------------------------------------------------------------------------
939 | Packs the sign `zSign', the exponent `zExp', and the significand formed
940 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
941 | floating-point value, returning the result. After being shifted into the
942 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
943 | added together to form the most significant 32 bits of the result. This
944 | means that any integer portion of `zSig0' will be added into the exponent.
945 | Since a properly normalized significand will have an integer portion equal
946 | to 1, the `zExp' input should be 1 less than the desired result exponent
947 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
948 | significand.
949 *----------------------------------------------------------------------------*/
951 INLINE float128
952 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
954 float128 z;
956 z.low = zSig1;
957 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
958 return z;
962 /*----------------------------------------------------------------------------
963 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
964 | and extended significand formed by the concatenation of `zSig0', `zSig1',
965 | and `zSig2', and returns the proper quadruple-precision floating-point value
966 | corresponding to the abstract input. Ordinarily, the abstract value is
967 | simply rounded and packed into the quadruple-precision format, with the
968 | inexact exception raised if the abstract input cannot be represented
969 | exactly. However, if the abstract value is too large, the overflow and
970 | inexact exceptions are raised and an infinity or maximal finite value is
971 | returned. If the abstract value is too small, the input value is rounded to
972 | a subnormal number, and the underflow and inexact exceptions are raised if
973 | the abstract input cannot be represented exactly as a subnormal quadruple-
974 | precision floating-point number.
975 | The input significand must be normalized or smaller. If the input
976 | significand is not normalized, `zExp' must be 0; in that case, the result
977 | returned is a subnormal number, and it must not require rounding. In the
978 | usual case that the input significand is normalized, `zExp' must be 1 less
979 | than the ``true'' floating-point exponent. The handling of underflow and
980 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
981 *----------------------------------------------------------------------------*/
983 static float128
984 roundAndPackFloat128(
985 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
987 int8 roundingMode;
988 flag roundNearestEven, increment, isTiny;
990 roundingMode = STATUS(float_rounding_mode);
991 roundNearestEven = ( roundingMode == float_round_nearest_even );
992 increment = ( (int64_t) zSig2 < 0 );
993 if ( ! roundNearestEven ) {
994 if ( roundingMode == float_round_to_zero ) {
995 increment = 0;
997 else {
998 if ( zSign ) {
999 increment = ( roundingMode == float_round_down ) && zSig2;
1001 else {
1002 increment = ( roundingMode == float_round_up ) && zSig2;
1006 if ( 0x7FFD <= (uint32_t) zExp ) {
1007 if ( ( 0x7FFD < zExp )
1008 || ( ( zExp == 0x7FFD )
1009 && eq128(
1010 LIT64( 0x0001FFFFFFFFFFFF ),
1011 LIT64( 0xFFFFFFFFFFFFFFFF ),
1012 zSig0,
1013 zSig1
1015 && increment
1018 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1019 if ( ( roundingMode == float_round_to_zero )
1020 || ( zSign && ( roundingMode == float_round_up ) )
1021 || ( ! zSign && ( roundingMode == float_round_down ) )
1023 return
1024 packFloat128(
1025 zSign,
1026 0x7FFE,
1027 LIT64( 0x0000FFFFFFFFFFFF ),
1028 LIT64( 0xFFFFFFFFFFFFFFFF )
1031 return packFloat128( zSign, 0x7FFF, 0, 0 );
1033 if ( zExp < 0 ) {
1034 if (STATUS(flush_to_zero)) {
1035 float_raise(float_flag_output_denormal STATUS_VAR);
1036 return packFloat128(zSign, 0, 0, 0);
1038 isTiny =
1039 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1040 || ( zExp < -1 )
1041 || ! increment
1042 || lt128(
1043 zSig0,
1044 zSig1,
1045 LIT64( 0x0001FFFFFFFFFFFF ),
1046 LIT64( 0xFFFFFFFFFFFFFFFF )
1048 shift128ExtraRightJamming(
1049 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1050 zExp = 0;
1051 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1052 if ( roundNearestEven ) {
1053 increment = ( (int64_t) zSig2 < 0 );
1055 else {
1056 if ( zSign ) {
1057 increment = ( roundingMode == float_round_down ) && zSig2;
1059 else {
1060 increment = ( roundingMode == float_round_up ) && zSig2;
1065 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1066 if ( increment ) {
1067 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1068 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1070 else {
1071 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1073 return packFloat128( zSign, zExp, zSig0, zSig1 );
1077 /*----------------------------------------------------------------------------
1078 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1079 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1080 | returns the proper quadruple-precision floating-point value corresponding
1081 | to the abstract input. This routine is just like `roundAndPackFloat128'
1082 | except that the input significand has fewer bits and does not have to be
1083 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1084 | point exponent.
1085 *----------------------------------------------------------------------------*/
1087 static float128
1088 normalizeRoundAndPackFloat128(
1089 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1091 int8 shiftCount;
1092 uint64_t zSig2;
1094 if ( zSig0 == 0 ) {
1095 zSig0 = zSig1;
1096 zSig1 = 0;
1097 zExp -= 64;
1099 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1100 if ( 0 <= shiftCount ) {
1101 zSig2 = 0;
1102 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1104 else {
1105 shift128ExtraRightJamming(
1106 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1108 zExp -= shiftCount;
1109 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1113 /*----------------------------------------------------------------------------
1114 | Returns the result of converting the 32-bit two's complement integer `a'
1115 | to the single-precision floating-point format. The conversion is performed
1116 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1117 *----------------------------------------------------------------------------*/
1119 float32 int32_to_float32( int32 a STATUS_PARAM )
1121 flag zSign;
1123 if ( a == 0 ) return float32_zero;
1124 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1125 zSign = ( a < 0 );
1126 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1130 /*----------------------------------------------------------------------------
1131 | Returns the result of converting the 32-bit two's complement integer `a'
1132 | to the double-precision floating-point format. The conversion is performed
1133 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1134 *----------------------------------------------------------------------------*/
1136 float64 int32_to_float64( int32 a STATUS_PARAM )
1138 flag zSign;
1139 uint32 absA;
1140 int8 shiftCount;
1141 uint64_t zSig;
1143 if ( a == 0 ) return float64_zero;
1144 zSign = ( a < 0 );
1145 absA = zSign ? - a : a;
1146 shiftCount = countLeadingZeros32( absA ) + 21;
1147 zSig = absA;
1148 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1152 /*----------------------------------------------------------------------------
1153 | Returns the result of converting the 32-bit two's complement integer `a'
1154 | to the extended double-precision floating-point format. The conversion
1155 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1156 | Arithmetic.
1157 *----------------------------------------------------------------------------*/
1159 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1161 flag zSign;
1162 uint32 absA;
1163 int8 shiftCount;
1164 uint64_t zSig;
1166 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1167 zSign = ( a < 0 );
1168 absA = zSign ? - a : a;
1169 shiftCount = countLeadingZeros32( absA ) + 32;
1170 zSig = absA;
1171 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1175 /*----------------------------------------------------------------------------
1176 | Returns the result of converting the 32-bit two's complement integer `a' to
1177 | the quadruple-precision floating-point format. The conversion is performed
1178 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1179 *----------------------------------------------------------------------------*/
1181 float128 int32_to_float128( int32 a STATUS_PARAM )
1183 flag zSign;
1184 uint32 absA;
1185 int8 shiftCount;
1186 uint64_t zSig0;
1188 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1189 zSign = ( a < 0 );
1190 absA = zSign ? - a : a;
1191 shiftCount = countLeadingZeros32( absA ) + 17;
1192 zSig0 = absA;
1193 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1197 /*----------------------------------------------------------------------------
1198 | Returns the result of converting the 64-bit two's complement integer `a'
1199 | to the single-precision floating-point format. The conversion is performed
1200 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1201 *----------------------------------------------------------------------------*/
1203 float32 int64_to_float32( int64 a STATUS_PARAM )
1205 flag zSign;
1206 uint64 absA;
1207 int8 shiftCount;
1209 if ( a == 0 ) return float32_zero;
1210 zSign = ( a < 0 );
1211 absA = zSign ? - a : a;
1212 shiftCount = countLeadingZeros64( absA ) - 40;
1213 if ( 0 <= shiftCount ) {
1214 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1216 else {
1217 shiftCount += 7;
1218 if ( shiftCount < 0 ) {
1219 shift64RightJamming( absA, - shiftCount, &absA );
1221 else {
1222 absA <<= shiftCount;
1224 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1229 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1231 int8 shiftCount;
1233 if ( a == 0 ) return float32_zero;
1234 shiftCount = countLeadingZeros64( a ) - 40;
1235 if ( 0 <= shiftCount ) {
1236 return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1238 else {
1239 shiftCount += 7;
1240 if ( shiftCount < 0 ) {
1241 shift64RightJamming( a, - shiftCount, &a );
1243 else {
1244 a <<= shiftCount;
1246 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1250 /*----------------------------------------------------------------------------
1251 | Returns the result of converting the 64-bit two's complement integer `a'
1252 | to the double-precision floating-point format. The conversion is performed
1253 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1254 *----------------------------------------------------------------------------*/
1256 float64 int64_to_float64( int64 a STATUS_PARAM )
1258 flag zSign;
1260 if ( a == 0 ) return float64_zero;
1261 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1262 return packFloat64( 1, 0x43E, 0 );
1264 zSign = ( a < 0 );
1265 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1269 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1271 if ( a == 0 ) return float64_zero;
1272 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1276 /*----------------------------------------------------------------------------
1277 | Returns the result of converting the 64-bit two's complement integer `a'
1278 | to the extended double-precision floating-point format. The conversion
1279 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1280 | Arithmetic.
1281 *----------------------------------------------------------------------------*/
1283 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1285 flag zSign;
1286 uint64 absA;
1287 int8 shiftCount;
1289 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1290 zSign = ( a < 0 );
1291 absA = zSign ? - a : a;
1292 shiftCount = countLeadingZeros64( absA );
1293 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1297 /*----------------------------------------------------------------------------
1298 | Returns the result of converting the 64-bit two's complement integer `a' to
1299 | the quadruple-precision floating-point format. The conversion is performed
1300 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1301 *----------------------------------------------------------------------------*/
1303 float128 int64_to_float128( int64 a STATUS_PARAM )
1305 flag zSign;
1306 uint64 absA;
1307 int8 shiftCount;
1308 int32 zExp;
1309 uint64_t zSig0, zSig1;
1311 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1312 zSign = ( a < 0 );
1313 absA = zSign ? - a : a;
1314 shiftCount = countLeadingZeros64( absA ) + 49;
1315 zExp = 0x406E - shiftCount;
1316 if ( 64 <= shiftCount ) {
1317 zSig1 = 0;
1318 zSig0 = absA;
1319 shiftCount -= 64;
1321 else {
1322 zSig1 = absA;
1323 zSig0 = 0;
1325 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1326 return packFloat128( zSign, zExp, zSig0, zSig1 );
1330 /*----------------------------------------------------------------------------
1331 | Returns the result of converting the single-precision floating-point value
1332 | `a' to the 32-bit two's complement integer format. The conversion is
1333 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1334 | Arithmetic---which means in particular that the conversion is rounded
1335 | according to the current rounding mode. If `a' is a NaN, the largest
1336 | positive integer is returned. Otherwise, if the conversion overflows, the
1337 | largest integer with the same sign as `a' is returned.
1338 *----------------------------------------------------------------------------*/
1340 int32 float32_to_int32( float32 a STATUS_PARAM )
1342 flag aSign;
1343 int16 aExp, shiftCount;
1344 uint32_t aSig;
1345 uint64_t aSig64;
1347 a = float32_squash_input_denormal(a STATUS_VAR);
1348 aSig = extractFloat32Frac( a );
1349 aExp = extractFloat32Exp( a );
1350 aSign = extractFloat32Sign( a );
1351 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1352 if ( aExp ) aSig |= 0x00800000;
1353 shiftCount = 0xAF - aExp;
1354 aSig64 = aSig;
1355 aSig64 <<= 32;
1356 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1357 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1361 /*----------------------------------------------------------------------------
1362 | Returns the result of converting the single-precision floating-point value
1363 | `a' to the 32-bit two's complement integer format. The conversion is
1364 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1365 | Arithmetic, except that the conversion is always rounded toward zero.
1366 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1367 | the conversion overflows, the largest integer with the same sign as `a' is
1368 | returned.
1369 *----------------------------------------------------------------------------*/
1371 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1373 flag aSign;
1374 int16 aExp, shiftCount;
1375 uint32_t aSig;
1376 int32 z;
1377 a = float32_squash_input_denormal(a STATUS_VAR);
1379 aSig = extractFloat32Frac( a );
1380 aExp = extractFloat32Exp( a );
1381 aSign = extractFloat32Sign( a );
1382 shiftCount = aExp - 0x9E;
1383 if ( 0 <= shiftCount ) {
1384 if ( float32_val(a) != 0xCF000000 ) {
1385 float_raise( float_flag_invalid STATUS_VAR);
1386 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1388 return (int32_t) 0x80000000;
1390 else if ( aExp <= 0x7E ) {
1391 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1392 return 0;
1394 aSig = ( aSig | 0x00800000 )<<8;
1395 z = aSig>>( - shiftCount );
1396 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1397 STATUS(float_exception_flags) |= float_flag_inexact;
1399 if ( aSign ) z = - z;
1400 return z;
1404 /*----------------------------------------------------------------------------
1405 | Returns the result of converting the single-precision floating-point value
1406 | `a' to the 16-bit two's complement integer format. The conversion is
1407 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1408 | Arithmetic, except that the conversion is always rounded toward zero.
1409 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1410 | the conversion overflows, the largest integer with the same sign as `a' is
1411 | returned.
1412 *----------------------------------------------------------------------------*/
1414 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1416 flag aSign;
1417 int16 aExp, shiftCount;
1418 uint32_t aSig;
1419 int32 z;
1421 aSig = extractFloat32Frac( a );
1422 aExp = extractFloat32Exp( a );
1423 aSign = extractFloat32Sign( a );
1424 shiftCount = aExp - 0x8E;
1425 if ( 0 <= shiftCount ) {
1426 if ( float32_val(a) != 0xC7000000 ) {
1427 float_raise( float_flag_invalid STATUS_VAR);
1428 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1429 return 0x7FFF;
1432 return (int32_t) 0xffff8000;
1434 else if ( aExp <= 0x7E ) {
1435 if ( aExp | aSig ) {
1436 STATUS(float_exception_flags) |= float_flag_inexact;
1438 return 0;
1440 shiftCount -= 0x10;
1441 aSig = ( aSig | 0x00800000 )<<8;
1442 z = aSig>>( - shiftCount );
1443 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1444 STATUS(float_exception_flags) |= float_flag_inexact;
1446 if ( aSign ) {
1447 z = - z;
1449 return z;
1453 /*----------------------------------------------------------------------------
1454 | Returns the result of converting the single-precision floating-point value
1455 | `a' to the 64-bit two's complement integer format. The conversion is
1456 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1457 | Arithmetic---which means in particular that the conversion is rounded
1458 | according to the current rounding mode. If `a' is a NaN, the largest
1459 | positive integer is returned. Otherwise, if the conversion overflows, the
1460 | largest integer with the same sign as `a' is returned.
1461 *----------------------------------------------------------------------------*/
1463 int64 float32_to_int64( float32 a STATUS_PARAM )
1465 flag aSign;
1466 int16 aExp, shiftCount;
1467 uint32_t aSig;
1468 uint64_t aSig64, aSigExtra;
1469 a = float32_squash_input_denormal(a STATUS_VAR);
1471 aSig = extractFloat32Frac( a );
1472 aExp = extractFloat32Exp( a );
1473 aSign = extractFloat32Sign( a );
1474 shiftCount = 0xBE - aExp;
1475 if ( shiftCount < 0 ) {
1476 float_raise( float_flag_invalid STATUS_VAR);
1477 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1478 return LIT64( 0x7FFFFFFFFFFFFFFF );
1480 return (int64_t) LIT64( 0x8000000000000000 );
1482 if ( aExp ) aSig |= 0x00800000;
1483 aSig64 = aSig;
1484 aSig64 <<= 40;
1485 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1486 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1490 /*----------------------------------------------------------------------------
1491 | Returns the result of converting the single-precision floating-point value
1492 | `a' to the 64-bit two's complement integer format. The conversion is
1493 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1494 | Arithmetic, except that the conversion is always rounded toward zero. If
1495 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1496 | conversion overflows, the largest integer with the same sign as `a' is
1497 | returned.
1498 *----------------------------------------------------------------------------*/
1500 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1502 flag aSign;
1503 int16 aExp, shiftCount;
1504 uint32_t aSig;
1505 uint64_t aSig64;
1506 int64 z;
1507 a = float32_squash_input_denormal(a STATUS_VAR);
1509 aSig = extractFloat32Frac( a );
1510 aExp = extractFloat32Exp( a );
1511 aSign = extractFloat32Sign( a );
1512 shiftCount = aExp - 0xBE;
1513 if ( 0 <= shiftCount ) {
1514 if ( float32_val(a) != 0xDF000000 ) {
1515 float_raise( float_flag_invalid STATUS_VAR);
1516 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1517 return LIT64( 0x7FFFFFFFFFFFFFFF );
1520 return (int64_t) LIT64( 0x8000000000000000 );
1522 else if ( aExp <= 0x7E ) {
1523 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1524 return 0;
1526 aSig64 = aSig | 0x00800000;
1527 aSig64 <<= 40;
1528 z = aSig64>>( - shiftCount );
1529 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1530 STATUS(float_exception_flags) |= float_flag_inexact;
1532 if ( aSign ) z = - z;
1533 return z;
1537 /*----------------------------------------------------------------------------
1538 | Returns the result of converting the single-precision floating-point value
1539 | `a' to the double-precision floating-point format. The conversion is
1540 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1541 | Arithmetic.
1542 *----------------------------------------------------------------------------*/
1544 float64 float32_to_float64( float32 a STATUS_PARAM )
1546 flag aSign;
1547 int16 aExp;
1548 uint32_t aSig;
1549 a = float32_squash_input_denormal(a STATUS_VAR);
1551 aSig = extractFloat32Frac( a );
1552 aExp = extractFloat32Exp( a );
1553 aSign = extractFloat32Sign( a );
1554 if ( aExp == 0xFF ) {
1555 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1556 return packFloat64( aSign, 0x7FF, 0 );
1558 if ( aExp == 0 ) {
1559 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1560 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1561 --aExp;
1563 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1567 /*----------------------------------------------------------------------------
1568 | Returns the result of converting the single-precision floating-point value
1569 | `a' to the extended double-precision floating-point format. The conversion
1570 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1571 | Arithmetic.
1572 *----------------------------------------------------------------------------*/
1574 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1576 flag aSign;
1577 int16 aExp;
1578 uint32_t aSig;
1580 a = float32_squash_input_denormal(a STATUS_VAR);
1581 aSig = extractFloat32Frac( a );
1582 aExp = extractFloat32Exp( a );
1583 aSign = extractFloat32Sign( a );
1584 if ( aExp == 0xFF ) {
1585 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1586 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1588 if ( aExp == 0 ) {
1589 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1590 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1592 aSig |= 0x00800000;
1593 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1597 /*----------------------------------------------------------------------------
1598 | Returns the result of converting the single-precision floating-point value
1599 | `a' to the double-precision floating-point format. The conversion is
1600 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1601 | Arithmetic.
1602 *----------------------------------------------------------------------------*/
1604 float128 float32_to_float128( float32 a STATUS_PARAM )
1606 flag aSign;
1607 int16 aExp;
1608 uint32_t aSig;
1610 a = float32_squash_input_denormal(a STATUS_VAR);
1611 aSig = extractFloat32Frac( a );
1612 aExp = extractFloat32Exp( a );
1613 aSign = extractFloat32Sign( a );
1614 if ( aExp == 0xFF ) {
1615 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1616 return packFloat128( aSign, 0x7FFF, 0, 0 );
1618 if ( aExp == 0 ) {
1619 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1620 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1621 --aExp;
1623 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1627 /*----------------------------------------------------------------------------
1628 | Rounds the single-precision floating-point value `a' to an integer, and
1629 | returns the result as a single-precision floating-point value. The
1630 | operation is performed according to the IEC/IEEE Standard for Binary
1631 | Floating-Point Arithmetic.
1632 *----------------------------------------------------------------------------*/
1634 float32 float32_round_to_int( float32 a STATUS_PARAM)
1636 flag aSign;
1637 int16 aExp;
1638 uint32_t lastBitMask, roundBitsMask;
1639 int8 roundingMode;
1640 uint32_t z;
1641 a = float32_squash_input_denormal(a STATUS_VAR);
1643 aExp = extractFloat32Exp( a );
1644 if ( 0x96 <= aExp ) {
1645 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1646 return propagateFloat32NaN( a, a STATUS_VAR );
1648 return a;
1650 if ( aExp <= 0x7E ) {
1651 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1652 STATUS(float_exception_flags) |= float_flag_inexact;
1653 aSign = extractFloat32Sign( a );
1654 switch ( STATUS(float_rounding_mode) ) {
1655 case float_round_nearest_even:
1656 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1657 return packFloat32( aSign, 0x7F, 0 );
1659 break;
1660 case float_round_down:
1661 return make_float32(aSign ? 0xBF800000 : 0);
1662 case float_round_up:
1663 return make_float32(aSign ? 0x80000000 : 0x3F800000);
1665 return packFloat32( aSign, 0, 0 );
1667 lastBitMask = 1;
1668 lastBitMask <<= 0x96 - aExp;
1669 roundBitsMask = lastBitMask - 1;
1670 z = float32_val(a);
1671 roundingMode = STATUS(float_rounding_mode);
1672 if ( roundingMode == float_round_nearest_even ) {
1673 z += lastBitMask>>1;
1674 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1676 else if ( roundingMode != float_round_to_zero ) {
1677 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1678 z += roundBitsMask;
1681 z &= ~ roundBitsMask;
1682 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1683 return make_float32(z);
1687 /*----------------------------------------------------------------------------
1688 | Returns the result of adding the absolute values of the single-precision
1689 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1690 | before being returned. `zSign' is ignored if the result is a NaN.
1691 | The addition is performed according to the IEC/IEEE Standard for Binary
1692 | Floating-Point Arithmetic.
1693 *----------------------------------------------------------------------------*/
1695 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1697 int16 aExp, bExp, zExp;
1698 uint32_t aSig, bSig, zSig;
1699 int16 expDiff;
1701 aSig = extractFloat32Frac( a );
1702 aExp = extractFloat32Exp( a );
1703 bSig = extractFloat32Frac( b );
1704 bExp = extractFloat32Exp( b );
1705 expDiff = aExp - bExp;
1706 aSig <<= 6;
1707 bSig <<= 6;
1708 if ( 0 < expDiff ) {
1709 if ( aExp == 0xFF ) {
1710 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1711 return a;
1713 if ( bExp == 0 ) {
1714 --expDiff;
1716 else {
1717 bSig |= 0x20000000;
1719 shift32RightJamming( bSig, expDiff, &bSig );
1720 zExp = aExp;
1722 else if ( expDiff < 0 ) {
1723 if ( bExp == 0xFF ) {
1724 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1725 return packFloat32( zSign, 0xFF, 0 );
1727 if ( aExp == 0 ) {
1728 ++expDiff;
1730 else {
1731 aSig |= 0x20000000;
1733 shift32RightJamming( aSig, - expDiff, &aSig );
1734 zExp = bExp;
1736 else {
1737 if ( aExp == 0xFF ) {
1738 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1739 return a;
1741 if ( aExp == 0 ) {
1742 if (STATUS(flush_to_zero)) {
1743 if (aSig | bSig) {
1744 float_raise(float_flag_output_denormal STATUS_VAR);
1746 return packFloat32(zSign, 0, 0);
1748 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1750 zSig = 0x40000000 + aSig + bSig;
1751 zExp = aExp;
1752 goto roundAndPack;
1754 aSig |= 0x20000000;
1755 zSig = ( aSig + bSig )<<1;
1756 --zExp;
1757 if ( (int32_t) zSig < 0 ) {
1758 zSig = aSig + bSig;
1759 ++zExp;
1761 roundAndPack:
1762 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1766 /*----------------------------------------------------------------------------
1767 | Returns the result of subtracting the absolute values of the single-
1768 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1769 | difference is negated before being returned. `zSign' is ignored if the
1770 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1771 | Standard for Binary Floating-Point Arithmetic.
1772 *----------------------------------------------------------------------------*/
1774 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1776 int16 aExp, bExp, zExp;
1777 uint32_t aSig, bSig, zSig;
1778 int16 expDiff;
1780 aSig = extractFloat32Frac( a );
1781 aExp = extractFloat32Exp( a );
1782 bSig = extractFloat32Frac( b );
1783 bExp = extractFloat32Exp( b );
1784 expDiff = aExp - bExp;
1785 aSig <<= 7;
1786 bSig <<= 7;
1787 if ( 0 < expDiff ) goto aExpBigger;
1788 if ( expDiff < 0 ) goto bExpBigger;
1789 if ( aExp == 0xFF ) {
1790 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1791 float_raise( float_flag_invalid STATUS_VAR);
1792 return float32_default_nan;
1794 if ( aExp == 0 ) {
1795 aExp = 1;
1796 bExp = 1;
1798 if ( bSig < aSig ) goto aBigger;
1799 if ( aSig < bSig ) goto bBigger;
1800 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1801 bExpBigger:
1802 if ( bExp == 0xFF ) {
1803 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1804 return packFloat32( zSign ^ 1, 0xFF, 0 );
1806 if ( aExp == 0 ) {
1807 ++expDiff;
1809 else {
1810 aSig |= 0x40000000;
1812 shift32RightJamming( aSig, - expDiff, &aSig );
1813 bSig |= 0x40000000;
1814 bBigger:
1815 zSig = bSig - aSig;
1816 zExp = bExp;
1817 zSign ^= 1;
1818 goto normalizeRoundAndPack;
1819 aExpBigger:
1820 if ( aExp == 0xFF ) {
1821 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1822 return a;
1824 if ( bExp == 0 ) {
1825 --expDiff;
1827 else {
1828 bSig |= 0x40000000;
1830 shift32RightJamming( bSig, expDiff, &bSig );
1831 aSig |= 0x40000000;
1832 aBigger:
1833 zSig = aSig - bSig;
1834 zExp = aExp;
1835 normalizeRoundAndPack:
1836 --zExp;
1837 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1841 /*----------------------------------------------------------------------------
1842 | Returns the result of adding the single-precision floating-point values `a'
1843 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1844 | Binary Floating-Point Arithmetic.
1845 *----------------------------------------------------------------------------*/
1847 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1849 flag aSign, bSign;
1850 a = float32_squash_input_denormal(a STATUS_VAR);
1851 b = float32_squash_input_denormal(b STATUS_VAR);
1853 aSign = extractFloat32Sign( a );
1854 bSign = extractFloat32Sign( b );
1855 if ( aSign == bSign ) {
1856 return addFloat32Sigs( a, b, aSign STATUS_VAR);
1858 else {
1859 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1864 /*----------------------------------------------------------------------------
1865 | Returns the result of subtracting the single-precision floating-point values
1866 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1867 | for Binary Floating-Point Arithmetic.
1868 *----------------------------------------------------------------------------*/
1870 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1872 flag aSign, bSign;
1873 a = float32_squash_input_denormal(a STATUS_VAR);
1874 b = float32_squash_input_denormal(b STATUS_VAR);
1876 aSign = extractFloat32Sign( a );
1877 bSign = extractFloat32Sign( b );
1878 if ( aSign == bSign ) {
1879 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1881 else {
1882 return addFloat32Sigs( a, b, aSign STATUS_VAR );
1887 /*----------------------------------------------------------------------------
1888 | Returns the result of multiplying the single-precision floating-point values
1889 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1890 | for Binary Floating-Point Arithmetic.
1891 *----------------------------------------------------------------------------*/
1893 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1895 flag aSign, bSign, zSign;
1896 int16 aExp, bExp, zExp;
1897 uint32_t aSig, bSig;
1898 uint64_t zSig64;
1899 uint32_t zSig;
1901 a = float32_squash_input_denormal(a STATUS_VAR);
1902 b = float32_squash_input_denormal(b STATUS_VAR);
1904 aSig = extractFloat32Frac( a );
1905 aExp = extractFloat32Exp( a );
1906 aSign = extractFloat32Sign( a );
1907 bSig = extractFloat32Frac( b );
1908 bExp = extractFloat32Exp( b );
1909 bSign = extractFloat32Sign( b );
1910 zSign = aSign ^ bSign;
1911 if ( aExp == 0xFF ) {
1912 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1913 return propagateFloat32NaN( a, b STATUS_VAR );
1915 if ( ( bExp | bSig ) == 0 ) {
1916 float_raise( float_flag_invalid STATUS_VAR);
1917 return float32_default_nan;
1919 return packFloat32( zSign, 0xFF, 0 );
1921 if ( bExp == 0xFF ) {
1922 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1923 if ( ( aExp | aSig ) == 0 ) {
1924 float_raise( float_flag_invalid STATUS_VAR);
1925 return float32_default_nan;
1927 return packFloat32( zSign, 0xFF, 0 );
1929 if ( aExp == 0 ) {
1930 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1931 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1933 if ( bExp == 0 ) {
1934 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1935 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1937 zExp = aExp + bExp - 0x7F;
1938 aSig = ( aSig | 0x00800000 )<<7;
1939 bSig = ( bSig | 0x00800000 )<<8;
1940 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
1941 zSig = zSig64;
1942 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
1943 zSig <<= 1;
1944 --zExp;
1946 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1950 /*----------------------------------------------------------------------------
1951 | Returns the result of dividing the single-precision floating-point value `a'
1952 | by the corresponding value `b'. The operation is performed according to the
1953 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1954 *----------------------------------------------------------------------------*/
1956 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1958 flag aSign, bSign, zSign;
1959 int16 aExp, bExp, zExp;
1960 uint32_t aSig, bSig, zSig;
1961 a = float32_squash_input_denormal(a STATUS_VAR);
1962 b = float32_squash_input_denormal(b STATUS_VAR);
1964 aSig = extractFloat32Frac( a );
1965 aExp = extractFloat32Exp( a );
1966 aSign = extractFloat32Sign( a );
1967 bSig = extractFloat32Frac( b );
1968 bExp = extractFloat32Exp( b );
1969 bSign = extractFloat32Sign( b );
1970 zSign = aSign ^ bSign;
1971 if ( aExp == 0xFF ) {
1972 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1973 if ( bExp == 0xFF ) {
1974 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1975 float_raise( float_flag_invalid STATUS_VAR);
1976 return float32_default_nan;
1978 return packFloat32( zSign, 0xFF, 0 );
1980 if ( bExp == 0xFF ) {
1981 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1982 return packFloat32( zSign, 0, 0 );
1984 if ( bExp == 0 ) {
1985 if ( bSig == 0 ) {
1986 if ( ( aExp | aSig ) == 0 ) {
1987 float_raise( float_flag_invalid STATUS_VAR);
1988 return float32_default_nan;
1990 float_raise( float_flag_divbyzero STATUS_VAR);
1991 return packFloat32( zSign, 0xFF, 0 );
1993 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1995 if ( aExp == 0 ) {
1996 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1997 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1999 zExp = aExp - bExp + 0x7D;
2000 aSig = ( aSig | 0x00800000 )<<7;
2001 bSig = ( bSig | 0x00800000 )<<8;
2002 if ( bSig <= ( aSig + aSig ) ) {
2003 aSig >>= 1;
2004 ++zExp;
2006 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2007 if ( ( zSig & 0x3F ) == 0 ) {
2008 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2010 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2014 /*----------------------------------------------------------------------------
2015 | Returns the remainder of the single-precision floating-point value `a'
2016 | with respect to the corresponding value `b'. The operation is performed
2017 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2018 *----------------------------------------------------------------------------*/
2020 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2022 flag aSign, zSign;
2023 int16 aExp, bExp, expDiff;
2024 uint32_t aSig, bSig;
2025 uint32_t q;
2026 uint64_t aSig64, bSig64, q64;
2027 uint32_t alternateASig;
2028 int32_t sigMean;
2029 a = float32_squash_input_denormal(a STATUS_VAR);
2030 b = float32_squash_input_denormal(b STATUS_VAR);
2032 aSig = extractFloat32Frac( a );
2033 aExp = extractFloat32Exp( a );
2034 aSign = extractFloat32Sign( a );
2035 bSig = extractFloat32Frac( b );
2036 bExp = extractFloat32Exp( b );
2037 if ( aExp == 0xFF ) {
2038 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2039 return propagateFloat32NaN( a, b STATUS_VAR );
2041 float_raise( float_flag_invalid STATUS_VAR);
2042 return float32_default_nan;
2044 if ( bExp == 0xFF ) {
2045 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2046 return a;
2048 if ( bExp == 0 ) {
2049 if ( bSig == 0 ) {
2050 float_raise( float_flag_invalid STATUS_VAR);
2051 return float32_default_nan;
2053 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2055 if ( aExp == 0 ) {
2056 if ( aSig == 0 ) return a;
2057 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2059 expDiff = aExp - bExp;
2060 aSig |= 0x00800000;
2061 bSig |= 0x00800000;
2062 if ( expDiff < 32 ) {
2063 aSig <<= 8;
2064 bSig <<= 8;
2065 if ( expDiff < 0 ) {
2066 if ( expDiff < -1 ) return a;
2067 aSig >>= 1;
2069 q = ( bSig <= aSig );
2070 if ( q ) aSig -= bSig;
2071 if ( 0 < expDiff ) {
2072 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2073 q >>= 32 - expDiff;
2074 bSig >>= 2;
2075 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2077 else {
2078 aSig >>= 2;
2079 bSig >>= 2;
2082 else {
2083 if ( bSig <= aSig ) aSig -= bSig;
2084 aSig64 = ( (uint64_t) aSig )<<40;
2085 bSig64 = ( (uint64_t) bSig )<<40;
2086 expDiff -= 64;
2087 while ( 0 < expDiff ) {
2088 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2089 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2090 aSig64 = - ( ( bSig * q64 )<<38 );
2091 expDiff -= 62;
2093 expDiff += 64;
2094 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2095 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2096 q = q64>>( 64 - expDiff );
2097 bSig <<= 6;
2098 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2100 do {
2101 alternateASig = aSig;
2102 ++q;
2103 aSig -= bSig;
2104 } while ( 0 <= (int32_t) aSig );
2105 sigMean = aSig + alternateASig;
2106 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2107 aSig = alternateASig;
2109 zSign = ( (int32_t) aSig < 0 );
2110 if ( zSign ) aSig = - aSig;
2111 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2115 /*----------------------------------------------------------------------------
2116 | Returns the square root of the single-precision floating-point value `a'.
2117 | The operation is performed according to the IEC/IEEE Standard for Binary
2118 | Floating-Point Arithmetic.
2119 *----------------------------------------------------------------------------*/
2121 float32 float32_sqrt( float32 a STATUS_PARAM )
2123 flag aSign;
2124 int16 aExp, zExp;
2125 uint32_t aSig, zSig;
2126 uint64_t rem, term;
2127 a = float32_squash_input_denormal(a STATUS_VAR);
2129 aSig = extractFloat32Frac( a );
2130 aExp = extractFloat32Exp( a );
2131 aSign = extractFloat32Sign( a );
2132 if ( aExp == 0xFF ) {
2133 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2134 if ( ! aSign ) return a;
2135 float_raise( float_flag_invalid STATUS_VAR);
2136 return float32_default_nan;
2138 if ( aSign ) {
2139 if ( ( aExp | aSig ) == 0 ) return a;
2140 float_raise( float_flag_invalid STATUS_VAR);
2141 return float32_default_nan;
2143 if ( aExp == 0 ) {
2144 if ( aSig == 0 ) return float32_zero;
2145 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2147 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2148 aSig = ( aSig | 0x00800000 )<<8;
2149 zSig = estimateSqrt32( aExp, aSig ) + 2;
2150 if ( ( zSig & 0x7F ) <= 5 ) {
2151 if ( zSig < 2 ) {
2152 zSig = 0x7FFFFFFF;
2153 goto roundAndPack;
2155 aSig >>= aExp & 1;
2156 term = ( (uint64_t) zSig ) * zSig;
2157 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2158 while ( (int64_t) rem < 0 ) {
2159 --zSig;
2160 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2162 zSig |= ( rem != 0 );
2164 shift32RightJamming( zSig, 1, &zSig );
2165 roundAndPack:
2166 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2170 /*----------------------------------------------------------------------------
2171 | Returns the binary exponential of the single-precision floating-point value
2172 | `a'. The operation is performed according to the IEC/IEEE Standard for
2173 | Binary Floating-Point Arithmetic.
2175 | Uses the following identities:
2177 | 1. -------------------------------------------------------------------------
2178 | x x*ln(2)
2179 | 2 = e
2181 | 2. -------------------------------------------------------------------------
2182 | 2 3 4 5 n
2183 | x x x x x x x
2184 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2185 | 1! 2! 3! 4! 5! n!
2186 *----------------------------------------------------------------------------*/
2188 static const float64 float32_exp2_coefficients[15] =
2190 const_float64( 0x3ff0000000000000ll ), /* 1 */
2191 const_float64( 0x3fe0000000000000ll ), /* 2 */
2192 const_float64( 0x3fc5555555555555ll ), /* 3 */
2193 const_float64( 0x3fa5555555555555ll ), /* 4 */
2194 const_float64( 0x3f81111111111111ll ), /* 5 */
2195 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2196 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2197 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2198 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2199 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2200 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2201 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2202 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2203 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2204 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2207 float32 float32_exp2( float32 a STATUS_PARAM )
2209 flag aSign;
2210 int16 aExp;
2211 uint32_t aSig;
2212 float64 r, x, xn;
2213 int i;
2214 a = float32_squash_input_denormal(a STATUS_VAR);
2216 aSig = extractFloat32Frac( a );
2217 aExp = extractFloat32Exp( a );
2218 aSign = extractFloat32Sign( a );
2220 if ( aExp == 0xFF) {
2221 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2222 return (aSign) ? float32_zero : a;
2224 if (aExp == 0) {
2225 if (aSig == 0) return float32_one;
2228 float_raise( float_flag_inexact STATUS_VAR);
2230 /* ******************************* */
2231 /* using float64 for approximation */
2232 /* ******************************* */
2233 x = float32_to_float64(a STATUS_VAR);
2234 x = float64_mul(x, float64_ln2 STATUS_VAR);
2236 xn = x;
2237 r = float64_one;
2238 for (i = 0 ; i < 15 ; i++) {
2239 float64 f;
2241 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2242 r = float64_add(r, f STATUS_VAR);
2244 xn = float64_mul(xn, x STATUS_VAR);
2247 return float64_to_float32(r, status);
2250 /*----------------------------------------------------------------------------
2251 | Returns the binary log of the single-precision floating-point value `a'.
2252 | The operation is performed according to the IEC/IEEE Standard for Binary
2253 | Floating-Point Arithmetic.
2254 *----------------------------------------------------------------------------*/
2255 float32 float32_log2( float32 a STATUS_PARAM )
2257 flag aSign, zSign;
2258 int16 aExp;
2259 uint32_t aSig, zSig, i;
2261 a = float32_squash_input_denormal(a STATUS_VAR);
2262 aSig = extractFloat32Frac( a );
2263 aExp = extractFloat32Exp( a );
2264 aSign = extractFloat32Sign( a );
2266 if ( aExp == 0 ) {
2267 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2268 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2270 if ( aSign ) {
2271 float_raise( float_flag_invalid STATUS_VAR);
2272 return float32_default_nan;
2274 if ( aExp == 0xFF ) {
2275 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2276 return a;
2279 aExp -= 0x7F;
2280 aSig |= 0x00800000;
2281 zSign = aExp < 0;
2282 zSig = aExp << 23;
2284 for (i = 1 << 22; i > 0; i >>= 1) {
2285 aSig = ( (uint64_t)aSig * aSig ) >> 23;
2286 if ( aSig & 0x01000000 ) {
2287 aSig >>= 1;
2288 zSig |= i;
2292 if ( zSign )
2293 zSig = -zSig;
2295 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2298 /*----------------------------------------------------------------------------
2299 | Returns 1 if the single-precision floating-point value `a' is equal to
2300 | the corresponding value `b', and 0 otherwise. The invalid exception is
2301 | raised if either operand is a NaN. Otherwise, the comparison is performed
2302 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2303 *----------------------------------------------------------------------------*/
2305 int float32_eq( float32 a, float32 b STATUS_PARAM )
2307 uint32_t av, bv;
2308 a = float32_squash_input_denormal(a STATUS_VAR);
2309 b = float32_squash_input_denormal(b STATUS_VAR);
2311 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2312 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2314 float_raise( float_flag_invalid STATUS_VAR);
2315 return 0;
2317 av = float32_val(a);
2318 bv = float32_val(b);
2319 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2322 /*----------------------------------------------------------------------------
2323 | Returns 1 if the single-precision floating-point value `a' is less than
2324 | or equal to the corresponding value `b', and 0 otherwise. The invalid
2325 | exception is raised if either operand is a NaN. The comparison is performed
2326 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2327 *----------------------------------------------------------------------------*/
2329 int float32_le( float32 a, float32 b STATUS_PARAM )
2331 flag aSign, bSign;
2332 uint32_t av, bv;
2333 a = float32_squash_input_denormal(a STATUS_VAR);
2334 b = float32_squash_input_denormal(b STATUS_VAR);
2336 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2337 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2339 float_raise( float_flag_invalid STATUS_VAR);
2340 return 0;
2342 aSign = extractFloat32Sign( a );
2343 bSign = extractFloat32Sign( b );
2344 av = float32_val(a);
2345 bv = float32_val(b);
2346 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2347 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2351 /*----------------------------------------------------------------------------
2352 | Returns 1 if the single-precision floating-point value `a' is less than
2353 | the corresponding value `b', and 0 otherwise. The invalid exception is
2354 | raised if either operand is a NaN. The comparison is performed according
2355 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2356 *----------------------------------------------------------------------------*/
2358 int float32_lt( float32 a, float32 b STATUS_PARAM )
2360 flag aSign, bSign;
2361 uint32_t av, bv;
2362 a = float32_squash_input_denormal(a STATUS_VAR);
2363 b = float32_squash_input_denormal(b STATUS_VAR);
2365 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2366 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2368 float_raise( float_flag_invalid STATUS_VAR);
2369 return 0;
2371 aSign = extractFloat32Sign( a );
2372 bSign = extractFloat32Sign( b );
2373 av = float32_val(a);
2374 bv = float32_val(b);
2375 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2376 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2380 /*----------------------------------------------------------------------------
2381 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2382 | be compared, and 0 otherwise. The invalid exception is raised if either
2383 | operand is a NaN. The comparison is performed according to the IEC/IEEE
2384 | Standard for Binary Floating-Point Arithmetic.
2385 *----------------------------------------------------------------------------*/
2387 int float32_unordered( float32 a, float32 b STATUS_PARAM )
2389 a = float32_squash_input_denormal(a STATUS_VAR);
2390 b = float32_squash_input_denormal(b STATUS_VAR);
2392 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2393 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2395 float_raise( float_flag_invalid STATUS_VAR);
2396 return 1;
2398 return 0;
2401 /*----------------------------------------------------------------------------
2402 | Returns 1 if the single-precision floating-point value `a' is equal to
2403 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2404 | exception. The comparison is performed according to the IEC/IEEE Standard
2405 | for Binary Floating-Point Arithmetic.
2406 *----------------------------------------------------------------------------*/
2408 int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2410 a = float32_squash_input_denormal(a STATUS_VAR);
2411 b = float32_squash_input_denormal(b STATUS_VAR);
2413 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2414 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2416 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2417 float_raise( float_flag_invalid STATUS_VAR);
2419 return 0;
2421 return ( float32_val(a) == float32_val(b) ) ||
2422 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2425 /*----------------------------------------------------------------------------
2426 | Returns 1 if the single-precision floating-point value `a' is less than or
2427 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2428 | cause an exception. Otherwise, the comparison is performed according to the
2429 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2430 *----------------------------------------------------------------------------*/
2432 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2434 flag aSign, bSign;
2435 uint32_t av, bv;
2436 a = float32_squash_input_denormal(a STATUS_VAR);
2437 b = float32_squash_input_denormal(b STATUS_VAR);
2439 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2440 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2442 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2443 float_raise( float_flag_invalid STATUS_VAR);
2445 return 0;
2447 aSign = extractFloat32Sign( a );
2448 bSign = extractFloat32Sign( b );
2449 av = float32_val(a);
2450 bv = float32_val(b);
2451 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2452 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2456 /*----------------------------------------------------------------------------
2457 | Returns 1 if the single-precision floating-point value `a' is less than
2458 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2459 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2460 | Standard for Binary Floating-Point Arithmetic.
2461 *----------------------------------------------------------------------------*/
2463 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2465 flag aSign, bSign;
2466 uint32_t av, bv;
2467 a = float32_squash_input_denormal(a STATUS_VAR);
2468 b = float32_squash_input_denormal(b STATUS_VAR);
2470 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2471 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2473 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2474 float_raise( float_flag_invalid STATUS_VAR);
2476 return 0;
2478 aSign = extractFloat32Sign( a );
2479 bSign = extractFloat32Sign( b );
2480 av = float32_val(a);
2481 bv = float32_val(b);
2482 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2483 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2487 /*----------------------------------------------------------------------------
2488 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2489 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2490 | comparison is performed according to the IEC/IEEE Standard for Binary
2491 | Floating-Point Arithmetic.
2492 *----------------------------------------------------------------------------*/
2494 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2496 a = float32_squash_input_denormal(a STATUS_VAR);
2497 b = float32_squash_input_denormal(b STATUS_VAR);
2499 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2500 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2502 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2503 float_raise( float_flag_invalid STATUS_VAR);
2505 return 1;
2507 return 0;
2510 /*----------------------------------------------------------------------------
2511 | Returns the result of converting the double-precision floating-point value
2512 | `a' to the 32-bit two's complement integer format. The conversion is
2513 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2514 | Arithmetic---which means in particular that the conversion is rounded
2515 | according to the current rounding mode. If `a' is a NaN, the largest
2516 | positive integer is returned. Otherwise, if the conversion overflows, the
2517 | largest integer with the same sign as `a' is returned.
2518 *----------------------------------------------------------------------------*/
2520 int32 float64_to_int32( float64 a STATUS_PARAM )
2522 flag aSign;
2523 int16 aExp, shiftCount;
2524 uint64_t aSig;
2525 a = float64_squash_input_denormal(a STATUS_VAR);
2527 aSig = extractFloat64Frac( a );
2528 aExp = extractFloat64Exp( a );
2529 aSign = extractFloat64Sign( a );
2530 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2531 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2532 shiftCount = 0x42C - aExp;
2533 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2534 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2538 /*----------------------------------------------------------------------------
2539 | Returns the result of converting the double-precision floating-point value
2540 | `a' to the 32-bit two's complement integer format. The conversion is
2541 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2542 | Arithmetic, except that the conversion is always rounded toward zero.
2543 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2544 | the conversion overflows, the largest integer with the same sign as `a' is
2545 | returned.
2546 *----------------------------------------------------------------------------*/
2548 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2550 flag aSign;
2551 int16 aExp, shiftCount;
2552 uint64_t aSig, savedASig;
2553 int32 z;
2554 a = float64_squash_input_denormal(a STATUS_VAR);
2556 aSig = extractFloat64Frac( a );
2557 aExp = extractFloat64Exp( a );
2558 aSign = extractFloat64Sign( a );
2559 if ( 0x41E < aExp ) {
2560 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2561 goto invalid;
2563 else if ( aExp < 0x3FF ) {
2564 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2565 return 0;
2567 aSig |= LIT64( 0x0010000000000000 );
2568 shiftCount = 0x433 - aExp;
2569 savedASig = aSig;
2570 aSig >>= shiftCount;
2571 z = aSig;
2572 if ( aSign ) z = - z;
2573 if ( ( z < 0 ) ^ aSign ) {
2574 invalid:
2575 float_raise( float_flag_invalid STATUS_VAR);
2576 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2578 if ( ( aSig<<shiftCount ) != savedASig ) {
2579 STATUS(float_exception_flags) |= float_flag_inexact;
2581 return z;
2585 /*----------------------------------------------------------------------------
2586 | Returns the result of converting the double-precision floating-point value
2587 | `a' to the 16-bit two's complement integer format. The conversion is
2588 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2589 | Arithmetic, except that the conversion is always rounded toward zero.
2590 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2591 | the conversion overflows, the largest integer with the same sign as `a' is
2592 | returned.
2593 *----------------------------------------------------------------------------*/
2595 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2597 flag aSign;
2598 int16 aExp, shiftCount;
2599 uint64_t aSig, savedASig;
2600 int32 z;
2602 aSig = extractFloat64Frac( a );
2603 aExp = extractFloat64Exp( a );
2604 aSign = extractFloat64Sign( a );
2605 if ( 0x40E < aExp ) {
2606 if ( ( aExp == 0x7FF ) && aSig ) {
2607 aSign = 0;
2609 goto invalid;
2611 else if ( aExp < 0x3FF ) {
2612 if ( aExp || aSig ) {
2613 STATUS(float_exception_flags) |= float_flag_inexact;
2615 return 0;
2617 aSig |= LIT64( 0x0010000000000000 );
2618 shiftCount = 0x433 - aExp;
2619 savedASig = aSig;
2620 aSig >>= shiftCount;
2621 z = aSig;
2622 if ( aSign ) {
2623 z = - z;
2625 if ( ( (int16_t)z < 0 ) ^ aSign ) {
2626 invalid:
2627 float_raise( float_flag_invalid STATUS_VAR);
2628 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2630 if ( ( aSig<<shiftCount ) != savedASig ) {
2631 STATUS(float_exception_flags) |= float_flag_inexact;
2633 return z;
2636 /*----------------------------------------------------------------------------
2637 | Returns the result of converting the double-precision floating-point value
2638 | `a' to the 64-bit two's complement integer format. The conversion is
2639 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2640 | Arithmetic---which means in particular that the conversion is rounded
2641 | according to the current rounding mode. If `a' is a NaN, the largest
2642 | positive integer is returned. Otherwise, if the conversion overflows, the
2643 | largest integer with the same sign as `a' is returned.
2644 *----------------------------------------------------------------------------*/
2646 int64 float64_to_int64( float64 a STATUS_PARAM )
2648 flag aSign;
2649 int16 aExp, shiftCount;
2650 uint64_t aSig, aSigExtra;
2651 a = float64_squash_input_denormal(a STATUS_VAR);
2653 aSig = extractFloat64Frac( a );
2654 aExp = extractFloat64Exp( a );
2655 aSign = extractFloat64Sign( a );
2656 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2657 shiftCount = 0x433 - aExp;
2658 if ( shiftCount <= 0 ) {
2659 if ( 0x43E < aExp ) {
2660 float_raise( float_flag_invalid STATUS_VAR);
2661 if ( ! aSign
2662 || ( ( aExp == 0x7FF )
2663 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2665 return LIT64( 0x7FFFFFFFFFFFFFFF );
2667 return (int64_t) LIT64( 0x8000000000000000 );
2669 aSigExtra = 0;
2670 aSig <<= - shiftCount;
2672 else {
2673 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2675 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2679 /*----------------------------------------------------------------------------
2680 | Returns the result of converting the double-precision floating-point value
2681 | `a' to the 64-bit two's complement integer format. The conversion is
2682 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2683 | Arithmetic, except that the conversion is always rounded toward zero.
2684 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2685 | the conversion overflows, the largest integer with the same sign as `a' is
2686 | returned.
2687 *----------------------------------------------------------------------------*/
2689 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2691 flag aSign;
2692 int16 aExp, shiftCount;
2693 uint64_t aSig;
2694 int64 z;
2695 a = float64_squash_input_denormal(a STATUS_VAR);
2697 aSig = extractFloat64Frac( a );
2698 aExp = extractFloat64Exp( a );
2699 aSign = extractFloat64Sign( a );
2700 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2701 shiftCount = aExp - 0x433;
2702 if ( 0 <= shiftCount ) {
2703 if ( 0x43E <= aExp ) {
2704 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2705 float_raise( float_flag_invalid STATUS_VAR);
2706 if ( ! aSign
2707 || ( ( aExp == 0x7FF )
2708 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2710 return LIT64( 0x7FFFFFFFFFFFFFFF );
2713 return (int64_t) LIT64( 0x8000000000000000 );
2715 z = aSig<<shiftCount;
2717 else {
2718 if ( aExp < 0x3FE ) {
2719 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2720 return 0;
2722 z = aSig>>( - shiftCount );
2723 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
2724 STATUS(float_exception_flags) |= float_flag_inexact;
2727 if ( aSign ) z = - z;
2728 return z;
2732 /*----------------------------------------------------------------------------
2733 | Returns the result of converting the double-precision floating-point value
2734 | `a' to the single-precision floating-point format. The conversion is
2735 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2736 | Arithmetic.
2737 *----------------------------------------------------------------------------*/
2739 float32 float64_to_float32( float64 a STATUS_PARAM )
2741 flag aSign;
2742 int16 aExp;
2743 uint64_t aSig;
2744 uint32_t zSig;
2745 a = float64_squash_input_denormal(a STATUS_VAR);
2747 aSig = extractFloat64Frac( a );
2748 aExp = extractFloat64Exp( a );
2749 aSign = extractFloat64Sign( a );
2750 if ( aExp == 0x7FF ) {
2751 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2752 return packFloat32( aSign, 0xFF, 0 );
2754 shift64RightJamming( aSig, 22, &aSig );
2755 zSig = aSig;
2756 if ( aExp || zSig ) {
2757 zSig |= 0x40000000;
2758 aExp -= 0x381;
2760 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2765 /*----------------------------------------------------------------------------
2766 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2767 | half-precision floating-point value, returning the result. After being
2768 | shifted into the proper positions, the three fields are simply added
2769 | together to form the result. This means that any integer portion of `zSig'
2770 | will be added into the exponent. Since a properly normalized significand
2771 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2772 | than the desired result exponent whenever `zSig' is a complete, normalized
2773 | significand.
2774 *----------------------------------------------------------------------------*/
2775 static float16 packFloat16(flag zSign, int16 zExp, uint16_t zSig)
2777 return make_float16(
2778 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
2781 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2782 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
2784 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2786 flag aSign;
2787 int16 aExp;
2788 uint32_t aSig;
2790 aSign = extractFloat16Sign(a);
2791 aExp = extractFloat16Exp(a);
2792 aSig = extractFloat16Frac(a);
2794 if (aExp == 0x1f && ieee) {
2795 if (aSig) {
2796 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
2798 return packFloat32(aSign, 0xff, aSig << 13);
2800 if (aExp == 0) {
2801 int8 shiftCount;
2803 if (aSig == 0) {
2804 return packFloat32(aSign, 0, 0);
2807 shiftCount = countLeadingZeros32( aSig ) - 21;
2808 aSig = aSig << shiftCount;
2809 aExp = -shiftCount;
2811 return packFloat32( aSign, aExp + 0x70, aSig << 13);
2814 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
2816 flag aSign;
2817 int16 aExp;
2818 uint32_t aSig;
2819 uint32_t mask;
2820 uint32_t increment;
2821 int8 roundingMode;
2822 a = float32_squash_input_denormal(a STATUS_VAR);
2824 aSig = extractFloat32Frac( a );
2825 aExp = extractFloat32Exp( a );
2826 aSign = extractFloat32Sign( a );
2827 if ( aExp == 0xFF ) {
2828 if (aSig) {
2829 /* Input is a NaN */
2830 float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2831 if (!ieee) {
2832 return packFloat16(aSign, 0, 0);
2834 return r;
2836 /* Infinity */
2837 if (!ieee) {
2838 float_raise(float_flag_invalid STATUS_VAR);
2839 return packFloat16(aSign, 0x1f, 0x3ff);
2841 return packFloat16(aSign, 0x1f, 0);
2843 if (aExp == 0 && aSig == 0) {
2844 return packFloat16(aSign, 0, 0);
2846 /* Decimal point between bits 22 and 23. */
2847 aSig |= 0x00800000;
2848 aExp -= 0x7f;
2849 if (aExp < -14) {
2850 mask = 0x00ffffff;
2851 if (aExp >= -24) {
2852 mask >>= 25 + aExp;
2854 } else {
2855 mask = 0x00001fff;
2857 if (aSig & mask) {
2858 float_raise( float_flag_underflow STATUS_VAR );
2859 roundingMode = STATUS(float_rounding_mode);
2860 switch (roundingMode) {
2861 case float_round_nearest_even:
2862 increment = (mask + 1) >> 1;
2863 if ((aSig & mask) == increment) {
2864 increment = aSig & (increment << 1);
2866 break;
2867 case float_round_up:
2868 increment = aSign ? 0 : mask;
2869 break;
2870 case float_round_down:
2871 increment = aSign ? mask : 0;
2872 break;
2873 default: /* round_to_zero */
2874 increment = 0;
2875 break;
2877 aSig += increment;
2878 if (aSig >= 0x01000000) {
2879 aSig >>= 1;
2880 aExp++;
2882 } else if (aExp < -14
2883 && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2884 float_raise( float_flag_underflow STATUS_VAR);
2887 if (ieee) {
2888 if (aExp > 15) {
2889 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2890 return packFloat16(aSign, 0x1f, 0);
2892 } else {
2893 if (aExp > 16) {
2894 float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
2895 return packFloat16(aSign, 0x1f, 0x3ff);
2898 if (aExp < -24) {
2899 return packFloat16(aSign, 0, 0);
2901 if (aExp < -14) {
2902 aSig >>= -14 - aExp;
2903 aExp = -14;
2905 return packFloat16(aSign, aExp + 14, aSig >> 13);
2908 /*----------------------------------------------------------------------------
2909 | Returns the result of converting the double-precision floating-point value
2910 | `a' to the extended double-precision floating-point format. The conversion
2911 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2912 | Arithmetic.
2913 *----------------------------------------------------------------------------*/
2915 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2917 flag aSign;
2918 int16 aExp;
2919 uint64_t aSig;
2921 a = float64_squash_input_denormal(a STATUS_VAR);
2922 aSig = extractFloat64Frac( a );
2923 aExp = extractFloat64Exp( a );
2924 aSign = extractFloat64Sign( a );
2925 if ( aExp == 0x7FF ) {
2926 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2927 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2929 if ( aExp == 0 ) {
2930 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2931 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2933 return
2934 packFloatx80(
2935 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2939 /*----------------------------------------------------------------------------
2940 | Returns the result of converting the double-precision floating-point value
2941 | `a' to the quadruple-precision floating-point format. The conversion is
2942 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2943 | Arithmetic.
2944 *----------------------------------------------------------------------------*/
2946 float128 float64_to_float128( float64 a STATUS_PARAM )
2948 flag aSign;
2949 int16 aExp;
2950 uint64_t aSig, zSig0, zSig1;
2952 a = float64_squash_input_denormal(a STATUS_VAR);
2953 aSig = extractFloat64Frac( a );
2954 aExp = extractFloat64Exp( a );
2955 aSign = extractFloat64Sign( a );
2956 if ( aExp == 0x7FF ) {
2957 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2958 return packFloat128( aSign, 0x7FFF, 0, 0 );
2960 if ( aExp == 0 ) {
2961 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2962 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2963 --aExp;
2965 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2966 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2970 /*----------------------------------------------------------------------------
2971 | Rounds the double-precision floating-point value `a' to an integer, and
2972 | returns the result as a double-precision floating-point value. The
2973 | operation is performed according to the IEC/IEEE Standard for Binary
2974 | Floating-Point Arithmetic.
2975 *----------------------------------------------------------------------------*/
2977 float64 float64_round_to_int( float64 a STATUS_PARAM )
2979 flag aSign;
2980 int16 aExp;
2981 uint64_t lastBitMask, roundBitsMask;
2982 int8 roundingMode;
2983 uint64_t z;
2984 a = float64_squash_input_denormal(a STATUS_VAR);
2986 aExp = extractFloat64Exp( a );
2987 if ( 0x433 <= aExp ) {
2988 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2989 return propagateFloat64NaN( a, a STATUS_VAR );
2991 return a;
2993 if ( aExp < 0x3FF ) {
2994 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
2995 STATUS(float_exception_flags) |= float_flag_inexact;
2996 aSign = extractFloat64Sign( a );
2997 switch ( STATUS(float_rounding_mode) ) {
2998 case float_round_nearest_even:
2999 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3000 return packFloat64( aSign, 0x3FF, 0 );
3002 break;
3003 case float_round_down:
3004 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3005 case float_round_up:
3006 return make_float64(
3007 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3009 return packFloat64( aSign, 0, 0 );
3011 lastBitMask = 1;
3012 lastBitMask <<= 0x433 - aExp;
3013 roundBitsMask = lastBitMask - 1;
3014 z = float64_val(a);
3015 roundingMode = STATUS(float_rounding_mode);
3016 if ( roundingMode == float_round_nearest_even ) {
3017 z += lastBitMask>>1;
3018 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3020 else if ( roundingMode != float_round_to_zero ) {
3021 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3022 z += roundBitsMask;
3025 z &= ~ roundBitsMask;
3026 if ( z != float64_val(a) )
3027 STATUS(float_exception_flags) |= float_flag_inexact;
3028 return make_float64(z);
3032 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3034 int oldmode;
3035 float64 res;
3036 oldmode = STATUS(float_rounding_mode);
3037 STATUS(float_rounding_mode) = float_round_to_zero;
3038 res = float64_round_to_int(a STATUS_VAR);
3039 STATUS(float_rounding_mode) = oldmode;
3040 return res;
3043 /*----------------------------------------------------------------------------
3044 | Returns the result of adding the absolute values of the double-precision
3045 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3046 | before being returned. `zSign' is ignored if the result is a NaN.
3047 | The addition is performed according to the IEC/IEEE Standard for Binary
3048 | Floating-Point Arithmetic.
3049 *----------------------------------------------------------------------------*/
3051 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3053 int16 aExp, bExp, zExp;
3054 uint64_t aSig, bSig, zSig;
3055 int16 expDiff;
3057 aSig = extractFloat64Frac( a );
3058 aExp = extractFloat64Exp( a );
3059 bSig = extractFloat64Frac( b );
3060 bExp = extractFloat64Exp( b );
3061 expDiff = aExp - bExp;
3062 aSig <<= 9;
3063 bSig <<= 9;
3064 if ( 0 < expDiff ) {
3065 if ( aExp == 0x7FF ) {
3066 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3067 return a;
3069 if ( bExp == 0 ) {
3070 --expDiff;
3072 else {
3073 bSig |= LIT64( 0x2000000000000000 );
3075 shift64RightJamming( bSig, expDiff, &bSig );
3076 zExp = aExp;
3078 else if ( expDiff < 0 ) {
3079 if ( bExp == 0x7FF ) {
3080 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3081 return packFloat64( zSign, 0x7FF, 0 );
3083 if ( aExp == 0 ) {
3084 ++expDiff;
3086 else {
3087 aSig |= LIT64( 0x2000000000000000 );
3089 shift64RightJamming( aSig, - expDiff, &aSig );
3090 zExp = bExp;
3092 else {
3093 if ( aExp == 0x7FF ) {
3094 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3095 return a;
3097 if ( aExp == 0 ) {
3098 if (STATUS(flush_to_zero)) {
3099 if (aSig | bSig) {
3100 float_raise(float_flag_output_denormal STATUS_VAR);
3102 return packFloat64(zSign, 0, 0);
3104 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3106 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3107 zExp = aExp;
3108 goto roundAndPack;
3110 aSig |= LIT64( 0x2000000000000000 );
3111 zSig = ( aSig + bSig )<<1;
3112 --zExp;
3113 if ( (int64_t) zSig < 0 ) {
3114 zSig = aSig + bSig;
3115 ++zExp;
3117 roundAndPack:
3118 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3122 /*----------------------------------------------------------------------------
3123 | Returns the result of subtracting the absolute values of the double-
3124 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3125 | difference is negated before being returned. `zSign' is ignored if the
3126 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3127 | Standard for Binary Floating-Point Arithmetic.
3128 *----------------------------------------------------------------------------*/
3130 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3132 int16 aExp, bExp, zExp;
3133 uint64_t aSig, bSig, zSig;
3134 int16 expDiff;
3136 aSig = extractFloat64Frac( a );
3137 aExp = extractFloat64Exp( a );
3138 bSig = extractFloat64Frac( b );
3139 bExp = extractFloat64Exp( b );
3140 expDiff = aExp - bExp;
3141 aSig <<= 10;
3142 bSig <<= 10;
3143 if ( 0 < expDiff ) goto aExpBigger;
3144 if ( expDiff < 0 ) goto bExpBigger;
3145 if ( aExp == 0x7FF ) {
3146 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3147 float_raise( float_flag_invalid STATUS_VAR);
3148 return float64_default_nan;
3150 if ( aExp == 0 ) {
3151 aExp = 1;
3152 bExp = 1;
3154 if ( bSig < aSig ) goto aBigger;
3155 if ( aSig < bSig ) goto bBigger;
3156 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3157 bExpBigger:
3158 if ( bExp == 0x7FF ) {
3159 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3160 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3162 if ( aExp == 0 ) {
3163 ++expDiff;
3165 else {
3166 aSig |= LIT64( 0x4000000000000000 );
3168 shift64RightJamming( aSig, - expDiff, &aSig );
3169 bSig |= LIT64( 0x4000000000000000 );
3170 bBigger:
3171 zSig = bSig - aSig;
3172 zExp = bExp;
3173 zSign ^= 1;
3174 goto normalizeRoundAndPack;
3175 aExpBigger:
3176 if ( aExp == 0x7FF ) {
3177 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3178 return a;
3180 if ( bExp == 0 ) {
3181 --expDiff;
3183 else {
3184 bSig |= LIT64( 0x4000000000000000 );
3186 shift64RightJamming( bSig, expDiff, &bSig );
3187 aSig |= LIT64( 0x4000000000000000 );
3188 aBigger:
3189 zSig = aSig - bSig;
3190 zExp = aExp;
3191 normalizeRoundAndPack:
3192 --zExp;
3193 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3197 /*----------------------------------------------------------------------------
3198 | Returns the result of adding the double-precision floating-point values `a'
3199 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3200 | Binary Floating-Point Arithmetic.
3201 *----------------------------------------------------------------------------*/
3203 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3205 flag aSign, bSign;
3206 a = float64_squash_input_denormal(a STATUS_VAR);
3207 b = float64_squash_input_denormal(b STATUS_VAR);
3209 aSign = extractFloat64Sign( a );
3210 bSign = extractFloat64Sign( b );
3211 if ( aSign == bSign ) {
3212 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3214 else {
3215 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3220 /*----------------------------------------------------------------------------
3221 | Returns the result of subtracting the double-precision floating-point values
3222 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3223 | for Binary Floating-Point Arithmetic.
3224 *----------------------------------------------------------------------------*/
3226 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3228 flag aSign, bSign;
3229 a = float64_squash_input_denormal(a STATUS_VAR);
3230 b = float64_squash_input_denormal(b STATUS_VAR);
3232 aSign = extractFloat64Sign( a );
3233 bSign = extractFloat64Sign( b );
3234 if ( aSign == bSign ) {
3235 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3237 else {
3238 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3243 /*----------------------------------------------------------------------------
3244 | Returns the result of multiplying the double-precision floating-point values
3245 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3246 | for Binary Floating-Point Arithmetic.
3247 *----------------------------------------------------------------------------*/
3249 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3251 flag aSign, bSign, zSign;
3252 int16 aExp, bExp, zExp;
3253 uint64_t aSig, bSig, zSig0, zSig1;
3255 a = float64_squash_input_denormal(a STATUS_VAR);
3256 b = float64_squash_input_denormal(b STATUS_VAR);
3258 aSig = extractFloat64Frac( a );
3259 aExp = extractFloat64Exp( a );
3260 aSign = extractFloat64Sign( a );
3261 bSig = extractFloat64Frac( b );
3262 bExp = extractFloat64Exp( b );
3263 bSign = extractFloat64Sign( b );
3264 zSign = aSign ^ bSign;
3265 if ( aExp == 0x7FF ) {
3266 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3267 return propagateFloat64NaN( a, b STATUS_VAR );
3269 if ( ( bExp | bSig ) == 0 ) {
3270 float_raise( float_flag_invalid STATUS_VAR);
3271 return float64_default_nan;
3273 return packFloat64( zSign, 0x7FF, 0 );
3275 if ( bExp == 0x7FF ) {
3276 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3277 if ( ( aExp | aSig ) == 0 ) {
3278 float_raise( float_flag_invalid STATUS_VAR);
3279 return float64_default_nan;
3281 return packFloat64( zSign, 0x7FF, 0 );
3283 if ( aExp == 0 ) {
3284 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3285 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3287 if ( bExp == 0 ) {
3288 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3289 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3291 zExp = aExp + bExp - 0x3FF;
3292 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3293 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3294 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3295 zSig0 |= ( zSig1 != 0 );
3296 if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3297 zSig0 <<= 1;
3298 --zExp;
3300 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3304 /*----------------------------------------------------------------------------
3305 | Returns the result of dividing the double-precision floating-point value `a'
3306 | by the corresponding value `b'. The operation is performed according to
3307 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3308 *----------------------------------------------------------------------------*/
3310 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3312 flag aSign, bSign, zSign;
3313 int16 aExp, bExp, zExp;
3314 uint64_t aSig, bSig, zSig;
3315 uint64_t rem0, rem1;
3316 uint64_t term0, term1;
3317 a = float64_squash_input_denormal(a STATUS_VAR);
3318 b = float64_squash_input_denormal(b STATUS_VAR);
3320 aSig = extractFloat64Frac( a );
3321 aExp = extractFloat64Exp( a );
3322 aSign = extractFloat64Sign( a );
3323 bSig = extractFloat64Frac( b );
3324 bExp = extractFloat64Exp( b );
3325 bSign = extractFloat64Sign( b );
3326 zSign = aSign ^ bSign;
3327 if ( aExp == 0x7FF ) {
3328 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3329 if ( bExp == 0x7FF ) {
3330 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3331 float_raise( float_flag_invalid STATUS_VAR);
3332 return float64_default_nan;
3334 return packFloat64( zSign, 0x7FF, 0 );
3336 if ( bExp == 0x7FF ) {
3337 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3338 return packFloat64( zSign, 0, 0 );
3340 if ( bExp == 0 ) {
3341 if ( bSig == 0 ) {
3342 if ( ( aExp | aSig ) == 0 ) {
3343 float_raise( float_flag_invalid STATUS_VAR);
3344 return float64_default_nan;
3346 float_raise( float_flag_divbyzero STATUS_VAR);
3347 return packFloat64( zSign, 0x7FF, 0 );
3349 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3351 if ( aExp == 0 ) {
3352 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3353 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3355 zExp = aExp - bExp + 0x3FD;
3356 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3357 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3358 if ( bSig <= ( aSig + aSig ) ) {
3359 aSig >>= 1;
3360 ++zExp;
3362 zSig = estimateDiv128To64( aSig, 0, bSig );
3363 if ( ( zSig & 0x1FF ) <= 2 ) {
3364 mul64To128( bSig, zSig, &term0, &term1 );
3365 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3366 while ( (int64_t) rem0 < 0 ) {
3367 --zSig;
3368 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3370 zSig |= ( rem1 != 0 );
3372 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3376 /*----------------------------------------------------------------------------
3377 | Returns the remainder of the double-precision floating-point value `a'
3378 | with respect to the corresponding value `b'. The operation is performed
3379 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3380 *----------------------------------------------------------------------------*/
3382 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3384 flag aSign, zSign;
3385 int16 aExp, bExp, expDiff;
3386 uint64_t aSig, bSig;
3387 uint64_t q, alternateASig;
3388 int64_t sigMean;
3390 a = float64_squash_input_denormal(a STATUS_VAR);
3391 b = float64_squash_input_denormal(b STATUS_VAR);
3392 aSig = extractFloat64Frac( a );
3393 aExp = extractFloat64Exp( a );
3394 aSign = extractFloat64Sign( a );
3395 bSig = extractFloat64Frac( b );
3396 bExp = extractFloat64Exp( b );
3397 if ( aExp == 0x7FF ) {
3398 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3399 return propagateFloat64NaN( a, b STATUS_VAR );
3401 float_raise( float_flag_invalid STATUS_VAR);
3402 return float64_default_nan;
3404 if ( bExp == 0x7FF ) {
3405 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3406 return a;
3408 if ( bExp == 0 ) {
3409 if ( bSig == 0 ) {
3410 float_raise( float_flag_invalid STATUS_VAR);
3411 return float64_default_nan;
3413 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3415 if ( aExp == 0 ) {
3416 if ( aSig == 0 ) return a;
3417 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3419 expDiff = aExp - bExp;
3420 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3421 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3422 if ( expDiff < 0 ) {
3423 if ( expDiff < -1 ) return a;
3424 aSig >>= 1;
3426 q = ( bSig <= aSig );
3427 if ( q ) aSig -= bSig;
3428 expDiff -= 64;
3429 while ( 0 < expDiff ) {
3430 q = estimateDiv128To64( aSig, 0, bSig );
3431 q = ( 2 < q ) ? q - 2 : 0;
3432 aSig = - ( ( bSig>>2 ) * q );
3433 expDiff -= 62;
3435 expDiff += 64;
3436 if ( 0 < expDiff ) {
3437 q = estimateDiv128To64( aSig, 0, bSig );
3438 q = ( 2 < q ) ? q - 2 : 0;
3439 q >>= 64 - expDiff;
3440 bSig >>= 2;
3441 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3443 else {
3444 aSig >>= 2;
3445 bSig >>= 2;
3447 do {
3448 alternateASig = aSig;
3449 ++q;
3450 aSig -= bSig;
3451 } while ( 0 <= (int64_t) aSig );
3452 sigMean = aSig + alternateASig;
3453 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3454 aSig = alternateASig;
3456 zSign = ( (int64_t) aSig < 0 );
3457 if ( zSign ) aSig = - aSig;
3458 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3462 /*----------------------------------------------------------------------------
3463 | Returns the square root of the double-precision floating-point value `a'.
3464 | The operation is performed according to the IEC/IEEE Standard for Binary
3465 | Floating-Point Arithmetic.
3466 *----------------------------------------------------------------------------*/
3468 float64 float64_sqrt( float64 a STATUS_PARAM )
3470 flag aSign;
3471 int16 aExp, zExp;
3472 uint64_t aSig, zSig, doubleZSig;
3473 uint64_t rem0, rem1, term0, term1;
3474 a = float64_squash_input_denormal(a STATUS_VAR);
3476 aSig = extractFloat64Frac( a );
3477 aExp = extractFloat64Exp( a );
3478 aSign = extractFloat64Sign( a );
3479 if ( aExp == 0x7FF ) {
3480 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3481 if ( ! aSign ) return a;
3482 float_raise( float_flag_invalid STATUS_VAR);
3483 return float64_default_nan;
3485 if ( aSign ) {
3486 if ( ( aExp | aSig ) == 0 ) return a;
3487 float_raise( float_flag_invalid STATUS_VAR);
3488 return float64_default_nan;
3490 if ( aExp == 0 ) {
3491 if ( aSig == 0 ) return float64_zero;
3492 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3494 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3495 aSig |= LIT64( 0x0010000000000000 );
3496 zSig = estimateSqrt32( aExp, aSig>>21 );
3497 aSig <<= 9 - ( aExp & 1 );
3498 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3499 if ( ( zSig & 0x1FF ) <= 5 ) {
3500 doubleZSig = zSig<<1;
3501 mul64To128( zSig, zSig, &term0, &term1 );
3502 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3503 while ( (int64_t) rem0 < 0 ) {
3504 --zSig;
3505 doubleZSig -= 2;
3506 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3508 zSig |= ( ( rem0 | rem1 ) != 0 );
3510 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3514 /*----------------------------------------------------------------------------
3515 | Returns the binary log of the double-precision floating-point value `a'.
3516 | The operation is performed according to the IEC/IEEE Standard for Binary
3517 | Floating-Point Arithmetic.
3518 *----------------------------------------------------------------------------*/
3519 float64 float64_log2( float64 a STATUS_PARAM )
3521 flag aSign, zSign;
3522 int16 aExp;
3523 uint64_t aSig, aSig0, aSig1, zSig, i;
3524 a = float64_squash_input_denormal(a STATUS_VAR);
3526 aSig = extractFloat64Frac( a );
3527 aExp = extractFloat64Exp( a );
3528 aSign = extractFloat64Sign( a );
3530 if ( aExp == 0 ) {
3531 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3532 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3534 if ( aSign ) {
3535 float_raise( float_flag_invalid STATUS_VAR);
3536 return float64_default_nan;
3538 if ( aExp == 0x7FF ) {
3539 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3540 return a;
3543 aExp -= 0x3FF;
3544 aSig |= LIT64( 0x0010000000000000 );
3545 zSign = aExp < 0;
3546 zSig = (uint64_t)aExp << 52;
3547 for (i = 1LL << 51; i > 0; i >>= 1) {
3548 mul64To128( aSig, aSig, &aSig0, &aSig1 );
3549 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3550 if ( aSig & LIT64( 0x0020000000000000 ) ) {
3551 aSig >>= 1;
3552 zSig |= i;
3556 if ( zSign )
3557 zSig = -zSig;
3558 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3561 /*----------------------------------------------------------------------------
3562 | Returns 1 if the double-precision floating-point value `a' is equal to the
3563 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3564 | if either operand is a NaN. Otherwise, the comparison is performed
3565 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3566 *----------------------------------------------------------------------------*/
3568 int float64_eq( float64 a, float64 b STATUS_PARAM )
3570 uint64_t av, bv;
3571 a = float64_squash_input_denormal(a STATUS_VAR);
3572 b = float64_squash_input_denormal(b STATUS_VAR);
3574 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3575 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3577 float_raise( float_flag_invalid STATUS_VAR);
3578 return 0;
3580 av = float64_val(a);
3581 bv = float64_val(b);
3582 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3586 /*----------------------------------------------------------------------------
3587 | Returns 1 if the double-precision floating-point value `a' is less than or
3588 | equal to the corresponding value `b', and 0 otherwise. The invalid
3589 | exception is raised if either operand is a NaN. The comparison is performed
3590 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3591 *----------------------------------------------------------------------------*/
3593 int float64_le( float64 a, float64 b STATUS_PARAM )
3595 flag aSign, bSign;
3596 uint64_t av, bv;
3597 a = float64_squash_input_denormal(a STATUS_VAR);
3598 b = float64_squash_input_denormal(b STATUS_VAR);
3600 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3601 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3603 float_raise( float_flag_invalid STATUS_VAR);
3604 return 0;
3606 aSign = extractFloat64Sign( a );
3607 bSign = extractFloat64Sign( b );
3608 av = float64_val(a);
3609 bv = float64_val(b);
3610 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3611 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3615 /*----------------------------------------------------------------------------
3616 | Returns 1 if the double-precision floating-point value `a' is less than
3617 | the corresponding value `b', and 0 otherwise. The invalid exception is
3618 | raised if either operand is a NaN. The comparison is performed according
3619 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3620 *----------------------------------------------------------------------------*/
3622 int float64_lt( float64 a, float64 b STATUS_PARAM )
3624 flag aSign, bSign;
3625 uint64_t av, bv;
3627 a = float64_squash_input_denormal(a STATUS_VAR);
3628 b = float64_squash_input_denormal(b STATUS_VAR);
3629 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3630 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3632 float_raise( float_flag_invalid STATUS_VAR);
3633 return 0;
3635 aSign = extractFloat64Sign( a );
3636 bSign = extractFloat64Sign( b );
3637 av = float64_val(a);
3638 bv = float64_val(b);
3639 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3640 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3644 /*----------------------------------------------------------------------------
3645 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
3646 | be compared, and 0 otherwise. The invalid exception is raised if either
3647 | operand is a NaN. The comparison is performed according to the IEC/IEEE
3648 | Standard for Binary Floating-Point Arithmetic.
3649 *----------------------------------------------------------------------------*/
3651 int float64_unordered( float64 a, float64 b STATUS_PARAM )
3653 a = float64_squash_input_denormal(a STATUS_VAR);
3654 b = float64_squash_input_denormal(b STATUS_VAR);
3656 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3657 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3659 float_raise( float_flag_invalid STATUS_VAR);
3660 return 1;
3662 return 0;
3665 /*----------------------------------------------------------------------------
3666 | Returns 1 if the double-precision floating-point value `a' is equal to the
3667 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3668 | exception.The comparison is performed according to the IEC/IEEE Standard
3669 | for Binary Floating-Point Arithmetic.
3670 *----------------------------------------------------------------------------*/
3672 int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
3674 uint64_t av, bv;
3675 a = float64_squash_input_denormal(a STATUS_VAR);
3676 b = float64_squash_input_denormal(b STATUS_VAR);
3678 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3679 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3681 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3682 float_raise( float_flag_invalid STATUS_VAR);
3684 return 0;
3686 av = float64_val(a);
3687 bv = float64_val(b);
3688 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3692 /*----------------------------------------------------------------------------
3693 | Returns 1 if the double-precision floating-point value `a' is less than or
3694 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3695 | cause an exception. Otherwise, the comparison is performed according to the
3696 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3697 *----------------------------------------------------------------------------*/
3699 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3701 flag aSign, bSign;
3702 uint64_t av, bv;
3703 a = float64_squash_input_denormal(a STATUS_VAR);
3704 b = float64_squash_input_denormal(b STATUS_VAR);
3706 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3707 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3709 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3710 float_raise( float_flag_invalid STATUS_VAR);
3712 return 0;
3714 aSign = extractFloat64Sign( a );
3715 bSign = extractFloat64Sign( b );
3716 av = float64_val(a);
3717 bv = float64_val(b);
3718 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3719 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3723 /*----------------------------------------------------------------------------
3724 | Returns 1 if the double-precision floating-point value `a' is less than
3725 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3726 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3727 | Standard for Binary Floating-Point Arithmetic.
3728 *----------------------------------------------------------------------------*/
3730 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3732 flag aSign, bSign;
3733 uint64_t av, bv;
3734 a = float64_squash_input_denormal(a STATUS_VAR);
3735 b = float64_squash_input_denormal(b STATUS_VAR);
3737 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3738 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3740 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3741 float_raise( float_flag_invalid STATUS_VAR);
3743 return 0;
3745 aSign = extractFloat64Sign( a );
3746 bSign = extractFloat64Sign( b );
3747 av = float64_val(a);
3748 bv = float64_val(b);
3749 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3750 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3754 /*----------------------------------------------------------------------------
3755 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
3756 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3757 | comparison is performed according to the IEC/IEEE Standard for Binary
3758 | Floating-Point Arithmetic.
3759 *----------------------------------------------------------------------------*/
3761 int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
3763 a = float64_squash_input_denormal(a STATUS_VAR);
3764 b = float64_squash_input_denormal(b STATUS_VAR);
3766 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3767 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3769 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3770 float_raise( float_flag_invalid STATUS_VAR);
3772 return 1;
3774 return 0;
3777 /*----------------------------------------------------------------------------
3778 | Returns the result of converting the extended double-precision floating-
3779 | point value `a' to the 32-bit two's complement integer format. The
3780 | conversion is performed according to the IEC/IEEE Standard for Binary
3781 | Floating-Point Arithmetic---which means in particular that the conversion
3782 | is rounded according to the current rounding mode. If `a' is a NaN, the
3783 | largest positive integer is returned. Otherwise, if the conversion
3784 | overflows, the largest integer with the same sign as `a' is returned.
3785 *----------------------------------------------------------------------------*/
3787 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3789 flag aSign;
3790 int32 aExp, shiftCount;
3791 uint64_t aSig;
3793 aSig = extractFloatx80Frac( a );
3794 aExp = extractFloatx80Exp( a );
3795 aSign = extractFloatx80Sign( a );
3796 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3797 shiftCount = 0x4037 - aExp;
3798 if ( shiftCount <= 0 ) shiftCount = 1;
3799 shift64RightJamming( aSig, shiftCount, &aSig );
3800 return roundAndPackInt32( aSign, aSig STATUS_VAR );
3804 /*----------------------------------------------------------------------------
3805 | Returns the result of converting the extended double-precision floating-
3806 | point value `a' to the 32-bit two's complement integer format. The
3807 | conversion is performed according to the IEC/IEEE Standard for Binary
3808 | Floating-Point Arithmetic, except that the conversion is always rounded
3809 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3810 | Otherwise, if the conversion overflows, the largest integer with the same
3811 | sign as `a' is returned.
3812 *----------------------------------------------------------------------------*/
3814 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3816 flag aSign;
3817 int32 aExp, shiftCount;
3818 uint64_t aSig, savedASig;
3819 int32 z;
3821 aSig = extractFloatx80Frac( a );
3822 aExp = extractFloatx80Exp( a );
3823 aSign = extractFloatx80Sign( a );
3824 if ( 0x401E < aExp ) {
3825 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3826 goto invalid;
3828 else if ( aExp < 0x3FFF ) {
3829 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3830 return 0;
3832 shiftCount = 0x403E - aExp;
3833 savedASig = aSig;
3834 aSig >>= shiftCount;
3835 z = aSig;
3836 if ( aSign ) z = - z;
3837 if ( ( z < 0 ) ^ aSign ) {
3838 invalid:
3839 float_raise( float_flag_invalid STATUS_VAR);
3840 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3842 if ( ( aSig<<shiftCount ) != savedASig ) {
3843 STATUS(float_exception_flags) |= float_flag_inexact;
3845 return z;
3849 /*----------------------------------------------------------------------------
3850 | Returns the result of converting the extended double-precision floating-
3851 | point value `a' to the 64-bit two's complement integer format. The
3852 | conversion is performed according to the IEC/IEEE Standard for Binary
3853 | Floating-Point Arithmetic---which means in particular that the conversion
3854 | is rounded according to the current rounding mode. If `a' is a NaN,
3855 | the largest positive integer is returned. Otherwise, if the conversion
3856 | overflows, the largest integer with the same sign as `a' is returned.
3857 *----------------------------------------------------------------------------*/
3859 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3861 flag aSign;
3862 int32 aExp, shiftCount;
3863 uint64_t aSig, aSigExtra;
3865 aSig = extractFloatx80Frac( a );
3866 aExp = extractFloatx80Exp( a );
3867 aSign = extractFloatx80Sign( a );
3868 shiftCount = 0x403E - aExp;
3869 if ( shiftCount <= 0 ) {
3870 if ( shiftCount ) {
3871 float_raise( float_flag_invalid STATUS_VAR);
3872 if ( ! aSign
3873 || ( ( aExp == 0x7FFF )
3874 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3876 return LIT64( 0x7FFFFFFFFFFFFFFF );
3878 return (int64_t) LIT64( 0x8000000000000000 );
3880 aSigExtra = 0;
3882 else {
3883 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3885 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3889 /*----------------------------------------------------------------------------
3890 | Returns the result of converting the extended double-precision floating-
3891 | point value `a' to the 64-bit two's complement integer format. The
3892 | conversion is performed according to the IEC/IEEE Standard for Binary
3893 | Floating-Point Arithmetic, except that the conversion is always rounded
3894 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3895 | Otherwise, if the conversion overflows, the largest integer with the same
3896 | sign as `a' is returned.
3897 *----------------------------------------------------------------------------*/
3899 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3901 flag aSign;
3902 int32 aExp, shiftCount;
3903 uint64_t aSig;
3904 int64 z;
3906 aSig = extractFloatx80Frac( a );
3907 aExp = extractFloatx80Exp( a );
3908 aSign = extractFloatx80Sign( a );
3909 shiftCount = aExp - 0x403E;
3910 if ( 0 <= shiftCount ) {
3911 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3912 if ( ( a.high != 0xC03E ) || aSig ) {
3913 float_raise( float_flag_invalid STATUS_VAR);
3914 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3915 return LIT64( 0x7FFFFFFFFFFFFFFF );
3918 return (int64_t) LIT64( 0x8000000000000000 );
3920 else if ( aExp < 0x3FFF ) {
3921 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3922 return 0;
3924 z = aSig>>( - shiftCount );
3925 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3926 STATUS(float_exception_flags) |= float_flag_inexact;
3928 if ( aSign ) z = - z;
3929 return z;
3933 /*----------------------------------------------------------------------------
3934 | Returns the result of converting the extended double-precision floating-
3935 | point value `a' to the single-precision floating-point format. The
3936 | conversion is performed according to the IEC/IEEE Standard for Binary
3937 | Floating-Point Arithmetic.
3938 *----------------------------------------------------------------------------*/
3940 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3942 flag aSign;
3943 int32 aExp;
3944 uint64_t aSig;
3946 aSig = extractFloatx80Frac( a );
3947 aExp = extractFloatx80Exp( a );
3948 aSign = extractFloatx80Sign( a );
3949 if ( aExp == 0x7FFF ) {
3950 if ( (uint64_t) ( aSig<<1 ) ) {
3951 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3953 return packFloat32( aSign, 0xFF, 0 );
3955 shift64RightJamming( aSig, 33, &aSig );
3956 if ( aExp || aSig ) aExp -= 0x3F81;
3957 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3961 /*----------------------------------------------------------------------------
3962 | Returns the result of converting the extended double-precision floating-
3963 | point value `a' to the double-precision floating-point format. The
3964 | conversion is performed according to the IEC/IEEE Standard for Binary
3965 | Floating-Point Arithmetic.
3966 *----------------------------------------------------------------------------*/
3968 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3970 flag aSign;
3971 int32 aExp;
3972 uint64_t aSig, zSig;
3974 aSig = extractFloatx80Frac( a );
3975 aExp = extractFloatx80Exp( a );
3976 aSign = extractFloatx80Sign( a );
3977 if ( aExp == 0x7FFF ) {
3978 if ( (uint64_t) ( aSig<<1 ) ) {
3979 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3981 return packFloat64( aSign, 0x7FF, 0 );
3983 shift64RightJamming( aSig, 1, &zSig );
3984 if ( aExp || aSig ) aExp -= 0x3C01;
3985 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3989 /*----------------------------------------------------------------------------
3990 | Returns the result of converting the extended double-precision floating-
3991 | point value `a' to the quadruple-precision floating-point format. The
3992 | conversion is performed according to the IEC/IEEE Standard for Binary
3993 | Floating-Point Arithmetic.
3994 *----------------------------------------------------------------------------*/
3996 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3998 flag aSign;
3999 int16 aExp;
4000 uint64_t aSig, zSig0, zSig1;
4002 aSig = extractFloatx80Frac( a );
4003 aExp = extractFloatx80Exp( a );
4004 aSign = extractFloatx80Sign( a );
4005 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4006 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4008 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4009 return packFloat128( aSign, aExp, zSig0, zSig1 );
4013 /*----------------------------------------------------------------------------
4014 | Rounds the extended double-precision floating-point value `a' to an integer,
4015 | and returns the result as an extended quadruple-precision floating-point
4016 | value. The operation is performed according to the IEC/IEEE Standard for
4017 | Binary Floating-Point Arithmetic.
4018 *----------------------------------------------------------------------------*/
4020 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4022 flag aSign;
4023 int32 aExp;
4024 uint64_t lastBitMask, roundBitsMask;
4025 int8 roundingMode;
4026 floatx80 z;
4028 aExp = extractFloatx80Exp( a );
4029 if ( 0x403E <= aExp ) {
4030 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4031 return propagateFloatx80NaN( a, a STATUS_VAR );
4033 return a;
4035 if ( aExp < 0x3FFF ) {
4036 if ( ( aExp == 0 )
4037 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4038 return a;
4040 STATUS(float_exception_flags) |= float_flag_inexact;
4041 aSign = extractFloatx80Sign( a );
4042 switch ( STATUS(float_rounding_mode) ) {
4043 case float_round_nearest_even:
4044 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4046 return
4047 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4049 break;
4050 case float_round_down:
4051 return
4052 aSign ?
4053 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4054 : packFloatx80( 0, 0, 0 );
4055 case float_round_up:
4056 return
4057 aSign ? packFloatx80( 1, 0, 0 )
4058 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4060 return packFloatx80( aSign, 0, 0 );
4062 lastBitMask = 1;
4063 lastBitMask <<= 0x403E - aExp;
4064 roundBitsMask = lastBitMask - 1;
4065 z = a;
4066 roundingMode = STATUS(float_rounding_mode);
4067 if ( roundingMode == float_round_nearest_even ) {
4068 z.low += lastBitMask>>1;
4069 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4071 else if ( roundingMode != float_round_to_zero ) {
4072 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4073 z.low += roundBitsMask;
4076 z.low &= ~ roundBitsMask;
4077 if ( z.low == 0 ) {
4078 ++z.high;
4079 z.low = LIT64( 0x8000000000000000 );
4081 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4082 return z;
4086 /*----------------------------------------------------------------------------
4087 | Returns the result of adding the absolute values of the extended double-
4088 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4089 | negated before being returned. `zSign' is ignored if the result is a NaN.
4090 | The addition is performed according to the IEC/IEEE Standard for Binary
4091 | Floating-Point Arithmetic.
4092 *----------------------------------------------------------------------------*/
4094 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4096 int32 aExp, bExp, zExp;
4097 uint64_t aSig, bSig, zSig0, zSig1;
4098 int32 expDiff;
4100 aSig = extractFloatx80Frac( a );
4101 aExp = extractFloatx80Exp( a );
4102 bSig = extractFloatx80Frac( b );
4103 bExp = extractFloatx80Exp( b );
4104 expDiff = aExp - bExp;
4105 if ( 0 < expDiff ) {
4106 if ( aExp == 0x7FFF ) {
4107 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4108 return a;
4110 if ( bExp == 0 ) --expDiff;
4111 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4112 zExp = aExp;
4114 else if ( expDiff < 0 ) {
4115 if ( bExp == 0x7FFF ) {
4116 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4117 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4119 if ( aExp == 0 ) ++expDiff;
4120 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4121 zExp = bExp;
4123 else {
4124 if ( aExp == 0x7FFF ) {
4125 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4126 return propagateFloatx80NaN( a, b STATUS_VAR );
4128 return a;
4130 zSig1 = 0;
4131 zSig0 = aSig + bSig;
4132 if ( aExp == 0 ) {
4133 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4134 goto roundAndPack;
4136 zExp = aExp;
4137 goto shiftRight1;
4139 zSig0 = aSig + bSig;
4140 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4141 shiftRight1:
4142 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4143 zSig0 |= LIT64( 0x8000000000000000 );
4144 ++zExp;
4145 roundAndPack:
4146 return
4147 roundAndPackFloatx80(
4148 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4152 /*----------------------------------------------------------------------------
4153 | Returns the result of subtracting the absolute values of the extended
4154 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4155 | difference is negated before being returned. `zSign' is ignored if the
4156 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4157 | Standard for Binary Floating-Point Arithmetic.
4158 *----------------------------------------------------------------------------*/
4160 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4162 int32 aExp, bExp, zExp;
4163 uint64_t aSig, bSig, zSig0, zSig1;
4164 int32 expDiff;
4165 floatx80 z;
4167 aSig = extractFloatx80Frac( a );
4168 aExp = extractFloatx80Exp( a );
4169 bSig = extractFloatx80Frac( b );
4170 bExp = extractFloatx80Exp( b );
4171 expDiff = aExp - bExp;
4172 if ( 0 < expDiff ) goto aExpBigger;
4173 if ( expDiff < 0 ) goto bExpBigger;
4174 if ( aExp == 0x7FFF ) {
4175 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4176 return propagateFloatx80NaN( a, b STATUS_VAR );
4178 float_raise( float_flag_invalid STATUS_VAR);
4179 z.low = floatx80_default_nan_low;
4180 z.high = floatx80_default_nan_high;
4181 return z;
4183 if ( aExp == 0 ) {
4184 aExp = 1;
4185 bExp = 1;
4187 zSig1 = 0;
4188 if ( bSig < aSig ) goto aBigger;
4189 if ( aSig < bSig ) goto bBigger;
4190 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4191 bExpBigger:
4192 if ( bExp == 0x7FFF ) {
4193 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4194 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4196 if ( aExp == 0 ) ++expDiff;
4197 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4198 bBigger:
4199 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4200 zExp = bExp;
4201 zSign ^= 1;
4202 goto normalizeRoundAndPack;
4203 aExpBigger:
4204 if ( aExp == 0x7FFF ) {
4205 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4206 return a;
4208 if ( bExp == 0 ) --expDiff;
4209 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4210 aBigger:
4211 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4212 zExp = aExp;
4213 normalizeRoundAndPack:
4214 return
4215 normalizeRoundAndPackFloatx80(
4216 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4220 /*----------------------------------------------------------------------------
4221 | Returns the result of adding the extended double-precision floating-point
4222 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4223 | Standard for Binary Floating-Point Arithmetic.
4224 *----------------------------------------------------------------------------*/
4226 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4228 flag aSign, bSign;
4230 aSign = extractFloatx80Sign( a );
4231 bSign = extractFloatx80Sign( b );
4232 if ( aSign == bSign ) {
4233 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4235 else {
4236 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4241 /*----------------------------------------------------------------------------
4242 | Returns the result of subtracting the extended double-precision floating-
4243 | point values `a' and `b'. The operation is performed according to the
4244 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4245 *----------------------------------------------------------------------------*/
4247 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4249 flag aSign, bSign;
4251 aSign = extractFloatx80Sign( a );
4252 bSign = extractFloatx80Sign( b );
4253 if ( aSign == bSign ) {
4254 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4256 else {
4257 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4262 /*----------------------------------------------------------------------------
4263 | Returns the result of multiplying the extended double-precision floating-
4264 | point values `a' and `b'. The operation is performed according to the
4265 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4266 *----------------------------------------------------------------------------*/
4268 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4270 flag aSign, bSign, zSign;
4271 int32 aExp, bExp, zExp;
4272 uint64_t aSig, bSig, zSig0, zSig1;
4273 floatx80 z;
4275 aSig = extractFloatx80Frac( a );
4276 aExp = extractFloatx80Exp( a );
4277 aSign = extractFloatx80Sign( a );
4278 bSig = extractFloatx80Frac( b );
4279 bExp = extractFloatx80Exp( b );
4280 bSign = extractFloatx80Sign( b );
4281 zSign = aSign ^ bSign;
4282 if ( aExp == 0x7FFF ) {
4283 if ( (uint64_t) ( aSig<<1 )
4284 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4285 return propagateFloatx80NaN( a, b STATUS_VAR );
4287 if ( ( bExp | bSig ) == 0 ) goto invalid;
4288 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4290 if ( bExp == 0x7FFF ) {
4291 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4292 if ( ( aExp | aSig ) == 0 ) {
4293 invalid:
4294 float_raise( float_flag_invalid STATUS_VAR);
4295 z.low = floatx80_default_nan_low;
4296 z.high = floatx80_default_nan_high;
4297 return z;
4299 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4301 if ( aExp == 0 ) {
4302 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4303 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4305 if ( bExp == 0 ) {
4306 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4307 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4309 zExp = aExp + bExp - 0x3FFE;
4310 mul64To128( aSig, bSig, &zSig0, &zSig1 );
4311 if ( 0 < (int64_t) zSig0 ) {
4312 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4313 --zExp;
4315 return
4316 roundAndPackFloatx80(
4317 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4321 /*----------------------------------------------------------------------------
4322 | Returns the result of dividing the extended double-precision floating-point
4323 | value `a' by the corresponding value `b'. The operation is performed
4324 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4325 *----------------------------------------------------------------------------*/
4327 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4329 flag aSign, bSign, zSign;
4330 int32 aExp, bExp, zExp;
4331 uint64_t aSig, bSig, zSig0, zSig1;
4332 uint64_t rem0, rem1, rem2, term0, term1, term2;
4333 floatx80 z;
4335 aSig = extractFloatx80Frac( a );
4336 aExp = extractFloatx80Exp( a );
4337 aSign = extractFloatx80Sign( a );
4338 bSig = extractFloatx80Frac( b );
4339 bExp = extractFloatx80Exp( b );
4340 bSign = extractFloatx80Sign( b );
4341 zSign = aSign ^ bSign;
4342 if ( aExp == 0x7FFF ) {
4343 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4344 if ( bExp == 0x7FFF ) {
4345 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4346 goto invalid;
4348 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4350 if ( bExp == 0x7FFF ) {
4351 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4352 return packFloatx80( zSign, 0, 0 );
4354 if ( bExp == 0 ) {
4355 if ( bSig == 0 ) {
4356 if ( ( aExp | aSig ) == 0 ) {
4357 invalid:
4358 float_raise( float_flag_invalid STATUS_VAR);
4359 z.low = floatx80_default_nan_low;
4360 z.high = floatx80_default_nan_high;
4361 return z;
4363 float_raise( float_flag_divbyzero STATUS_VAR);
4364 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4366 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4368 if ( aExp == 0 ) {
4369 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4370 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4372 zExp = aExp - bExp + 0x3FFE;
4373 rem1 = 0;
4374 if ( bSig <= aSig ) {
4375 shift128Right( aSig, 0, 1, &aSig, &rem1 );
4376 ++zExp;
4378 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4379 mul64To128( bSig, zSig0, &term0, &term1 );
4380 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4381 while ( (int64_t) rem0 < 0 ) {
4382 --zSig0;
4383 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4385 zSig1 = estimateDiv128To64( rem1, 0, bSig );
4386 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4387 mul64To128( bSig, zSig1, &term1, &term2 );
4388 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4389 while ( (int64_t) rem1 < 0 ) {
4390 --zSig1;
4391 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4393 zSig1 |= ( ( rem1 | rem2 ) != 0 );
4395 return
4396 roundAndPackFloatx80(
4397 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4401 /*----------------------------------------------------------------------------
4402 | Returns the remainder of the extended double-precision floating-point value
4403 | `a' with respect to the corresponding value `b'. The operation is performed
4404 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4405 *----------------------------------------------------------------------------*/
4407 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4409 flag aSign, zSign;
4410 int32 aExp, bExp, expDiff;
4411 uint64_t aSig0, aSig1, bSig;
4412 uint64_t q, term0, term1, alternateASig0, alternateASig1;
4413 floatx80 z;
4415 aSig0 = extractFloatx80Frac( a );
4416 aExp = extractFloatx80Exp( a );
4417 aSign = extractFloatx80Sign( a );
4418 bSig = extractFloatx80Frac( b );
4419 bExp = extractFloatx80Exp( b );
4420 if ( aExp == 0x7FFF ) {
4421 if ( (uint64_t) ( aSig0<<1 )
4422 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4423 return propagateFloatx80NaN( a, b STATUS_VAR );
4425 goto invalid;
4427 if ( bExp == 0x7FFF ) {
4428 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4429 return a;
4431 if ( bExp == 0 ) {
4432 if ( bSig == 0 ) {
4433 invalid:
4434 float_raise( float_flag_invalid STATUS_VAR);
4435 z.low = floatx80_default_nan_low;
4436 z.high = floatx80_default_nan_high;
4437 return z;
4439 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4441 if ( aExp == 0 ) {
4442 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
4443 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4445 bSig |= LIT64( 0x8000000000000000 );
4446 zSign = aSign;
4447 expDiff = aExp - bExp;
4448 aSig1 = 0;
4449 if ( expDiff < 0 ) {
4450 if ( expDiff < -1 ) return a;
4451 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4452 expDiff = 0;
4454 q = ( bSig <= aSig0 );
4455 if ( q ) aSig0 -= bSig;
4456 expDiff -= 64;
4457 while ( 0 < expDiff ) {
4458 q = estimateDiv128To64( aSig0, aSig1, bSig );
4459 q = ( 2 < q ) ? q - 2 : 0;
4460 mul64To128( bSig, q, &term0, &term1 );
4461 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4462 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4463 expDiff -= 62;
4465 expDiff += 64;
4466 if ( 0 < expDiff ) {
4467 q = estimateDiv128To64( aSig0, aSig1, bSig );
4468 q = ( 2 < q ) ? q - 2 : 0;
4469 q >>= 64 - expDiff;
4470 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4471 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4472 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4473 while ( le128( term0, term1, aSig0, aSig1 ) ) {
4474 ++q;
4475 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4478 else {
4479 term1 = 0;
4480 term0 = bSig;
4482 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4483 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4484 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4485 && ( q & 1 ) )
4487 aSig0 = alternateASig0;
4488 aSig1 = alternateASig1;
4489 zSign = ! zSign;
4491 return
4492 normalizeRoundAndPackFloatx80(
4493 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4497 /*----------------------------------------------------------------------------
4498 | Returns the square root of the extended double-precision floating-point
4499 | value `a'. The operation is performed according to the IEC/IEEE Standard
4500 | for Binary Floating-Point Arithmetic.
4501 *----------------------------------------------------------------------------*/
4503 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4505 flag aSign;
4506 int32 aExp, zExp;
4507 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4508 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4509 floatx80 z;
4511 aSig0 = extractFloatx80Frac( a );
4512 aExp = extractFloatx80Exp( a );
4513 aSign = extractFloatx80Sign( a );
4514 if ( aExp == 0x7FFF ) {
4515 if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4516 if ( ! aSign ) return a;
4517 goto invalid;
4519 if ( aSign ) {
4520 if ( ( aExp | aSig0 ) == 0 ) return a;
4521 invalid:
4522 float_raise( float_flag_invalid STATUS_VAR);
4523 z.low = floatx80_default_nan_low;
4524 z.high = floatx80_default_nan_high;
4525 return z;
4527 if ( aExp == 0 ) {
4528 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4529 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4531 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4532 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4533 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4534 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4535 doubleZSig0 = zSig0<<1;
4536 mul64To128( zSig0, zSig0, &term0, &term1 );
4537 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4538 while ( (int64_t) rem0 < 0 ) {
4539 --zSig0;
4540 doubleZSig0 -= 2;
4541 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4543 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4544 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4545 if ( zSig1 == 0 ) zSig1 = 1;
4546 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4547 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4548 mul64To128( zSig1, zSig1, &term2, &term3 );
4549 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4550 while ( (int64_t) rem1 < 0 ) {
4551 --zSig1;
4552 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4553 term3 |= 1;
4554 term2 |= doubleZSig0;
4555 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4557 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4559 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4560 zSig0 |= doubleZSig0;
4561 return
4562 roundAndPackFloatx80(
4563 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4567 /*----------------------------------------------------------------------------
4568 | Returns 1 if the extended double-precision floating-point value `a' is equal
4569 | to the corresponding value `b', and 0 otherwise. The invalid exception is
4570 | raised if either operand is a NaN. Otherwise, the comparison is performed
4571 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4572 *----------------------------------------------------------------------------*/
4574 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4577 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4578 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4579 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4580 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4582 float_raise( float_flag_invalid STATUS_VAR);
4583 return 0;
4585 return
4586 ( a.low == b.low )
4587 && ( ( a.high == b.high )
4588 || ( ( a.low == 0 )
4589 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4594 /*----------------------------------------------------------------------------
4595 | Returns 1 if the extended double-precision floating-point value `a' is
4596 | less than or equal to the corresponding value `b', and 0 otherwise. The
4597 | invalid exception is raised if either operand is a NaN. The comparison is
4598 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4599 | Arithmetic.
4600 *----------------------------------------------------------------------------*/
4602 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4604 flag aSign, bSign;
4606 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4607 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4608 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4609 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4611 float_raise( float_flag_invalid STATUS_VAR);
4612 return 0;
4614 aSign = extractFloatx80Sign( a );
4615 bSign = extractFloatx80Sign( b );
4616 if ( aSign != bSign ) {
4617 return
4618 aSign
4619 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4620 == 0 );
4622 return
4623 aSign ? le128( b.high, b.low, a.high, a.low )
4624 : le128( a.high, a.low, b.high, b.low );
4628 /*----------------------------------------------------------------------------
4629 | Returns 1 if the extended double-precision floating-point value `a' is
4630 | less than the corresponding value `b', and 0 otherwise. The invalid
4631 | exception is raised if either operand is a NaN. The comparison is performed
4632 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4633 *----------------------------------------------------------------------------*/
4635 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4637 flag aSign, bSign;
4639 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4640 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4641 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4642 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4644 float_raise( float_flag_invalid STATUS_VAR);
4645 return 0;
4647 aSign = extractFloatx80Sign( a );
4648 bSign = extractFloatx80Sign( b );
4649 if ( aSign != bSign ) {
4650 return
4651 aSign
4652 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4653 != 0 );
4655 return
4656 aSign ? lt128( b.high, b.low, a.high, a.low )
4657 : lt128( a.high, a.low, b.high, b.low );
4661 /*----------------------------------------------------------------------------
4662 | Returns 1 if the extended double-precision floating-point values `a' and `b'
4663 | cannot be compared, and 0 otherwise. The invalid exception is raised if
4664 | either operand is a NaN. The comparison is performed according to the
4665 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4666 *----------------------------------------------------------------------------*/
4667 int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
4669 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4670 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4671 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4672 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4674 float_raise( float_flag_invalid STATUS_VAR);
4675 return 1;
4677 return 0;
4680 /*----------------------------------------------------------------------------
4681 | Returns 1 if the extended double-precision floating-point value `a' is
4682 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4683 | cause an exception. The comparison is performed according to the IEC/IEEE
4684 | Standard for Binary Floating-Point Arithmetic.
4685 *----------------------------------------------------------------------------*/
4687 int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4690 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4691 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4692 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4693 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4695 if ( floatx80_is_signaling_nan( a )
4696 || floatx80_is_signaling_nan( b ) ) {
4697 float_raise( float_flag_invalid STATUS_VAR);
4699 return 0;
4701 return
4702 ( a.low == b.low )
4703 && ( ( a.high == b.high )
4704 || ( ( a.low == 0 )
4705 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4710 /*----------------------------------------------------------------------------
4711 | Returns 1 if the extended double-precision floating-point value `a' is less
4712 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4713 | do not cause an exception. Otherwise, the comparison is performed according
4714 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4715 *----------------------------------------------------------------------------*/
4717 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4719 flag aSign, bSign;
4721 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4722 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4723 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4724 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4726 if ( floatx80_is_signaling_nan( a )
4727 || floatx80_is_signaling_nan( b ) ) {
4728 float_raise( float_flag_invalid STATUS_VAR);
4730 return 0;
4732 aSign = extractFloatx80Sign( a );
4733 bSign = extractFloatx80Sign( b );
4734 if ( aSign != bSign ) {
4735 return
4736 aSign
4737 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4738 == 0 );
4740 return
4741 aSign ? le128( b.high, b.low, a.high, a.low )
4742 : le128( a.high, a.low, b.high, b.low );
4746 /*----------------------------------------------------------------------------
4747 | Returns 1 if the extended double-precision floating-point value `a' is less
4748 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4749 | an exception. Otherwise, the comparison is performed according to the
4750 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4751 *----------------------------------------------------------------------------*/
4753 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4755 flag aSign, bSign;
4757 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4758 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4759 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4760 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4762 if ( floatx80_is_signaling_nan( a )
4763 || floatx80_is_signaling_nan( b ) ) {
4764 float_raise( float_flag_invalid STATUS_VAR);
4766 return 0;
4768 aSign = extractFloatx80Sign( a );
4769 bSign = extractFloatx80Sign( b );
4770 if ( aSign != bSign ) {
4771 return
4772 aSign
4773 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4774 != 0 );
4776 return
4777 aSign ? lt128( b.high, b.low, a.high, a.low )
4778 : lt128( a.high, a.low, b.high, b.low );
4782 /*----------------------------------------------------------------------------
4783 | Returns 1 if the extended double-precision floating-point values `a' and `b'
4784 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
4785 | The comparison is performed according to the IEC/IEEE Standard for Binary
4786 | Floating-Point Arithmetic.
4787 *----------------------------------------------------------------------------*/
4788 int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4790 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4791 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4792 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4793 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4795 if ( floatx80_is_signaling_nan( a )
4796 || floatx80_is_signaling_nan( b ) ) {
4797 float_raise( float_flag_invalid STATUS_VAR);
4799 return 1;
4801 return 0;
4804 /*----------------------------------------------------------------------------
4805 | Returns the result of converting the quadruple-precision floating-point
4806 | value `a' to the 32-bit two's complement integer format. The conversion
4807 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4808 | Arithmetic---which means in particular that the conversion is rounded
4809 | according to the current rounding mode. If `a' is a NaN, the largest
4810 | positive integer is returned. Otherwise, if the conversion overflows, the
4811 | largest integer with the same sign as `a' is returned.
4812 *----------------------------------------------------------------------------*/
4814 int32 float128_to_int32( float128 a STATUS_PARAM )
4816 flag aSign;
4817 int32 aExp, shiftCount;
4818 uint64_t aSig0, aSig1;
4820 aSig1 = extractFloat128Frac1( a );
4821 aSig0 = extractFloat128Frac0( a );
4822 aExp = extractFloat128Exp( a );
4823 aSign = extractFloat128Sign( a );
4824 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4825 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4826 aSig0 |= ( aSig1 != 0 );
4827 shiftCount = 0x4028 - aExp;
4828 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4829 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4833 /*----------------------------------------------------------------------------
4834 | Returns the result of converting the quadruple-precision floating-point
4835 | value `a' to the 32-bit two's complement integer format. The conversion
4836 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4837 | Arithmetic, except that the conversion is always rounded toward zero. If
4838 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4839 | conversion overflows, the largest integer with the same sign as `a' is
4840 | returned.
4841 *----------------------------------------------------------------------------*/
4843 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4845 flag aSign;
4846 int32 aExp, shiftCount;
4847 uint64_t aSig0, aSig1, savedASig;
4848 int32 z;
4850 aSig1 = extractFloat128Frac1( a );
4851 aSig0 = extractFloat128Frac0( a );
4852 aExp = extractFloat128Exp( a );
4853 aSign = extractFloat128Sign( a );
4854 aSig0 |= ( aSig1 != 0 );
4855 if ( 0x401E < aExp ) {
4856 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4857 goto invalid;
4859 else if ( aExp < 0x3FFF ) {
4860 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4861 return 0;
4863 aSig0 |= LIT64( 0x0001000000000000 );
4864 shiftCount = 0x402F - aExp;
4865 savedASig = aSig0;
4866 aSig0 >>= shiftCount;
4867 z = aSig0;
4868 if ( aSign ) z = - z;
4869 if ( ( z < 0 ) ^ aSign ) {
4870 invalid:
4871 float_raise( float_flag_invalid STATUS_VAR);
4872 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4874 if ( ( aSig0<<shiftCount ) != savedASig ) {
4875 STATUS(float_exception_flags) |= float_flag_inexact;
4877 return z;
4881 /*----------------------------------------------------------------------------
4882 | Returns the result of converting the quadruple-precision floating-point
4883 | value `a' to the 64-bit two's complement integer format. The conversion
4884 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4885 | Arithmetic---which means in particular that the conversion is rounded
4886 | according to the current rounding mode. If `a' is a NaN, the largest
4887 | positive integer is returned. Otherwise, if the conversion overflows, the
4888 | largest integer with the same sign as `a' is returned.
4889 *----------------------------------------------------------------------------*/
4891 int64 float128_to_int64( float128 a STATUS_PARAM )
4893 flag aSign;
4894 int32 aExp, shiftCount;
4895 uint64_t aSig0, aSig1;
4897 aSig1 = extractFloat128Frac1( a );
4898 aSig0 = extractFloat128Frac0( a );
4899 aExp = extractFloat128Exp( a );
4900 aSign = extractFloat128Sign( a );
4901 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4902 shiftCount = 0x402F - aExp;
4903 if ( shiftCount <= 0 ) {
4904 if ( 0x403E < aExp ) {
4905 float_raise( float_flag_invalid STATUS_VAR);
4906 if ( ! aSign
4907 || ( ( aExp == 0x7FFF )
4908 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4911 return LIT64( 0x7FFFFFFFFFFFFFFF );
4913 return (int64_t) LIT64( 0x8000000000000000 );
4915 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4917 else {
4918 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4920 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4924 /*----------------------------------------------------------------------------
4925 | Returns the result of converting the quadruple-precision floating-point
4926 | value `a' to the 64-bit two's complement integer format. The conversion
4927 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4928 | Arithmetic, except that the conversion is always rounded toward zero.
4929 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4930 | the conversion overflows, the largest integer with the same sign as `a' is
4931 | returned.
4932 *----------------------------------------------------------------------------*/
4934 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4936 flag aSign;
4937 int32 aExp, shiftCount;
4938 uint64_t aSig0, aSig1;
4939 int64 z;
4941 aSig1 = extractFloat128Frac1( a );
4942 aSig0 = extractFloat128Frac0( a );
4943 aExp = extractFloat128Exp( a );
4944 aSign = extractFloat128Sign( a );
4945 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4946 shiftCount = aExp - 0x402F;
4947 if ( 0 < shiftCount ) {
4948 if ( 0x403E <= aExp ) {
4949 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4950 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4951 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4952 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4954 else {
4955 float_raise( float_flag_invalid STATUS_VAR);
4956 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4957 return LIT64( 0x7FFFFFFFFFFFFFFF );
4960 return (int64_t) LIT64( 0x8000000000000000 );
4962 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4963 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
4964 STATUS(float_exception_flags) |= float_flag_inexact;
4967 else {
4968 if ( aExp < 0x3FFF ) {
4969 if ( aExp | aSig0 | aSig1 ) {
4970 STATUS(float_exception_flags) |= float_flag_inexact;
4972 return 0;
4974 z = aSig0>>( - shiftCount );
4975 if ( aSig1
4976 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4977 STATUS(float_exception_flags) |= float_flag_inexact;
4980 if ( aSign ) z = - z;
4981 return z;
4985 /*----------------------------------------------------------------------------
4986 | Returns the result of converting the quadruple-precision floating-point
4987 | value `a' to the single-precision floating-point format. The conversion
4988 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4989 | Arithmetic.
4990 *----------------------------------------------------------------------------*/
4992 float32 float128_to_float32( float128 a STATUS_PARAM )
4994 flag aSign;
4995 int32 aExp;
4996 uint64_t aSig0, aSig1;
4997 uint32_t zSig;
4999 aSig1 = extractFloat128Frac1( a );
5000 aSig0 = extractFloat128Frac0( a );
5001 aExp = extractFloat128Exp( a );
5002 aSign = extractFloat128Sign( a );
5003 if ( aExp == 0x7FFF ) {
5004 if ( aSig0 | aSig1 ) {
5005 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5007 return packFloat32( aSign, 0xFF, 0 );
5009 aSig0 |= ( aSig1 != 0 );
5010 shift64RightJamming( aSig0, 18, &aSig0 );
5011 zSig = aSig0;
5012 if ( aExp || zSig ) {
5013 zSig |= 0x40000000;
5014 aExp -= 0x3F81;
5016 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5020 /*----------------------------------------------------------------------------
5021 | Returns the result of converting the quadruple-precision floating-point
5022 | value `a' to the double-precision floating-point format. The conversion
5023 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5024 | Arithmetic.
5025 *----------------------------------------------------------------------------*/
5027 float64 float128_to_float64( float128 a STATUS_PARAM )
5029 flag aSign;
5030 int32 aExp;
5031 uint64_t aSig0, aSig1;
5033 aSig1 = extractFloat128Frac1( a );
5034 aSig0 = extractFloat128Frac0( a );
5035 aExp = extractFloat128Exp( a );
5036 aSign = extractFloat128Sign( a );
5037 if ( aExp == 0x7FFF ) {
5038 if ( aSig0 | aSig1 ) {
5039 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5041 return packFloat64( aSign, 0x7FF, 0 );
5043 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5044 aSig0 |= ( aSig1 != 0 );
5045 if ( aExp || aSig0 ) {
5046 aSig0 |= LIT64( 0x4000000000000000 );
5047 aExp -= 0x3C01;
5049 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5053 /*----------------------------------------------------------------------------
5054 | Returns the result of converting the quadruple-precision floating-point
5055 | value `a' to the extended double-precision floating-point format. The
5056 | conversion is performed according to the IEC/IEEE Standard for Binary
5057 | Floating-Point Arithmetic.
5058 *----------------------------------------------------------------------------*/
5060 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5062 flag aSign;
5063 int32 aExp;
5064 uint64_t aSig0, aSig1;
5066 aSig1 = extractFloat128Frac1( a );
5067 aSig0 = extractFloat128Frac0( a );
5068 aExp = extractFloat128Exp( a );
5069 aSign = extractFloat128Sign( a );
5070 if ( aExp == 0x7FFF ) {
5071 if ( aSig0 | aSig1 ) {
5072 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5074 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5076 if ( aExp == 0 ) {
5077 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5078 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5080 else {
5081 aSig0 |= LIT64( 0x0001000000000000 );
5083 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5084 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5088 /*----------------------------------------------------------------------------
5089 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5090 | returns the result as a quadruple-precision floating-point value. The
5091 | operation is performed according to the IEC/IEEE Standard for Binary
5092 | Floating-Point Arithmetic.
5093 *----------------------------------------------------------------------------*/
5095 float128 float128_round_to_int( float128 a STATUS_PARAM )
5097 flag aSign;
5098 int32 aExp;
5099 uint64_t lastBitMask, roundBitsMask;
5100 int8 roundingMode;
5101 float128 z;
5103 aExp = extractFloat128Exp( a );
5104 if ( 0x402F <= aExp ) {
5105 if ( 0x406F <= aExp ) {
5106 if ( ( aExp == 0x7FFF )
5107 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5109 return propagateFloat128NaN( a, a STATUS_VAR );
5111 return a;
5113 lastBitMask = 1;
5114 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5115 roundBitsMask = lastBitMask - 1;
5116 z = a;
5117 roundingMode = STATUS(float_rounding_mode);
5118 if ( roundingMode == float_round_nearest_even ) {
5119 if ( lastBitMask ) {
5120 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5121 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5123 else {
5124 if ( (int64_t) z.low < 0 ) {
5125 ++z.high;
5126 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5130 else if ( roundingMode != float_round_to_zero ) {
5131 if ( extractFloat128Sign( z )
5132 ^ ( roundingMode == float_round_up ) ) {
5133 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5136 z.low &= ~ roundBitsMask;
5138 else {
5139 if ( aExp < 0x3FFF ) {
5140 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5141 STATUS(float_exception_flags) |= float_flag_inexact;
5142 aSign = extractFloat128Sign( a );
5143 switch ( STATUS(float_rounding_mode) ) {
5144 case float_round_nearest_even:
5145 if ( ( aExp == 0x3FFE )
5146 && ( extractFloat128Frac0( a )
5147 | extractFloat128Frac1( a ) )
5149 return packFloat128( aSign, 0x3FFF, 0, 0 );
5151 break;
5152 case float_round_down:
5153 return
5154 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5155 : packFloat128( 0, 0, 0, 0 );
5156 case float_round_up:
5157 return
5158 aSign ? packFloat128( 1, 0, 0, 0 )
5159 : packFloat128( 0, 0x3FFF, 0, 0 );
5161 return packFloat128( aSign, 0, 0, 0 );
5163 lastBitMask = 1;
5164 lastBitMask <<= 0x402F - aExp;
5165 roundBitsMask = lastBitMask - 1;
5166 z.low = 0;
5167 z.high = a.high;
5168 roundingMode = STATUS(float_rounding_mode);
5169 if ( roundingMode == float_round_nearest_even ) {
5170 z.high += lastBitMask>>1;
5171 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5172 z.high &= ~ lastBitMask;
5175 else if ( roundingMode != float_round_to_zero ) {
5176 if ( extractFloat128Sign( z )
5177 ^ ( roundingMode == float_round_up ) ) {
5178 z.high |= ( a.low != 0 );
5179 z.high += roundBitsMask;
5182 z.high &= ~ roundBitsMask;
5184 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5185 STATUS(float_exception_flags) |= float_flag_inexact;
5187 return z;
5191 /*----------------------------------------------------------------------------
5192 | Returns the result of adding the absolute values of the quadruple-precision
5193 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5194 | before being returned. `zSign' is ignored if the result is a NaN.
5195 | The addition is performed according to the IEC/IEEE Standard for Binary
5196 | Floating-Point Arithmetic.
5197 *----------------------------------------------------------------------------*/
5199 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5201 int32 aExp, bExp, zExp;
5202 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5203 int32 expDiff;
5205 aSig1 = extractFloat128Frac1( a );
5206 aSig0 = extractFloat128Frac0( a );
5207 aExp = extractFloat128Exp( a );
5208 bSig1 = extractFloat128Frac1( b );
5209 bSig0 = extractFloat128Frac0( b );
5210 bExp = extractFloat128Exp( b );
5211 expDiff = aExp - bExp;
5212 if ( 0 < expDiff ) {
5213 if ( aExp == 0x7FFF ) {
5214 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5215 return a;
5217 if ( bExp == 0 ) {
5218 --expDiff;
5220 else {
5221 bSig0 |= LIT64( 0x0001000000000000 );
5223 shift128ExtraRightJamming(
5224 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5225 zExp = aExp;
5227 else if ( expDiff < 0 ) {
5228 if ( bExp == 0x7FFF ) {
5229 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5230 return packFloat128( zSign, 0x7FFF, 0, 0 );
5232 if ( aExp == 0 ) {
5233 ++expDiff;
5235 else {
5236 aSig0 |= LIT64( 0x0001000000000000 );
5238 shift128ExtraRightJamming(
5239 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5240 zExp = bExp;
5242 else {
5243 if ( aExp == 0x7FFF ) {
5244 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5245 return propagateFloat128NaN( a, b STATUS_VAR );
5247 return a;
5249 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5250 if ( aExp == 0 ) {
5251 if (STATUS(flush_to_zero)) {
5252 if (zSig0 | zSig1) {
5253 float_raise(float_flag_output_denormal STATUS_VAR);
5255 return packFloat128(zSign, 0, 0, 0);
5257 return packFloat128( zSign, 0, zSig0, zSig1 );
5259 zSig2 = 0;
5260 zSig0 |= LIT64( 0x0002000000000000 );
5261 zExp = aExp;
5262 goto shiftRight1;
5264 aSig0 |= LIT64( 0x0001000000000000 );
5265 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5266 --zExp;
5267 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5268 ++zExp;
5269 shiftRight1:
5270 shift128ExtraRightJamming(
5271 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5272 roundAndPack:
5273 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5277 /*----------------------------------------------------------------------------
5278 | Returns the result of subtracting the absolute values of the quadruple-
5279 | precision floating-point values `a' and `b'. If `zSign' is 1, the
5280 | difference is negated before being returned. `zSign' is ignored if the
5281 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5282 | Standard for Binary Floating-Point Arithmetic.
5283 *----------------------------------------------------------------------------*/
5285 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5287 int32 aExp, bExp, zExp;
5288 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5289 int32 expDiff;
5290 float128 z;
5292 aSig1 = extractFloat128Frac1( a );
5293 aSig0 = extractFloat128Frac0( a );
5294 aExp = extractFloat128Exp( a );
5295 bSig1 = extractFloat128Frac1( b );
5296 bSig0 = extractFloat128Frac0( b );
5297 bExp = extractFloat128Exp( b );
5298 expDiff = aExp - bExp;
5299 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5300 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5301 if ( 0 < expDiff ) goto aExpBigger;
5302 if ( expDiff < 0 ) goto bExpBigger;
5303 if ( aExp == 0x7FFF ) {
5304 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5305 return propagateFloat128NaN( a, b STATUS_VAR );
5307 float_raise( float_flag_invalid STATUS_VAR);
5308 z.low = float128_default_nan_low;
5309 z.high = float128_default_nan_high;
5310 return z;
5312 if ( aExp == 0 ) {
5313 aExp = 1;
5314 bExp = 1;
5316 if ( bSig0 < aSig0 ) goto aBigger;
5317 if ( aSig0 < bSig0 ) goto bBigger;
5318 if ( bSig1 < aSig1 ) goto aBigger;
5319 if ( aSig1 < bSig1 ) goto bBigger;
5320 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5321 bExpBigger:
5322 if ( bExp == 0x7FFF ) {
5323 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5324 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5326 if ( aExp == 0 ) {
5327 ++expDiff;
5329 else {
5330 aSig0 |= LIT64( 0x4000000000000000 );
5332 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5333 bSig0 |= LIT64( 0x4000000000000000 );
5334 bBigger:
5335 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5336 zExp = bExp;
5337 zSign ^= 1;
5338 goto normalizeRoundAndPack;
5339 aExpBigger:
5340 if ( aExp == 0x7FFF ) {
5341 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5342 return a;
5344 if ( bExp == 0 ) {
5345 --expDiff;
5347 else {
5348 bSig0 |= LIT64( 0x4000000000000000 );
5350 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5351 aSig0 |= LIT64( 0x4000000000000000 );
5352 aBigger:
5353 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5354 zExp = aExp;
5355 normalizeRoundAndPack:
5356 --zExp;
5357 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5361 /*----------------------------------------------------------------------------
5362 | Returns the result of adding the quadruple-precision floating-point values
5363 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5364 | for Binary Floating-Point Arithmetic.
5365 *----------------------------------------------------------------------------*/
5367 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5369 flag aSign, bSign;
5371 aSign = extractFloat128Sign( a );
5372 bSign = extractFloat128Sign( b );
5373 if ( aSign == bSign ) {
5374 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5376 else {
5377 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5382 /*----------------------------------------------------------------------------
5383 | Returns the result of subtracting the quadruple-precision floating-point
5384 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5385 | Standard for Binary Floating-Point Arithmetic.
5386 *----------------------------------------------------------------------------*/
5388 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5390 flag aSign, bSign;
5392 aSign = extractFloat128Sign( a );
5393 bSign = extractFloat128Sign( b );
5394 if ( aSign == bSign ) {
5395 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5397 else {
5398 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5403 /*----------------------------------------------------------------------------
5404 | Returns the result of multiplying the quadruple-precision floating-point
5405 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5406 | Standard for Binary Floating-Point Arithmetic.
5407 *----------------------------------------------------------------------------*/
5409 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5411 flag aSign, bSign, zSign;
5412 int32 aExp, bExp, zExp;
5413 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5414 float128 z;
5416 aSig1 = extractFloat128Frac1( a );
5417 aSig0 = extractFloat128Frac0( a );
5418 aExp = extractFloat128Exp( a );
5419 aSign = extractFloat128Sign( a );
5420 bSig1 = extractFloat128Frac1( b );
5421 bSig0 = extractFloat128Frac0( b );
5422 bExp = extractFloat128Exp( b );
5423 bSign = extractFloat128Sign( b );
5424 zSign = aSign ^ bSign;
5425 if ( aExp == 0x7FFF ) {
5426 if ( ( aSig0 | aSig1 )
5427 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5428 return propagateFloat128NaN( a, b STATUS_VAR );
5430 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5431 return packFloat128( zSign, 0x7FFF, 0, 0 );
5433 if ( bExp == 0x7FFF ) {
5434 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5435 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5436 invalid:
5437 float_raise( float_flag_invalid STATUS_VAR);
5438 z.low = float128_default_nan_low;
5439 z.high = float128_default_nan_high;
5440 return z;
5442 return packFloat128( zSign, 0x7FFF, 0, 0 );
5444 if ( aExp == 0 ) {
5445 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5446 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5448 if ( bExp == 0 ) {
5449 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5450 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5452 zExp = aExp + bExp - 0x4000;
5453 aSig0 |= LIT64( 0x0001000000000000 );
5454 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5455 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5456 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5457 zSig2 |= ( zSig3 != 0 );
5458 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5459 shift128ExtraRightJamming(
5460 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5461 ++zExp;
5463 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5467 /*----------------------------------------------------------------------------
5468 | Returns the result of dividing the quadruple-precision floating-point value
5469 | `a' by the corresponding value `b'. The operation is performed according to
5470 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5471 *----------------------------------------------------------------------------*/
5473 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5475 flag aSign, bSign, zSign;
5476 int32 aExp, bExp, zExp;
5477 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5478 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5479 float128 z;
5481 aSig1 = extractFloat128Frac1( a );
5482 aSig0 = extractFloat128Frac0( a );
5483 aExp = extractFloat128Exp( a );
5484 aSign = extractFloat128Sign( a );
5485 bSig1 = extractFloat128Frac1( b );
5486 bSig0 = extractFloat128Frac0( b );
5487 bExp = extractFloat128Exp( b );
5488 bSign = extractFloat128Sign( b );
5489 zSign = aSign ^ bSign;
5490 if ( aExp == 0x7FFF ) {
5491 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5492 if ( bExp == 0x7FFF ) {
5493 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5494 goto invalid;
5496 return packFloat128( zSign, 0x7FFF, 0, 0 );
5498 if ( bExp == 0x7FFF ) {
5499 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5500 return packFloat128( zSign, 0, 0, 0 );
5502 if ( bExp == 0 ) {
5503 if ( ( bSig0 | bSig1 ) == 0 ) {
5504 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5505 invalid:
5506 float_raise( float_flag_invalid STATUS_VAR);
5507 z.low = float128_default_nan_low;
5508 z.high = float128_default_nan_high;
5509 return z;
5511 float_raise( float_flag_divbyzero STATUS_VAR);
5512 return packFloat128( zSign, 0x7FFF, 0, 0 );
5514 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5516 if ( aExp == 0 ) {
5517 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5518 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5520 zExp = aExp - bExp + 0x3FFD;
5521 shortShift128Left(
5522 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5523 shortShift128Left(
5524 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5525 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5526 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5527 ++zExp;
5529 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5530 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5531 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5532 while ( (int64_t) rem0 < 0 ) {
5533 --zSig0;
5534 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5536 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5537 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5538 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5539 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5540 while ( (int64_t) rem1 < 0 ) {
5541 --zSig1;
5542 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5544 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5546 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5547 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5551 /*----------------------------------------------------------------------------
5552 | Returns the remainder of the quadruple-precision floating-point value `a'
5553 | with respect to the corresponding value `b'. The operation is performed
5554 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5555 *----------------------------------------------------------------------------*/
5557 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5559 flag aSign, zSign;
5560 int32 aExp, bExp, expDiff;
5561 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5562 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
5563 int64_t sigMean0;
5564 float128 z;
5566 aSig1 = extractFloat128Frac1( a );
5567 aSig0 = extractFloat128Frac0( a );
5568 aExp = extractFloat128Exp( a );
5569 aSign = extractFloat128Sign( a );
5570 bSig1 = extractFloat128Frac1( b );
5571 bSig0 = extractFloat128Frac0( b );
5572 bExp = extractFloat128Exp( b );
5573 if ( aExp == 0x7FFF ) {
5574 if ( ( aSig0 | aSig1 )
5575 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5576 return propagateFloat128NaN( a, b STATUS_VAR );
5578 goto invalid;
5580 if ( bExp == 0x7FFF ) {
5581 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5582 return a;
5584 if ( bExp == 0 ) {
5585 if ( ( bSig0 | bSig1 ) == 0 ) {
5586 invalid:
5587 float_raise( float_flag_invalid STATUS_VAR);
5588 z.low = float128_default_nan_low;
5589 z.high = float128_default_nan_high;
5590 return z;
5592 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5594 if ( aExp == 0 ) {
5595 if ( ( aSig0 | aSig1 ) == 0 ) return a;
5596 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5598 expDiff = aExp - bExp;
5599 if ( expDiff < -1 ) return a;
5600 shortShift128Left(
5601 aSig0 | LIT64( 0x0001000000000000 ),
5602 aSig1,
5603 15 - ( expDiff < 0 ),
5604 &aSig0,
5605 &aSig1
5607 shortShift128Left(
5608 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5609 q = le128( bSig0, bSig1, aSig0, aSig1 );
5610 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5611 expDiff -= 64;
5612 while ( 0 < expDiff ) {
5613 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5614 q = ( 4 < q ) ? q - 4 : 0;
5615 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5616 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5617 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5618 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5619 expDiff -= 61;
5621 if ( -64 < expDiff ) {
5622 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5623 q = ( 4 < q ) ? q - 4 : 0;
5624 q >>= - expDiff;
5625 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5626 expDiff += 52;
5627 if ( expDiff < 0 ) {
5628 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5630 else {
5631 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5633 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5634 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5636 else {
5637 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5638 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5640 do {
5641 alternateASig0 = aSig0;
5642 alternateASig1 = aSig1;
5643 ++q;
5644 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5645 } while ( 0 <= (int64_t) aSig0 );
5646 add128(
5647 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
5648 if ( ( sigMean0 < 0 )
5649 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5650 aSig0 = alternateASig0;
5651 aSig1 = alternateASig1;
5653 zSign = ( (int64_t) aSig0 < 0 );
5654 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5655 return
5656 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5660 /*----------------------------------------------------------------------------
5661 | Returns the square root of the quadruple-precision floating-point value `a'.
5662 | The operation is performed according to the IEC/IEEE Standard for Binary
5663 | Floating-Point Arithmetic.
5664 *----------------------------------------------------------------------------*/
5666 float128 float128_sqrt( float128 a STATUS_PARAM )
5668 flag aSign;
5669 int32 aExp, zExp;
5670 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5671 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5672 float128 z;
5674 aSig1 = extractFloat128Frac1( a );
5675 aSig0 = extractFloat128Frac0( a );
5676 aExp = extractFloat128Exp( a );
5677 aSign = extractFloat128Sign( a );
5678 if ( aExp == 0x7FFF ) {
5679 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5680 if ( ! aSign ) return a;
5681 goto invalid;
5683 if ( aSign ) {
5684 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5685 invalid:
5686 float_raise( float_flag_invalid STATUS_VAR);
5687 z.low = float128_default_nan_low;
5688 z.high = float128_default_nan_high;
5689 return z;
5691 if ( aExp == 0 ) {
5692 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5693 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5695 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5696 aSig0 |= LIT64( 0x0001000000000000 );
5697 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5698 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5699 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5700 doubleZSig0 = zSig0<<1;
5701 mul64To128( zSig0, zSig0, &term0, &term1 );
5702 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5703 while ( (int64_t) rem0 < 0 ) {
5704 --zSig0;
5705 doubleZSig0 -= 2;
5706 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5708 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5709 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5710 if ( zSig1 == 0 ) zSig1 = 1;
5711 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5712 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5713 mul64To128( zSig1, zSig1, &term2, &term3 );
5714 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5715 while ( (int64_t) rem1 < 0 ) {
5716 --zSig1;
5717 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5718 term3 |= 1;
5719 term2 |= doubleZSig0;
5720 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5722 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5724 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5725 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5729 /*----------------------------------------------------------------------------
5730 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5731 | the corresponding value `b', and 0 otherwise. The invalid exception is
5732 | raised if either operand is a NaN. Otherwise, the comparison is performed
5733 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5734 *----------------------------------------------------------------------------*/
5736 int float128_eq( float128 a, float128 b STATUS_PARAM )
5739 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5740 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5741 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5742 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5744 float_raise( float_flag_invalid STATUS_VAR);
5745 return 0;
5747 return
5748 ( a.low == b.low )
5749 && ( ( a.high == b.high )
5750 || ( ( a.low == 0 )
5751 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5756 /*----------------------------------------------------------------------------
5757 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5758 | or equal to the corresponding value `b', and 0 otherwise. The invalid
5759 | exception is raised if either operand is a NaN. The comparison is performed
5760 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5761 *----------------------------------------------------------------------------*/
5763 int float128_le( float128 a, float128 b STATUS_PARAM )
5765 flag aSign, bSign;
5767 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5768 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5769 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5770 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5772 float_raise( float_flag_invalid STATUS_VAR);
5773 return 0;
5775 aSign = extractFloat128Sign( a );
5776 bSign = extractFloat128Sign( b );
5777 if ( aSign != bSign ) {
5778 return
5779 aSign
5780 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5781 == 0 );
5783 return
5784 aSign ? le128( b.high, b.low, a.high, a.low )
5785 : le128( a.high, a.low, b.high, b.low );
5789 /*----------------------------------------------------------------------------
5790 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5791 | the corresponding value `b', and 0 otherwise. The invalid exception is
5792 | raised if either operand is a NaN. The comparison is performed according
5793 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5794 *----------------------------------------------------------------------------*/
5796 int float128_lt( float128 a, float128 b STATUS_PARAM )
5798 flag aSign, bSign;
5800 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5801 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5802 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5803 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5805 float_raise( float_flag_invalid STATUS_VAR);
5806 return 0;
5808 aSign = extractFloat128Sign( a );
5809 bSign = extractFloat128Sign( b );
5810 if ( aSign != bSign ) {
5811 return
5812 aSign
5813 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5814 != 0 );
5816 return
5817 aSign ? lt128( b.high, b.low, a.high, a.low )
5818 : lt128( a.high, a.low, b.high, b.low );
5822 /*----------------------------------------------------------------------------
5823 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5824 | be compared, and 0 otherwise. The invalid exception is raised if either
5825 | operand is a NaN. The comparison is performed according to the IEC/IEEE
5826 | Standard for Binary Floating-Point Arithmetic.
5827 *----------------------------------------------------------------------------*/
5829 int float128_unordered( float128 a, float128 b STATUS_PARAM )
5831 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5832 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5833 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5834 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5836 float_raise( float_flag_invalid STATUS_VAR);
5837 return 1;
5839 return 0;
5842 /*----------------------------------------------------------------------------
5843 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5844 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5845 | exception. The comparison is performed according to the IEC/IEEE Standard
5846 | for Binary Floating-Point Arithmetic.
5847 *----------------------------------------------------------------------------*/
5849 int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
5852 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5853 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5854 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5855 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5857 if ( float128_is_signaling_nan( a )
5858 || float128_is_signaling_nan( b ) ) {
5859 float_raise( float_flag_invalid STATUS_VAR);
5861 return 0;
5863 return
5864 ( a.low == b.low )
5865 && ( ( a.high == b.high )
5866 || ( ( a.low == 0 )
5867 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5872 /*----------------------------------------------------------------------------
5873 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5874 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5875 | cause an exception. Otherwise, the comparison is performed according to the
5876 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5877 *----------------------------------------------------------------------------*/
5879 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5881 flag aSign, bSign;
5883 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5884 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5885 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5886 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5888 if ( float128_is_signaling_nan( a )
5889 || float128_is_signaling_nan( b ) ) {
5890 float_raise( float_flag_invalid STATUS_VAR);
5892 return 0;
5894 aSign = extractFloat128Sign( a );
5895 bSign = extractFloat128Sign( b );
5896 if ( aSign != bSign ) {
5897 return
5898 aSign
5899 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5900 == 0 );
5902 return
5903 aSign ? le128( b.high, b.low, a.high, a.low )
5904 : le128( a.high, a.low, b.high, b.low );
5908 /*----------------------------------------------------------------------------
5909 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5910 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5911 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5912 | Standard for Binary Floating-Point Arithmetic.
5913 *----------------------------------------------------------------------------*/
5915 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5917 flag aSign, bSign;
5919 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5920 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5921 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5922 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5924 if ( float128_is_signaling_nan( a )
5925 || float128_is_signaling_nan( b ) ) {
5926 float_raise( float_flag_invalid STATUS_VAR);
5928 return 0;
5930 aSign = extractFloat128Sign( a );
5931 bSign = extractFloat128Sign( b );
5932 if ( aSign != bSign ) {
5933 return
5934 aSign
5935 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5936 != 0 );
5938 return
5939 aSign ? lt128( b.high, b.low, a.high, a.low )
5940 : lt128( a.high, a.low, b.high, b.low );
5944 /*----------------------------------------------------------------------------
5945 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5946 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
5947 | comparison is performed according to the IEC/IEEE Standard for Binary
5948 | Floating-Point Arithmetic.
5949 *----------------------------------------------------------------------------*/
5951 int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
5953 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5954 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5955 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5956 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5958 if ( float128_is_signaling_nan( a )
5959 || float128_is_signaling_nan( b ) ) {
5960 float_raise( float_flag_invalid STATUS_VAR);
5962 return 1;
5964 return 0;
5967 /* misc functions */
5968 float32 uint32_to_float32( uint32 a STATUS_PARAM )
5970 return int64_to_float32(a STATUS_VAR);
5973 float64 uint32_to_float64( uint32 a STATUS_PARAM )
5975 return int64_to_float64(a STATUS_VAR);
5978 uint32 float32_to_uint32( float32 a STATUS_PARAM )
5980 int64_t v;
5981 uint32 res;
5983 v = float32_to_int64(a STATUS_VAR);
5984 if (v < 0) {
5985 res = 0;
5986 float_raise( float_flag_invalid STATUS_VAR);
5987 } else if (v > 0xffffffff) {
5988 res = 0xffffffff;
5989 float_raise( float_flag_invalid STATUS_VAR);
5990 } else {
5991 res = v;
5993 return res;
5996 uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5998 int64_t v;
5999 uint32 res;
6001 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6002 if (v < 0) {
6003 res = 0;
6004 float_raise( float_flag_invalid STATUS_VAR);
6005 } else if (v > 0xffffffff) {
6006 res = 0xffffffff;
6007 float_raise( float_flag_invalid STATUS_VAR);
6008 } else {
6009 res = v;
6011 return res;
6014 uint16 float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
6016 int64_t v;
6017 uint16 res;
6019 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6020 if (v < 0) {
6021 res = 0;
6022 float_raise( float_flag_invalid STATUS_VAR);
6023 } else if (v > 0xffff) {
6024 res = 0xffff;
6025 float_raise( float_flag_invalid STATUS_VAR);
6026 } else {
6027 res = v;
6029 return res;
6032 uint32 float64_to_uint32( float64 a STATUS_PARAM )
6034 int64_t v;
6035 uint32 res;
6037 v = float64_to_int64(a STATUS_VAR);
6038 if (v < 0) {
6039 res = 0;
6040 float_raise( float_flag_invalid STATUS_VAR);
6041 } else if (v > 0xffffffff) {
6042 res = 0xffffffff;
6043 float_raise( float_flag_invalid STATUS_VAR);
6044 } else {
6045 res = v;
6047 return res;
6050 uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6052 int64_t v;
6053 uint32 res;
6055 v = float64_to_int64_round_to_zero(a STATUS_VAR);
6056 if (v < 0) {
6057 res = 0;
6058 float_raise( float_flag_invalid STATUS_VAR);
6059 } else if (v > 0xffffffff) {
6060 res = 0xffffffff;
6061 float_raise( float_flag_invalid STATUS_VAR);
6062 } else {
6063 res = v;
6065 return res;
6068 uint16 float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
6070 int64_t v;
6071 uint16 res;
6073 v = float64_to_int64_round_to_zero(a STATUS_VAR);
6074 if (v < 0) {
6075 res = 0;
6076 float_raise( float_flag_invalid STATUS_VAR);
6077 } else if (v > 0xffff) {
6078 res = 0xffff;
6079 float_raise( float_flag_invalid STATUS_VAR);
6080 } else {
6081 res = v;
6083 return res;
6086 /* FIXME: This looks broken. */
6087 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
6089 int64_t v;
6091 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6092 v += float64_val(a);
6093 v = float64_to_int64(make_float64(v) STATUS_VAR);
6095 return v - INT64_MIN;
6098 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6100 int64_t v;
6102 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6103 v += float64_val(a);
6104 v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
6106 return v - INT64_MIN;
6109 #define COMPARE(s, nan_exp) \
6110 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
6111 int is_quiet STATUS_PARAM ) \
6113 flag aSign, bSign; \
6114 uint ## s ## _t av, bv; \
6115 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6116 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6118 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
6119 extractFloat ## s ## Frac( a ) ) || \
6120 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
6121 extractFloat ## s ## Frac( b ) )) { \
6122 if (!is_quiet || \
6123 float ## s ## _is_signaling_nan( a ) || \
6124 float ## s ## _is_signaling_nan( b ) ) { \
6125 float_raise( float_flag_invalid STATUS_VAR); \
6127 return float_relation_unordered; \
6129 aSign = extractFloat ## s ## Sign( a ); \
6130 bSign = extractFloat ## s ## Sign( b ); \
6131 av = float ## s ## _val(a); \
6132 bv = float ## s ## _val(b); \
6133 if ( aSign != bSign ) { \
6134 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
6135 /* zero case */ \
6136 return float_relation_equal; \
6137 } else { \
6138 return 1 - (2 * aSign); \
6140 } else { \
6141 if (av == bv) { \
6142 return float_relation_equal; \
6143 } else { \
6144 return 1 - 2 * (aSign ^ ( av < bv )); \
6149 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
6151 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
6154 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
6156 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
6159 COMPARE(32, 0xff)
6160 COMPARE(64, 0x7ff)
6162 INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
6163 int is_quiet STATUS_PARAM )
6165 flag aSign, bSign;
6167 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6168 ( extractFloatx80Frac( a )<<1 ) ) ||
6169 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6170 ( extractFloatx80Frac( b )<<1 ) )) {
6171 if (!is_quiet ||
6172 floatx80_is_signaling_nan( a ) ||
6173 floatx80_is_signaling_nan( b ) ) {
6174 float_raise( float_flag_invalid STATUS_VAR);
6176 return float_relation_unordered;
6178 aSign = extractFloatx80Sign( a );
6179 bSign = extractFloatx80Sign( b );
6180 if ( aSign != bSign ) {
6182 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6183 ( ( a.low | b.low ) == 0 ) ) {
6184 /* zero case */
6185 return float_relation_equal;
6186 } else {
6187 return 1 - (2 * aSign);
6189 } else {
6190 if (a.low == b.low && a.high == b.high) {
6191 return float_relation_equal;
6192 } else {
6193 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6198 int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
6200 return floatx80_compare_internal(a, b, 0 STATUS_VAR);
6203 int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
6205 return floatx80_compare_internal(a, b, 1 STATUS_VAR);
6208 INLINE int float128_compare_internal( float128 a, float128 b,
6209 int is_quiet STATUS_PARAM )
6211 flag aSign, bSign;
6213 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6214 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6215 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6216 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6217 if (!is_quiet ||
6218 float128_is_signaling_nan( a ) ||
6219 float128_is_signaling_nan( b ) ) {
6220 float_raise( float_flag_invalid STATUS_VAR);
6222 return float_relation_unordered;
6224 aSign = extractFloat128Sign( a );
6225 bSign = extractFloat128Sign( b );
6226 if ( aSign != bSign ) {
6227 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6228 /* zero case */
6229 return float_relation_equal;
6230 } else {
6231 return 1 - (2 * aSign);
6233 } else {
6234 if (a.low == b.low && a.high == b.high) {
6235 return float_relation_equal;
6236 } else {
6237 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6242 int float128_compare( float128 a, float128 b STATUS_PARAM )
6244 return float128_compare_internal(a, b, 0 STATUS_VAR);
6247 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6249 return float128_compare_internal(a, b, 1 STATUS_VAR);
6252 /* min() and max() functions. These can't be implemented as
6253 * 'compare and pick one input' because that would mishandle
6254 * NaNs and +0 vs -0.
6256 #define MINMAX(s, nan_exp) \
6257 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \
6258 int ismin STATUS_PARAM ) \
6260 flag aSign, bSign; \
6261 uint ## s ## _t av, bv; \
6262 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6263 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6264 if (float ## s ## _is_any_nan(a) || \
6265 float ## s ## _is_any_nan(b)) { \
6266 return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
6268 aSign = extractFloat ## s ## Sign(a); \
6269 bSign = extractFloat ## s ## Sign(b); \
6270 av = float ## s ## _val(a); \
6271 bv = float ## s ## _val(b); \
6272 if (aSign != bSign) { \
6273 if (ismin) { \
6274 return aSign ? a : b; \
6275 } else { \
6276 return aSign ? b : a; \
6278 } else { \
6279 if (ismin) { \
6280 return (aSign ^ (av < bv)) ? a : b; \
6281 } else { \
6282 return (aSign ^ (av < bv)) ? b : a; \
6287 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
6289 return float ## s ## _minmax(a, b, 1 STATUS_VAR); \
6292 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
6294 return float ## s ## _minmax(a, b, 0 STATUS_VAR); \
6297 MINMAX(32, 0xff)
6298 MINMAX(64, 0x7ff)
6301 /* Multiply A by 2 raised to the power N. */
6302 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6304 flag aSign;
6305 int16_t aExp;
6306 uint32_t aSig;
6308 a = float32_squash_input_denormal(a STATUS_VAR);
6309 aSig = extractFloat32Frac( a );
6310 aExp = extractFloat32Exp( a );
6311 aSign = extractFloat32Sign( a );
6313 if ( aExp == 0xFF ) {
6314 if ( aSig ) {
6315 return propagateFloat32NaN( a, a STATUS_VAR );
6317 return a;
6319 if ( aExp != 0 )
6320 aSig |= 0x00800000;
6321 else if ( aSig == 0 )
6322 return a;
6324 if (n > 0x200) {
6325 n = 0x200;
6326 } else if (n < -0x200) {
6327 n = -0x200;
6330 aExp += n - 1;
6331 aSig <<= 7;
6332 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6335 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6337 flag aSign;
6338 int16_t aExp;
6339 uint64_t aSig;
6341 a = float64_squash_input_denormal(a STATUS_VAR);
6342 aSig = extractFloat64Frac( a );
6343 aExp = extractFloat64Exp( a );
6344 aSign = extractFloat64Sign( a );
6346 if ( aExp == 0x7FF ) {
6347 if ( aSig ) {
6348 return propagateFloat64NaN( a, a STATUS_VAR );
6350 return a;
6352 if ( aExp != 0 )
6353 aSig |= LIT64( 0x0010000000000000 );
6354 else if ( aSig == 0 )
6355 return a;
6357 if (n > 0x1000) {
6358 n = 0x1000;
6359 } else if (n < -0x1000) {
6360 n = -0x1000;
6363 aExp += n - 1;
6364 aSig <<= 10;
6365 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6368 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6370 flag aSign;
6371 int32_t aExp;
6372 uint64_t aSig;
6374 aSig = extractFloatx80Frac( a );
6375 aExp = extractFloatx80Exp( a );
6376 aSign = extractFloatx80Sign( a );
6378 if ( aExp == 0x7FFF ) {
6379 if ( aSig<<1 ) {
6380 return propagateFloatx80NaN( a, a STATUS_VAR );
6382 return a;
6385 if (aExp == 0 && aSig == 0)
6386 return a;
6388 if (n > 0x10000) {
6389 n = 0x10000;
6390 } else if (n < -0x10000) {
6391 n = -0x10000;
6394 aExp += n;
6395 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6396 aSign, aExp, aSig, 0 STATUS_VAR );
6399 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6401 flag aSign;
6402 int32_t aExp;
6403 uint64_t aSig0, aSig1;
6405 aSig1 = extractFloat128Frac1( a );
6406 aSig0 = extractFloat128Frac0( a );
6407 aExp = extractFloat128Exp( a );
6408 aSign = extractFloat128Sign( a );
6409 if ( aExp == 0x7FFF ) {
6410 if ( aSig0 | aSig1 ) {
6411 return propagateFloat128NaN( a, a STATUS_VAR );
6413 return a;
6415 if ( aExp != 0 )
6416 aSig0 |= LIT64( 0x0001000000000000 );
6417 else if ( aSig0 == 0 && aSig1 == 0 )
6418 return a;
6420 if (n > 0x10000) {
6421 n = 0x10000;
6422 } else if (n < -0x10000) {
6423 n = -0x10000;
6426 aExp += n - 1;
6427 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6428 STATUS_VAR );